In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
from scipy.stats import itemfreq
import random
import os.path

import theano
import lasagne
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.externals import joblib
from sklearn.svm import SVC

from nolearn.lasagne import visualize

# import user defined load_data to build input data
from load_data import Data
from utils import save_network
from model_predictions import build_cnn
from model_predictions import generate_features
from model_predictions import extract_features

# Enter your own file path here, in the path it should contain three
# directories, model, data, word2vec
FILE_PATH = '../files/'

### Use Sports data

In [None]:
sports_dic = {'basketball':1, 'hockey':2, 'baseball':3, 'tennis':4, 'volleyball':5}
sp_data = Data(sports_dic, FILE_PATH)
sp_df = sp_data.csv_df(['text']) # load data
rm_hashtags = ['#'+s for s in sports_dic.keys()]
sp_data.pre_process(sp_df, rm_list=rm_hashtags) # pre-process data
# save this to csv 
sp_df.to_csv(FILE_PATH+'data/all_sports.csv', index=False)
sp_df.head()

In [None]:
# if you want to save the processed file to csv
sp_df.to_csv('../files/data/all_sports_new.csv', index=False)

In [None]:
# lets take a look of the 
sp_df['class'].value_counts()

In [None]:
# comment out if do not want class balance

# airline_df = airline_data.balance_class(airline_df)
# # and check again
# airline_df['class'].value_counts()

In [None]:
# train or load the model
model = sp_data.build_wordvec(size=600, verbose=False)

In [None]:
# max_len is the max length of a sentence in our data, this decides the padding
max_len = sp_data.max_len(sp_df)
# convert our aline data to vector
data = sp_data.convert2vec(sp_df, max_len, model, name='sports-600')
#data = airline_data.standarize(data)
sp_data.save_vec(data, name='sports-600')

In [None]:
# you can uncomment this to check if the wordvec makes sense
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

### create data that gets fed into classifier

In [None]:
N, M, D = data.shape
print "N, M, D:", N, M, D
data = data.reshape(-1, 1, M, D).astype(theano.config.floatX) # theano needs this way
label = sp_df['class']
label = np.int8(label) - 1# seems like theano also needs this
print data.shape
print label.shape

In [None]:
# train our model or load model if it exists
def train_cnn(net, X_train, y_train, model_name='nn_cnn001'):
    model_file = FILE_PATH+'model/' + model_name
    if os.path.isfile(model_file):
        print ("Loading existing model ...")
        net.load_params_from(model_file)
    else:
        # Train the network
        net.fit(X_train, y_train)
        net.save_params_to(model_file)

In [None]:
# initialize
cnn= build_cnn(M, D)
# train
train_cnn(cnn, data, label)

In [None]:
visualize.plot_conv_weights(cnn.layers_['conv2d1'])

In [None]:
# now transfer to svm
# stratified cross-validation
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
n_cv = 1
for train_index, val_index in sss.split(data, label):
    t1 = time.time()
    data_train, data_val = data[train_index], data[val_index]
    label_train, label_val = label[train_index], label[val_index]
    
    freq_train = itemfreq(label_train)
    print "train freq", freq_train[:,1]
    freq_val = itemfreq(label_val)
    print "val freq", freq_val[:,1]

    # pass through cnn
    extract_train = extract_features(cnn, data_train)
    extract_val = extract_features(cnn, data_val)
    clf = SVC(verbose=True, random_state=None)
    print "Training cv {} ...".format(n_cv)
    clf.fit(extract_train, label_train)
    acc = clf.score(extract_val, label_val)
    t2 = time.time()
    
    print acc
    print "\n"
    print "Time took: {0:.2f} min".format((t2-t1)/60)
    n_cv += 1
    

In [None]:
# save model
joblib.dump(clf, FILE_PATH+'svm-final.pkl')