In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
from scipy.stats import itemfreq
import random
import os.path

import theano
import theano.tensor as T
import lasagne
from sklearn.model_selection import train_test_split

from nolearn.lasagne import NeuralNet
from nolearn.lasagne import visualize
from nolearn.lasagne import BatchIterator
from nolearn.lasagne import TrainSplit

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from lasagne import layers
from lasagne.updates import nesterov_momentum

# import user defined load_data to build input data
from load_data import Data
from utils import save_network

# Enter your own file path here, in the path it should contain two 
# directories, data and word2vec
FILE_PATH = '/home/sam/Hhd/twitter_sentiment/'
# FILE_PATH = '/home/sam/Data/twitter_sentiment/'

Using cuDNN version 5105 on context None
Mapped name None to device cuda: GeForce GTX 1080 (0000:01:00.0)


### Use Airline data

In [2]:
airline_data = Data('Airline-Sentiment-2-w-AA.csv', FILE_PATH)
airline_df = airline_data.csv_df(['airline_sentiment', 'text']) # load data
airline_data.pre_process(airline_df) # pre-process data
# drop neutral
# airline_df = airline_data.drop_value(airline_df, 'airline_sentiment', 'neutral')
airline_df.head()

Loading csv: Airline-Sentiment-2-w-AA.csv ...
Note: pre-process changes the dataframe inplace.


Unnamed: 0,airline_sentiment,text,tokenized
0,neutral,What said,[said]
1,positive,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ..."
2,neutral,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another..."
3,negative,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta..."
4,negative,and its a really big bad thing about it,"[really, big, bad, thing]"


In [3]:
# convert categorical value to int class
# class_label = {'positive': 1, 'negative': 2}
class_label = {'positive': 1, 'neutral': 2, 'negative': 3}

airline_df = airline_data.cat2num(airline_df, 'airline_sentiment', class_label, 'class')
airline_df.head()

class
Done converting categorical to numeric, this changes df.


Unnamed: 0,text,tokenized,class
0,What said,[said],2
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2
3,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta...",3
4,and its a really big bad thing about it,"[really, big, bad, thing]",3


In [4]:
# lets take a look of the 
airline_df['class'].value_counts()

3    9178
2    3099
1    2363
Name: class, dtype: int64

In [5]:
# comment out if do not want class balance
airline_df = airline_data.balance_class(airline_df)
# and check again
airline_df['class'].value_counts()

3    3099
2    3099
1    2363
Name: class, dtype: int64

In [6]:
airline_df.head()

Unnamed: 0,text,tokenized,class
0,What said,[said],2
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2
3,seriously would pay a flight for seats that ...,"[seriously, would, pay, flight, seats, didnt, ...",3
4,yes nearly every time I fly VX this ear worm ...,"[yes, nearly, every, time, fly, vx, ear, worm,...",1


In [7]:
# train or load the model
model = airline_data.build_wordvec(size=600, verbose=False)

Loading existing model tweets600.model.bin ...
Done building.


In [8]:
# max_len is the max length of a sentence in our data, this decides the padding
max_len = airline_data.max_len(airline_df)
# convert our aline data to vector
data = airline_data.convert2vec(airline_df, max_len, model, name='airline-3class-600')
#data = airline_data.standarize(data)
airline_data.save_vec(data, name='airline-3class-600')

max sentence length is:  19
npy already exists, loading ...
Done loading npy file.
npy already exists.


In [9]:
# you can uncomment this to check if the wordvec makes sense
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

### create data that gets fed into classifier

In [10]:
N, M, D = data.shape
print "N, M, D:", N, M, D
data = data.reshape(-1, 1, M, D).astype(theano.config.floatX) # theano needs this way
label = airline_df['class']
label = np.int8(label) - 1# seems like theano also needs this
print data.shape
print label.shape

N, M, D: 8561 19 600
(8561, 1, 19, 600)
(8561,)


In [11]:
def build_cnn(num_epochs=20):
    net1 = NeuralNet(
        layers=[('input', layers.InputLayer),
                ('conv2d1', layers.Conv2DLayer),
                ('maxpool1', layers.MaxPool2DLayer),
                ('conv2d2', layers.Conv2DLayer),
                ('maxpool2', layers.MaxPool2DLayer),
                ('dropout1', layers.DropoutLayer),
                ('dense', layers.DenseLayer),
                ('dropout2', layers.DropoutLayer),
                ('output', layers.DenseLayer),
                ],
        # input layer
        input_shape=(None, 1, M, D),
        # layer conv2d1
        conv2d1_num_filters=50,
        conv2d1_filter_size=(3, 3),
        conv2d1_nonlinearity=lasagne.nonlinearities.rectify,
        conv2d1_W=lasagne.init.GlorotUniform(),  
        conv2d1_stride=1,
        conv2d1_pad=1,
        conv2d1_untie_biases=True,
        # layer maxpool1
        maxpool1_pool_size=(2, 2),    
        # layer conv2d2
        conv2d2_num_filters=50,
        conv2d2_filter_size=(3, 3),
        conv2d2_nonlinearity=lasagne.nonlinearities.rectify,
        conv2d2_stride=1,
        conv2d2_pad=1,
        conv2d2_untie_biases=True,
        # layer maxpool2
        maxpool2_pool_size=(2, 2),
        # dropout1
        dropout1_p=0.5,    
        # dense
        dense_num_units=5000,
        dense_nonlinearity=lasagne.nonlinearities.rectify,    
        # dropout2
        dropout2_p=0.5,    
        # output
        output_nonlinearity=lasagne.nonlinearities.softmax,
        output_num_units=3,
        # optimization method params
        update=nesterov_momentum,
        update_learning_rate=0.01,
        update_momentum=0.9,
        # train options
        train_split = TrainSplit(0.2, stratify=True),
        batch_iterator_train = BatchIterator(batch_size=50),
        batch_iterator_test = BatchIterator(batch_size=50),
        max_epochs=num_epochs,
        verbose=1,
        )
    return net1

In [None]:
def train_cnn(net, X_train, y_train, model_name='nn_cnn'):
    model_file = FILE_PATH+'nn_cnn'
    if os.path.isfile(model_file):
        print ("Loading existing model ...")
        net.load_params_from(model_file)
    else:
        # Train the network
        net.fit(X_train, y_train)
        nn.save_params_to(model_file)
#     preds = net1.predict(X_test)
#     cm = confusion_matrix(y_test, preds)
#     plt.matshow(cm)
#     plt.title('Confusion matrix')
#     plt.colorbar()
#     plt.ylabel('True label')
#     plt.xlabel('Predicted label')
#     plt.show()

In [None]:
# initialize
cnn= build_cnn()
# train
train_cnn(cnn, data, label)

# Neural Network with 150747953 learnable parameters

## Layer information

  #  name      size
---  --------  ---------
  0  input     1x19x600
  1  conv2d1   50x19x600
  2  maxpool1  50x9x300
  3  conv2d2   50x9x300
  4  maxpool2  50x4x150
  5  dropout1  50x4x150
  6  dense     5000
  7  dropout2  5000
  8  output    3

  epoch    trn loss    val loss    trn/val    valid acc  dur
-------  ----------  ----------  ---------  -----------  -----
      1     [36m1.05326[0m     [32m1.09896[0m    0.95842      0.42207  8.16s
      2     [36m1.02418[0m     [32m1.09531[0m    0.93506      0.44250  8.01s
      3     [36m1.01622[0m     [32m1.08716[0m    0.93474      0.44250  7.91s
      4     [36m1.01129[0m     [32m1.08490[0m    0.93215      0.44250  7.91s
      5     [36m1.00323[0m     [32m1.07581[0m    0.93253      0.44542  7.91s
      6     [36m0.99060[0m     [32m1.06293[0m    0.93195      0.46118  7.90s
      7     [36m0.96132[0m     [32m1.02863[0m    0.93456      

In [None]:
visualize.plot_conv_weights(cnn.layers_['conv2d1'])

In [None]:
# now transfer to svm
dense_layer = layers.get_output(cnn.layers_['dense'], deterministic=True)
output_layer = layers.get_output(cnn.layers_['output'], deterministic=True)
input_var = cnn.layers_['input'].input_var

f_output = theano.function([input_var], output_layer)
f_dense = theano.function([input_var], dense_layer)

In [None]:
def extract_features(input_data):
    print "Extracting ... "
    # input_data: n0, 1, n2, n3
    n = input_data.shape
    return [f_dense(i.reshape(1,1,n[2],n[3])).flatten() for i in input_data]

In [None]:
from sklearn.svm import SVC
# stratified k-fold cross-validation
skf = train_test_split(test_size=0.2, stratify=True)
n_cv = 1
for train_index, val_index in skf.split(data, label):

    data_train, data_val = data[train_index], data[val_index]
    label_train, label_val = label[train_index], label[val_index]
    
    freq_train = itemfreq(label_train)
    print "train freq", freq_train[:,1]
    freq_val = itemfreq(label_val)
    print "val freq", freq_val[:,1]

    # pass through cnn
    extract1 = extract_features(data_train)
    extract2 = extract_features(data_val)
    clf = SVC(verbose=True, random_state=42)
    print "Training cv {} ...".format(n_cv)
    clf.fit(extract1, label_train)
    acc = clf.score(extract2, label_val)
    print acc
    print "\n"
    n_cv += 1