In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
from scipy.stats import itemfreq
import random

import theano
import theano.tensor as T
import lasagne
from sklearn.model_selection import StratifiedKFold

# import user defined load_data to build input data
from load_data import Data

# Enter your own file path here, in the path it should contain two 
# directories, data and word2vec
# FILE_PATH = '/home/sam/Hhd/twitter_sentiment/'
FILE_PATH = '/home/sam/Data/twitter_sentiment/'

Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 965M (0000:01:00.0)


### Use Airline data

In [2]:
airline_data = Data('Airline-Sentiment-2-w-AA.csv', FILE_PATH)
airline_df = airline_data.csv_df(['airline_sentiment', 'text']) # load data
airline_data.pre_process(airline_df) # pre-process data
# drop neutral
# airline_df = airline_data.drop_value(airline_df, 'airline_sentiment', 'neutral')
airline_df.head()

Loading csv: Airline-Sentiment-2-w-AA.csv ...
Note: pre_process changes the dataframe inplace.


Unnamed: 0,airline_sentiment,text,tokenized
0,neutral,What said,[said]
1,positive,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ..."
2,neutral,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another..."
3,negative,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta..."
4,negative,and its a really big bad thing about it,"[really, big, bad, thing]"


In [3]:
# convert categorical value to int class
class_label = {'positive': 1, 'neutral': 2, 'negative': 3}
# class_label = {'positive': 1, 'negative': 2}
# airline_df['class'] = airline_df['airline_sentiment'].apply(lambda x: class_label[x])
# airline_df.drop('airline_sentiment', inplace=True, axis=1)
airline_df = airline_data.cat2num(airline_df,'airline_sentiment', 'class', class_label)
airline_df.head()

class
Done converting categorical to numeric, this changes df.


Unnamed: 0,text,tokenized,class
0,What said,[said],2
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2
3,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta...",3
4,and its a really big bad thing about it,"[really, big, bad, thing]",3


In [4]:
# lets take a look of the 
airline_df['class'].value_counts()

3    9178
2    3099
1    2363
Name: class, dtype: int64

In [5]:
# comment out if do not want class balance
airline_df = airline_data.balance_class(airline_df)
# and check again
airline_df['class'].value_counts()

3    3099
2    3099
1    2363
Name: class, dtype: int64

In [6]:
airline_df.head()

Unnamed: 0,text,tokenized,class
0,What said,[said],2
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2
3,seriously would pay a flight for seats that ...,"[seriously, would, pay, flight, seats, didnt, ...",3
4,yes nearly every time I fly VX this ear worm ...,"[yes, nearly, every, time, fly, vx, ear, worm,...",1


In [7]:
# train or load the model
model = airline_data.build_wordvec(size=500)

2017-03-18 16:39:05,538 : INFO : collecting all words and their counts
2017-03-18 16:39:05,547 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training for tweets500.model.bin ...


2017-03-18 16:39:11,108 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2017-03-18 16:39:11,109 : INFO : Loading a fresh vocabulary
2017-03-18 16:39:11,448 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2017-03-18 16:39:11,448 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2017-03-18 16:39:11,586 : INFO : deleting the raw counts dictionary of 253854 items
2017-03-18 16:39:11,612 : INFO : sample=0.001 downsamples 38 most-common words
2017-03-18 16:39:11,613 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)
2017-03-18 16:39:11,614 : INFO : estimated required memory for 71290 words and 500 dimensions: 320805000 bytes
2017-03-18 16:39:11,860 : INFO : resetting layer weights
2017-03-18 16:39:12,885 : INFO : training model with 4 workers on 71290 vocabulary and 500 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2017-03-

Done building.


In [9]:
# max_len is the max length of a sentence in our data, this decides the padding
max_len = airline_data.max_len(airline_df)
# convert our aline data to vector
data = airline_data.convert2vec(airline_df, max_len, model, name='airline500')
airline_data.save_vec(data, name='airline500')

max sentence length is:  21
npy already exists, loading ...
Done loading npy file.
npy already exists.


In [10]:
# you can uncomment this to check if the wordvec makes sense
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

In [11]:
N, M, D = data.shape
print "N, M, D:", N, M, D
data = data.reshape(-1, 1, M, D).astype(theano.config.floatX) # theano needs this way
label = airline_df['class']
label = np.int8(label) - 1# seems like theano also needs this
print data.shape
print label.shape

N, M, D: 8561 21 500
(8561, 1, 21, 500)
(8561,)


In [12]:
# ############################# Batch iterator ###############################
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]

In [13]:
# ##################### Build the CNN neural network model #######################
def build_cnn(input_var=None):
    # We create a CNN of two convolution + pooling stages
    # and a fully-connected hidden layer in front of the output layer.

    # Input layer, as usual:
    network = lasagne.layers.InputLayer(shape=(None, 1, M, D), input_var=input_var)
    # This time we do not apply input dropout, as it tends to work less well
    # for convolutional layers.
# first convolution layer
    # Convolutional layer with 32 kernels of size 3x3. Strided and padded
    # convolutions are supported as well; see the docstring.
    network = lasagne.layers.Conv2DLayer(network, num_filters=40, filter_size=(3, 5), \
                                         nonlinearity=lasagne.nonlinearities.rectify, \
                                         W=lasagne.init.GlorotUniform(), pad=0, stride=(1, 1), \
                                         untie_biases=True)
    # Max-pooling layer of factor 2 in both dimensions:
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
# second convolution layer
    # Another convolution with 32 3x3 kernels, and another 2x2 pooling:
#     network = lasagne.layers.Conv2DLayer(network, num_filters=40, filter_size=(3, 5), \
#                                          nonlinearity=lasagne.nonlinearities.rectify, pad=0, \
#                                          stride=(1, 1), untie_biases=True)
    
#     network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))

    # A fully-connected layer of ??? units with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, p=0.5), num_units=1000, \
                                        nonlinearity=lasagne.nonlinearities.rectify)

    # And, finally, the 10-unit output layer with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, p=0.5), num_units=3,  \
                                        nonlinearity=lasagne.nonlinearities.softmax)

    return network

In [14]:
# ############################## Main program ################################
def train_cnn(X_train, y_train, X_val, y_val, X_test, num_epochs=300):
    # Load the dataset
    print("Loading data...")
    #X_train, y_train, X_val, y_val, X_test = load_dataset()
    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")

    network = build_cnn(input_var)
    accuracy_rate = []

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.005, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    # print test_prediction.flatten()
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)

    predict_label = T.argmax(test_prediction,axis=1)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
    test_fn = theano.function([input_var], predict_label)

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
#         start_time = time.time()
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 50, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1
    
#         train_err = train_fn(X_train, y_train)
        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 50, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
    
#         val_err, val_acc = val_fn(X_val, y_val)
#         accuracy_rate.append(val_acc)
        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))

            
    # After training, we compute and print the test error:
    test_pred = test_fn(X_test[:50,:,:])
    print set(test_pred)

    # After training, we compute and print the test error:
#     test_err = 0
#     test_acc = 0
#     test_batches = 0
#     for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
#         inputs, targets = batch
#         err, acc = val_fn(inputs, targets)
#         test_err += err
#         test_acc += acc
#         test_batches += 1
#     print("Final results:")
#     print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
#     print("  test accuracy:\t\t{:.2f} %".format(
#         test_acc / test_batches * 100))

    # Optionally, you could now dump the network weights to a file like this:
    # np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
    #
    # And load them again later on like this:
    # with np.load('model.npz') as f:
    #     param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    # lasagne.layers.set_all_param_values(network, param_values)
    return test_pred

In [15]:
# stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=3)
for train_index, val_index in skf.split(data, label):
    print("TRAIN:", train_index, "TEST:", val_index)
    data_train, data_val = data[train_index], data[val_index]
    label_train, label_val = label[train_index], label[val_index]
    
    freq_train = itemfreq(label_train)
    print "train freq", freq_train[:,1]
    freq_val = itemfreq(label_val)
    print "val freq", freq_val[:,1]
    
    train_cnn(data_train, label_train, data_val, label_val, data_val)

('TRAIN:', array([2842, 2844, 2845, ..., 8558, 8559, 8560]), 'TEST:', array([   0,    1,    2, ..., 2872, 2876, 2877]))
train freq [1575 2066 2066]
val freq [ 788 1033 1033]
Loading data...
Building model and compiling functions...
Starting training...
Epoch 1 of 300 took 13.053s
  training loss:		0.361240
  validation loss:		0.354349
  validation accuracy:		16.53 %
Epoch 2 of 300 took 13.013s
  training loss:		0.350905
  validation loss:		0.350523
  validation accuracy:		16.39 %
Epoch 3 of 300 took 13.051s
  training loss:		0.345920
  validation loss:		0.348370
  validation accuracy:		16.35 %
Epoch 4 of 300 took 13.019s
  training loss:		0.340658
  validation loss:		0.343858
  validation accuracy:		16.63 %
Epoch 5 of 300 took 13.119s
  training loss:		0.333680
  validation loss:		0.337235
  validation accuracy:		17.16 %


KeyboardInterrupt: 