In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
from scipy.stats import itemfreq

import theano
import theano.tensor as T
import lasagne
from sklearn.model_selection import StratifiedKFold

# import user defined load_data to build input data
from load_data import Data

# FILE_PATH = '/home/sam/Hhd/twitter_sentiment/'
FILE_PATH = '/home/sam/Data/twitter_sentiment/'

Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 965M (0000:01:00.0)


In [2]:
ls '/home/sam/Data/twitter_sentiment/data'

Airline-Sentiment-2-w-AA.csv  primary-plutchik-wheel-DFE.csv  text8
nba.json                      questions-words.txt             text_emotion.csv
negative.json                 sentiment-words-DFE-785960.csv  trump.json
positive.json                 stream.json


### Use Airline data

In [3]:
airline_data = Data('Airline-Sentiment-2-w-AA.csv', FILE_PATH)
airline_df = airline_data.csv_df(['airline_sentiment', 'text']) # load data
airline_data.pre_process(airline_df) # pre-process data
airline_df.head()

Loading csv: Airline-Sentiment-2-w-AA.csv ...
Note: pre_process changes the dataframe inplace.


Unnamed: 0,airline_sentiment,text,tokenized
0,neutral,What said,[said]
1,positive,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ..."
2,neutral,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another..."
3,negative,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta..."
4,negative,and its a really big bad thing about it,"[really, big, bad, thing]"


In [4]:
# convert categorical value to int class
class_label = {'positive': 1, 'neutral': 2, 'negative': 3}
airline_df['class'] = airline_df['airline_sentiment'].apply(lambda x: class_label[x])
airline_df.drop('airline_sentiment', inplace=True, axis=1)
airline_df.head()

Unnamed: 0,text,tokenized,class
0,What said,[said],2
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2
3,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta...",3
4,and its a really big bad thing about it,"[really, big, bad, thing]",3


In [5]:
model = airline_data.build_wordvec(size=800)
max_len = airline_data.max_len(airline_df)
data = airline_data.convert2vec(airline_df, max_len, model, name='airline')
airline_data.save_vec(data, name='airline')

Loading existing model tweets.model.bin ...
Done building.
max sentence length is:  21
npy already exists, loading ...
Done loading npy file.
npy already exists.


In [6]:
airline_df.head()

Unnamed: 0,text,tokenized,class,size
0,What said,[said],2,1
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1,6
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2,8
3,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta...",3,9
4,and its a really big bad thing about it,"[really, big, bad, thing]",3,4


In [7]:
N, M, D = data.shape
print "N, M, D:", N, M, D
data = data.reshape(-1, 1, M, D).astype(theano.config.floatX) # theano needs this way
label = airline_df['class']
label = np.int8(label) - 1# seems like theano also needs this
print data.shape
print label.shape

N, M, D: 14640 21 800
(14640, 1, 21, 800)
(14640,)


In [8]:
# ############################# Batch iterator ###############################
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.arange(len(inputs))
        np.random.shuffle(indices)
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]

In [9]:
# ##################### Build the CNN neural network model #######################
def build_cnn(input_var=None):
    # We create a CNN of two convolution + pooling stages
    # and a fully-connected hidden layer in front of the output layer.

    # Input layer, as usual:
    network = lasagne.layers.InputLayer(shape=(None, 1, M, D), input_var=input_var)
    # This time we do not apply input dropout, as it tends to work less well
    # for convolutional layers.

    # Convolutional layer with 32 kernels of size 3x3. Strided and padded
    # convolutions are supported as well; see the docstring.
    network = lasagne.layers.Conv2DLayer(network, num_filters=50, filter_size=(3, 3), \
                                         nonlinearity=lasagne.nonlinearities.rectify, \
                                         W=lasagne.init.GlorotUniform(), pad=0, stride=(1, 1), \
                                         untie_biases=True)
    # Max-pooling layer of factor 2 in both dimensions:
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))

    # Another convolution with 32 3x3 kernels, and another 2x2 pooling:
    network = lasagne.layers.Conv2DLayer(network, num_filters=50, filter_size=(3, 3), \
                                         nonlinearity=lasagne.nonlinearities.rectify, pad=0, \
                                         stride=(1, 1), untie_biases=True)
    
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))

    # A fully-connected layer of ??? units with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, p=0.5), num_units=2000, \
                                        nonlinearity=lasagne.nonlinearities.rectify)

    # And, finally, the 10-unit output layer with 50% dropout on its inputs:
    network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, p=0.5), num_units=3,  \
                                        nonlinearity=lasagne.nonlinearities.softmax)

    return network

In [10]:
# ############################## Main program ################################
def train_cnn(X_train, y_train, X_val, y_val, X_test, num_epochs=20):
    # Load the dataset
    print("Loading data...")
    #X_train, y_train, X_val, y_val, X_test = load_dataset()
    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")

    network = build_cnn(input_var)
    accuracy_rate = []

    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    # We could add some weight decay as well here, see lasagne.regularization.

    # Create update expressions for training, i.e., how to modify the
    # parameters at each training step. Here, we'll use Stochastic Gradient
    # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more.
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.1, momentum=0.9)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    # print test_prediction.flatten()
    test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var)
    test_loss = test_loss.mean()
    # As a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)

    predict_label = T.argmax(test_prediction,axis=1)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
    test_fn = theano.function([input_var], predict_label)

    # Finally, launch the training loop.
    print("Starting training...")
    # We iterate over epochs:
    for epoch in range(num_epochs):
#         start_time = time.time()
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train, y_train, 50, shuffle=True):
            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1
    
#         train_err = train_fn(X_train, y_train)
        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val, y_val, 50, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
    
#         val_err, val_acc = val_fn(X_val, y_val)
#         accuracy_rate.append(val_acc)
        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_err / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100))

            
    # After training, we compute and print the test error:
    test_pred = test_fn(X_test[:50,:,:])
    print set(test_pred)

    # After training, we compute and print the test error:
#     test_err = 0
#     test_acc = 0
#     test_batches = 0
#     for batch in iterate_minibatches(X_test, y_test, 500, shuffle=False):
#         inputs, targets = batch
#         err, acc = val_fn(inputs, targets)
#         test_err += err
#         test_acc += acc
#         test_batches += 1
#     print("Final results:")
#     print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
#     print("  test accuracy:\t\t{:.2f} %".format(
#         test_acc / test_batches * 100))

    # Optionally, you could now dump the network weights to a file like this:
    # np.savez('model.npz', *lasagne.layers.get_all_param_values(network))
    #
    # And load them again later on like this:
    # with np.load('model.npz') as f:
    #     param_values = [f['arr_%d' % i] for i in range(len(f.files))]
    # lasagne.layers.set_all_param_values(network, param_values)
    return test_pred

In [11]:
# stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=3)
for train_index, val_index in skf.split(data, label):
    print("TRAIN:", train_index, "TEST:", val_index)
    data_train, data_val = data[train_index], data[val_index]
    label_train, label_val = label[train_index], label[val_index]
    
    freq_train = itemfreq(label_train)
    print "train freq", freq_train[:,1]
    freq_val = itemfreq(label_val)
    print "val freq", freq_val[:,1]
    
    train_cnn(data_train, label_train, data_val, label_val, data_val)

('TRAIN:', array([ 4840,  4842,  4843, ..., 14637, 14638, 14639]), 'TEST:', array([   0,    1,    2, ..., 4899, 4900, 4902]))
train freq [1575 2066 6118]
val freq [ 788 1033 3060]
Loading data...
Building model and compiling functions...
Starting training...
Epoch 1 of 20 took 28.982s
  training loss:		0.278249
  validation loss:		0.255908
  validation accuracy:		23.03 %
Epoch 2 of 20 took 28.997s
  training loss:		0.241048
  validation loss:		0.240254
  validation accuracy:		24.23 %
Epoch 3 of 20 took 28.902s
  training loss:		0.231170
  validation loss:		0.231298
  validation accuracy:		24.58 %
Epoch 4 of 20 took 28.883s
  training loss:		0.222096
  validation loss:		0.245425
  validation accuracy:		23.84 %
Epoch 5 of 20 took 28.876s
  training loss:		0.212696
  validation loss:		0.241499
  validation accuracy:		23.90 %
Epoch 6 of 20 took 28.920s
  training loss:		0.205273
  validation loss:		0.243483
  validation accuracy:		24.54 %


KeyboardInterrupt: 