In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
# This file makes predictions on incoming data and perform basic analysis.
import pandas as pd
import lasagne
import theano
import theano.tensor as T

import nltk
from load_data import Data
from utils import *

FILE_PATH = '/home/sam/Hhd/twitter_sentiment/'
# FILE_PATH = '/home/sam/Data/twitter_sentiment/'

Using cuDNN version 5105 on context None
Mapped name None to device cuda: GeForce GTX 1080 (0000:01:00.0)


In [3]:
file_name = "air.csv"

test_data = Data(file_name, FILE_PATH)
test_df = test_data.csv_df(['text'])
# make a copy of the original tweets for later use
original_df = test_df.copy()
# original tweets
test_df.head()

Loading csv: air.csv ...


Unnamed: 0,text
0,RT @dimitrivegas: For more info... please visi...
1,@AmericanAir I don't think you do!! 90% of the...
2,@AmericanAir they said it was due to size and ...
3,"Amazon, SeaWorld, @AmericanAir Downgraded - @S..."
4,"Julie, @AmericanAir manager in Bozeman, MT, th..."


In [4]:
# pre-process data(same as how we trained)
test_data.pre_process(test_df) 
test_df.head()

Note: pre-process changes the dataframe inplace.


Unnamed: 0,text,tokenized
0,RT For more info please visit this shortcut l...,"[info, please, visit, shortcut, eduaubdedubu]"
1,I dont think you do of the flights Ive had w...,"[dont, think, flights, ive, bags, delayed, fli..."
2,they said it was due to size and I have never...,"[said, due, size, never, problem, fitting, ame..."
3,Amazon SeaWorld Downgraded blog,"[amazon, seaworld, downgraded, blog]"
4,Julie manager in Bozeman MT thanks much for t...,"[manager, mt, thanks, much, upgrade, yesterday..."


In [5]:
# then convert using word2vec
model = test_data.build_wordvec(size=600, verbose=False)
# take a look of the max_len of testing. although we still have to use max_len from train
max_len_test = test_data.max_len(test_df)
max_len_train = 19
data = test_data.convert2vec(test_df, max_len_train, model, name='test')
test_data.save_vec(data, name='test')

Loading existing model tweets600.model.bin ...
Done building.
max sentence length is:  18
npy already exists, loading ...
Done loading npy file.
npy already exists.


### load trained model and make predictions

In [6]:
def cnn(M, D, input_var=None):
    network = lasagne.layers.InputLayer(shape=(None, 1, M, D), input_var=input_var)
    network = lasagne.layers.Conv2DLayer(network, num_filters=40, filter_size=(3, 3), \
                                         nonlinearity=lasagne.nonlinearities.rectify, \
                                         W=lasagne.init.GlorotUniform(), pad=0, stride=(1, 1), \
                                         untie_biases=True)
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
    network = lasagne.layers.Conv2DLayer(network, num_filters=50, filter_size=(3, 3), \
                                         nonlinearity=lasagne.nonlinearities.rectify, pad=0, \
                                         stride=(1, 1), untie_biases=True)
    network = lasagne.layers.MaxPool2DLayer(network, pool_size=(2, 2))
    network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, p=0.5), num_units=6000, \
                                        nonlinearity=lasagne.nonlinearities.rectify)
    network = lasagne.layers.DenseLayer(lasagne.layers.dropout(network, p=0.5), num_units=3,  \
                                        nonlinearity=lasagne.nonlinearities.softmax)
    return network

def make_prediction(data):
    
    N, M, D = data.shape
    print "N, M, D:", N, M, D
    data = data.reshape(-1, 1, M, D).astype(theano.config.floatX) # theano needs this way
    
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')
    network = cnn(M, D, input_var)
    
    # now load model and do predictions
    saved_params = load_network(FILE_PATH, "cnn.npz")
    lasagne.layers.set_all_param_values(network, saved_params)
    
    # define prediction function
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    predict_label = T.argmax(test_prediction,axis=1)
    test_fn = theano.function([input_var], predict_label)
    
    test_pred = test_fn(data) + 1
    
    return test_pred

In [7]:
# get predictions
test_predictions = make_prediction(data)

N, M, D: 1000 19 600


### Take a look at the predictions with raw tweets

In [8]:
test_df['prediction'] = test_predictions
# lets take a look of the 
print test_df['prediction'].value_counts()

# write to original dataframe
original_df['prediction'] = test_predictions
# convert numeric prediction to categorical
class_label = {1:'positive', 2: 'neutral', 3: 'negative'}
original_df = test_data.num2cat(original_df, 'prediction', class_label)
# take a quick look at the prediction and its corresponding tweet
for i in range(10):
    print original_df.values[i,]

3    643
2    224
1    133
Name: prediction, dtype: int64
prediction
Done converting categorical to numeric, this changes df.
[ 'RT @dimitrivegas: For more info... please visit this shortcut link! <ed><U+00A0><U+00BD><ed><U+00B8><U+0085> @AmericanAir https://t.co/M8whrVEKMc'
 'neutral']
[ "@AmericanAir I don't think you do!! 90% of the flights I've had with you my bags delayed, flights cancelled, bookings cancelled etc."
 'negative']
[ '@AmericanAir they said it was due to size and I have never once had a problem fitting it on American flights. New airline here I come'
 'negative']
[ 'Amazon, SeaWorld, @AmericanAir Downgraded - @Schaeffers (blog) : https://t.co/g0QTn38IH4'
 'neutral']
[ 'Julie, @AmericanAir manager in Bozeman, MT, thanks much for the upgrade after yesterday. Glad to have met you and wish you the best!'
 'positive']
[ 'Is San Francisco America\x92s New Art Capital? In Celebrated Living for @AmericanAir #WeAreTravelMedia\x85 https://t.co/RrulHzMw03'
 'negative']
[ "@Amer

In [9]:
# save to csv
original_df.to_csv("airline_predicted.csv")

### Most top frequent words from each sentiment

In [10]:
# look at most frequent words in different groups
print "Positive"
print most_freq(test_df, 1, top=20)
print "Neutral"
print most_freq(test_df, 2)
print "Negative"
print most_freq(test_df, 3, top=20)

Positive
[('great', 49), ('thanks', 38), ('helps', 18), ('thank', 18), ('sharing', 11), ('everyone', 11), ('news', 11), ('experience', 10), ('fly', 10), ('flight', 10), ('big', 10), ('guests', 9), ('smoothly', 9), ('cognos', 9), ('analytics', 9), ('run', 9), ('trip', 9), ('busi', 8), ('amazing', 8), ('making', 7)]
Neutral
[('visit', 53), ('please', 50), ('eduaubdedubu', 48), ('shortcut', 48), ('info', 48), ('w', 17), ('watch', 14), ('award', 11), ('time', 10), ('great', 9)]
Negative
[('flight', 157), ('time', 52), ('get', 50), ('service', 39), ('im', 37), ('us', 37), ('hours', 34), ('delayed', 31), ('plane', 30), ('flights', 29), ('thanks', 28), ('dont', 28), ('please', 28), ('one', 25), ('customer', 25), ('home', 25), ('cant', 24), ('worst', 24), ('help', 24), ('gate', 23)]


### Look up the context of those most frequent word, e.g. *gate*

In [11]:
# we can take a look of the tweets where the frequent word is mentioned, e.g. lookup 'help' in negative tweets
look_up(original_df, test_df, 'gate', 3)

Rebecca: @Delta gave us a coloring book at the gate and the TSA agents gave them stickers after we went through security! @AmericanAir
@AmericanAir If you simcerly want that, please train your gate agents appropriately.
Another travel favorite: land 30 min early. Wait on tarmac for 40 for gate. Then bags not out 40 minutes later. @AmericanAir @PHXSkyHarbor
@Delta : delay @JFK to ATL due to technical issues. Sat inside plane (no drinks) at gate for 2.5 hrs and after night long stay in airport
@DELTA gate agents at B18 must be having a bad day.  Disappointing service.
Rebecca: @Delta gave us a coloring book at the gate and the TSA agents gave them stickers after we went through security! @AmericanAir
Now to hurry up and wait... at least @delta usually gets out of here on time #travel (@ Gate 60 in Kansas City, MO) https://t.co/N3K4mseHNm
@Delta get your act together and staff your damn gate desk. Loved ones want to go home! #LAX
I'd rather walk home than deal with another high strung, cr