In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
from scipy.stats import itemfreq
import random

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

# import user defined load_data to build input data
from load_data import Data

FILE_PATH = '/home/sam/Hhd/twitter_sentiment/'
# FILE_PATH = '/home/sam/Data/twitter_sentiment/'

In [2]:
ls '/home/sam/Hhd/twitter_sentiment/data'

Airline-Sentiment-2-w-AA.csv  negative.json  questions-words.txt  text8
nba.json                      positive.json  stream.json          trump.json


In [3]:
# ls '/home/sam/Data/twitter_sentiment/data'

### Use Airline data

In [4]:
airline_data = Data('Airline-Sentiment-2-w-AA.csv', FILE_PATH)
airline_df = airline_data.csv_df(['airline_sentiment', 'text']) # load data
airline_data.pre_process(airline_df) # pre-process data
airline_df.head()

Loading csv: Airline-Sentiment-2-w-AA.csv ...
Note: pre_process changes the dataframe inplace.


Unnamed: 0,airline_sentiment,text,tokenized
0,neutral,What said,[said]
1,positive,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ..."
2,neutral,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another..."
3,negative,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta..."
4,negative,and its a really big bad thing about it,"[really, big, bad, thing]"


In [5]:
# convert categorical value to int class
class_label = {'positive': 1, 'neutral': 2, 'negative': 3}
airline_df['class'] = airline_df['airline_sentiment'].apply(lambda x: class_label[x])
airline_df.drop('airline_sentiment', inplace=True, axis=1)
airline_df.head()

Unnamed: 0,text,tokenized,class
0,What said,[said],2
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2
3,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta...",3
4,and its a really big bad thing about it,"[really, big, bad, thing]",3


In [6]:
# take a look of the class
class_counts = airline_df['class'].value_counts()
print class_counts
# let us randomly drop some class 3 so that it is on par with rest of classes
n3 = class_counts.values[0]
n2 = class_counts.values[1]
n1 = class_counts.values[2]
drop_n3 = random.sample(range(n3), n3-n2) # sample without replacement
n3_index = airline_df[airline_df['class']==3].index.values
airline_df.drop(n3_index[drop_n3], axis=0, inplace=True)
airline_df = airline_df.reset_index(drop=True)

3    9178
2    3099
1    2363
Name: class, dtype: int64


In [7]:
airline_df['class'].value_counts()

3    3099
2    3099
1    2363
Name: class, dtype: int64

In [8]:
airline_df.head()

Unnamed: 0,text,tokenized,class
0,What said,[said],2
1,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
2,I didnt today Must mean I need to take anothe...,"[didnt, today, must, mean, need, take, another...",2
3,and its a really big bad thing about it,"[really, big, bad, thing]",3
4,yes nearly every time I fly VX this ear worm ...,"[yes, nearly, every, time, fly, vx, ear, worm,...",1


In [9]:
# train or load the model
model = airline_data.build_wordvec(size=800)

2017-03-16 22:06:03,162 : INFO : collecting all words and their counts
2017-03-16 22:06:03,163 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training for tweets800.model.bin ...


2017-03-16 22:06:06,649 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2017-03-16 22:06:06,650 : INFO : Loading a fresh vocabulary
2017-03-16 22:06:06,883 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2017-03-16 22:06:06,884 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2017-03-16 22:06:06,996 : INFO : deleting the raw counts dictionary of 253854 items
2017-03-16 22:06:07,020 : INFO : sample=0.001 downsamples 38 most-common words
2017-03-16 22:06:07,020 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)
2017-03-16 22:06:07,021 : INFO : estimated required memory for 71290 words and 800 dimensions: 491901000 bytes
2017-03-16 22:06:07,172 : INFO : resetting layer weights
2017-03-16 22:06:08,046 : INFO : training model with 4 workers on 71290 vocabulary and 800 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2017-03-

Done building.


In [11]:
max_len = airline_data.max_len(airline_df)
data = airline_data.convert2vec(airline_df, max_len, model, name='airline800')
airline_data.save_vec(data, name='airline800')

max sentence length is:  21
Total 56 not in vocab.
Done converting tweets to vec!
Saved airline800 to disk.


In [12]:
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

In [13]:
N, M, D = data.shape
print "N, M, D:", N, M, D
data = data.reshape(N, M*D)#.astype(theano.config.floatX) # theano needs this way
label = airline_df['class']
# label = np.int8(label) - 1# seems like theano also needs this
print data.shape
print label.shape

N, M, D: 8561 21 800
(8561, 16800)
(8561,)


In [None]:
# stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=3)
n_cv = 1
for train_index, val_index in skf.split(data, label):
    print("TRAIN:", train_index, "TEST:", val_index)
    data_train, data_val = data[train_index], data[val_index]
    label_train, label_val = label[train_index], label[val_index]
    
    freq_train = itemfreq(label_train)
    print "train freq", freq_train[:,1]
    freq_val = itemfreq(label_val)
    print "val freq", freq_val[:,1]
    
    print "Training cv {} ...".format(n_cv)
    clf = SVC(random_state=42)
    clf.fit(data_train, label_train)
    train_acc = clf.score(data_train, label_train)
    val_acc = clf.score(data_val, label_val)
    print "Train accuracy is:", train_acc
    print "Validation accuracy is:", val_acc
    n_cv += 1

('TRAIN:', array([2816, 2818, 2819, ..., 8558, 8559, 8560]), 'TEST:', array([   0,    1,    2, ..., 2939, 2940, 2941]))
train freq [1575 2066 2066]
val freq [ 788 1033 1033]
Training cv 1 ...
Train accuracy is: 0.501664622394
Validation accuracy is: 0.497547302032
('TRAIN:', array([   0,    1,    2, ..., 8558, 8559, 8560]), 'TEST:', array([2816, 2818, 2819, ..., 6366, 6367, 6368]))
train freq [1575 2066 2066]
val freq [ 788 1033 1033]
Training cv 2 ...
Train accuracy is: 0.495356579639
Validation accuracy is: 0.505956552207
('TRAIN:', array([   0,    1,    2, ..., 6366, 6367, 6368]), 'TEST:', array([5201, 5204, 5206, ..., 8558, 8559, 8560]))
train freq [1576 2066 2066]
val freq [ 787 1033 1033]
Training cv 3 ...
