In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
from scipy.stats import itemfreq
import random

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

# import user defined load_data to build input data
from load_data import Data

FILE_PATH = '/home/sam/Hhd/twitter_sentiment/'
# FILE_PATH = '/home/sam/Data/twitter_sentiment/'

### Use Airline data

In [2]:
airline_data = Data('Airline-Sentiment-2-w-AA.csv', FILE_PATH)
airline_df = airline_data.csv_df(['airline_sentiment', 'text']) # load data
airline_data.pre_process(airline_df) # pre-process data
# drop neutral
airline_df = airline_data.drop_value(airline_df, 'airline_sentiment', 'neutral')
airline_df.head()

Loading csv: Airline-Sentiment-2-w-AA.csv ...
Note: pre_process changes the dataframe inplace.
Dropped neutral on column airline_sentiment


Unnamed: 0,airline_sentiment,text,tokenized
0,positive,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ..."
1,negative,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta..."
2,negative,and its a really big bad thing about it,"[really, big, bad, thing]"
3,negative,seriously would pay a flight for seats that ...,"[seriously, would, pay, flight, seats, didnt, ..."
4,positive,yes nearly every time I fly VX this ear worm ...,"[yes, nearly, every, time, fly, vx, ear, worm,..."


In [3]:
# convert categorical value to int class
class_label = {'positive': 1, 'negative': 2}
# class_label = {'positive': 1, 'neutral': 2, 'negative': 3}
airline_df = airline_data.cat2num(airline_df,'airline_sentiment', 'class', class_label)
airline_df.head()

class
Done converting categorical to numeric, this changes df.


Unnamed: 0,text,tokenized,class
0,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
1,its really aggressive to blast obnoxious ente...,"[really, aggressive, blast, obnoxious, enterta...",2
2,and its a really big bad thing about it,"[really, big, bad, thing]",2
3,seriously would pay a flight for seats that ...,"[seriously, would, pay, flight, seats, didnt, ...",2
4,yes nearly every time I fly VX this ear worm ...,"[yes, nearly, every, time, fly, vx, ear, worm,...",1


In [4]:
# lets take a look of the 
airline_df['class'].value_counts()

2    9178
1    2363
Name: class, dtype: int64

In [5]:
# comment out if do not want class balance
airline_df = airline_data.balance_class(airline_df)
# and check again
airline_df['class'].value_counts()

2    2363
1    2363
Name: class, dtype: int64

In [6]:
airline_df.head()

Unnamed: 0,text,tokenized,class
0,plus youve added commercials to the experienc...,"[plus, youve, added, commercials, experience, ...",1
1,seriously would pay a flight for seats that ...,"[seriously, would, pay, flight, seats, didnt, ...",2
2,yes nearly every time I fly VX this ear worm ...,"[yes, nearly, every, time, fly, vx, ear, worm,...",1
3,Well I didntbut NOW I DO D,"[well, didntbut]",1
4,it was amazing and arrived an hour early Your...,"[amazing, arrived, hour, early, youre, good]",1


In [7]:
# train or load the model
model = airline_data.build_wordvec(size=800)

2017-03-18 22:12:39,391 : INFO : collecting all words and their counts
2017-03-18 22:12:39,392 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training for tweets800.model.bin ...


2017-03-18 22:12:42,929 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2017-03-18 22:12:42,930 : INFO : Loading a fresh vocabulary
2017-03-18 22:12:43,136 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2017-03-18 22:12:43,136 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2017-03-18 22:12:43,248 : INFO : deleting the raw counts dictionary of 253854 items
2017-03-18 22:12:43,271 : INFO : sample=0.001 downsamples 38 most-common words
2017-03-18 22:12:43,271 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)
2017-03-18 22:12:43,272 : INFO : estimated required memory for 71290 words and 800 dimensions: 491901000 bytes
2017-03-18 22:12:43,424 : INFO : resetting layer weights
2017-03-18 22:12:44,313 : INFO : training model with 4 workers on 71290 vocabulary and 800 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2017-03-

Done building.


In [8]:
max_len = airline_data.max_len(airline_df)
data = airline_data.convert2vec(airline_df, max_len, model, name='airline800_2class')
airline_data.save_vec(data, name='airline800_2class')

max sentence length is:  21
Total 17 not in vocab.
Done converting tweets to vec!
Saved airline800_2class to disk.


In [9]:
# model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

In [10]:
N, M, D = data.shape
print "N, M, D:", N, M, D
data = data.reshape(N, M*D)#.astype(theano.config.floatX) # theano needs this way
label = airline_df['class']
# label = np.int8(label) - 1# seems like theano also needs this
print data.shape
print label.shape

N, M, D: 4726 21 800
(4726, 16800)
(4726,)


In [11]:
# stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=3)
n_cv = 1
for train_index, val_index in skf.split(data, label):
    data_train, data_val = data[train_index], data[val_index]
    label_train, label_val = label[train_index], label[val_index]
    
    freq_train = itemfreq(label_train)
    print "train freq", freq_train[:,1]
    freq_val = itemfreq(label_val)
    print "val freq", freq_val[:,1]
    
    print "Training cv {} ...".format(n_cv)
    clf = SVC(random_state=42)
    clf.fit(data_train, label_train)
    train_acc = clf.score(data_train, label_train)
    val_acc = clf.score(data_val, label_val)
    print "Train accuracy is:", train_acc
    print "Validation accuracy is:", val_acc
    print "\n"
    n_cv += 1

train freq [1575 1575]
val freq [788 788]
Training cv 1 ...
Train accuracy is: 0.7
Validation accuracy is: 0.678934010152


train freq [1575 1575]
val freq [788 788]
Training cv 2 ...
Train accuracy is: 0.688571428571
Validation accuracy is: 0.690355329949


train freq [1576 1576]
val freq [787 787]
Training cv 3 ...
Train accuracy is: 0.690355329949
Validation accuracy is: 0.689326556544


