In [36]:
import os 
import re
import keras

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

#Based on example here: https://devklaus.wordpress.com/2018/03/11/sentiment-analysis-on-us-twitter-airlines-dataset-a-deep-learning-approach/

In [37]:
train = pd.read_csv('../data/train.csv',sep=';',index_col=0)
val = pd.read_csv('../data/val.csv',sep=';',index_col=0)
test = pd.read_csv('../data/test.csv',sep=';',index_col=0)
df = train.append(val).append(test)

In [38]:
from sklearn.preprocessing import MultiLabelBinarizer
onehot_enc = MultiLabelBinarizer()
onehot_enc.fit(df['text'])

MultiLabelBinarizer(classes=None, sparse_output=False)

In [39]:
max_features = 2000
word_tokenizer = Tokenizer(num_words=max_features, split=' ')
word_tokenizer.fit_on_texts(df['text'])


#X = word_tokenizer.texts_to_sequences(train['text'].values.tolist() + val['text'].values.tolist() + test['text'].values.tolist())

Xtrain = word_tokenizer.texts_to_matrix(train['text'])
Xval = word_tokenizer.texts_to_matrix(val['text'])
Xtest = word_tokenizer.texts_to_matrix(test['text'])

In [40]:
ytrain = pd.get_dummies(train['sentiment']).values
yval = pd.get_dummies(val['sentiment']).values
ytest = pd.get_dummies(test['sentiment']).values
print(Xtrain.shape,ytrain.shape)
print(Xval.shape,yval.shape)
print(Xtest.shape,ytest.shape)

(5000, 2000) (5000, 3)
(1000, 2000) (1000, 3)
(1089, 2000) (1089, 3)


In [41]:
from sklearn.naive_bayes import BernoulliNB

bnbc = BernoulliNB(binarize=None)
bnbc.fit(Xtrain, train['sentiment'])
np.mean(bnbc.predict(Xval) == val['sentiment'])

0.737

In [43]:
from sklearn.svm import LinearSVC
lsvm = LinearSVC()
lsvm.fit(Xtrain, train['sentiment'])
np.mean(lsvm.predict(Xval) == val['sentiment'])

0.724

In [44]:
#Uni + Bigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,2))
Xtrain = vectorizer.fit_transform(train['text']).toarray()
Xval = vectorizer.transform(val['text']).toarray()
Xtest = vectorizer.transform(test['text']).toarray()

bnbc = BernoulliNB(binarize=None)
bnbc.fit(Xtrain, train['sentiment'])
np.mean(bnbc.predict(Xval) == val['sentiment'])

0.75

In [45]:
lsvm = LinearSVC()
lsvm.fit(Xtrain, train['sentiment'])
np.mean(lsvm.predict(Xval) == val['sentiment'])

0.75

In [46]:
#Just Bigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2,2))
Xtrain = vectorizer.fit_transform(train['text']).toarray()
Xval = vectorizer.transform(val['text']).toarray()
Xtest = vectorizer.transform(test['text']).toarray()

bnbc = BernoulliNB(binarize=None)
bnbc.fit(Xtrain, train['sentiment'])
np.mean(bnbc.predict(Xval) == val['sentiment'])

0.672

In [47]:
lsvm = LinearSVC()
lsvm.fit(Xtrain, train['sentiment'])
np.mean(lsvm.predict(Xval) == val['sentiment'])

0.658

In [48]:
#Uni + Bigram + Trigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1,3))
Xtrain = vectorizer.fit_transform(train['text']).toarray()
Xval = vectorizer.transform(val['text']).toarray()
Xtest = vectorizer.transform(test['text']).toarray()

bnbc = BernoulliNB(binarize=None)
bnbc.fit(Xtrain, train['sentiment'])
np.mean(bnbc.predict(Xval) == val['sentiment'])

0.714

In [49]:
lsvm = LinearSVC()
lsvm.fit(Xtrain, train['sentiment'])
np.mean(lsvm.predict(Xval) == val['sentiment'])

0.748

In [50]:
#Just Trigram
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(3,3))
Xtrain = vectorizer.fit_transform(train['text']).toarray()
Xval = vectorizer.transform(val['text']).toarray()
Xtest = vectorizer.transform(test['text']).toarray()

bnbc = BernoulliNB(binarize=None)
bnbc.fit(Xtrain, train['sentiment'])
np.mean(bnbc.predict(Xval) == val['sentiment'])

0.49

In [51]:
lsvm = LinearSVC()
lsvm.fit(Xtrain, train['sentiment'])
np.mean(lsvm.predict(Xval) == val['sentiment'])

0.525

In [10]:
max_features = 1000
embed_dim = 128
lstm_out = 196
batch_size = 512
nepochs=20
res = []

In [None]:

nepochs=30
from tqdm import tqdm
with tqdm(total=2*3*4*3) as pbar:
    for max_features in [5000,10000]:
        for embed_dim in [64]:
            for lstm_out in [32]:
                for batch_size in [128]:
                    word_tokenizer = Tokenizer(num_words=max_features, split=' ')
                    word_tokenizer.fit_on_texts(train['text'].values.tolist() + val['text'].values.tolist() + test['text'].values.tolist())


                    X = word_tokenizer.texts_to_sequences(train['text'].values.tolist() + val['text'].values.tolist() + test['text'].values.tolist())
                    X = pad_sequences(X)
                    Xtrain = X[:len(train)]
                    Xval = X[len(train):len(train)+len(val)]
                    Xtest = X[len(train)+len(val):]



                    model = Sequential()
                    model.add(Embedding(max_features, embed_dim,input_length = Xtrain.shape[1]))
                    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
                    model.add(Dense(3,activation='softmax'))
                    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
                    #print(model.summary())

                    history = model.fit(Xtrain, ytrain, epochs = nepochs, batch_size=batch_size, validation_data=(Xval, yval),verbose=False)
                    res += [[max_features,embed_dim,lstm_out,batch_size] + history.history['acc'] + history.history['val_acc']]
                    print([nepochs, max_features,embed_dim,lstm_out,batch_size] + [np.max(history.history['acc'])] + [np.max(history.history['val_acc'])])
                    pbar.update()

  1%|▏         | 1/72 [01:26<1:42:19, 86.47s/it]

[30, 5000, 128, 64, 256, 0.9867999994277954, 0.7630000066757202]


In [18]:
df = pd.DataFrame(np.array(res)[:,:5],columns = ['nepochs','max_features','embed_dim','lstm_out','batch_size'])
tr_acc = pd.DataFrame(np.array(res)[:,5:15])
va_acc = pd.DataFrame(np.array(res)[:,15:25])
df['train_max'] = tr_acc.max(axis=1)
df['val_max'] = va_acc.max(axis=1)
df['diff'] = df['train_max'] - df['val_max']
df[(df['lstm_out']!=3)].sort_values('val_max',ascending=False)

Unnamed: 0,nepochs,max_features,embed_dim,lstm_out,batch_size,train_max,val_max,diff
18,10.0,5000.0,128.0,64.0,256.0,0.9462,0.768,0.1782
15,10.0,5000.0,64.0,64.0,256.0,0.9322,0.767,0.1652
22,10.0,5000.0,256.0,128.0,256.0,0.9666,0.764,0.2026
21,10.0,5000.0,256.0,64.0,256.0,0.9638,0.763,0.2008
20,10.0,5000.0,128.0,256.0,256.0,0.9552,0.762,0.1932
6,10.0,1000.0,128.0,64.0,256.0,0.8488,0.76,0.0888
3,10.0,1000.0,64.0,64.0,256.0,0.833,0.759,0.074
4,10.0,1000.0,64.0,128.0,256.0,0.8306,0.759,0.0716
17,10.0,5000.0,64.0,256.0,256.0,0.9368,0.758,0.1788
23,10.0,5000.0,256.0,256.0,256.0,0.9704,0.755,0.2154


In [36]:
model.predict(history.validation_data[0]).round(0)

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.]], dtype=float32)

In [42]:
history.history

{'acc': [0.4062000021934509,
  0.5275999977111816,
  0.5857999985694885,
  0.6772000019073486,
  0.7309999984741211,
  0.7759999981880188,
  0.7929999979972839,
  0.8005999985694885,
  0.8136000009536744,
  0.8380000006675721],
 'loss': [1.0746020488739014,
  0.9884465896606446,
  0.8958369854927063,
  0.7637522832870484,
  0.6406505021095276,
  0.5588097978591919,
  0.5105084029197693,
  0.48304083008766174,
  0.46273315505981444,
  0.43337293162345886],
 'val_acc': [0.5220000109672547,
  0.5660000066757203,
  0.6179999914169312,
  0.672000009059906,
  0.7129999985694885,
  0.7319999976158142,
  0.7400000061988831,
  0.7509999952316284,
  0.7349999890327453,
  0.7570000038146972],
 'val_loss': [1.0360965976715089,
  0.9714739093780518,
  0.8775711927413941,
  0.7391939158439637,
  0.678727029800415,
  0.6424490041732788,
  0.6495523729324341,
  0.6502507195472718,
  0.6688855495452881,
  0.7119390687942505]}