In [100]:
import csv
import nltk
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Dropout
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.utils.np_utils import to_categorical

from keras import losses

DATA_PATH="../data/"
DATA = DATA_PATH + "labeled_data.csv"

In [101]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter=',')]
    return data

def getTweets(raw):
    data = [x[6] for x in raw]
    return np.array(data)

def getClass(raw):
    classes = [x[5] for x in raw]
    return np.array(classes)

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
    return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score:   ", f1)
    print("Avg Recall: ", rec)    
    print("Accuracy:   ", acc)

In [102]:
en_stopwords = set(stopwords.words("english")) 

raw = readData(DATA) 
tweets = getTweets(raw)
classes = getClass(raw)
tweets = preprocess(tweets)

In [103]:
tweets = tweets[1:]
classes = classes[1:]

In [104]:
X = [x for x in tweets]
X = np.delete(np.array(X), [0])
y = np.delete(classes, [0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Word Tokens with Simple CountVectorizer

In [105]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)
vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

In [106]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [107]:
classifier = SVC(C = 0.1)

In [108]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)



F1 score:    0.6779519819556444
Avg Recall:  0.3333333333333333
Accuracy:    0.7758725035303611


  'precision', 'predicted', average, warn_for)


## Char Tokens with Simple CountVectorizer

In [112]:
vectorizer = CountVectorizer(
    analyzer = 'char',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(2, 6),
    stop_words = en_stopwords)
vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

In [113]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [114]:
classifier = SVC(C = 0.1)

In [115]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

F1 score:    0.6779519819556444
Avg Recall:  0.3333333333333333
Accuracy:    0.7758725035303611


## Word Tokens with TF-IDF Vectorizer

In [116]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1))
vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

In [117]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [118]:
classifier = SVC(C = 0.1)

In [119]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

F1 score:    0.6779519819556444
Avg Recall:  0.3333333333333333
Accuracy:    0.7758725035303611


## Char Tokens with TF-IDF Vectorizer

In [120]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
vectorizer.fit(X_train)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

In [121]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [122]:
classifier = SVC(C = 0.1)

In [123]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

F1 score:    0.6779519819556444
Avg Recall:  0.3333333333333333
Accuracy:    0.7758725035303611


# CNN

In [86]:
txt = ''
maxlen = 128
for doc in tweets:
    for s in doc:
        txt += s
chars = set(txt)
vocab_size = len(chars)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 68


In [87]:
def vectorize_sentences(data, char_indices):
    X = []
    for sentences in data:
        x = [char_indices[w] for w in sentences]
        x2 = np.eye(len(char_indices))[x]
        X.append(x2)
    return (pad_sequences(X, maxlen=maxlen))

In [88]:
data = vectorize_sentences(tweets, char_indices)
y = to_categorical(classes)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.2)

## 2 Layer

In [89]:
seq_length = 128

nb_filter = 128
model = Sequential()
model.add(Conv1D(nb_filter, 7, activation='relu', input_shape=(seq_length, 68)))
model.add(MaxPooling1D(3))
model.add(Conv1D(nb_filter, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.2))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_23 (Conv1D)           (None, 122, 128)          61056     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 40, 128)           0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 38, 128)           49280     
_________________________________________________________________
global_average_pooling1d_6 ( (None, 128)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 387       
Total params: 110,723
Trainable params: 110,723
Non-trainable params: 0
_________________________________________________________________
None

In [90]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size = batch_size, verbose = 2)

Epoch 1/10
 - 5s - loss: 0.5800 - acc: 0.7875
Epoch 2/10
 - 3s - loss: 0.4660 - acc: 0.8257
Epoch 3/10
 - 3s - loss: 0.4043 - acc: 0.8526
Epoch 4/10
 - 3s - loss: 0.3661 - acc: 0.8688
Epoch 5/10
 - 3s - loss: 0.3354 - acc: 0.8816
Epoch 6/10
 - 3s - loss: 0.3122 - acc: 0.8898
Epoch 7/10
 - 3s - loss: 0.2934 - acc: 0.8974
Epoch 8/10
 - 3s - loss: 0.2727 - acc: 0.9040
Epoch 9/10
 - 3s - loss: 0.2556 - acc: 0.9116
Epoch 10/10
 - 3s - loss: 0.2436 - acc: 0.9148


<keras.callbacks.History at 0x7f3eec1f9d68>

In [91]:
temp = model.predict(X_test)
preds = []

for x in range(len(temp)):
    result = temp[x]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)
evaluate(preds, y_test)

F1 score:    0.894722387688959
Avg Recall:  0.7025203893750982
Accuracy:    0.8741174097236232


## 5 Layer

In [92]:
seq_length = 128

nb_filter = 128
model = Sequential()
model.add(Conv1D(nb_filter, 7, activation='relu', input_shape=(seq_length, 68)))
model.add(Conv1D(nb_filter, 7, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(nb_filter, 3, activation='relu'))
model.add(Conv1D(nb_filter, 3, activation='relu'))
model.add(Conv1D(nb_filter, 3, activation='relu'))
model.add(GlobalAveragePooling1D())
model.add(Dropout(0.2))
model.add(Dense(3, activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_25 (Conv1D)           (None, 122, 128)          61056     
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 116, 128)          114816    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 38, 128)           0         
_________________________________________________________________
conv1d_27 (Conv1D)           (None, 36, 128)           49280     
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 34, 128)           49280     
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 32, 128)           49280     
_________________________________________________________________
global_average_pooling1d_7 ( (None, 128)               0         
__________

In [93]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size = batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3eb1455b70>

In [94]:
temp = model.predict(X_test)
preds = []

for x in range(len(temp)):
    result = temp[x]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)
evaluate(preds, y_test)

F1 score:    0.8524946799492567
Avg Recall:  0.6583308015538095
Accuracy:    0.8523300383296348


## Text To Sequence

In [45]:
allTweets = [x for x in tweets]
classes = [x for x in classes]

n = int(len(allTweets)*0.8)

trainTweets = allTweets[:n]
testTweets = allTweets[n:]
trainClass = classes[:n]
testClass = classes[n:]

max_features = 2000
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(trainTweets)

X = tokenizer.texts_to_sequences(trainTweets + testTweets)
X_train = X[:n]
X_test = X[n:]
X_train = pad_sequences(X_train, maxlen = 32)
X_test = pad_sequences(X_test, maxlen = 32)



In [46]:
Y_train = []
Y_test = []
for i in trainClass:
    if i == "0":
        Y_train.append([1, 0, 0])
    elif i == "1":
        Y_train.append([0, 1, 0])
    elif i == "2":
        Y_train.append([0, 0, 1])
Y_train = np.array(Y_train)

for i in testClass:
    if i == "0":
        Y_test.append([1, 0, 0])
    elif i == "1":
        Y_test.append([0, 1, 0])
    elif i == "2":
        Y_test.append([0, 0, 1])
Y_test = np.array(Y_test)

## GRU Embedding Size: 128

In [47]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(GRU(lstm_out, activation='tanh', recurrent_activation='hard_sigmoid',  
              use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
gru_1 (GRU)                  (None, 196)               191100    
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 591       
Total params: 447,691
Trainable params: 447,691
Non-trainable params: 0
_________________________________________________________________
None


In [48]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 28s - loss: 0.3642 - acc: 0.8719
Epoch 2/14
 - 27s - loss: 0.2542 - acc: 0.9118
Epoch 3/14
 - 26s - loss: 0.2292 - acc: 0.9181
Epoch 4/14
 - 27s - loss: 0.2085 - acc: 0.9256
Epoch 5/14
 - 27s - loss: 0.1858 - acc: 0.9347
Epoch 6/14
 - 27s - loss: 0.1656 - acc: 0.9399
Epoch 7/14
 - 26s - loss: 0.1463 - acc: 0.9467
Epoch 8/14
 - 27s - loss: 0.1279 - acc: 0.9533
Epoch 9/14
 - 27s - loss: 0.1108 - acc: 0.9587
Epoch 10/14
 - 27s - loss: 0.0996 - acc: 0.9642
Epoch 11/14
 - 26s - loss: 0.0860 - acc: 0.9681
Epoch 12/14
 - 27s - loss: 0.0767 - acc: 0.9722
Epoch 13/14
 - 27s - loss: 0.0710 - acc: 0.9746
Epoch 14/14
 - 27s - loss: 0.0601 - acc: 0.9791


<keras.callbacks.History at 0x7f3f31ae0390>

In [49]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [50]:
evaluate(preds, Y_test)

F1 score:    0.8937814866545816
Avg Recall:  0.6907232447257861
Accuracy:    0.8922735525519467


## GRU Embedding Size : 128 RMSProp

In [51]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(GRU(lstm_out, activation='tanh', recurrent_activation='hard_sigmoid',  
              use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

  """
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
gru_2 (GRU)                  (None, 196)               191100    
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 591       
Total params: 447,691
Trainable params: 447,691
Non-trainable params: 0
_________________________________________________________________
None


In [52]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 28s - loss: 0.3711 - acc: 0.8664
Epoch 2/14
 - 26s - loss: 0.2718 - acc: 0.9070
Epoch 3/14
 - 26s - loss: 0.2584 - acc: 0.9097
Epoch 4/14
 - 26s - loss: 0.2488 - acc: 0.9132
Epoch 5/14
 - 26s - loss: 0.2400 - acc: 0.9153
Epoch 6/14
 - 26s - loss: 0.2332 - acc: 0.9194
Epoch 7/14
 - 26s - loss: 0.2260 - acc: 0.9220
Epoch 8/14
 - 27s - loss: 0.2184 - acc: 0.9253
Epoch 9/14
 - 26s - loss: 0.2101 - acc: 0.9280
Epoch 10/14
 - 26s - loss: 0.2053 - acc: 0.9287
Epoch 11/14
 - 26s - loss: 0.1959 - acc: 0.9323
Epoch 12/14
 - 27s - loss: 0.1892 - acc: 0.9361
Epoch 13/14
 - 26s - loss: 0.1851 - acc: 0.9359
Epoch 14/14
 - 27s - loss: 0.1768 - acc: 0.9404


<keras.callbacks.History at 0x7f3f28058128>

In [53]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [54]:
evaluate(preds, Y_test)

F1 score:    0.9134313144875803
Avg Recall:  0.7290463026285119
Accuracy:    0.9110349001412145


## GRU Embedding Size: 256

In [55]:
embed_dim = 256
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(GRU(lstm_out, activation='tanh', recurrent_activation='hard_sigmoid',  
              use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 32, 256)           512000    
_________________________________________________________________
gru_3 (GRU)                  (None, 196)               266364    
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 591       
Total params: 778,955
Trainable params: 778,955
Non-trainable params: 0
_________________________________________________________________
None


In [56]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 29s - loss: 0.3546 - acc: 0.8733
Epoch 2/14
 - 27s - loss: 0.2494 - acc: 0.9118
Epoch 3/14
 - 27s - loss: 0.2214 - acc: 0.9198
Epoch 4/14
 - 27s - loss: 0.1949 - acc: 0.9298
Epoch 5/14
 - 27s - loss: 0.1680 - acc: 0.9408
Epoch 6/14
 - 27s - loss: 0.1429 - acc: 0.9488
Epoch 7/14
 - 27s - loss: 0.1188 - acc: 0.9582
Epoch 8/14
 - 27s - loss: 0.0987 - acc: 0.9636
Epoch 9/14
 - 27s - loss: 0.0834 - acc: 0.9701
Epoch 10/14
 - 26s - loss: 0.0701 - acc: 0.9744
Epoch 11/14
 - 27s - loss: 0.0599 - acc: 0.9797
Epoch 12/14
 - 27s - loss: 0.0526 - acc: 0.9811
Epoch 13/14
 - 26s - loss: 0.0442 - acc: 0.9843
Epoch 14/14
 - 27s - loss: 0.0401 - acc: 0.9854


<keras.callbacks.History at 0x7f3eb93acb38>

In [57]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [58]:
evaluate(preds, Y_test)

F1 score:    0.8935686840112977
Avg Recall:  0.694817403171914
Accuracy:    0.8932822271535202


## GRU Embedding Size : 256 RMSProp

In [59]:
embed_dim = 256
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(GRU(lstm_out, activation='tanh', recurrent_activation='hard_sigmoid',  
              use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

  """
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 32, 256)           512000    
_________________________________________________________________
gru_4 (GRU)                  (None, 196)               266364    
_________________________________________________________________
dense_8 (Dense)              (None, 3)                 591       
Total params: 778,955
Trainable params: 778,955
Non-trainable params: 0
_________________________________________________________________
None


In [60]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 29s - loss: 0.3589 - acc: 0.8731
Epoch 2/14
 - 26s - loss: 0.2693 - acc: 0.9075
Epoch 3/14
 - 26s - loss: 0.2527 - acc: 0.9104
Epoch 4/14
 - 27s - loss: 0.2429 - acc: 0.9155
Epoch 5/14
 - 26s - loss: 0.2344 - acc: 0.9190
Epoch 6/14
 - 26s - loss: 0.2235 - acc: 0.9237
Epoch 7/14
 - 26s - loss: 0.2146 - acc: 0.9261
Epoch 8/14
 - 26s - loss: 0.2037 - acc: 0.9303
Epoch 9/14
 - 26s - loss: 0.1928 - acc: 0.9340
Epoch 10/14
 - 26s - loss: 0.1856 - acc: 0.9358
Epoch 11/14
 - 26s - loss: 0.1814 - acc: 0.9387
Epoch 12/14
 - 26s - loss: 0.1697 - acc: 0.9434
Epoch 13/14
 - 26s - loss: 0.1600 - acc: 0.9469
Epoch 14/14
 - 26s - loss: 0.1518 - acc: 0.9491


<keras.callbacks.History at 0x7f3eec1f9e80>

In [61]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [62]:
evaluate(preds, Y_test)

F1 score:    0.9096601532904508
Avg Recall:  0.7341576765232675
Accuracy:    0.9043776477708292


## LSTM Embedding size : 128

In [63]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [64]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 35s - loss: 0.3925 - acc: 0.8600
Epoch 2/14
 - 34s - loss: 0.2549 - acc: 0.9106
Epoch 3/14
 - 33s - loss: 0.2255 - acc: 0.9191
Epoch 4/14
 - 33s - loss: 0.2079 - acc: 0.9254
Epoch 5/14
 - 33s - loss: 0.1893 - acc: 0.9313
Epoch 6/14
 - 33s - loss: 0.1711 - acc: 0.9377
Epoch 7/14
 - 34s - loss: 0.1533 - acc: 0.9428
Epoch 8/14
 - 33s - loss: 0.1360 - acc: 0.9509
Epoch 9/14
 - 34s - loss: 0.1208 - acc: 0.9562
Epoch 10/14
 - 33s - loss: 0.1072 - acc: 0.9599
Epoch 11/14
 - 33s - loss: 0.0937 - acc: 0.9660
Epoch 12/14
 - 33s - loss: 0.0828 - acc: 0.9697
Epoch 13/14
 - 33s - loss: 0.0751 - acc: 0.9736
Epoch 14/14
 - 33s - loss: 0.0631 - acc: 0.9762


<keras.callbacks.History at 0x7f3eb6dee9e8>

In [65]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [66]:
evaluate(preds, Y_test)

F1 score:    0.9035124321187848
Avg Recall:  0.7094559678006201
Accuracy:    0.9017550938067379


## Multi layer LSTM rmsprop Embedding size : 128

In [67]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2, return_sequences = True))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

  """
  
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 32, 196)           254800    
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               308112    
_________________________________________________________________
dense_10 (Dense)             (None, 3)                 591       
Total params: 819,503
Trainable params: 819,503
Non-trainable params: 0
_________________________________________________________________
None


In [68]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 66s - loss: 0.3917 - acc: 0.8610
Epoch 2/14
 - 63s - loss: 0.2795 - acc: 0.9054
Epoch 3/14
 - 63s - loss: 0.2623 - acc: 0.9092
Epoch 4/14
 - 63s - loss: 0.2561 - acc: 0.9126
Epoch 5/14
 - 62s - loss: 0.2502 - acc: 0.9143
Epoch 6/14
 - 63s - loss: 0.2446 - acc: 0.9164
Epoch 7/14
 - 63s - loss: 0.2374 - acc: 0.9176
Epoch 8/14
 - 63s - loss: 0.2322 - acc: 0.9173
Epoch 9/14
 - 62s - loss: 0.2259 - acc: 0.9215
Epoch 10/14
 - 63s - loss: 0.2212 - acc: 0.9229
Epoch 11/14
 - 63s - loss: 0.2151 - acc: 0.9250
Epoch 12/14
 - 63s - loss: 0.2127 - acc: 0.9261
Epoch 13/14
 - 62s - loss: 0.2066 - acc: 0.9295
Epoch 14/14
 - 63s - loss: 0.1979 - acc: 0.9330


<keras.callbacks.History at 0x7f3eb8b60a90>

In [69]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [70]:
evaluate(preds, Y_test)

F1 score:    0.9267241247799751
Avg Recall:  0.7566652675875654
Accuracy:    0.918902562033488


## Embedding size : 128 RMSprop

In [71]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_11 (Dense)             (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [72]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 35s - loss: 0.3940 - acc: 0.8601
Epoch 2/14
 - 33s - loss: 0.2767 - acc: 0.9037
Epoch 3/14
 - 33s - loss: 0.2594 - acc: 0.9092
Epoch 4/14
 - 33s - loss: 0.2509 - acc: 0.9126
Epoch 5/14
 - 33s - loss: 0.2424 - acc: 0.9147
Epoch 6/14
 - 33s - loss: 0.2374 - acc: 0.9162
Epoch 7/14
 - 33s - loss: 0.2340 - acc: 0.9179
Epoch 8/14
 - 33s - loss: 0.2269 - acc: 0.9194
Epoch 9/14
 - 33s - loss: 0.2208 - acc: 0.9240
Epoch 10/14
 - 33s - loss: 0.2156 - acc: 0.9249
Epoch 11/14
 - 33s - loss: 0.2098 - acc: 0.9272
Epoch 12/14
 - 32s - loss: 0.2041 - acc: 0.9284
Epoch 13/14
 - 32s - loss: 0.1985 - acc: 0.9295
Epoch 14/14
 - 33s - loss: 0.1918 - acc: 0.9319


<keras.callbacks.History at 0x7f3eb47d3dd8>

In [73]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [74]:
evaluate(preds, Y_test)

F1 score:    0.9228497582049308
Avg Recall:  0.756774319062377
Accuracy:    0.9180956223522292


## Embedding size : 256

In [75]:
embed_dim = 256
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 32, 256)           512000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 196)               355152    
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 591       
Total params: 867,743
Trainable params: 867,743
Non-trainable params: 0
_________________________________________________________________
None


In [76]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 36s - loss: 0.3708 - acc: 0.8693
Epoch 2/14
 - 32s - loss: 0.2463 - acc: 0.9135
Epoch 3/14
 - 33s - loss: 0.2203 - acc: 0.9209
Epoch 4/14
 - 32s - loss: 0.1997 - acc: 0.9270
Epoch 5/14
 - 32s - loss: 0.1765 - acc: 0.9352
Epoch 6/14
 - 33s - loss: 0.1524 - acc: 0.9448
Epoch 7/14
 - 32s - loss: 0.1293 - acc: 0.9522
Epoch 8/14
 - 32s - loss: 0.1145 - acc: 0.9579
Epoch 9/14
 - 33s - loss: 0.0956 - acc: 0.9636
Epoch 10/14
 - 32s - loss: 0.0798 - acc: 0.9708
Epoch 11/14
 - 33s - loss: 0.0679 - acc: 0.9757
Epoch 12/14
 - 33s - loss: 0.0563 - acc: 0.9787
Epoch 13/14
 - 33s - loss: 0.0480 - acc: 0.9830
Epoch 14/14
 - 33s - loss: 0.0425 - acc: 0.9842


<keras.callbacks.History at 0x7f3eb2c0fc50>

In [77]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [78]:
evaluate(preds, Y_test)

F1 score:    0.9057149412855426
Avg Recall:  0.7166753079708252
Accuracy:    0.9027637684083115


## Embedding size : 256 RMSProp

In [79]:
embed_dim = 256
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 32, 256)           512000    
_________________________________________________________________
lstm_6 (LSTM)                (None, 196)               355152    
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 591       
Total params: 867,743
Trainable params: 867,743
Non-trainable params: 0
_________________________________________________________________
None


In [80]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 37s - loss: 0.3725 - acc: 0.8642
Epoch 2/14
 - 33s - loss: 0.2718 - acc: 0.9060
Epoch 3/14
 - 33s - loss: 0.2543 - acc: 0.9106
Epoch 4/14
 - 33s - loss: 0.2465 - acc: 0.9136
Epoch 5/14
 - 33s - loss: 0.2373 - acc: 0.9155
Epoch 6/14
 - 34s - loss: 0.2303 - acc: 0.9181
Epoch 7/14
 - 33s - loss: 0.2200 - acc: 0.9220
Epoch 8/14
 - 33s - loss: 0.2139 - acc: 0.9258
Epoch 9/14
 - 33s - loss: 0.2030 - acc: 0.9285
Epoch 10/14
 - 31s - loss: 0.1932 - acc: 0.9318
Epoch 11/14
 - 27s - loss: 0.1827 - acc: 0.9355
Epoch 12/14
 - 27s - loss: 0.1755 - acc: 0.9383
Epoch 13/14
 - 27s - loss: 0.1645 - acc: 0.9437
Epoch 14/14
 - 27s - loss: 0.1524 - acc: 0.9470


<keras.callbacks.History at 0x7f3eb29a5588>

In [81]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [82]:
evaluate(preds, Y_test)

F1 score:    0.9054162418522184
Avg Recall:  0.708635810671585
Accuracy:    0.9035707080895703


In [83]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 27s - loss: 0.1412 - acc: 0.9497
Epoch 2/14
 - 27s - loss: 0.1309 - acc: 0.9536
Epoch 3/14
 - 27s - loss: 0.1217 - acc: 0.9577
Epoch 4/14
 - 27s - loss: 0.1108 - acc: 0.9621
Epoch 5/14
 - 27s - loss: 0.1005 - acc: 0.9663
Epoch 6/14
 - 27s - loss: 0.0919 - acc: 0.9692
Epoch 7/14
 - 27s - loss: 0.0829 - acc: 0.9710
Epoch 8/14
 - 27s - loss: 0.0749 - acc: 0.9751
Epoch 9/14
 - 27s - loss: 0.0661 - acc: 0.9768
Epoch 10/14
 - 27s - loss: 0.0578 - acc: 0.9810
Epoch 11/14
 - 27s - loss: 0.0525 - acc: 0.9828
Epoch 12/14
 - 27s - loss: 0.0486 - acc: 0.9836
Epoch 13/14
 - 27s - loss: 0.0464 - acc: 0.9840
Epoch 14/14
 - 27s - loss: 0.0400 - acc: 0.9863


<keras.callbacks.History at 0x7f3eb29a5eb8>

In [84]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [85]:
evaluate(preds, Y_test)

F1 score:    0.8987994048037962
Avg Recall:  0.7110798365542689
Accuracy:    0.8946943715957232
