In [1]:
import csv
import nltk
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras import losses

DATA_PATH="../data/"
DATA = DATA_PATH + "labeled_data.csv"

Using TensorFlow backend.


In [3]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter=',')]
    return data

def getTweets(raw):
    data = [x[6] for x in raw]
    return np.array(data)

def getClass(raw):
    classes = [x[5] for x in raw]
    return np.array(classes)

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
    return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("f1 score: ", f1)
    print("avg recall", rec)    
    print("accuracy", acc)

In [4]:
en_stopwords = set(stopwords.words("english")) 

raw = readData(DATA) 
tweets = getTweets(raw)
classes = getClass(raw)
tweets = preprocess(tweets)

In [5]:
one, two, zero = 0, 0, 0
for i in classes:
    if i == '1':
        one+=1
    elif i == '2':
        two+=1
    elif i == '0':
        zero+=1
print(zero, one, two)

1430 19190 4163


In [8]:
X = [x for x in tweets]
X = np.delete(np.array(X), [0])
y = np.delete(classes, [0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vectorizer.fit(X)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

## Word Tokens with Simple CountVectorizer

In [7]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [23]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [24]:
classifier = SVC(C = 0.1)

In [25]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)



f1 score:  0.6851256974895188
avg recall 0.3333333333333333
accuracy 0.7811176114585435


  'precision', 'predicted', average, warn_for)


## Char Tokens with Simple CountVectorizer

In [26]:
vectorizer = CountVectorizer(
    analyzer = 'char',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(2, 6),
    stop_words = en_stopwords)

In [27]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [28]:
classifier = SVC(C = 0.1)

In [29]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

f1 score:  0.6851256974895188
avg recall 0.3333333333333333
accuracy 0.7811176114585435


## Word Tokens with TF-IDF Vectorizer

In [30]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1))

In [31]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [32]:
classifier = SVC(C = 0.1)

In [33]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

f1 score:  0.6851256974895188
avg recall 0.3333333333333333
accuracy 0.7811176114585435


## Char Tokens with TF-IDF Vectorizer

In [34]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)

In [35]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [36]:
classifier = SVC(C = 0.1)

In [37]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

f1 score:  0.6851256974895188
avg recall 0.3333333333333333
accuracy 0.7811176114585435


## Text To Sequence

In [9]:
allTweets = [x for x in tweets]
classes = [x for x in classes]

n = int(len(allTweets)*0.8)

trainTweets = allTweets[1:n]
testTweets = allTweets[n:]
trainClass = classes[1:n]
testClass = classes[n:]

max_features = 2000
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(trainTweets)

X = tokenizer.texts_to_sequences(trainTweets + testTweets)
X_train = X[:n-1]
X_test = X[n-1:]
X_train = pad_sequences(X_train, maxlen = 32)
X_test = pad_sequences(X_test, maxlen = 32)



In [10]:
Y_train = []
Y_test = []
for i in trainClass:
    if i == "0":
        Y_train.append([1, 0, 0])
    elif i == "1":
        Y_train.append([0, 1, 0])
    elif i == "2":
        Y_train.append([0, 0, 1])
Y_train = np.array(Y_train)

for i in testClass:
    if i == "0":
        Y_test.append([1, 0, 0])
    elif i == "1":
        Y_test.append([0, 1, 0])
    elif i == "2":
        Y_test.append([0, 0, 1])
Y_test = np.array(Y_test)

## GRU Embedding Size: 128

In [11]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(GRU(lstm_out, activation='tanh', recurrent_activation='hard_sigmoid',  
              use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
gru_1 (GRU)                  (None, 196)               191100    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 591       
Total params: 447,691
Trainable params: 447,691
Non-trainable params: 0
_________________________________________________________________
None


## GRU Embedding Size : 128 RMSProp

In [15]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(GRU(lstm_out, activation='tanh', recurrent_activation='hard_sigmoid',  
              use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

  """
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
gru_2 (GRU)                  (None, 196)               191100    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 591       
Total params: 447,691
Trainable params: 447,691
Non-trainable params: 0
_________________________________________________________________
None


## LSTM Embedding size : 128

In [41]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 30s - loss: 0.3706 - acc: 0.8655
Epoch 2/14
 - 29s - loss: 0.2748 - acc: 0.9045
Epoch 3/14
 - 29s - loss: 0.2597 - acc: 0.9096
Epoch 4/14
 - 29s - loss: 0.2486 - acc: 0.9126
Epoch 5/14
 - 29s - loss: 0.2418 - acc: 0.9160
Epoch 6/14
 - 29s - loss: 0.2346 - acc: 0.9186
Epoch 7/14
 - 33s - loss: 0.2277 - acc: 0.9217
Epoch 8/14
 - 33s - loss: 0.2240 - acc: 0.9224
Epoch 9/14
 - 32s - loss: 0.2153 - acc: 0.9256
Epoch 10/14
 - 32s - loss: 0.2082 - acc: 0.9281
Epoch 11/14
 - 29s - loss: 0.1999 - acc: 0.9311
Epoch 12/14
 - 31s - loss: 0.1947 - acc: 0.9325
Epoch 13/14
 - 39s - loss: 0.1899 - acc: 0.9347
Epoch 14/14
 - 32s - loss: 0.1837 - acc: 0.9380


<keras.callbacks.History at 0x7faeed598f98>

In [17]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [18]:
evaluate(preds, Y_test)

f1 score:  0.9159048268356916
avg recall 0.7493826692062546
accuracy 0.9098244906193262


## Embedding size : 128 RMSprop

In [45]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='rmsprop',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_4 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


In [46]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 27s - loss: 0.3935 - acc: 0.8588
Epoch 2/14
 - 26s - loss: 0.2758 - acc: 0.9043
Epoch 3/14
 - 26s - loss: 0.2581 - acc: 0.9084
Epoch 4/14
 - 25s - loss: 0.2492 - acc: 0.9127
Epoch 5/14
 - 26s - loss: 0.2463 - acc: 0.9142
Epoch 6/14
 - 25s - loss: 0.2385 - acc: 0.9168
Epoch 7/14
 - 25s - loss: 0.2308 - acc: 0.9183
Epoch 8/14
 - 26s - loss: 0.2258 - acc: 0.9208
Epoch 9/14
 - 26s - loss: 0.2210 - acc: 0.9219
Epoch 10/14
 - 26s - loss: 0.2158 - acc: 0.9241
Epoch 11/14
 - 26s - loss: 0.2091 - acc: 0.9251
Epoch 12/14
 - 26s - loss: 0.2026 - acc: 0.9281
Epoch 13/14
 - 26s - loss: 0.1988 - acc: 0.9316
Epoch 14/14
 - 26s - loss: 0.1895 - acc: 0.9348


<keras.callbacks.History at 0x7f29b05fe358>

In [47]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [48]:
evaluate(preds, Y_test)

f1 score:  0.9266472489443812
avg recall 0.787312116442089
accuracy 0.915473068388138


## Embedding size : 256

In [49]:
embed_dim = 256
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 32, 256)           512000    
_________________________________________________________________
lstm_5 (LSTM)                (None, 196)               355152    
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 591       
Total params: 867,743
Trainable params: 867,743
Non-trainable params: 0
_________________________________________________________________
None


In [50]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 27s - loss: 0.3690 - acc: 0.8684
Epoch 2/14
 - 26s - loss: 0.2465 - acc: 0.9121
Epoch 3/14
 - 26s - loss: 0.2199 - acc: 0.9198
Epoch 4/14
 - 26s - loss: 0.1978 - acc: 0.9287
Epoch 5/14
 - 26s - loss: 0.1735 - acc: 0.9375
Epoch 6/14
 - 26s - loss: 0.1510 - acc: 0.9439
Epoch 7/14
 - 26s - loss: 0.1290 - acc: 0.9520
Epoch 8/14
 - 26s - loss: 0.1113 - acc: 0.9582
Epoch 9/14
 - 26s - loss: 0.0915 - acc: 0.9665
Epoch 10/14
 - 26s - loss: 0.0780 - acc: 0.9722
Epoch 11/14
 - 26s - loss: 0.0684 - acc: 0.9751
Epoch 12/14
 - 26s - loss: 0.0579 - acc: 0.9794
Epoch 13/14
 - 25s - loss: 0.0518 - acc: 0.9811
Epoch 14/14
 - 26s - loss: 0.0429 - acc: 0.9845


<keras.callbacks.History at 0x7f297854d710>

In [51]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [52]:
evaluate(preds, Y_test)

f1 score:  0.8968267868127393
avg recall 0.7019873385287991
accuracy 0.8948961065160379


In [53]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 14, batch_size = batch_size, verbose = 2)

Epoch 1/14
 - 26s - loss: 0.0372 - acc: 0.9872
Epoch 2/14
 - 26s - loss: 0.0335 - acc: 0.9886
Epoch 3/14
 - 26s - loss: 0.0287 - acc: 0.9908
Epoch 4/14
 - 26s - loss: 0.0284 - acc: 0.9898
Epoch 5/14
 - 26s - loss: 0.0270 - acc: 0.9903
Epoch 6/14
 - 26s - loss: 0.0267 - acc: 0.9909
Epoch 7/14
 - 26s - loss: 0.0199 - acc: 0.9926
Epoch 8/14
 - 26s - loss: 0.0203 - acc: 0.9928
Epoch 9/14
 - 26s - loss: 0.0178 - acc: 0.9938
Epoch 10/14
 - 25s - loss: 0.0218 - acc: 0.9918
Epoch 11/14
 - 26s - loss: 0.0190 - acc: 0.9932
Epoch 12/14
 - 26s - loss: 0.0150 - acc: 0.9948
Epoch 13/14
 - 26s - loss: 0.0142 - acc: 0.9952
Epoch 14/14
 - 26s - loss: 0.0156 - acc: 0.9949


<keras.callbacks.History at 0x7f293432c400>

In [54]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [55]:
evaluate(preds, Y_test)

f1 score:  0.8798922681557714
avg recall 0.6787232414703951
accuracy 0.8823885414565261
