In [74]:
import csv
import re
import codecs

import numpy as np
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras import losses

In [49]:
def readData(path):
    data = []
    with open(path) as file:
        data = file.read()
#         data = codecs.decode(data, 'unicode_escape')
        data = data.split('\n')[:-1]
    return data

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for i in r:
        tweet = re.sub(i, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for line in data:
        tId, tAt, tSent, tweet = line.split("\t")[:4] # Splitting by tabspace
        tweet = removePattern(tweet, "@[\w]*") # Removing @user tags
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append([tId, tAt, tSent, tweet])
    return cleanData

def tokenize(tweet):
    return TweetTokenizer().tokenize(tweet)

In [50]:
def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("F1 score : ", f1)
    print("Avg Recall:", rec)    
    print("Accuracy:  ", acc)    

# 5 Class with SVM

In [39]:
TRAIN_DATA = "../data/Subtasks_CE/twitter-2016train-CE.txt"
TEST_DATA = "../data/Subtasks_CE/twitter-2016test-CE.txt"
DEV_DATA = "../data/Subtasks_CE/twitter-2016dev-CE.txt"
DEVTEST_DATA = "../data/Subtasks_CE/twitter-2016devtest-CE.txt"

In [40]:
en_stopwords = set(stopwords.words("english")) 
vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [41]:
trainData   = readData(TRAIN_DATA)
testData    = readData(TEST_DATA)
devData     = readData(DEV_DATA)
devTestData = readData(DEVTEST_DATA)

In [42]:
trainData   = preprocess(trainData)
testData    = preprocess(testData)
devData     = preprocess(devData)
devTestData = preprocess(devTestData)

In [43]:
trainTweets = [x[3] for x in trainData]
X_train = np.array(trainTweets)
trainSents = [x[2] for x in trainData]
y_train = np.array(trainSents)

testTweets = [x[3] for x in testData]
X_test = np.array(testTweets)
y_test = [x[2] for x in testData]

In [44]:
X = np.append(X_train, X_test)
X = vectorizer.fit_transform(X)
n = X_train.shape[0]
X_train = X[:n]
X_test = X[n:]

In [45]:
params = {'kernel':['linear', 'rbf'],'C':[0.001, 0.01, 0.03, 0.05, 0.07 ,0.1,1, 10],'gamma':[0.000000001,0.00001, 0.001,0.1]}
clf = GridSearchCV(SVC(), params, cv =2, n_jobs = -1)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.score(X_test, y_test))

{'C': 0.03, 'gamma': 1e-09, 'kernel': 'linear'}
0.40141527723924003


In [46]:
evaluate(y_test, clf.predict(X_test))

F1 score :  0.26599774601583265
Avg Recall: 0.20832751412550957
Accuracy:   0.40141527723924003


  'precision', 'predicted', average, warn_for)


# 5 Class with LSTM

In [64]:
TRAIN_DATA = "../data/Subtasks_CE/twitter-2016train-CE.txt"
TEST_DATA = "../data/Subtasks_CE/twitter-2016test-CE.txt"
DEV_DATA = "../data/Subtasks_CE/twitter-2016dev-CE.txt"
DEVTEST_DATA = "../data/Subtasks_CE/twitter-2016devtest-CE.txt"

In [65]:
trainData   = readData(TRAIN_DATA)
testData    = readData(TEST_DATA)
devData     = readData(DEV_DATA)
devTestData = readData(DEVTEST_DATA)

In [66]:
trainData   = readData(TRAIN_DATA)
testData    = readData(TEST_DATA)
devData     = readData(DEV_DATA)
devTestData = readData(DEVTEST_DATA)

In [67]:
trainTweets = [x[3] for x in trainData]
X_train = np.array(trainTweets)
trainSents = [x[2] for x in trainData]
y_train = np.array(trainSents)

testTweets = [x[3] for x in testData]
X_test = np.array(testTweets)
testSents = [x[2] for x in testData]
y_test = np.array(testSents)

In [68]:
max_features = 2000
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(trainTweets)
n = len(X_train)
X = tokenizer.texts_to_sequences(trainTweets + testTweets)
X_train = X[:n]
X_test = X[n:]
X_train = pad_sequences(X_train, maxlen=32)
X_test = pad_sequences(X_test, maxlen=32)



In [86]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_7 (Dense)              (None, 5)                 985       
Total params: 511,785
Trainable params: 511,785
Non-trainable params: 0
_________________________________________________________________
None


In [87]:
Y_train = []
Y_test = []
for i in trainSents:
    if i == "-2":
        Y_train.append([1, 0, 0, 0, 0])
    elif i == "-1":
        Y_train.append([0, 1, 0, 0, 0])
    elif i == "0":
        Y_train.append([0, 0, 1, 0, 0])
    elif i == "1":
        Y_train.append([0, 0, 0, 1, 0])
    elif i == "2":
        Y_train.append([0, 0, 0, 0, 1])
    else:
        print(i)
        
Y_train = np.array(Y_train)

for i in testSents:
    if i == "-2":
        Y_test.append([1, 0, 0, 0, 0])
    elif i == "-1":
        Y_test.append([0, 1, 0, 0, 0])
    elif i == "0":
        Y_test.append([0, 0, 1, 0, 0])
    elif i == "1":
        Y_test.append([0, 0, 0, 1, 0])
    elif i == "2":
        Y_test.append([0, 0, 0, 0, 1])
    else:
        print(i)
Y_test = np.array(Y_test)


In [88]:
batch_size = 32
model.fit(X_train, Y_train, nb_epoch = 7, batch_size=batch_size, verbose = 2)

  


Epoch 1/7
 - 18s - loss: 1.1780 - acc: 0.5322
Epoch 2/7
 - 14s - loss: 0.9784 - acc: 0.6028
Epoch 3/7
 - 12s - loss: 0.8640 - acc: 0.6462
Epoch 4/7
 - 14s - loss: 0.7751 - acc: 0.6820
Epoch 5/7
 - 17s - loss: 0.7055 - acc: 0.7160
Epoch 6/7
 - 16s - loss: 0.6263 - acc: 0.7505
Epoch 7/7
 - 20s - loss: 0.5730 - acc: 0.7793


<keras.callbacks.History at 0x7f171a2ebcf8>

In [89]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0, 0, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1, 0, 0])
    elif np.argmax(result) == 3:
        preds.append([0, 0, 0, 1, 0])
    elif np.argmax(result) == 4:
        preds.append([0, 0, 0, 0, 1])
        
preds = np.array(preds)

In [90]:
evaluate(preds, Y_test)


F1 score :  0.4605503031686896
Avg Recall: 0.2757862935409193
Accuracy:   0.4390752229546336


# 2 class LSTM

In [92]:
TRAIN_DATA = "../data/Subtasks_BD/twitter-2016train-BD.txt"
TEST_DATA = "../data/Subtasks_BD/twitter-2016test-BD.txt"
DEV_DATA = "../data/Subtasks_BD/twitter-2016dev-BD.txt"
DEVTEST_DATA = "../data/Subtasks_BD/twitter-2016devtest-BD.txt"

In [93]:
trainData   = readData(TRAIN_DATA)
testData    = readData(TEST_DATA)
devData     = readData(DEV_DATA)
devTestData = readData(DEVTEST_DATA)

In [94]:
trainData   = preprocess(trainData)
testData    = preprocess(testData)
devData     = preprocess(devData)
devTestData = preprocess(devTestData)

In [95]:
trainTweets = [x[3] for x in trainData]
X_train = np.array(trainTweets)
trainSents = [x[2] for x in trainData]
y_train = np.array(trainSents)

testTweets = [x[3] for x in testData]
X_test = np.array(testTweets)
testSents = [x[2] for x in testData]
y_test = np.array(testSents)

In [96]:
max_features = 2000
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(trainTweets)
n = len(X_train)
X = tokenizer.texts_to_sequences(trainTweets + testTweets)
X_train = X[:n]
X_test = X[n:]
X_train = pad_sequences(X_train, maxlen=32)
X_test = pad_sequences(X_test, maxlen=32)



In [97]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_8 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [98]:
Y_train = []
Y_test = []
for i in trainSents:
    if i == "positive":
        Y_train.append([1, 0])
    else:
        Y_train.append([0, 1])
Y_train = np.array(Y_train)

for i in testSents:
    if i == "positive":
        Y_test.append([1, 0])
    else:
        Y_test.append([0, 1])
Y_test = np.array(Y_test)


In [99]:
batch_size = 32
model.fit(X_train, Y_train, nb_epoch = 7, batch_size=batch_size, verbose = 2)

  


Epoch 1/7
 - 11s - loss: 0.4134 - acc: 0.8373
Epoch 2/7
 - 9s - loss: 0.2542 - acc: 0.8974
Epoch 3/7
 - 9s - loss: 0.1896 - acc: 0.9312
Epoch 4/7
 - 9s - loss: 0.1469 - acc: 0.9448
Epoch 5/7
 - 10s - loss: 0.1131 - acc: 0.9609
Epoch 6/7
 - 7s - loss: 0.0787 - acc: 0.9740
Epoch 7/7
 - 7s - loss: 0.0670 - acc: 0.9765


<keras.callbacks.History at 0x7f175d905860>

In [100]:
preds = []
for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0])
    else:
        preds.append([0, 1])
preds = np.array(preds)

In [101]:
evaluate(preds, Y_test)

F1 score :  0.7536952763479294
Avg Recall: 0.6261703164164272
Accuracy:   0.7483650838783054
