In [2]:
import csv
import nltk
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from scipy.sparse import hstack
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras import losses

DATA_PATH="../data/"
DATA = DATA_PATH + "labeled_data.csv"

Using TensorFlow backend.


In [3]:
def readData(path):
    data = []
    with open(path,'r') as file:
        data = [x for x in csv.reader(file, delimiter=',')]
    return data

def getTweets(raw):
    data = [x[6] for x in raw]
    return np.array(data)

def getClass(raw):
    classes = [x[5] for x in raw]
    return np.array(classes)

def removePattern(tweet, pattern):
    r = re.findall(pattern, tweet)
    for x in r:
        tweet = re.sub(x, '', tweet)
    return tweet

def preprocess(data):
    cleanData = []
    for tweet in data:
        tweet = removePattern(tweet, "@[\w]*")
        tweet = tweet.replace("#", "") # Removing '#' from hashtags
        tweet = tweet.replace("[^a-zA-Z#]", " ") # Removing punctuation and special characters
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+',"<URL>", tweet)
        tweet = re.sub(" +", " ", tweet)
        tweet = tweet.lower()
        cleanData.append(tweet)
    return cleanData

def tokenize(text):
    return TweetTokenizer.tokenize(text)

def evaluate(target, predicted):
    f1 = f1_score(target, predicted, average='weighted')
    acc = accuracy_score(target, predicted)
    rec = recall_score(target, predicted, average = 'macro')
    print("f1 score: ", f1)
    print("avg recall", rec)    
    print("accuracy", acc)

In [4]:
en_stopwords = set(stopwords.words("english")) 

raw = readData(DATA) 
tweets = getTweets(raw)
classes = getClass(raw)
tweets = preprocess(tweets)



In [10]:
one, two, zero = 0, 0, 0
for i in classes:
    if i == '1':
        one+=1
    elif i == '2':
        two+=1
    elif i == '0':
        zero+=1
print(zero, one, two)

1430 19190 4163


## Word Tokens with Simple CountVectorizer

In [11]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [12]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [6]:
classifier = SVC(C = 0.1)

In [13]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

f1 score:  0.8859701610631988
avg recall 0.666017123630816
accuracy 0.8987290700020173


## Char Tokens with Simple CountVectorizer

In [14]:
vectorizer = CountVectorizer(
    analyzer = 'char',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(2, 6),
    stop_words = en_stopwords)

In [15]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [6]:
classifier = SVC(C = 0.1)

In [16]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

f1 score:  0.8855780280279991
avg recall 0.6655293753993855
accuracy 0.898325600161388


## Word Tokens with TF-IDF Vectorizer

In [17]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1))

In [18]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [6]:
classifier = SVC(C = 0.1)

In [19]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

f1 score:  0.886127565034215
avg recall 0.6668150412533967
accuracy 0.8987290700020173


## Char Tokens with TF-IDF Vectorizer

In [20]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)

In [21]:
classifier = LogisticRegression(C=0.1, solver='sag')

In [6]:
classifier = SVC(C = 0.1)

In [22]:
classifier.fit(train_features, y_train)
y_predict = classifier.predict(test_features)
evaluate(y_test, y_predict)

f1 score:  0.8855780280279991
avg recall 0.6655293753993855
accuracy 0.898325600161388


## LSTM

In [23]:
X = [x for x in tweets]
X = np.delete(np.array(X), [0])
y = np.delete(classes, [0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
vectorizer.fit(X)
train_features = vectorizer.transform(X_train)
test_features = vectorizer.transform(X_test)

In [24]:
allTweets = [x for x in tweets]
classes = [x for x in classes]

n = int(len(allTweets)*0.8)

trainTweets = allTweets[1:n]
testTweets = allTweets[n:]
trainClass = classes[1:n]
testClass = classes[n:]

max_features = 2000
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(trainTweets)

X = tokenizer.texts_to_sequences(trainTweets + testTweets)
X_train = X[:n-1]
X_test = X[n-1:]
X_train = pad_sequences(X_train, maxlen = 32)
X_test = pad_sequences(X_test, maxlen = 32)



## Embedding size : 128

In [25]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.


  """
  


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 128)           256000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 591       
Total params: 511,391
Trainable params: 511,391
Non-trainable params: 0
_________________________________________________________________
None


## Embedding size : 256

In [30]:
embed_dim = 256
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X_train.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  """
  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 32, 256)           512000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               355152    
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 591       
Total params: 867,743
Trainable params: 867,743
Non-trainable params: 0
_________________________________________________________________
None


In [31]:
Y_train = []
Y_test = []
for i in trainClass:
    if i == "0":
        Y_train.append([1, 0, 0])
    elif i == "1":
        Y_train.append([0, 1, 0])
    elif i == "2":
        Y_train.append([0, 0, 1])
Y_train = np.array(Y_train)

for i in testClass:
    if i == "0":
        Y_test.append([1, 0, 0])
    elif i == "1":
        Y_test.append([0, 1, 0])
    elif i == "2":
        Y_test.append([0, 0, 1])
Y_test = np.array(Y_test)

In [32]:
batch_size = 32
model.fit(X_train, Y_train, nb_epoch = 7, batch_size = batch_size, verbose = 2)

  


Epoch 1/7
 - 42s - loss: 0.3689 - acc: 0.8706
Epoch 2/7
 - 46s - loss: 0.2474 - acc: 0.9132
Epoch 3/7
 - 47s - loss: 0.2209 - acc: 0.9214
Epoch 4/7
 - 44s - loss: 0.1985 - acc: 0.9276
Epoch 5/7
 - 47s - loss: 0.1749 - acc: 0.9369
Epoch 6/7
 - 47s - loss: 0.1519 - acc: 0.9429
Epoch 7/7
 - 47s - loss: 0.1323 - acc: 0.9515


<keras.callbacks.History at 0x7f9f68212eb8>

In [33]:
preds = []

for x in range(len(X_test)):
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
    if np.argmax(result) == 0:
        preds.append([1, 0, 0])
    elif np.argmax(result) == 1:
        preds.append([0, 1, 0])
    elif np.argmax(result) == 2:
        preds.append([0, 0, 1])
        
preds = np.array(preds)

In [34]:
evaluate(preds, Y_test)

f1 score:  0.9076079746029507
avg recall 0.7187095470339532
accuracy 0.9063949969739762
