In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD

from keras.models import Sequential
from keras.layers import Input ,Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from common.attention import AttentionWithContext

Using TensorFlow backend.


In [3]:
def parse_dataset(fp):
    '''
    Loads the dataset .txt file with label-tweet on each line and parses the dataset.
    :param fp: filepath of dataset
    :return:
        corpus: list of tweet strings of each tweet.
        y: list of labels
    '''
    y = []
    corpus = []
    with open(fp, 'rt') as data_in:
        for line in data_in:
            if not line.lower().startswith("tweet index"): # discard first line if it contains metadata
                line = line.rstrip() # remove trailing whitespace
                label = int(line.split("\t")[1])
                tweet = line.split("\t")[2]
                y.append(label)
                corpus.append(tweet)

    return corpus, y

In [4]:
tweets, labels = parse_dataset('datasets/train/SemEval2018-T3-train-taskA.txt')

In [5]:
num_words = 10000
maxlen=32
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(tweets)

In [6]:
len(tokenizer.word_index)

12923

In [7]:
from common.GloveEmbeddings import GloveEmbeddings
embeddings = GloveEmbeddings(
        '/media/radoslav/ce763dbf-b2a6-4110-960f-2ef10c8c6bde/MachineLearning/glove.twitter.27B/glove.twitter.27B.200d.txt',
        200).load().get_embedding_matrix_for_tokenizer(tokenizer)

Found 1193514 word vectors.


In [8]:
processed = tokenizer.texts_to_sequences(tweets)
processed = pad_sequences(processed, maxlen=maxlen)

In [25]:
processed.shape

(3834, 32)

In [26]:
def get_model():
    model = Sequential()

    model.add(Embedding(embeddings.shape[0], embeddings.shape[1], weights=[embeddings], trainable=False))
    model.add(Bidirectional(LSTM(10, dropout=0.3, return_sequences=True)))
    model.add(AttentionWithContext())
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [27]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 200)         2584800   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 20)          16880     
_________________________________________________________________
attention_with_context_2 (At (None, 20)                440       
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                672       
_________________________________________________________________
batch_normalization_2 (Batch (None, 32)                128       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total para

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(processed, labels, random_state=123)

In [30]:
print(X_train.shape)
print(X_test.shape)

(2875, 32)
(959, 32)


In [31]:
from keras.callbacks import EarlyStopping
earlyStopping = EarlyStopping(patience=5)
model.fit(X_train, y_train,validation_data=(X_test, y_test), batch_size=256, epochs=100, callbacks=[earlyStopping])


Train on 2875 samples, validate on 959 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


<keras.callbacks.History at 0x7f5f9c572748>

In [32]:
from sklearn.metrics import classification_report

In [33]:
y_pred = model.predict(X_test) > 0.5

In [34]:
print(classification_report(y_pred=y_pred, y_true=y_test))

             precision    recall  f1-score   support

          0       0.64      0.71      0.67       474
          1       0.68      0.60      0.64       485

avg / total       0.66      0.66      0.66       959

