In [1]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD

from keras.models import Sequential
from keras.layers import Input ,Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization, Flatten, GlobalMaxPooling1D

Using TensorFlow backend.


In [2]:
def parse_dataset(fp):
    '''
    Loads the dataset .txt file with label-tweet on each line and parses the dataset.
    :param fp: filepath of dataset
    :return:
        corpus: list of tweet strings of each tweet.
        y: list of labels
    '''
    y = []
    corpus = []
    with open(fp, 'rt') as data_in:
        for line in data_in:
            if not line.lower().startswith("tweet index"): # discard first line if it contains metadata
                line = line.rstrip() # remove trailing whitespace
                label = int(line.split("\t")[1])
                tweet = line.split("\t")[2]
                y.append(label)
                corpus.append(tweet)

    return corpus, y

In [3]:
tweets, labels = parse_dataset('datasets/train/SemEval2018-T3-train-taskA.txt')

In [8]:
num_words = 10000
maxlen=32
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(tweets)

In [18]:
len(tokenizer.word_index)

12923

In [26]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [28]:
from common.GloveEmbeddings import GloveEmbeddings
embeddings = GloveEmbeddings(
        '/media/radoslav/ce763dbf-b2a6-4110-960f-2ef10c8c6bde/MachineLearning/glove.6B/glove.6B.100d.txt',
        100).load().get_embedding_matrix_for_tokenizer(tokenizer)

Found 400000 word vectors.


In [35]:
processed = tokenizer.texts_to_sequences(tweets)
processed = pad_sequences(processed, maxlen=maxlen)

In [36]:
processed.shape

(3834, 32)

In [37]:
def get_model():
    model = Sequential()

    model.add(Embedding(embeddings.shape[0], embeddings.shape[1], weights=[embeddings], trainable=False))
    model.add(Bidirectional(LSTM(40, dropout=0.3)))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [38]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 100)         1292400   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 80)                45120     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 81        
Total params: 1,337,601
Trainable params: 45,201
Non-trainable params: 1,292,400
_________________________________________________________________


In [39]:
from keras.callbacks import EarlyStopping
earlyStopping = EarlyStopping(patience=5)
model.fit(processed, labels, batch_size=256, epochs=100, validation_split=0.2, callbacks=[earlyStopping])


Train on 3067 samples, validate on 767 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


<keras.callbacks.History at 0x7f1e5cb53cf8>