In [156]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import SGD

from keras.models import Sequential
from keras.layers import Input ,Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization, Flatten, GlobalMaxPooling1D

In [2]:
def parse_dataset(fp):
    '''
    Loads the dataset .txt file with label-tweet on each line and parses the dataset.
    :param fp: filepath of dataset
    :return:
        corpus: list of tweet strings of each tweet.
        y: list of labels
    '''
    y = []
    corpus = []
    with open(fp, 'rt') as data_in:
        for line in data_in:
            if not line.lower().startswith("tweet index"): # discard first line if it contains metadata
                line = line.rstrip() # remove trailing whitespace
                label = int(line.split("\t")[1])
                tweet = line.split("\t")[2]
                y.append(label)
                corpus.append(tweet)

    return corpus, y

In [85]:
tweets, labels = parse_dataset('datasets/train/SemEval2018-T3-train-taskA.txt')

In [84]:
maxlen=30
data = pd.DataFrame({'Tweet': pad_sequences(tweets, maxlen=maxlen), 'Label':labels})
data[:10]

ValueError: invalid literal for int() with base 10: 'ligion  http://t.co/fej2v3OUBR'

In [88]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(data['Tweet'])

In [6]:
# tokenizer

<keras.preprocessing.text.Tokenizer at 0x7f68234bd860>

In [89]:
processed = tokenizer.texts_to_sequences(data['Tweet'])
processed = pad_sequences(processed, maxlen=maxlen)

In [90]:
processed.shape

(3834, 30)

In [168]:
max_features=10000

def get_model():
    model = Sequential()

    model.add(Embedding(max_features, 100, input_length=maxlen))
    model.add(Bidirectional(LSTM(50, return_sequences=True)))
    model.add(GlobalMaxPooling1D()),
    
#     model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
#     model.add(Dropout(0.4))
    model.add(Dense(8, activation='relu'))

    
    model.add(Dense(units=1, activation='softmax'))
    
    opt = SGD(lr=0.1)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [169]:
model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_35 (Embedding)     (None, 30, 100)           1000000   
_________________________________________________________________
bidirectional_47 (Bidirectio (None, 30, 100)           60400     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 100)               0         
_________________________________________________________________
dense_82 (Dense)             (None, 16)                1616      
_________________________________________________________________
dense_83 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_84 (Dense)             (None, 1)                 9         
Total params: 1,062,161
Trainable params: 1,062,161
Non-trainable params: 0
_________________________________________________________________


In [170]:
from keras.callbacks import EarlyStopping
earlyStopping = EarlyStopping(patience=5)
model.fit(processed, labels, batch_size=32, epochs=100, validation_split=0.1, callbacks=[earlyStopping])


Train on 3450 samples, validate on 384 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


<keras.callbacks.History at 0x7f67bc19bda0>