In [1]:
import numpy as np
import pandas as pd

In [13]:

from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate, Dropout
from keras.layers import Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, LSTM
from keras.preprocessing import text, sequence

import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from gensim.models import KeyedVectors
from keras.callbacks import EarlyStopping, ModelCheckpoint


Using TensorFlow backend.


In [8]:
""" Title: Scikit-learn: How to obtain True Positive, True Negative, False Positive and False Negative
Author: invoketheshell, & Rasoul
Date: 2015
Availability: https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal """
def perf_measure(y_actual, y_hat):
    y_actual = y_actual.to_list()
    y_hat = y_hat.to_list()
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1: TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]: FP += 1
        if y_actual[i]==y_hat[i]==0: TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]: FN += 1
    return(TP, FP, TN, FN)

In [6]:
EMBEDDING_FILES = [
    'crawl-300d-2M.gensim',
    'glove.840B.300d.gensim'
]

# tuning batch size
BATCH_SIZE = 512
# BATCH_SIZE = 5000

LSTM_UNITS = 128 # output vector dimension of each lstm cell
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 20
DROPOUT_RATE = 0.5
TRAIN_PERCENT = 0.8
VALID_PERCENT = 0.25 # 25% from the training set is equiv to 20% from the whole dataset
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'

In [18]:
""" Title: Simple LSTM
Author: thousandvoices
Date: 2019
Code version: 8
Availability: https://www.kaggle.com/thousandvoices/simple-lstm?scriptVersionId=16109977 """
def build_matrix(word_index, path):
    unknown_words = []
    embedding_index = KeyedVectors.load(path, mmap='r')
    embedding_matrix = np.zeros((MAX_FEATURES + 1, 300))
    for word, i in word_index.items():
        if i <= MAX_FEATURES:
            try:
                embedding_matrix[i] = embedding_index[word]
            except KeyError:
                try:
                    embedding_matrix[i] = embedding_index[word.lower()]
                except KeyError:
                    try:
                        embedding_matrix[i] = embedding_index[word.title()]
                    except KeyError:
                        unknown_words.append(word)
    return embedding_matrix, unknown_words

In [64]:
""" Title: Simple LSTM
Author: thousandvoices
Date: 2019
Code version: 8
Availability: https://www.kaggle.com/thousandvoices/simple-lstm?scriptVersionId=16109977 """
def build_model(embedding_matrix):
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(DROPOUT_RATE)(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    model = Model(inputs=words, outputs=[result])
    model.compile(loss='binary_crossentropy', optimizer='adam') #reset learning rate using Adam Optimizer 

    return model

In [3]:
df = pd.read_csv('comments_preprocessed/comments_preprocessed/comments_preprocessed_1.csv', index_col = 'id')
print(df.memory_usage().sum() / 1024**2) # MB

40.395973205566406


In [4]:
df.shape

(1764927, 2)

In [7]:
MAX_LEN = 242 # outter_fence
comments = df[TEXT_COLUMN].astype('str')
y = df[TARGET_COLUMN].astype(np.int8)

non_long_len = comments.apply(lambda x:len(x.split()) <= 242)
text_removelong = comments[non_long_len].copy()
non_long_len_indices = text_removelong.index
y_removelong = y.loc[non_long_len_indices].copy()

In [10]:
train_size = round(y_removelong.shape[0]*TRAIN_PERCENT)

In [11]:
x_train = text_removelong[:train_size,]
y_train = y_removelong[:train_size,]
x_test = text_removelong[train_size:,]
y_test = y_removelong[train_size:,]

In [14]:
tokenizer = text.Tokenizer(lower = False)
tokenizer.fit_on_texts(list(x_train) + list(x_test))

In [15]:
MAX_FEATURES=len(tokenizer.word_index)
MAX_FEATURES

242697

In [16]:
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [19]:
""" Title: Simple LSTM
Author: thousandvoices
Date: 2019
Code version: 8
Availability: https://www.kaggle.com/thousandvoices/simple-lstm?scriptVersionId=16109977 """
embedding_matrix_cl, unknown_words_cl = build_matrix(tokenizer.word_index, EMBEDDING_FILES[0])
print('n unknown words(crawl): ', len(unknown_words_cl))
embedding_matrix_gl, unknown_words_gl = build_matrix(tokenizer.word_index, EMBEDDING_FILES[1])
print('n unknown words(glove): ', len(unknown_words_gl))

embedding_matrix = np.concatenate([embedding_matrix_cl, embedding_matrix_gl], axis=-1)

del embedding_matrix_cl
del embedding_matrix_gl


n unknown words(crawl):  97776
n unknown words(glove):  98663


In [65]:
""" Title: Simple LSTM
Author: thousandvoices
Date: 2019
Code version: 8
Availability: https://www.kaggle.com/thousandvoices/simple-lstm?scriptVersionId=16109977 """

# dropoutrate 0.5

model = build_model(embedding_matrix)
print('model built')
# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
print('es is set')
mc = ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
print('mc is set')
hist = model.fit(
    x_train,
    y_train_new,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    validation_split = VALID_PERCENT,
    callbacks=[es, mc]
)
print('model fitted')
predictions = model.predict(x_test, batch_size=512).flatten()

model built
es is set
mc is set
Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.13267, saving model to best_model.h5
Epoch 2/20
Epoch 00002: val_loss improved from 0.13267 to 0.12970, saving model to best_model.h5
Epoch 3/20
Epoch 00003: val_loss improved from 0.12970 to 0.12434, saving model to best_model.h5
Epoch 4/20
Epoch 00004: val_loss did not improve from 0.12434
Epoch 5/20
Epoch 00005: val_loss did not improve from 0.12434
Epoch 00005: early stopping
model fitted


In [67]:
predictions_encoded = pd.Series(predictions).apply(lambda x: 0 if x < 0.5 else 1)

In [69]:
y_test_pred = pd.DataFrame({'lstm_predict':predictions_encoded, 'id': list(y_test.index)})
y_result = y_test_pred.merge(y_test, left_on='id', right_on=y_test.index).set_index('id')
y_result.to_csv('result_lstm_batch512_epoch20es_imbal_nomaxf.csv')
#y_result.to_csv('result_lstm_batch5000_epoch20es_imbal_nomaxf.csv')