# Importing Libraries

In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer





# Used for Loading Files later

In [None]:
embedding_file = f'glove.6B.50d.txt'
training_file = f'train.csv'
testing_file = f'test.csv'

In [None]:
# Basic parameters 

embed_size = 50 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector) (because of violin plot)
maxlen = 150 # max number of words in a comment to use (because of violin plot) ADD TO REPORT

# Loading Data

In [None]:
train = pd.read_csv(training_file)
test = pd.read_csv(testing_file)

# Preprocessing the Data

In [None]:
# Remove Punctuation

def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

train['comment_text'] = train['comment_text'].apply(remove_punctuation)
test['comment_text'] = test['comment_text'].apply(remove_punctuation)

In [None]:
# Remove Stop Words

sw=stopwords.words('english')

def removesw(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)

train['comment_text'] = train['comment_text'].apply(removesw)
test['comment_text'] = test['comment_text'].apply(removesw)

In [None]:
# Applying Stemming

stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 
train['comment_text'] = train['comment_text'].apply(stemming)
test['comment_text'] = test['comment_text'].apply(stemming)

In [None]:
# Fill in missing values (redundant since we saw in EDA that it has no missing values)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [None]:
# Keras Preprocessing to convert words into list of word indexes, and padded to a standard length

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
# Read Glove Vectors into a dictionary which maps word to vectors

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))


In [None]:
# Use vectors to create embedding matrix with random initialization for words that aren't in GloVe.

all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean()
emb_std = all_embs.std()

# Use the mean and std deviation for the same.

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

# Create the bi-directional LSTM Model

## NOTE: 
## We ran these models on Google Collab to save time and so this file has no outputs for the same. I have attached a screenshot as an example.

<img src = "lstm_output.png">

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)     
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   

In [None]:
# Begin training

model.fit(X_t, y, batch_size=16, epochs=2, validation_split=0.1);

In [None]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)

# LSTM with SELU activation function

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="selu")(x)     
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   


In [None]:
model.fit(X_t, y, batch_size=16, epochs=2, validation_split=0.1);

In [None]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)

# ReLU with Nadam

In [None]:
# ReLU with Nadam

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)     
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='Nadam', metrics=['accuracy'])  

In [None]:
model.fit(X_t, y, batch_size=16, epochs=2, validation_split=0.1);

In [None]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)

# SELU with Nadam

In [None]:
# SELU with Nadam

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="selu")(x)      
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='Nadam', metrics=['accuracy'])   

In [None]:
model.fit(X_t, y, batch_size=16, epochs=2, validation_split=0.1);

# Testing on Kaggle

In [None]:
# y_test = model.predict([X_te], batch_size=1024, verbose=1)
# sample_submission = pd.read_csv(f'sample_submission.csv')
# sample_submission[list_classes] = y_test
# sample_submission.to_csv('submission_LSTM_SELU.csv', index=False)

Credits: 
https://www.kaggle.com/sbongo/for-beginners-tackling-toxic-using-keras
This guide helped us in our implementation of LSTM model.