# Simple LSTM with GloVe Embeddings (using only Targets)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import tensorflow as tf
import keras
from matplotlib import pyplot as plt

INPUT_DIR = "../input/jigsaw-unintended-bias-in-toxicity-classification"
GLOVE_DIR = "../input/glove-global-vectors-for-word-representation"

print(os.listdir(INPUT_DIR))
print(os.listdir(GLOVE_DIR))

## Understanding the Input and Output

### Loading the Training Set

In [None]:
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
train.head()

### Understanding what the Output should look like

Here we print a sample output and the test set input. The thing to note is that all we have to submit is the expected value of **TARGET** given the **COMMENT TEXT**. Our Classifier can be trained simply on this data and everything else is to aid rejecting false positives.

In [None]:
with open(os.path.join(INPUT_DIR, 'sample_submission.csv')) as sample_submission:
    for x in range(5):
        print(next(sample_submission), end='')

In [None]:
with open(os.path.join(INPUT_DIR, 'test.csv')) as sample_submission:
    for x in range(10):
        print(next(sample_submission), end='')

### Some comments from the Problem Statement

Here are the different types of Toxicity labels to help us fine tune our predictions.
* severe_toxicity
* obscene
* threat
* insult
* identity_attack
* sexual_explicit

There are many more classes storing the severity of attack / count of certain targetted entities. Here are the once that we will be tested on, that have 500 examples or more in the provided Training Set.
* male
* female
* homosexual_gay_or_lesbian
* christian
* jewish
* muslim
* black
* white
* psychiatric_or_mental_illness

## Preprocessing Text with Embeddings

### Loading the GloVe Embeddings onto Keras

Here we start by reading the GloVe text file. The format here is simple, it's the **token followed by it's 100-D representation, space-separated, in each line**. The token include both words and puncutations, and 's, etc. Next we shall extract data out of this.

In [None]:
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as glove_file:
    for x in range(5):
        print(next(glove_file))

In [None]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
print('Embeddings_index is a map of the words to a', len(embeddings_index['the']), 'dimentional vector.')

### Reading through the input and tokenizing the Comments

We load the data from the CSV file and print the first few lines to see what the data is like. The **Comment_text** column will be what we start working on first, preprocess it into a form we can use.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Generating the Text corpus in the form of a Numpy list
corpus = train['comment_text'].tolist()
print("Some sample comments we train the Tokenizer on:\n", corpus[:3])

# Fitting the tokenizer on the corpus, 
tokenizer = Tokenizer(num_words=1000000)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Padding to convert Jagged array into uniform length 2-D time series data
data = pad_sequences(sequences)

In [None]:
labels = train['target'].as_matrix()
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Split into Train and Validation Sets
VALIDATION_SPLIT = 0.25 # Percentage of sample going to the Validation set
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

### Generating Embedding Matrix and Feeding to Embedding Layer

We have the **Data Tensor**, which is a 317-Dimentional representation of every sentence, padded in the front by 0s till it fits the Max-length of 317.
Now we use the Embedding matrix to freeze the weights in the Embeddings layer. `keras.Embedding` takes in the *Data Tensor* and outputs a *2-D Vectors array representation of the sentence*.

In [None]:
EMBEDDING_DIM = 100
MAX_SEQUENCE_LENGTH = len(data[0])

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = tf.keras.layers.Embedding(len(word_index) + 1,
                                            EMBEDDING_DIM,
                                            weights=[embedding_matrix],
                                            input_length=MAX_SEQUENCE_LENGTH,
                                            trainable=False)

## The Neural Network Architecture

Lets wire up a model. This architecture is derived from this Kernel: https://www.kaggle.com/thousandvoices/simple-lstm

TODO: Understand LSTMs and replace with my own architecture.

In [None]:
def build_model():
    words = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embedding_layer(words)
    x = tf.keras.layers.SpatialDropout1D(0.2)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.CuDNNLSTM(128, return_sequences=True))(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.CuDNNLSTM(128, return_sequences=True))(x)

    hidden = tf.keras.layers.concatenate([
        tf.keras.layers.GlobalMaxPooling1D()(x),
        tf.keras.layers.GlobalAveragePooling1D()(x),
    ])
    hidden = tf.keras.layers.add([hidden, tf.keras.layers.Dense(512, activation='relu')(hidden)])
    hidden = tf.keras.layers.add([hidden, tf.keras.layers.Dense(512, activation='relu')(hidden)])
    result = tf.keras.layers.Dense(1, activation='sigmoid')(hidden)
    
    model = tf.keras.models.Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

Now for the Heavy operation, let's fit the model to the DataSet.

In [None]:
model = build_model()
history = model.fit(x = x_train, y = y_train, validation_data=(x_val, y_val), epochs = 2)

## Analyzing the Model

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png')

Let's plot a few graphs, see how our model did, and where we can do better.

In [None]:
print(history.history.keys())
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('The Loss Function')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

## Outputting the Result

In [None]:
test = pd.read_csv(os.path.join(INPUT_DIR, "test.csv"))
test.head()

In [None]:
questions = test['comment_text'].tolist()
q_data = pad_sequences(tokenizer.texts_to_sequences(questions), maxlen=MAX_SEQUENCE_LENGTH)
print(q_data.shape)

In [None]:
result = model.predict(q_data)
ids = test['id'].tolist()

In [None]:
assert len(result) == len(ids)
with open('submission.csv', 'w') as file:
    file.write('id,prediction\n')
    for item in range(len(ids)):
        file.write(str(ids[item]) + ',' + str(result[item][0]) + '\n')

In [None]:
with open('submission.csv') as sample_submission:
    for x in range(5):
        print(next(sample_submission), end='')