In [None]:
#import all necessary Libraries
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
# upload the data set
sample_submission = pd.read_csv("../input/sample_submission.csv")
test = pd.read_csv("../input/test.csv")
test_labels = pd.read_csv("../input/test_labels.csv")
train = pd.read_csv("../input/train.csv")

In [None]:
# let's have a look of our train our train dataset
train.head()

In [None]:
#from the train and test dataset create input and output 
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
#y represents the different categories of comments 
y = train[list_classes].values
# The input will be the columns "comment_text".
list_sentences_train = train["comment_text"]
list_sentences_test = test["comment_text"]

In [None]:
"""This class allows to vectorize a text corpus, by turning each text into either a sequence of integers 
(each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary,
based on word count, based on tf-idf... (from https://keras.io)"""

#max_features the maximum number of words to keep in our dictionnary
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [None]:
"""as the sentences don't have the same length we set the maximum length of one sentences/comment
 then we  will use pad_sequences which transaforms the list of tokenized sentence into a numpy array,
 
 which will the be used the LSTM Model  """

maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
"""Now that our data is ready we can create our model and train it. i am going to explain each part of the model"""
print('Build model...')
#the sequential model is a linear stack of layers. after creating it we use the function add() to add different layers.
model = Sequential()
# Embedding layer Turns positive integers (indexes) into dense vectors of fixed size
model.add(Embedding(max_features, 128))
# LSTM is a neural network unlike a feedforward network LSTM has feedback. it performs well on text classification
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Dense layer is just a regular densely connected NN with 6 neurons which corresponds to 6 categories of comments
model.add(Dense(6, activation='sigmoid'))

# let's Compile our model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
# now that everything is set let's fit the model to the datamodel.fit(X_t, y,
          batch_size=32,
          epochs=2,
          validation_split=0.1)


In [None]:
# use the model to predict the output of the test data
y_test = model.predict(X_te)

In [None]:
#create a submission file 
submission_df = pd.DataFrame(columns=['id'] + list_classes)
submission_df['id'] = test['id'].values 
submission_df[list_classes] = y_test 
submission_df.to_csv("./Prediction_Results.csv", index=False)