In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline 
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
X_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
Y_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')

In [None]:
import nltk
#download words from wordnet library
nltk.download('wordnet')
from nltk.stem import PorterStemmer, WordNetLemmatizer
lemmatiser = WordNetLemmatizer()
stemmer = PorterStemmer()

In [None]:
def format(test):
    test["comment_text"] = test['comment_text'].str.replace('[^\w\s]',' ')
    test["comment_text"] = test['comment_text'].str.replace('[\n]',' ')
    test["comment_text"] = test['comment_text'].str.lower()
    test["comment_text"] = test['comment_text'].str.strip()
    test = test[test['comment_text'].str.len().lt(200)]
    #test['comment_text']=test['comment_text'].apply(lambda x: " ".join(stemmer.stem(lemmatiser.lemmatize(word,pos="v")) for word in x.split()))
format(train)
format(X_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import tensorflow as tf
import re

vocab_size = 6000
max_length = 61
tokenizer = Tokenizer(num_words=vocab_size, split=' ', oov_token='<OOV>')
tokenizer.fit_on_texts(train['comment_text'].values)
X_train = tokenizer.texts_to_sequences(train['comment_text'].values)
X_train = pad_sequences(X_train, maxlen=max_length)

Y_train = pd.get_dummies(train[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]]).values

In [None]:
initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=1.)
embed_dim = 196  #128
lstm_out = 196


model = Sequential()
model.add(Embedding(vocab_size, embed_dim, input_length=max_length))
model.add(SpatialDropout1D(0.4))
model.add(Dense(128, activation='tanh', kernel_initializer='random_normal'))
model.add(Dense(128, activation='tanh', kernel_initializer='random_normal'))
model.add(LSTM(128, activation='tanh', return_sequences='true'))
model.add(LSTM(128, activation='tanh'))
model.add(Dense(6,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 1)

In [None]:
def testModel(X_test, Y_test):
  X_test = tokenizer.texts_to_sequences(X_test['comment_text'])
  #padding the tweet to have exactly the same shape as `embedding_2` input
  X_test = pad_sequences(X_test, maxlen=max_length)
  score,acc = model.evaluate(X_test, Y_test[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]], verbose = 1, batch_size = batch_size)
  print("score: %.2f" % (score))
  print("acc: %.2f" % (acc))

X_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
Y_test = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')    
    
def formatTest(test):
    test["comment_text"] = test['comment_text'].str.replace('[^\w\s]',' ')
    test["comment_text"] = test['comment_text'].str.replace('[\n]',' ')
    test["comment_text"] = test['comment_text'].str.lower()
    test["comment_text"] = test['comment_text'].str.strip()

formatTest(X_test)
Y_test.replace([-1], [0], inplace=True)

testModel(X_test, Y_test)

In [None]:
def generatePredictions(X_test):
  X_test = tokenizer.texts_to_sequences(X_test['comment_text'])
  #padding the tweet to have exactly the same shape as `embedding_2` input
  X_test = pad_sequences(X_test, maxlen=max_length)
  predictedValues = model.predict(X_test, verbose=1)
  return predictedValues

predictedValues = generatePredictions(X_test)
#my_submission = pd.DataFrame({'id': test.id, 'SalePrice': predicted_values})
# you could use any filename. We choose submission here
#my_submission.to_csv('submission.csv', index=False)
#predictedValues.head()

In [None]:
print(predictedValues[:,0])

In [None]:
my_submission = pd.DataFrame({'id': X_test.id, 'toxic': predictedValues[:,0], 'severe_toxic': predictedValues[:,1], 'obscene': predictedValues[:,2], 'threat': predictedValues[:,3], 'insult': predictedValues[:,4], 'identity_hate': predictedValues[:,4]})
my_submission.to_csv('submission.csv', index=False)