In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer

from matplotlib import pyplot as plt 

## Keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Note:
Please refer to the below two notebooks before getting started. first half of the code is taken from them.

https://www.kaggle.com/kaushikholla/0-787-regression-baseline-tf-idf

https://www.kaggle.com/kaushikholla/0-752-eda-baseline-model-svm

## Getting the data together

In [None]:
df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
df_test_l = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")

In [None]:
## We have test and test labels. So merging them together.
df_test = pd.merge(df_test, df_test_l, how = 'left', on = 'id')
df_test.head()

In [None]:
## Reading the training data
df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")

## Joining training and test data
df = pd.concat([df, df_test])

df.sample(5)

In [None]:
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)

In [None]:
df.sample(5)

In [None]:
# df_['toxic_total'].value_counts()
df['y'].value_counts()

In [None]:
df_ = df[['comment_text', 'y']]

In [None]:
df_['y'].value_counts()

In [None]:
df_one = df_[df_['y'] == 1]
df_zero = df_[df_['y'] == 0]

In [None]:
df_sample = df_zero.sample(n = 21384)

In [None]:
df_sample = pd.concat([df_one, df_sample])

In [None]:
df_sample.sample(10)

In [None]:
df_sample['y'].value_counts()

In [None]:
## Plotting the histogram to check the distribution of length of each reviews. 
plt.hist([len(x) for x in df_sample['comment_text']], bins=500)
plt.show()

In [None]:
## Hyper-parameters considered for building the model
embedding_dim = 300
max_length = 1000
padding_type = 'post'
trunc_type = 'post'
oov_tok = "<oov>"

In [None]:
## Tokenizing the sentenes
tokenizer = Tokenizer(oov_token = oov_tok)
tokenizer.fit_on_texts(df_sample['comment_text'])
word_index = tokenizer.word_index

In [None]:
##training sequences and labels
train_seqs = tokenizer.texts_to_sequences(df_sample['comment_text'])
train_padded = pad_sequences(train_seqs,maxlen=max_length, truncating=trunc_type)

In [None]:
## Model Architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index)+1, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
## Summary of the model
model.summary()

In [None]:
## Compiling the model. The loss function considered is binary crossentropy as we are predicting on only two classes. 
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
## Input to tesnorflow model should always be np array format
train_labels = np.array(df_sample["y"])

In [None]:
## Make sure to use GPU before running this cell. Time takes per epoch without gpu is 19 min
num_epochs = 10
## For early stopping to ensure it doesnt overfit
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)
history = model.fit(
    train_padded, 
    train_labels, 
    epochs=num_epochs,
    batch_size = 256,
    callbacks=[callback]
    )

## Comments to Score

In [None]:
## Reading the comments that we need to score
df_ = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
## Extracting the text and converting it for the model 
X_ = tokenizer.texts_to_sequences(df_['text'])
test_padded = pad_sequences(X_, maxlen=max_length, truncating=trunc_type)

In [None]:
pred = model.predict(test_padded)

In [None]:
pred

In [None]:
df_['score'] = pred

In [None]:
df_

In [None]:
df_.iloc[4]['text']

In [None]:
df_[['comment_id', 'score']].to_csv("submission.csv", index = False)