In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

# Checking for The data Info.

In [None]:
df.info()

# **Checking the number of columns and samples**

In [None]:
df.shape

# ***Checking the Null values if the exist***

In [None]:
df.isnull().any()

# ***To check duplications***

In [None]:
sum(df.duplicated())

In [None]:
df.head(30)

In [None]:
df.tail(30)

In [None]:
df.columns

In [None]:
for i,c in enumerate(df.columns):
    i +=1
    print(f'{i}-{c}')


In [None]:
samples = df.shape[0]

train = int(samples * 0.9)

val = samples - train

print(f'Number of samples:{samples}\nNumber of Training Examples:{train}\nNumber of Training Examples:{val}')

In [None]:
X_train = df['comment_text'][:train]

y_train = df[['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']][:train].values

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
X_val = df['comment_text'][train:]
y_val = df[['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']][train:].values

In [None]:
print(X_val.shape, y_val.shape)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab_size = 20000
embedding_dim = 16
max_length = 350
trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length,
padding=padding_type,
truncating=trunc_type)


val_sequences = tokenizer.texts_to_sequences(X_val)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, 
                               truncating=trunc_type)

In [None]:
import numpy as np

train_padded = np.array(train_padded)
val_padded   = np.array(val_padded)

# **Building a simple model using MLP(multi-layer perceptron) or what is called (Vanilla network), we excpect to have better results when we use (RNN, GRUs and LSTM)**

## Also Attention and Transformers can give us more better results.

In [None]:
tf.random.set_seed(30)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# **We are using dropout layer in order to prevent overfitting of our training data and we will check that using Train/val loss values and graphs**

In [None]:
model.summary()

In [None]:
num_epochs = 10
history = model.fit(train_padded, y_train, epochs=num_epochs, validation_data=(val_padded, y_val), verbose=1)

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

In [None]:
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
df1 = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')

In [None]:
df1.head()

In [None]:
X_test = df1['comment_text']
X_test.shape

In [None]:
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, 
                               truncating=trunc_type)

In [None]:
y_test = model.predict(test_padded)

In [None]:
y_test.shape

In [None]:
predictions = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

In [None]:
predictions.head()

In [None]:
predictions[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = y_test

In [None]:
predictions.head()

In [None]:
predictions.to_csv('submission1.csv', index = False)