In [None]:
!pip install tensorflow -q --upgrade
!pip install numpy -q --upgrade
!pip install pandas -q --upgrade
!pip install sklearn -q --upgrade

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
from zipfile import ZipFile
with ZipFile("/kaggle/working/glove.6B.zip", 'r') as zip_ref:
    zip_ref.extractall(".")

In [None]:
import nltk
import string
import re
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
test_data = pd.read_csv(ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip")
                        .open("test.csv"))
test_labels_data = pd.read_csv(ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip")
                               .open("test_labels.csv"))
train_data = pd.read_csv(ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip")
                         .open("train.csv"))
sample_submission_data = pd.read_csv(ZipFile("/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip")
                               .open("sample_submission.csv"))

In [None]:
test_data.head()

In [None]:
test_labels_data.head()

In [None]:
sample_submission_data.head()

In [None]:
train_data.head()

In [None]:
train_data_labels = train_data[["toxic", 
                         "severe_toxic", 
                         "obscene", 
                         "threat", 
                         "insult", 
                         "identity_hate"]]

train_data_labels.head()

**Pre processing**

In [None]:
def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    
    sentence = re.sub(r'\n', ' ', sentence)

    return sentence

In [None]:
X = []
sentences = list(train_data["comment_text"])
for sen in sentences:
    X.append(preprocess_text(sen))

y = train_data_labels.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

maxlen = 200

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)

**Embeddings**

In [None]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()

glove_file = open("/kaggle/working/glove.6B.100d.txt", encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
deep_inputs = tf.keras.layers.Input(shape=(maxlen,))
embedding_layer = tf.keras.layers.Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
LSTM_Layer_1 = tf.keras.layers.LSTM(128)(embedding_layer)
dense_layer_1 = tf.keras.layers.Dense(6, activation='sigmoid')(LSTM_Layer_1)
model = tf.keras.models.Model(inputs=deep_inputs, outputs=dense_layer_1)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
print(model.summary())

In [None]:
history = model.fit(X_train, y_train, batch_size=128, epochs=1, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(X_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
X_test_data = tokenizer.texts_to_sequences(test_data.comment_text)

X_test_data = tf.keras.preprocessing.sequence.pad_sequences(X_test_data, padding='post', maxlen=maxlen)

In [None]:
preds = model.predict(X_test_data)

In [None]:
submission = pd.DataFrame(preds)

In [None]:
submission.head()

In [None]:
# new_sub = [[submission.id]]
# new_sub

In [None]:
submission["id"] = test_data.id
submission = submission.rename(columns = { 0: "toxic",
                                          1: "severe_toxic",
                                          2: "obscene",
                                          3: "threat",
                                          4: "insult",
                                          5: "identity_hate"
                                         }, inplace = False)
submission

In [None]:
submission["toxic"] = [int(x > 0.35) for x in submission["toxic"]]
submission["severe_toxic"] = [int(x > 0.35) for x in submission["severe_toxic"]]
submission["obscene"] = [int(x > 0.35) for x in submission["obscene"]]
submission["threat"] = [int(x > 0.35) for x in submission["threat"]]
submission["insult"] = [int(x > 0.35) for x in submission["insult"]]
submission["identity_hate"] = [int(x > 0.35) for x in submission["identity_hate"]]

In [None]:
submission.to_csv("submission.csv",index=False)