In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install BeautifulSoup4

In [None]:
# import text_hammer as th
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras import layers, Input
from tensorflow.keras.optimizers import Adam, SGD
from keras.losses import BinaryCrossentropy
from keras.layers import Dense, LSTM, Dropout, Flatten, GRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.metrics import AUC
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from sklearn.model_selection import train_test_split

from  matplotlib import pyplot as plt
import matplotlib.image as mpimg
import random
import collections

import re
import nltk
from nltk.corpus import stopwords
# from bs4 import BeautifulSoup
%matplotlib inline

In [None]:
# Loading pretrained glove word embeddings
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip -q glove.6B.zip

In [None]:
# nltk.download()

In [None]:
# Loading the word embeddings
def read_glove_vecs():
    path_to_glove_file = os.path.join("../input/glove50/glove.6B.50d.txt")

    embeddings_index = {}
    word_to_index = {}
    index_to_word = {}
    with open(path_to_glove_file) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
    
    words_list = list(embeddings_index.keys())
    
    for i in range(len(words_list)):
        word_to_index[words_list[i]] = i
        index_to_word[i] = words_list[i]            
    
    print("Found %s word vectors." % len(embeddings_index))
    return word_to_index, index_to_word, embeddings_index

In [None]:
word_to_index, index_to_word, embeddings_matrix = read_glove_vecs()

In [None]:
TRAIN_PATH = "../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip"
TEST_PATH = "../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip"
SAMPLE_PATH = "../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip"
MAX_LENGTH = 1000

In [None]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_df = pd.read_csv(SAMPLE_PATH)
print(train_df.head())
print(test_df.head())

In [None]:
train_df[train_df["comment_text"] == float("NaN")]

In [None]:
def review_to_words(raw_review_df, colname):
    # Function to convert a raw review to a string of words
    # The input is an array of strings (a raw movie review), and 
    # the output is an array of strings (a preprocessed movie review)
    #
    for i in range(len(raw_review_df[colname])):
        if i % 1000 == 0:
            print("Training {} / {}".format(i, len(raw_review_df[colname])))
        # 1. Remove HTML
        raw_review_df[colname][i] = BeautifulSoup(raw_review_df[colname][i]).get_text() 
        #
        # 2. Remove non-letters        
        raw_review_df[colname][i] = re.sub("[^a-zA-Z]", " ", raw_review_df[colname][i]) 
        #
        # 3. Convert to lower case, split into individual words
        raw_review_df[colname][i] = raw_review_df[colname][i].lower().split()   
        #
        # 4. Remove stop words
        stops = set(stopwords.words("english"))
        raw_review_df[colname][i] = [w for w in raw_review_df[colname][i] if not w in stops] 
        # 4. Join the words back into one string separated by space, 
        # and return the result.
        raw_review_df[colname][i] = " ".join(raw_review_df[colname][i])
    return raw_review_df  

In [None]:
# train_clean_df = review_to_words(train_df.iloc[:3], "comment_text")

In [None]:
print(train_clean_df["comment_text"][0])
len(train_clean_df["comment_text"][0].split(" "))

In [None]:
print("max len of train comments",max([len(x.split()) for x in train_df.comment_text]))

In [None]:
# lengths and their count to set upper limit of the length of comment
def plot_length_dict(df):
    lengths = {}
    for i in range(len(df)):
        comment = df["comment_text"][i].split(" ")
        if len(comment) in lengths:
            lengths[len(comment)] += 1
        else:
            lengths[len(comment)] = 0
    ordered_lengths = collections.OrderedDict(sorted(lengths.items()))
    plt.hist(ordered_lengths.keys())
    plt.show()

In [None]:
plot_length_dict(train_clean_df)

In [None]:
plot_length_dict(test_clean_df)

In [None]:
train_clean_df = review_to_words(train_df, "comment_text")
train_clean_df.head()

In [None]:
test_clean_df = review_to_words(test_df, "comment_text")
test_clean_df.head()

In [None]:
train_clean_df.to_csv("train_clean_2.csv", index=False)
test_clean_df.to_csv("test_clean_2.csv", index=False)

In [None]:
train_clean_df = pd.read_csv("../input/toxiccleanset/train_clean_2.csv")
test_clean_df = pd.read_csv("../input/toxiccleanset/test_clean_2.csv")

In [None]:
print("max len of train comments",max([len(x.split()) for x in train_clean_df.comment_text]))
print("max len of test comment",max([len(x.split()) for x in test_clean_df.comment_text]))

In [None]:
train_set, eval_set = train_test_split(train_clean_df, test_size=0.3, shuffle=True)
print("X_train shape: " + str(train_set.shape))
print("X_test shape: " + str(eval_set.shape))

In [None]:
def sentences_to_indices(X, word_to_index, max_len):
    m = X.shape[0]
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = X[i].lower().split()
        j = 0
        for  w in sentence_words:
            if j < 1000:
                try:
                    X_indices[i, j] = word_to_index[w]
                except:
                    X_indices[i, j] = 0
            j += 1
    X_indices = np.array(X_indices).astype(np.int64)
    return X_indices

In [None]:
X_train_ind = sentences_to_indices(np.array(train_set.comment_text), word_to_index, max_len = MAX_LENGTH)
X_eval_ind = sentences_to_indices(np.array(eval_set.comment_text), word_to_index, max_len = MAX_LENGTH)
print(X_train_ind[:5, : ])
print(X_eval_ind[:5, : ])

In [None]:
# np.count_nonzero(X_train_ind[0][120] == 0)

In [None]:
# Since the length of each comment is very large and will require significant resources to train the model, 
# we will compress the input and remove the zeros in between.
# But before that we will attempt to train the large model with a gpu and see the difference in accuracies.

In [None]:
X_eval_ind.shape

In [None]:
y_train = np.array(train_set.drop(["id", "comment_text"], axis=1))
y_eval = np.array(eval_set.drop(["id", "comment_text"], axis=1))
print("y_train: " + str(y_train.shape))
print("y_test: " + str(y_eval.shape))

In [None]:
y_train

In [None]:
X_train_tensor = tf.convert_to_tensor(X_train_ind, np.int64)
X_eval_tensor = tf.convert_to_tensor(X_eval_ind, np.int64)
print(X_train_tensor.shape)
print(X_eval_tensor.shape) 

In [None]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1
    emb_dim = word_to_vec_map["the"].shape[0]
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]
    
    embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
    embedding_layer.build((None,))
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [None]:
def create_model(input_shape, word_to_vec_map, word_to_index):
    model = Sequential()
    
    model.add(Input(shape=input_shape))
    
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    model.add(embedding_layer)
    
    model.add(LSTM(750, return_sequences=False))    # try 700 hidden units
    model.add(Dropout(0.4))
    model.add(Dense(6, "softmax"))
    
    return model

In [None]:
model = create_model(MAX_LENGTH, embeddings_matrix, word_to_index)

In [None]:
model.summary()

In [None]:
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name="Adam")
loss_fn = BinaryCrossentropy(from_logits=True)
metrics = AUC(multi_label=True)
model.compile(optimizer=optimizer, loss=loss_fn, metrics=[metrics, "accuracy"])

In [None]:
EPOCHS = 5

In [None]:
# # detect and init the TPU
# tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# # instantiate a distribution strategy
# tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# # instantiating the model in the strategy scope creates the model on the TPU
# with tpu_strategy.scope():
#     model = create_model(MAX_LENGTH, embeddings_matrix, word_to_index)
#     optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name="Adam")
#     loss_fn = BinaryCrossentropy(from_logits=True)
#     model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])

# # train model normally
# history = model.fit(X_train_tensor, y_train, epochs=EPOCHS, batch_size=3)

In [None]:
history = model.fit(X_train_tensor, y_train, epochs=EPOCHS, batch_size=250)

In [None]:
history.history

In [None]:
loss = history.history["loss"]
acc = history.history["accuracy"]

In [None]:
epoch = np.arange(EPOCHS)
plt.plot(epoch, loss)
# plt.plot(epoch, val_loss)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend(['train', 'val'])

In [None]:
epoch = np.arange(EPOCHS)
plt.plot(epoch, acc)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training Accuracy');

In [None]:
model.save("ToxicCommentsv2-BS4-re-nltk")

In [None]:
# Creating a zip of the model folder 
!tar -zcvf ToxicCommentsv2-BS4-re-nltk.tar.gz /kaggle/working/ToxicCommentsv2-BS4-re-nltk

In [None]:
eval_score = model.evaluate(X_eval_tensor, y_eval)
print(eval_score)

In [None]:
x_pred = sentences_to_indices(X_eval_tensor, word_to_index, max_len = MAX_LENGTH)

In [None]:
predicted = model.predict(x_pred)
predicted

In [None]:
y_predicted = np.where(predicted>0.5, 1, 0)
y_predicted

In [None]:
y_predicted = y_predicted.reshape((1, len(y_predicted)))[0]
y_predicted