In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os 
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import RNN,LSTM,GRU,SimpleRNN
from tensorflow.keras.layers import Embedding,Dense,GlobalAveragePooling1D

import matplotlib.pyplot as plt
import tqdm.notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
RANDOM_STATE = 12

## Data Reading

In [None]:
train = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

In [None]:
train.head()

In [None]:
print(train.shape)
print(validation.shape)
print(test.shape)

In [None]:
validation.head()

In [None]:
# first try binary classification
train.drop(columns = ['severe_toxic','obscene',
                      'threat','insult',
                      'identity_hate','id'],
          inplace = True)

In [None]:
train.head()

In [None]:
train = train.loc[:12000,:]

In [None]:
train.shape

In [None]:
# apply basically applies a function on all datasamples
# lambda x: starts a function with x as the input
# apply(lambda x: f(x)) applies the lambda function on all elements\
padding_len = train['comment_text'].apply(lambda x: len(str(x).split())).max()

In [None]:
# Training and Validation split
X_train, X_valid, Y_train, Y_valid = train_test_split(train['comment_text'].values,
                                                      train['toxic'].values,
                                                      random_state = RANDOM_STATE,
                                                      test_size = 0.2,
                                                      shuffle= True)

In [None]:
del train
del validation

Check if pre-processing is needed

## Model Building

In [None]:
def pre_process(train,
                valid,
                number_of_words,
                padding_type,
                max_len):
    tokenizer = Tokenizer(num_words = number_of_words,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=False,
                          split=' ',
                          oov_token="<OOV>")
    tokenizer.fit_on_texts(list(train)+list(valid))
    print('TOKENIZED')
    train_sequence = tokenizer.texts_to_sequences(train)
    print('SAVED VARIABLE 1')
    valid_sequence = tokenizer.texts_to_sequences(valid)
    print('SAVED VARIABLE 2')
    padded_train = pad_sequences(train_sequence,
                                maxlen=max_len,
                                padding=padding_type,
                                truncating="post")
    print('PADDED 1')
    padded_valid = pad_sequences(valid_sequence,
                                maxlen=max_len,
                                padding=padding_type,
                                truncating="post")
    print('PADDED 2')
    word_index = tokenizer.word_index
    return word_index,padded_train,padded_valid

In [None]:
(type(X_train))

In [None]:
word_index,padded_train,padded_valid = pre_process(X_train.astype(str),
                                                   X_valid.astype(str),
                                                   None,
                                                   "post",
                                                   padding_len)

## Model Building - RNN Simple

In [None]:
vocab_size = len(word_index.keys())
embedding_size = 300
input_length = padded_train.shape[1]

In [None]:
model = Sequential([
    Embedding(vocab_size,
              embedding_size,
              input_length = input_length),
    SimpleRNN(100),
    Dense(1,activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer = 'Adam',
              metrics=['Accuracy','AUC'])

In [None]:
history = model.fit(padded_train,Y_train,validation_data=(padded_valid, Y_valid),epochs = 5,batch_size=64)

In [None]:
acc = history.history['Accuracy']
val_acc = history.history['val_Accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
# plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation Accuracy')

plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Cross Entropy')
# plt.ylim([0,1.0])
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()

In [None]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [None]:
scores = model.predict(padded_valid)
print("Auc: %.2f%%" % (roc_auc(scores,Y_valid)))

In [None]:
results = model.evaluate(padded_valid, Y_valid)
print("test loss, test acc, test auc:", results)

## Embedding Algorithm 