In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU,SimpleRNN
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import BatchNormalization
# from tensorflow.keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping
import os

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [2]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [3]:
data_path = "C:/Users/shaur/Downloads/jigsaw-multilingual-toxic-comment-classification"

train = pd.read_csv(os.path.join(data_path, "jigsaw-toxic-comment-train.csv"))
validation = pd.read_csv(os.path.join(data_path, "validation.csv"))
test = pd.read_csv(os.path.join(data_path, "test.csv"))

In [4]:
validation.head()

Unnamed: 0,id,comment_text,lang,toxic
0,0,Este usuario ni siquiera llega al rango de ...,es,0
1,1,Il testo di questa voce pare esser scopiazzato...,it,0
2,2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,es,1
3,3,Bu maddenin alt başlığı olarak uluslararası i...,tr,0
4,4,Belçika nın şehirlerinin yanında ilçe ve belde...,tr,0


In [5]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'], axis=1, inplace=True)
train = train.loc[:12000,:]
train.shape

(12001, 3)

In [7]:
# Max number of words present in the comment
train['comment_text'].apply(lambda x: len(str(x).split())).max()

1403

In [8]:
def roc_auc(predictions, target):
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [9]:
train.toxic.value_counts()

toxic
0    10865
1     1136
Name: count, dtype: int64

In [10]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, stratify=train.toxic.values, random_state=42, test_size=0.2, shuffle=True)

## Simple RNN

In [11]:
token = tf.keras.preprocessing.text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

xtrain_pad = tf.keras.utils.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = tf.keras.utils.pad_sequences(xvalid_seq, maxlen = max_len)

word_index =  token.word_index

In [12]:
%%time

with strategy.scope():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(len(word_index)+1, 300, input_length=max_len))
    model.add(tf.keras.layers.SimpleRNN(100))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: total: 266 ms
Wall time: 2.19 s


In [13]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x20b8edb2e20>

In [15]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [16]:
scores = model.predict(xvalid_pad)
print(roc_auc(scores, yvalid))

0.7786292953568201


In [17]:
scores_model = []
scores_model.append({'Model':'SimpleRNN', 'AUC_Score':roc_auc(scores, yvalid)})

## LSTMs

In [18]:
embeddings_index = {}
f = open(os.path.join(data_path, "glove.840B.300d.txt"),'r',encoding='utf-8')
for line in tqdm(f):
    
    try:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        pass
    
f.close()

print(len(embeddings_index))

2196018it [01:50, 19917.15it/s]

2195885





In [19]:
embedding_matrix = np.zeros((len(word_index)+1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 43496/43496 [00:00<00:00, 426919.35it/s]


In [20]:
with strategy.scope():
    model = Sequential()
    model.add(Embedding(len(word_index)+1, 300, weights=[embedding_matrix], input_length = max_len, trainable=False))
    model.add(LSTM(100, dropout=0.3, recurrent_dropout=.3))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1500, 300)         13049100  
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 13,209,601
Trainable params: 160,501
Non-trainable params: 13,049,100
_________________________________________________________________


In [21]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x20e48096d00>

In [32]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.98%


In [33]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,yvalid)})

## GRUs

In [34]:
%%time
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = Sequential()
     model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
     model.add(SpatialDropout1D(0.3))
     model.add(GRU(300))
     model.add(Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1500, 300)         13049100  
                                                                 
 spatial_dropout1d (SpatialD  (None, 1500, 300)        0         
 ropout1D)                                                       
                                                                 
 gru (GRU)                   (None, 300)               541800    
                                                                 
 dense_2 (Dense)             (None, 1)                 301       
                                                                 
Total params: 13,591,201
Trainable params: 542,101
Non-trainable params: 13,049,100
_________________________________________________________________
CPU times: total: 188 ms
Wall time: 790 ms


In [36]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2210f5c89d0>

In [37]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,yvalid)})

Auc: 0.97%


In [38]:
scores_model

[{'Model': 'SimpleRNN', 'AUC_Score': 0.8574948632010667},
 {'Model': 'LSTM', 'AUC_Score': 0.9765936234797303},
 {'Model': 'GRU', 'AUC_Score': 0.9743241107360111}]

## Bi-Directional RNNs

In [39]:
with strategy.scope():
    model = Sequential()
    model.add(Embedding(len(word_index)+1, 300, weights=[embedding_matrix], input_length=max_len, trainable=False))
    model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1500, 300)         13049100  
                                                                 
 bidirectional (Bidirectiona  (None, 600)              1442400   
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 1)                 601       
                                                                 
Total params: 14,492,101
Trainable params: 1,443,001
Non-trainable params: 13,049,100
_________________________________________________________________


In [40]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5


In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

In [None]:
scores_model.append({'Model': 'Bi-directional LSTM','AUC_Score': roc_auc(scores,yvalid)})