# Inputs

In [42]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import seaborn as sns

from keras.layers import TextVectorization
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from keras.metrics import Precision, Recall, CategoricalAccuracy

In [6]:
physical_devices = tf.config.list_physical_devices()
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

# Dataset

In [5]:
df = pd.read_csv("./datasets/comment_toxicity/train.csv")
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


# Preprocess

In [10]:
X = df['comment_text']
y = df.iloc[:,2:].values

In [13]:
MAX_FEATURE = 200000 # number of words in the vocab

In [14]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURE,
                               output_sequence_length=1800,
                               output_mode='int')

In [16]:
vectorizer.adapt(X.values)

In [19]:
vectorizer("hello world, great life")

<tf.Tensor: shape=(1800,), dtype=int64, numpy=array([286, 261, 275, ...,   0,   0,   0], dtype=int64)>

In [20]:
vectorized_text = vectorizer(X.values)

In [21]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  643,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2506, ...,     0,     0,     0],
       [  425,   440,    70, ...,     0,     0,     0],
       ...,
       [32141,  7329,   383, ...,     0,     0,     0],
       [    5,    12,   533, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]], dtype=int64)>

### create tf data pipeline

In [24]:
# map, cache, shuffle, batch, prefetch
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [25]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [26]:
batch_y.shape

(16, 6)

In [27]:
train = dataset.take(int(len(dataset) * 0.7))
val = dataset.skip(int(len(dataset) * 0.7)).take(int(len(dataset) * 0.2))
test = dataset.skip(int(len(dataset) * 0.9)).take(int(len(dataset) * 0.1))

In [28]:
len(train), len(val), len(test) # in batches

(6981, 1994, 997)

In [29]:
train_generator = train.as_numpy_iterator()
val_generator = val.as_numpy_iterator()
test_generator = test.as_numpy_iterator()

# Create sequential model

In [34]:
model = Sequential()

model.add(Embedding(MAX_FEATURE+1, 32)) # +1 is for unkown words
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               16640     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 128)               8320      
                                                                 
 dense_5 (Dense)             (None, 256)               33024     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 6)                 774       
                                                      

In [36]:
model.compile(loss=keras.losses.BinaryCrossentropy(), optimizer='adam')

In [38]:
history = model.fit(train, epochs=1, validation_data=val)



# Evaluate

In [39]:
input_text = vectorizer("you suck. I am gonna kill you")

In [40]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [41]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [44]:
res = model.predict(np.expand_dims(input_text,0))



In [43]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [45]:
for batch in test.as_numpy_iterator():
    X_true, y_true = batch
    yhat = model.predict(X_true)

    y_true = y_true.flatten()
    yhat = yhat.flatten()

    precision.update_state(y_true, yhat)
    recall.update_state(y_true, yhat)
    accuracy.update_state(y_true, yhat)



In [46]:
print(f"Precision   : {precision.result().numpy()}")
print(f"Recall      : {recall.result().numpy()}")
print(f"Accuracy    : {accuracy.result().numpy()}")

Precision   : 0.8497388362884521
Recall      : 0.6240779161453247
Accuracy    : 0.48345035314559937


# Saving the model

In [48]:
model.save("comment_toxicity.h5")