In [1]:
# Importing the dependencies and libraries
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
df = pd.read_csv(
    os.path.join('Dataset', 'train.csv', 'train.csv')
)

In [3]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Preprocessing
---

### Tokenisation and Embedding

In [4]:
from tensorflow.keras.layers import TextVectorization

In [5]:
X = df['comment_text']
y = df[df.columns[2: ]].values #df.columns[2:] = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult' , 'identity_hate']

In [6]:
MAX_WORDS = 200000 # Defining the number of words in the vocab (TextVectorisation)

In [7]:
vectoriser = TextVectorization(
    max_tokens=MAX_WORDS,
    output_sequence_length=1800,
    output_mode='int'
)

In [8]:
vectoriser.adapt(X.values)

In [9]:
vectorised_text = vectoriser(X.values)

In [10]:
vectorised_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

---
### PIPELINES
---

In [11]:
# MCSHBAP - map, cache, shuffle, batch, prefetch -> "from_tensor_slices" or "list_files"
dataset = tf.data.Dataset.from_tensor_slices((vectorised_text, y))

# Pipelining ->
dataset = dataset.cache()
dataset = dataset.shuffle(16000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)  ## <- helps prevent bottle-necks

In [14]:
batch_X, batch_y = dataset.as_numpy_iterator().next() # Deconstruction

# Batch_X consists of the vectorised text examples

2024-06-04 14:02:53.228058: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [16]:
type(dataset)

tensorflow.python.data.ops.prefetch_op._PrefetchDataset

In [18]:
# Creating the training, CV and testing sets

train = dataset.take(int(len(dataset)*.7)) # Taking only 70% of the data for training
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2)) # Skip 70% and take the next 20%
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1)) # Skip 90% and take the remaining 10%

In [20]:
train_generator = train.as_numpy_iterator()

In [24]:
train_generator.next()

(array([[  425,     8,  2808, ...,     0,     0,     0],
        [  213,   231,    22, ...,     0,     0,     0],
        [  428,    21,   621, ...,     0,     0,     0],
        ...,
        [  171, 42008,   124, ...,     0,     0,     0],
        [ 4537,  1084,     2, ...,     0,     0,     0],
        [   73,    35,    14, ...,     0,     0,     0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

---

## Neural Network
---

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [39]:
model = Sequential()
    # Create the Embedding layer
model.add(Embedding(MAX_WORDS+1, 32))  # MAX_WORDS+1 is done for embedding the unknown words
    # Creating the LSTM layer
    # Bidirectional helps parse in both direction in RNNs
model.add(Bidirectional(LSTM(32, activation='tanh'))) # tanh is used as tf dictates the usage of tanh in LSTM models
    
    # Feature extractor fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
    # Final Layer
model.add(Dense(6, activation='sigmoid'))


In [40]:
# We are using Binary cross entropy as we treat the 6 different outputs as separate binary outcomes instead of one categorical outcome
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy())

In [41]:
model.summary()

In [42]:
history = model.fit(train, epochs=1, validation_data=val)

[1m6980/6981[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 494ms/step - loss: 0.0858

2024-06-04 15:29:45.108999: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3528s[0m 505ms/step - loss: 0.0857 - val_loss: 0.0484


---

## Predictions

---

In [72]:
# We need to first vectorise the input and then predict
input_text = vectoriser('you freaking suck! I am going to hurt you')

# model can't directly predict input_text as it's type is not suitable
res = model.predict(np.expand_dims(input_text, 0)) #can also use 'np.array([input_text])'

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step


In [73]:
res

array([[0.9855361 , 0.1583281 , 0.8629617 , 0.02645349, 0.70426   ,
        0.08770757]], dtype=float32)

---
## Evaluating the model

In [77]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [78]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [79]:
for batch in test.as_numpy_iterator():
    # Deconstruct the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)
    
    # Flatten the values
    y_true = y_true.flatten()
    yhat = yhat.flatten() # Flatten yhat along the last axis
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47

2024-06-04 16:22:58.656092: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [82]:
print(f"Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}")

Precision: 0.8452665209770203, Recall: 0.6164780855178833, Accuracy: 0.4824473559856415
