### Dataset

## toxic-comment.csv

#### Load the loiberaries

In [41]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding
from sklearn.model_selection import train_test_split
from keras.layers import Bidirectional, GRU,  LSTM
import re

##### Read Dataset

In [42]:
df = pd.read_csv('toxic-comments.csv')

In [43]:
df.shape

(159571, 8)

In [44]:
df.columns

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

In [45]:
# Define target variables (Adapt based o your needs)
toxicities = ['toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate']

## Text Cleaning

In [46]:
# Clean text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"[^a-zA-Z0-9]+", " ", text) # Remove the non-alphanumeric charecter
    return text

In [47]:
df['comment_text'] = df['comment_text'].apply(clean_text)

In [48]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i m se...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i m really not trying to edit war it s...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i can t make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,and for the second time of asking when your v...,0,0,0,0,0,0
159567,ffea4adeee384e90,you should be ashamed of yourself that is a ho...,0,0,0,0,0,0
159568,ffee36eab5c267c9,spitzer umm theres no actual article for prost...,0,0,0,0,0,0
159569,fff125370e4aaaf3,and it looks like it was actually you who put ...,0,0,0,0,0,0


#### Define input and output variable

In [49]:
# Feature and target preparation
comments = df['comment_text'].tolist()
targets = df[toxicities].values

In [50]:
targets 

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

In [51]:
targets.shape

(159571, 6)

### prepare the Data

In [52]:
# Tokenozer and padding
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
padded_sequence = pad_sequences(sequences, maxlen=200)

In [53]:
padded_sequence.shape

(159571, 200)

#### Cross-validation

In [54]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequence, targets, test_size=0.2, random_state=0)

In [55]:
X_train.shape

(127656, 200)

In [56]:
y_train.shape

(127656, 6)

In [57]:
X_test.shape

(31915, 200)

### Build Model

In [69]:
# Model defination (customize architecture as needed)
model = Sequential()

In [70]:
model.add(Embedding(5000, 128, input_length =200))
model.add(GRU(64))
model.add(Dense(6, activation='softmax'))



In [71]:
model.summary()

In [72]:
from keras.utils import plot_model

In [73]:
# plot_model(model, show_dtype=True, show_layer_activations=True, 
#            show_layer_names= True, show_shapes=True)

### Compile Model

In [74]:
#### Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [75]:
y_train.shape, X_train.shape

((127656, 6), (127656, 200))

In [76]:
model.fit(X_train, y_train, epochs=3, batch_size= 32, validation_data = (X_test, y_test))

Epoch 1/3
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m470s[0m 117ms/step - accuracy: 0.9787 - loss: 0.0933 - val_accuracy: 0.9883 - val_loss: 0.0503
Epoch 2/3
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 117ms/step - accuracy: 0.9869 - loss: 0.0465 - val_accuracy: 0.9907 - val_loss: 0.0479
Epoch 3/3
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m464s[0m 116ms/step - accuracy: 0.9801 - loss: 0.0406 - val_accuracy: 0.9611 - val_loss: 0.0485


<keras.src.callbacks.history.History at 0x1f1ee51b290>

In [68]:
X_train

array([[   0,    0,    0, ...,    7,  263,  122],
       [   0,    0,    0, ...,  604,    2,    6],
       [   0,    0,    0, ..., 2981,   47,   66],
       ...,
       [   0,    0,    0, ...,  503,  100,   38],
       [   0,    0,    0, ...,  283,   15,   29],
       [   0,    0,    0, ...,   78,  116,  513]])

### Predict on new Data

In [94]:
new_comment = 'He studied good so he won!'

new_comment = clean_text(new_comment)

# Tokenozer and padding
#tokenizer = Tokenizer(num_words=5000)
#tokenizer.fit_on_texts(comment)
sequences = tokenizer.texts_to_sequences(new_comment)
padded_sequence = pad_sequences(sequences, maxlen=200)

In [95]:
prediction = model.predict(padded_sequence)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step


In [96]:
prediction

array([0.30697224, 0.02439717, 0.4684177 , 0.01184302, 0.16146985,
       0.02690005], dtype=float32)

In [97]:
for toxicity, prob in zip(toxicities, prediction):
    print(f"{toxicity}: {prob:.2f}")

toxic: 0.31
severe_toxic: 0.02
obscene: 0.47
threat: 0.01
insult: 0.16
identity_hate: 0.03
