In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing import text,sequence
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,SimpleRNN,LSTM,SpatialDropout1D,GRU,Bidirectional,Input
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
import seaborn as sns
# from keras.layers.core import Dense#,Activation,Dropout

In [3]:
from tqdm import tqdm

In [4]:
#configurint TPU
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    #default distribution strategy in tensorflow, Works on CPU and single GPU
    strategy = tf.distribute.OneDeviceStrategy("CPU:0")

In [5]:
train = pd.read_csv('C:/Users/Richa/OneDrive/Documents/Desktop/TARP PROJECT_TOXIC COMMENT/dataset/jigsaw-toxic-comment-train.csv')
validation = pd.read_csv('C:/Users/Richa/OneDrive/Documents/Desktop/TARP PROJECT_TOXIC COMMENT/dataset/validation.csv')
test = pd.read_csv('C:/Users/Richa/OneDrive/Documents/Desktop/TARP PROJECT_TOXIC COMMENT/dataset/test.csv')

In [6]:
train.shape

(223549, 8)

In [7]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis = 1, inplace = True)

In [9]:
#check max len of comment_text column to use this for padding in future
pad_len = train['comment_text'].apply(lambda x:len(str(x).split())).max()
print('max len of comment_text column',pad_len)

max len of comment_text column 2321


DATA PREPARATION

In [10]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, stratify = train.toxic.values, random_state = 42,test_size = 0.2,shuffle = True)

In [11]:
len(xtrain),len(xvalid)

(178839, 44710)

### Tokenisation and Padding with max len of words in curpus

In [12]:
test.head()

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr


In [13]:
#using keras tokenizer
token = text.Tokenizer(num_words = None)
max_len = 2400
xtest = test.content.values
token.fit_on_texts(list(xtrain) + list(xvalid) + list(xtest))

x_train_seq = token.texts_to_sequences(xtrain)
x_valid_seq = token.texts_to_sequences(xvalid)
x_test_seq = token.texts_to_sequences(xtest)

#zero pad the sequences
x_train_pad = pad_sequences(x_train_seq,maxlen = max_len)


x_valid_pad = pad_sequences(x_valid_seq,maxlen = max_len)
x_test_pad = pad_sequences(x_test_seq,maxlen = max_len)
word_index = token.word_index

### Classification based on GRU(Gated Recurrent Unit)

In [19]:
# load glove vector in a dictionary

embeddings_index = {}
f = open('C:/Users/Richa/Desktop/glove.840B.300d.txt/glove.840B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(value) for value in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors'.format(len(embeddings_index)))


2196018it [03:18, 11082.80it/s]

Found 2196017 word vectors





In [20]:
#create an embedding metrics for the words which are part of our datasets
embedding_metrics = np.zeros((len(word_index) + 1,300))
for word,i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_metrics[i] = embedding_vector

100%|██████████| 583776/583776 [00:00<00:00, 590285.70it/s]


In [25]:
%%time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, GRU, Dense  # Add Dense to your imports
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy

with strategy.scope():
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                        300,
                        weights=[embedding_metrics],
                        input_length=max_len,
                        trainable=False))
    model.add(SpatialDropout1D(0.3))
    model.add(GRU(300))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss=BinaryCrossentropy(), optimizer=Adam(), metrics=['accuracy'])

model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 2400, 300)         175133100 
                                                                 
 spatial_dropout1d_2 (Spati  (None, 2400, 300)         0         
 alDropout1D)                                                    
                                                                 
 gru_2 (GRU)                 (None, 300)               541800    
                                                                 
 dense (Dense)               (None, 1)                 301       
                                                                 
Total params: 175675201 (670.15 MB)
Trainable params: 542101 (2.07 MB)
Non-trainable params: 175133100 (668.08 MB)
_________________________________________________________________
CPU times: total: 6.8 s
Wall time: 2.31 s


In [None]:
model.fit(x_train_pad,ytrain, epochs = 1,batch_size = 128*strategy.num_replicas_in_sync)

  47/1398 [>.............................] - ETA: 240:07:53 - loss: 0.2762 - accuracy: 0.9016

In [None]:
gru_pred = model.predict(x_valid_pad)

In [None]:
model_accuracy = roc_auc_score(yvalid,gru_pred)
model_accuracy_ls.append({'model':'GRU','AUC_SCORE':model_accuracy})

In [None]:
model_accuracy_ls

In [None]:
# Calculate ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(yvalid, pred_val)
roc_auc = auc(fpr, tpr)

In [None]:
# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Threshold predictions to convert probabilities to binary predictions
threshold = 0.5  # You can adjust this threshold as needed
binary_pred_val = (pred_val > threshold).astype(int)

# Create confusion matrix
conf_matrix = confusion_matrix(yvalid, binary_pred_val)

# Plot confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Non-Toxic', 'Toxic'], yticklabels=['Non-Toxic', 'Toxic'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()