In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

import pickle

import warnings
warnings.filterwarnings('ignore')





In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df = pd.read_csv(r"/content/gdrive/MyDrive/database/training/train.csv")


# lets check the glimpse of first five rows of train dataset
print(df.head())



                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  


In [None]:
# shape of train dataframe
print(df.shape)

# Separate Independent and Dependent Variables
X = df['comment_text']
y = df.loc[:, 'toxic':].values


# lets check the comment maximum length 
print(df['comment_text'].str.len().max())




(159571, 8)
5000


In [None]:
MAX_FEATURES = 200000 # number of words in the vocab




# Text Vectorization is the process of converting text into a numerical representation.
# It transforms text into a more suitable form so that ML or DL algorithms can perform better.
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, 
                               output_sequence_length=1800,  # Maximum comment (text) size in words
                               output_mode='int')



vectorizer.adapt(X.values)





# apply vectorizer on X
vectorized_text = vectorizer(X.values)

# prepare tensorflow dataset
# it converts the data stored in Pandas Data Frame into data stored in TensorFlow Data Set.
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
# cache keeps the images in memory after they're loaded off disk during the first epoch.
dataset = dataset.cache()
# For true randomness, we set the shuffle buffer to the full dataset size.
dataset = dataset.shuffle(160000)
# Batch after shuffling to get unique batches at each epoch
dataset = dataset.batch(16)
# prefetch overlaps data preprocessing and model execution while training.
dataset = dataset.prefetch(8) 







In [None]:
# train(70%), validation(20%) and test(10%) split
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))





In [None]:
# Formation of CNN model
# Sequential layer
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer/ output layer 
model.add(Dense(6, activation='sigmoid'))






In [None]:
# Compile the Model
model.compile(loss='BinaryCrossentropy', optimizer='Adam')



# model summary
print(model.summary())


# fit model for only 9 epochs
history = model.fit(train, epochs=20, validation_data=val)
print(history)






NameError: ignored

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()


# save the model for future use
model.save('toxic_comments_model.h5')


# save the model for future use
model.save('toxic_comments_model.h5')



In [None]:
# load the model
model = tf.keras.models.load_model('toxic_comments_model.h5')

# use text vectorization on raw data to test the model
input_text = vectorizer(np.expand_dims('I am going to hit you.',0))

# model prediction on raw text
res = model.predict(input_text)
print(res)



# lets check the precision, recall and accuracy on test dataset 
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the true values
    y_true = y_true.flatten()
    # Flatten the predictions
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

# print("statement 1")
# print(score_comment('you are stupid.'))
# print("statement 2")
# print(score_comment('This is shit.'))
# print("statement 3")
# print(score_comment('I am going to kill You'))
# print("statement 4")
# print(score_comment('I am a student'))



# save the labels for further use in deployment
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
pickle.dump(labels,open('labels.pkl','wb'))