In [2]:
import pandas as pd
import numpy as numpy
import tensorflow as tf

In [3]:
train_df = pd.read_csv(r'./Datasets/trainData.csv')
train_df['toxic_comment'] = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].any(axis=1).astype(int)
train_df.drop(columns=['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], inplace=True)

In [4]:
pd.set_option('display.max_colwidth', 1)
print(train_df.shape)
train_df.head()

(159571, 2)


Unnamed: 0,comment_text,toxic_comment
0,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0
1,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0
2,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0
3,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0
4,"You, sir, are my hero. Any chance you remember what page that's on?",0


In [5]:
train_df['toxic_comment'].value_counts()

toxic_comment
0    143346
1    16225 
Name: count, dtype: int64

## Text Preprocessing

#### Lower Casing

In [6]:
def lower_casing(text):
    return text.lower()

train_df['comment_text'] = train_df['comment_text'].apply(lower_casing)
train_df.head()

Unnamed: 0,comment_text,toxic_comment
0,"explanation\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27",0
1,"d'aww! he matches this background colour i'm seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0
2,"hey man, i'm really not trying to edit war. it's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0
3,"""\nmore\ni can't make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nthere appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it's listed in the relevant form eg wikipedia:good_article_nominations#transport """,0
4,"you, sir, are my hero. any chance you remember what page that's on?",0


#### Contractions

In [7]:
import contractions
def expand_contractions(text):
    return ' '.join([contractions.fix(word) for word in text.split()])

train_df['comment_text'] = train_df['comment_text'].apply(expand_contractions)
train_df.head()

Unnamed: 0,comment_text,toxic_comment
0,"explanation why the edits made under my username hardcore metallica fan were reverted? they were not vandalisms, just closure on some gas after i voted at new york dolls fac. and please do not remove the template from the talk page since i am retired now.89.205.38.27",0
1,"d'aww! he matches this background colour i am seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0
2,"hey man, i am really not trying to edit war. it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.",0
3,""" more i cannot make any real suggestions on improvement - i wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -i think the references may need tidying so that they are all in the exact same format ie date format etc. i can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know. there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up. it is listed in the relevant form eg wikipedia:good_article_nominations#transport """,0
4,"you, sir, are my hero. any chance you remember what page that is on?",0


#### Remove Punctuations

In [8]:
import string
def remove_punctuations(text):
    return text.translate(str.maketrans('', '', string.punctuation))

train_df['comment_text'] = train_df['comment_text'].apply(remove_punctuations)
train_df.head()

Unnamed: 0,comment_text,toxic_comment
0,explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now892053827,0
1,daww he matches this background colour i am seemingly stuck with thanks talk 2151 january 11 2016 utc,0
2,hey man i am really not trying to edit war it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0
3,more i cannot make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it is listed in the relevant form eg wikipediagoodarticlenominationstransport,0
4,you sir are my hero any chance you remember what page that is on,0


#### Removing Numbers

In [9]:
import re
def remove_numbers_with_re(text):
    return re.sub(r'\d+', '', text)

train_df['comment_text'] = train_df['comment_text'].apply(remove_numbers_with_re)
train_df.head()

Unnamed: 0,comment_text,toxic_comment
0,explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now,0
1,daww he matches this background colour i am seemingly stuck with thanks talk january utc,0
2,hey man i am really not trying to edit war it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0
3,more i cannot make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it is listed in the relevant form eg wikipediagoodarticlenominationstransport,0
4,you sir are my hero any chance you remember what page that is on,0


In [10]:
train_df.head(2)

Unnamed: 0,comment_text,toxic_comment
0,explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now,0
1,daww he matches this background colour i am seemingly stuck with thanks talk january utc,0


## Tokenization

In [11]:
# train_df['tokens'] = train_df['comment_text'].str.split()
# train_df[['comment_text', 'tokens']].head()

#### Copying dataset

In [12]:
copy_df = train_df
copy_df.head()

Unnamed: 0,comment_text,toxic_comment
0,explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now,0
1,daww he matches this background colour i am seemingly stuck with thanks talk january utc,0
2,hey man i am really not trying to edit war it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info,0
3,more i cannot make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if noone else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it is listed in the relevant form eg wikipediagoodarticlenominationstransport,0
4,you sir are my hero any chance you remember what page that is on,0


#### Stats

In [13]:
s = 0.0
for i in copy_df['comment_text']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each comment : ",s/copy_df.shape[0])
pos = 0
for i in range(copy_df.shape[0]):
    if copy_df.iloc[i]['toxic_comment'] == 0:
        pos = pos + 1
neg = copy_df.shape[0]-pos
print("Percentage of positive comments is "+str(pos/copy_df.shape[0]*100)+"%")
print("Percentage of negative comments is "+str(neg/copy_df.shape[0]*100)+"%")

Average length of each comment :  66.55961922905792
Percentage of positive comments is 89.83211235124176%
Percentage of negative comments is 10.167887648758233%


### train_test_split

In [54]:
from sklearn.model_selection import train_test_split
train_sentences, test_sentences, train_labels, test_labels = train_test_split(copy_df['comment_text'].values, copy_df['toxic_comment'].values, test_size=0.2, random_state=42)

In [55]:
train_sentences[0]

'grandma terri should burn in trash grandma terri is trash i hate grandma terri fk her to hell '

In [56]:
train_labels[0]

np.int64(1)

In [57]:
test_sentences[0:5]

array(['geez are you forgetful we have already discussed why marx was not an anarchist ie he wanted to use a state to mold his socialist man ergo he is a statist  the opposite of an anarchist i know a guy who says that when he gets old and his teeth fall out he will quit eating meat would you call him a vegetarian',
       'carioca rfa thanks for your support on my request for adminship the final outcome was  so i am now an administrator if you have any comments or concerns on my actions as an administrator please let me know thank you',
       ' birthday no worries it is what i do enjoy you are daytalke ',
       'pseudoscience category i am assuming that this article is in the pseudoscience category because of its association with creationism however there are modern scientificallyaccepted variants of catastrophism that have nothing to do with creationism — and they are even mentioned in the article i think the connection to pseudoscience needs to be clarified or the article made mor

In [58]:
test_labels[:5]

array([0, 0, 0, 0, 0])

# Model

## Embeddings

In [73]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# Functions for preprocessing
def tokenize_text(df, max_num_words=20000):
    comments = df.astype(str).tolist()
    tokenizer = Tokenizer(num_words=max_num_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(comments)
    sequences = tokenizer.texts_to_sequences(comments)
    return tokenizer, sequences


In [74]:
def pad_text_sequences(sequences, max_sequence_length=100):
    return pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

## From Here

In [75]:
type(train_sentences)

numpy.ndarray

In [76]:
# Example Usage
train_tokenizer, train_sequences = tokenize_text(train_sentences)
train_padded_sequences = pad_text_sequences(train_sequences, max_sequence_length=100)

test_tokenizer, test_sequences = tokenize_text(test_sentences)
test_padded_sequences = pad_text_sequences(test_sequences, max_sequence_length=100)


In [77]:
print(len(train_tokenizer.word_index)+1)

200619


#### Getting shape

In [78]:
print("Train Sentences Shape: ", train_sentences.shape)
print("Train Labels Shape: ", train_labels.shape)
print("Test Sentences Shape: ", test_sentences.shape)
print("Test Labels Shape: ", test_labels.shape)

Train Sentences Shape:  (127656,)
Train Labels Shape:  (127656,)
Test Sentences Shape:  (31915,)
Test Labels Shape:  (31915,)


## Model Training

In [80]:
# Build the LSTM model and train the model
vocab_size = len(train_tokenizer.word_index) + 1  # Include OOV token
embedding_dim = 50
max_sequence_length = 100
output_dim = 1

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_sequence_length),
    Bidirectional(LSTM(64)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.summary()

In [81]:
model.fit(train_padded_sequences, train_labels, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 153ms/step - accuracy: 0.9257 - loss: 0.2162 - val_accuracy: 0.9625 - val_loss: 0.1080
Epoch 2/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 150ms/step - accuracy: 0.9674 - loss: 0.0903 - val_accuracy: 0.9618 - val_loss: 0.1097
Epoch 3/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m226s[0m 141ms/step - accuracy: 0.9728 - loss: 0.0730 - val_accuracy: 0.9599 - val_loss: 0.1145
Epoch 4/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 138ms/step - accuracy: 0.9780 - loss: 0.0601 - val_accuracy: 0.9595 - val_loss: 0.1224
Epoch 5/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 138ms/step - accuracy: 0.9827 - loss: 0.0465 - val_accuracy: 0.9562 - val_loss: 0.1522


<keras.src.callbacks.history.History at 0x251634bd0d0>

In [85]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Evaluate the model
test_pred = model.predict(test_padded_sequences)

# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in test_pred:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step
Accuracy of prediction on test set :  0.8007519974933417


In [86]:
# Save the model and tokenizer
model.save("lstm_toxicity_model.h5")



In [87]:
with open("tokenizer.pkl", "wb") as file:
    pickle.dump(train_tokenizer, file)

In [88]:
with open("pad_text_sequences.pkl", "wb") as file:
    pickle.dump(train_padded_sequences, file)

In [100]:
# Predict on new comments
user_input1 = ["You are the best person ever!"]
user_input = ["you are black person"]
tokenizer = pickle.load(open("tokenizer.pkl", "rb"))
user_sequences = tokenizer.texts_to_sequences(user_input)
user_padded = pad_text_sequences(user_sequences, max_sequence_length=100)

predictions = model.predict(user_padded)
print("Predictions:", predictions)

threshold = 0.5
classification = "toxic" if predictions[0][0] > threshold else "non_toxic"
print("Classification:", classification)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Predictions: [[0.96041954]]
Classification: toxic
