In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv("train.csv")

In [3]:
test_data = pd.read_csv("test.csv")

In [5]:
train_data.shape

(159571, 8)

In [6]:
test_data.shape

(153164, 2)

In [8]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
test_data.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [11]:
# check missing values
train_data.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [12]:
test_data.isnull().sum()

id              0
comment_text    0
dtype: int64

In [16]:
print("\nTrain columns:", train_data.columns)
print("\nTest columns:", test_data.columns)


Train columns: Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

Test columns: Index(['id', 'comment_text'], dtype='object')


In [14]:
test_data.columns

Index(['id', 'comment_text'], dtype='object')

In [36]:
# define the target column
target_col = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

### Step 1: Text Preprocessing (Cleaning)

#### cleaning function 

In [22]:
import re

def clean_text(text):
    text = str(text).lower()  # covert all words in lower case
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  #remove links
    text = re.sub(r"[^a-z\s]", "", text)  # keep only letters
    text = re.sub(r"\s+", " ", text).strip()# remove extra space
    return text

In [23]:
train_data["clean_comment"] =train_data["comment_text"].apply(clean_text)
test_data["clean_comment"] = test_data["comment_text"].apply(clean_text)

In [25]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_comment
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour im seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really not trying to edit war its j...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,more i cant make any real suggestions on impro...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...


### Stopword Removal

In [26]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PRIYANKA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from nltk.corpus import stopwords

stops_words = set(stopwords.words("english"))

def remove_stopwords(text):
    words = text.split()
    words = [w for w in words if w not in stops_words]
    return " ".join(words)

In [28]:
train_data["final_comment"] = train_data["clean_comment"].apply(remove_stopwords)
test_data["final_comment"] = test_data["clean_comment"].apply(remove_stopwords)

### tokenization 

In [30]:
sample = train_data["final_comment"].iloc[0]
print(sample)
print(sample.split())

explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired
['explanation', 'edits', 'made', 'username', 'hardcore', 'metallica', 'fan', 'reverted', 'werent', 'vandalisms', 'closure', 'gas', 'voted', 'new', 'york', 'dolls', 'fac', 'please', 'dont', 'remove', 'template', 'talk', 'page', 'since', 'im', 'retired']


### Vectorization (TF-IDF)

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x = train_data["final_comment"]
y =train_data[target_col]

# Split train into train + validation
X_train, X_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=8000)

# Fit on train, transform val + test
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)
X_test_vec  = vectorizer.transform(test_data["final_comment"])

print("Train Vector shape:", X_train_vec.shape)
print("Val Vector shape:", X_val_vec.shape)
print("Test Vector shape:", X_test_vec.shape)

Train Vector shape: (127656, 8000)
Val Vector shape: (31915, 8000)
Test Vector shape: (153164, 8000)


### LSTM Model

#### Prepare Input (Tokenizer + Padding)

In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [43]:
x = train_data["final_comment"].astype(str)
y =train_data[target_col].values


# Split train into train + validation
X_train, X_val, y_train, y_val = train_test_split(
    x, y, test_size=0.2, random_state=42
)

# Tokenizer
max_words = 20000
tokenizer = Tokenizer(num_words = max_words ,oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

# padding
max_len = 150
X_train_pad = pad_sequences(X_train_seq , maxlen = max_len , padding = "post" , truncating = "post")
X_val_pad = pad_sequences(X_val_seq , maxlen = max_len , padding = "post" , truncating = "post")

In [44]:
print("Train padded shape:", X_train_pad.shape)
print("Val padded shape:", X_val_pad.shape)

Train padded shape: (127656, 150)
Val padded shape: (31915, 150)


### Build LSTM Deep Learning Model

In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential([
    Embedding(input_dim = max_words, output_dim = 128 , input_length = max_len),
    Bidirectional(LSTM(64 , return_sequences = False)),
    Dropout(0.3),
    Dense(64, activation = "relu"),
    Dropout(0.3),
    Dense(6, activation = "sigmoid")
])



In [53]:
model.compile(
    loss="binary_crossentropy",
    optimizer = "adam",
    metrics = ["accuracy"]
)
model.build(input_shape=(None, max_len))

In [54]:
model.summary()

In [55]:
history = model.fit(X_train_pad , y_train, 
                    validation_data = (X_val_pad, y_val),
                    epochs = 3,
                    batch_size = 128
                   )

Epoch 1/3
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m416s[0m 402ms/step - accuracy: 0.7025 - loss: 0.1241 - val_accuracy: 0.9941 - val_loss: 0.0504
Epoch 2/3
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m789s[0m 791ms/step - accuracy: 0.9830 - loss: 0.0495 - val_accuracy: 0.9941 - val_loss: 0.0500
Epoch 3/3
[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m445s[0m 446ms/step - accuracy: 0.9727 - loss: 0.0424 - val_accuracy: 0.9941 - val_loss: 0.0511


In [58]:
from sklearn.metrics import classification_report

# Predict probabilities
y_pred_prob = model.predict(X_val_pad)

# Convert prob -> 0/1
y_pred = (y_pred_prob > 0.5).astype(int)

print(classification_report(y_val ,y_pred, target_names = target_col))

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 36ms/step
               precision    recall  f1-score   support

        toxic       0.80      0.76      0.78      3056
 severe_toxic       0.61      0.23      0.33       321
      obscene       0.78      0.81      0.79      1715
       threat       0.00      0.00      0.00        74
       insult       0.72      0.64      0.68      1614
identity_hate       0.00      0.00      0.00       294

    micro avg       0.77      0.68      0.72      7074
    macro avg       0.48      0.41      0.43      7074
 weighted avg       0.73      0.68      0.70      7074
  samples avg       0.07      0.06      0.06      7074



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
import os
os.makedirs("model", exist_ok=True)

model.save("model/toxicity_model.h5")
print(" Model saved!")



✅ Model saved!


In [60]:
import pickle

with open("model/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("✅ Tokenizer saved!")

✅ Tokenizer saved!
