In [2]:
import sys
import os
# Ensure project root is on Python path so `src` can be imported from notebooks
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from src.preprocess import clean_text


In [4]:
# Load the toxicity dataset (limit to 50k rows for faster training/testing)
df = pd.read_csv("../datasets/toxic/archive (5)/train.csv", nrows=50000)
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset shape: (50000, 8)
Columns: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [5]:
# Preprocess: clean text and create binary toxicity label
df["cleaned"] = df["comment_text"].apply(clean_text)

# Create a binary 'toxic' label (1 if any toxicity flag is set, 0 otherwise)
toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y = (df[toxicity_cols].sum(axis=1) > 0).astype(int)
print(f"Toxicity distribution:\n{y.value_counts()}")

Toxicity distribution:
0    44845
1     5155
Name: count, dtype: int64


In [6]:
# Vectorize and train model
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["cleaned"])

model = LogisticRegression(max_iter=1000)
model.fit(X, y)
print(f"Model trained. Classes: {model.classes_}")

Model trained. Classes: [0 1]


In [7]:
# Save models
joblib.dump(model, "../models/toxicity_model.pkl")
joblib.dump(tfidf, "../models/toxicity_vectorizer.pkl")
print("✅ Toxicity model and vectorizer saved.")

✅ Toxicity model and vectorizer saved.
