In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score



In [2]:
# Load dataset
data = pd.read_csv('train.csv')

# Combine toxicity labels
data['is_toxic'] = data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)



In [3]:
data.shape

(159571, 9)

In [4]:
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,is_toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prith\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prith\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:

def preprocess_text(text):
    import re
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    

    text = re.sub(r'<.*?>', '', text)

    text = text.lower()
  
    text = re.sub(r'[^a-z\s]', '', text)

    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

data['comment_text'] = data['comment_text'].apply(preprocess_text)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(data['comment_text'], data['is_toxic'], test_size=0.2, random_state=42)


In [11]:
X_train.shape

(127656,)

In [12]:
X_test.shape

(31915,)

In [6]:
X_train[:5]

140030    grandma terri burn trash grandma terri trash h...
159124    may utc would easiest admit member involved po...
60006     objectivity discussion doubtful nonexistent in...
65432                             shelly shock shelly shock
154979    care refer ong teng cheong talk page la goutte...
Name: comment_text, dtype: object

In [18]:
X_test[:5]

119105    geez forgetful weve already discussed marx ana...
131631    carioca rfa thanks support request adminship f...
125326                     birthday worry enjoy ur daytalke
111256    pseudoscience category im assuming article pse...
83590     phrase exists would provided search engine eve...
Name: comment_text, dtype: object

In [5]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))  
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_pred_rf))

Random Forest
Accuracy: 0.9573241422528591
F1 Score: 0.7632950990615223
ROC AUC: 0.8329951069966297


In [21]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)

print("Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("F1 Score:", f1_score(y_test, y_pred_nb))
print("ROC AUC:", roc_auc_score(y_test, y_pred_nb))


Naive Bayes
Accuracy: 0.9482688391038696
F1 Score: 0.6770975943672991
ROC AUC: 0.7643936336552688


In [22]:
from sklearn.linear_model import Perceptron

perc = Perceptron(random_state=42)
perc.fit(X_train_tfidf, y_train)
y_pred_perc = perc.predict(X_test_tfidf)

print("Perceptron")
print("Accuracy:", accuracy_score(y_test, y_pred_perc))
print("F1 Score:", f1_score(y_test, y_pred_perc))
print("ROC AUC:", roc_auc_score(y_test, y_pred_perc))


Perceptron
Accuracy: 0.9368322105592981
F1 Score: 0.7047451669595782
ROC AUC: 0.850295064794137


In [23]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, y_pred_lr))


Logistic Regression
Accuracy: 0.9574181419395269
F1 Score: 0.751417596488019
ROC AUC: 0.813637234717896


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_tfidf, y_train)
y_pred_dt = dt.predict(X_test_tfidf)

print("Decision Tree")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("ROC AUC:", roc_auc_score(y_test, y_pred_dt))


Decision Tree
Accuracy: 0.9426915243615854
F1 Score: 0.7117415287628053
ROC AUC: 0.8333258609160148


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron

In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

rf = RandomForestClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
perc = Perceptron(random_state=42)

voting_clf = VotingClassifier(estimators=[
    ('Random Forest', rf),
    ('Logistic Regression', lr),
    ('Perceptron', perc)
], voting='hard')  

voting_clf.fit(X_train_tfidf, y_train)

y_pred_voting = voting_clf.predict(X_test_tfidf)

print("Voting Classifier (Hard Voting)")
print("Accuracy:", accuracy_score(y_test, y_pred_voting))
print("F1 Score:", f1_score(y_test, y_pred_voting))
print("ROC AUC:", roc_auc_score(y_test, y_pred_voting))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting Classifier (Hard Voting)
Accuracy: 0.9598934670217766
F1 Score: 0.775201966982789
ROC AUC: 0.8359287296533604


In [7]:
from sklearn.ensemble import AdaBoostClassifier

In [10]:

from sklearn.tree import DecisionTreeClassifier

adaboost = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=2),  
    n_estimators=50,  
    learning_rate=1,  
    random_state=42
)
adaboost.fit(X_train_tfidf, y_train)
y_pred_adaboost = adaboost.predict(X_test_tfidf)

print("AdaBoost")
print("Accuracy:", accuracy_score(y_test, y_pred_adaboost))
print("F1 Score:", f1_score(y_test, y_pred_adaboost))
print("ROC AUC:", roc_auc_score(y_test, y_pred_adaboost))




AdaBoost
Accuracy: 0.9504934983550055
F1 Score: 0.7125181950509463
ROC AUC: 0.7966607842077266


In [11]:
from sklearn.model_selection import RandomizedSearchCV

# Define the parameter grid
param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'base_estimator__max_depth': [1, 2, 3],
    'base_estimator__min_samples_split': [2, 5, 10]
}

# Reduce to 10 combinations
random_search = RandomizedSearchCV(
    estimator=adaboost,
    param_distributions=param_dist,
    cv=3,
    scoring='f1',
    n_iter=10,  # Reduce to 10 combinations
    n_jobs=-1,
    random_state=42,
    verbose=2  # Enables detailed progress
)

# Fit the random search
random_search.fit(X_train_tfidf, y_train)

# Print the best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best F1 Score:", random_search.best_score_)

# Get the best model
best_adaboost = random_search.best_estimator_



Fitting 3 folds for each of 10 candidates, totalling 30 fits




Best Parameters: {'n_estimators': 100, 'learning_rate': 1, 'base_estimator__min_samples_split': 5, 'base_estimator__max_depth': 3}
Best F1 Score: 0.7346077465027278


In [12]:
y_pred_best_adaboost = best_adaboost.predict(X_test_tfidf)

print("Tuned AdaBoost")
print("Accuracy:", accuracy_score(y_test, y_pred_best_adaboost))
print("F1 Score:", f1_score(y_test, y_pred_best_adaboost))
print("ROC AUC:", roc_auc_score(y_test, y_pred_best_adaboost))

Tuned AdaBoost
Accuracy: 0.9529061569794768
F1 Score: 0.7400103788271924
ROC AUC: 0.8227447620935


In [13]:
import pandas as pd
import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [14]:
texts = data['comment_text'].tolist()
labels = data['is_toxic'].values

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode text
max_seq_length = 100
encodings = tokenizer(
    texts,
    padding="max_length",
    truncation=True,
    max_length=max_seq_length,
    return_tensors="pt"
)

# Features and labels
X = encodings['input_ids']
attention_masks = encodings['attention_mask']
y = torch.tensor(labels)


100%|██████████| 231508/231508 [00:03<00:00, 75102.19B/s]


TypeError: 'BertTokenizer' object is not callable

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
masks_train, masks_test = train_test_split(attention_masks, test_size=0.2, random_state=42)

# Create DataLoader
batch_size = 32
train_data = DataLoader(TensorDataset(X_train, masks_train, y_train), batch_size=batch_size, shuffle=True)
test_data = DataLoader(TensorDataset(X_test, masks_test, y_test), batch_size=batch_size)


In [None]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, pad_idx):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        lstm_out, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return self.sigmoid(output)

# Model Parameters
vocab_size = tokenizer.vocab_size
embed_dim = 128
hidden_dim = 128
output_dim = 1  # Binary classification
pad_idx = tokenizer.pad_token_id

# Initialize the model
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, pad_idx)


In [None]:
import torch.optim as optim

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_data:
        input_ids, masks, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()
        predictions = model(input_ids, masks).squeeze(1)
        loss = criterion(predictions, labels.float())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_data)}")


In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Evaluate the model
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_data:
        input_ids, masks, labels = [x.to(device) for x in batch]
        predictions = model(input_ids, masks).squeeze(1)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Convert predictions to binary
all_preds = [1 if p > 0.5 else 0 for p in all_preds]

# Metrics
accuracy = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
roc_auc = roc_auc_score(all_labels, all_preds)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC AUC: {roc_auc}")
