In [9]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [11]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, log_loss, zero_one_loss
from sklearn.metrics import confusion_matrix, roc_curve, auc, precision_score, recall_score
from sklearn.preprocessing import StandardScaler

# Data Loading (replace with your CSV file path)
data = pd.read_csv('/content/drive/MyDrive/train.csv')

# Preprocessing with refined stemming
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [t.lower() for t in tokens]  # Convert to lowercase
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]  # Remove stop words

    stemmer = SnowballStemmer('english')
    tokens = [stemmer.stem(t) for t in tokens if len(t) < 20]  # Limit word length

    return tokens  # Return the list of tokens

# Preprocessing (updated)
processed_text = data['comment_text'].apply(preprocess_text)
processed_text = processed_text.apply(' '.join)  # Join tokens back into text

# Choose your target column (Focus on 'toxic' for this example)
target_column = 'toxic'

# Feature generation using TF-IDF
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(processed_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, data[target_column], test_size=0.3, random_state=51)

# Scale the features
scaler = StandardScaler(with_mean=False)  # explicitly specify with_mean=False
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to calculate and print metrics
def calculate_metrics(model_name, model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    logloss = log_loss(y_test, model.predict_proba(X_test))
    zero_one = zero_one_loss(y_test, predictions)
    cm = confusion_matrix(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:,1])
    roc_auc = auc(fpr, tpr)

    print(f"\nMetrics for {model_name}:")
    print("Accuracy:", accuracy)
    print("F1-Score:", f1)
    print("Log Loss:", logloss)
    print("Zero-One Loss:", zero_one)
    print("Confusion Matrix:",cm)
    print("Precision Score:", precision)
    print("Recall Score:", recall)
    print("ROC AUC Score:", roc_auc)
    print("\n\n")

# Model 1: Multinomial Naive Bayes
model_nb = MultinomialNB()
model_nb.fit(X_train, y_train)

# Model 2: Logistic Regression with increased max_iter and scaled data
model_lr = LogisticRegression(random_state=51, max_iter=1000) # Increase max_iter
model_lr.fit(X_train_scaled, y_train)

# Model 3: Support Vector Machine (SVM) with probability estimates enabled
model_svm = SVC(random_state=51, probability=True)  # Enable probability estimates
model_svm.fit(X_train, y_train)

calculate_metrics("Multinomial Naive Bayes", model_nb, X_test, y_test)
calculate_metrics("Logistic Regression (Scaled)", model_lr, X_test_scaled, y_test)
calculate_metrics("SVM", model_svm, X_test, y_test)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Metrics for Multinomial Naive Bayes:
Accuracy: 0.9177598596256684
F1-Score: 0.27186979840946923
Log Loss: 0.3152520985207055
Zero-One Loss: 0.08224014037433158
Confusion Matrix: [[43200    14]
 [ 3923   735]]
Precision Score: 0.9813084112149533
Recall Score: 0.15779304422498927
ROC AUC Score: 0.870263676019152




Metrics for Logistic Regression (Scaled):
Accuracy: 0.9279954879679144
F1-Score: 0.6342705570291777
Log Loss: 0.7320115727709224
Zero-One Loss: 0.07200451203208558
Confusion Matrix: [[41436  1778]
 [ 1669  2989]]
Precision Score: 0.6270190895741556
Recall Score: 0.641691713181623
ROC AUC Score: 0.911191897819956




Metrics for SVM:
Accuracy: 0.9572192513368984
F1-Score: 0.7370988446726572
Log Loss: 0.11721292198248864
Zero-One Loss: 0.04278074866310155
Confusion Matrix: [[42953   261]
 [ 1787  2871]]
Precision Score: 0.9166666666666666
Recall Score: 0.6163589523400601
ROC AUC Score: 0.9709648595386461



