In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib


file_path = 'processed_data2.csv'
df_resampled = pd.read_csv(file_path)


df_resampled['cleaned_comment'] = df_resampled['cleaned_comment'].fillna('')

# Separate features and labels
features = df_resampled['cleaned_comment']
labels = df_resampled['labels']

# Vectorize the text data using TF-IDF
tfidf = TfidfVectorizer()
features_vectorized = tfidf.fit_transform(features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_vectorized, labels, test_size=0.2, random_state=42)

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    
    print(f"Evaluating {model_name}...")
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_prob):.4f}")
    print("\n")

    # Save the model
    joblib.dump(model, f"{model_name.lower().replace(' ', '_')}.joblib")

# Save the TF-IDF vectorizer
joblib.dump(tfidf, "tfidf_vectorizer.joblib")


Training Logistic Regression...
Evaluating Logistic Regression...
              precision    recall  f1-score   support

           0       0.73      0.88      0.80      2182
           1       0.80      0.59      0.68      1711

    accuracy                           0.75      3893
   macro avg       0.77      0.74      0.74      3893
weighted avg       0.76      0.75      0.75      3893

Confusion Matrix:
[[1927  255]
 [ 700 1011]]
ROC-AUC Score: 0.8368


Training Decision Tree...
Evaluating Decision Tree...
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      2182
           1       0.66      0.61      0.63      1711

    accuracy                           0.69      3893
   macro avg       0.69      0.68      0.68      3893
weighted avg       0.69      0.69      0.69      3893

Confusion Matrix:
[[1657  525]
 [ 672 1039]]
ROC-AUC Score: 0.6819


Training Random Forest...
Evaluating Random Forest...
              precision    recall 

['tfidf_vectorizer.joblib']