In [11]:
# 📌 ⿡ IMPORT LIBRARIES
# =======================================
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack

print("✅ Libraries imported.")


✅ Libraries imported.


In [12]:
# 📌 ⿢ LOAD PREPROCESSED DATA
# =======================================
X_train = pd.read_csv('../Data/X_train.csv')
X_test = pd.read_csv('../Data/X_test.csv')
y_train = pd.read_csv('../Data/y_train.csv').values.ravel()
y_test = pd.read_csv('../Data/y_test.csv').values.ravel()

print("✅ Data loaded.")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")



✅ Data loaded.
X_train: (12, 2), X_test: (4, 2)


In [13]:
# 📌 ⿣ VECTORIZER (TF-IDF)
# =======================================
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train['clean_comment'])
X_test_vec = vectorizer.transform(X_test['clean_comment'])

# Add platform feature
X_train_final = hstack((X_train_vec, np.array(X_train['platform_encoded']).reshape(-1, 1)))
X_test_final = hstack((X_test_vec, np.array(X_test['platform_encoded']).reshape(-1, 1)))

print("✅ Vectorization done.")
print(f"Vectorized shape: {X_train_final.shape}")


✅ Vectorization done.
Vectorized shape: (12, 28)


In [14]:
# 📌 ⿤ TRAIN SUPERVISED MODELS
# =======================================

# Logistic Regression
lr = LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(X_train_final, y_train)
lr_pred = lr.predict(X_test_final)
lr_acc = accuracy_score(y_test, lr_pred)
print("\n✅ Logistic Regression")
print(f"Accuracy: {lr_acc:.4f}")
print(classification_report(y_test, lr_pred))

# SVM
svm = SVC(probability=True, class_weight='balanced')
svm.fit(X_train_final, y_train)
svm_pred = svm.predict(X_test_final)
svm_acc = accuracy_score(y_test, svm_pred)
print("\n✅ SVM")
print(f"Accuracy: {svm_acc:.4f}")
print(classification_report(y_test, svm_pred))

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_final, y_train)
rf_pred = rf.predict(X_test_final)
rf_acc = accuracy_score(y_test, rf_pred)
print("\n✅ Random Forest")
print(f"Accuracy: {rf_acc:.4f}")
print(classification_report(y_test, rf_pred))

# AdaBoost
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train_final, y_train)
ada_pred = ada.predict(X_test_final)
ada_acc = accuracy_score(y_test, ada_pred)
print("\n✅ AdaBoost")
print(f"Accuracy: {ada_acc:.4f}")
print(classification_report(y_test, ada_pred))

# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb.fit(X_train_final, y_train)
gb_pred = gb.predict(X_test_final)
gb_acc = accuracy_score(y_test, gb_pred)
print("\n✅ Gradient Boosting")
print(f"Accuracy: {gb_acc:.4f}")
print(classification_report(y_test, gb_pred))




✅ Logistic Regression
Accuracy: 0.2500
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      0.50      0.40         2

    accuracy                           0.25         4
   macro avg       0.17      0.25      0.20         4
weighted avg       0.17      0.25      0.20         4


✅ SVM
Accuracy: 0.2500
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      0.50      0.40         2

    accuracy                           0.25         4
   macro avg       0.17      0.25      0.20         4
weighted avg       0.17      0.25      0.20         4


✅ Random Forest
Accuracy: 0.5000
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         2
           1       0.00      0.00      0.00         2

    accuracy                           0.50         4
   macro avg       0.25      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



✅ AdaBoost
Accuracy: 0.2500
              precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       0.00      0.00      0.00         2

    accuracy                           0.25         4
   macro avg       0.17      0.25      0.20         4
weighted avg       0.17      0.25      0.20         4


✅ Gradient Boosting
Accuracy: 0.2500
              precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       0.00      0.00      0.00         2

    accuracy                           0.25         4
   macro avg       0.17      0.25      0.20         4
weighted avg       0.17      0.25      0.20         4



In [15]:
# 📌 ⿥ STORE MODELS WITH SCORES
# ✅ ✅ ✅ IMPORTANT — this fixes the error!
# =======================================
models = {
    "Logistic Regression": (lr, lr_acc),
    "SVM": (svm, svm_acc),
    "Random Forest": (rf, rf_acc),
    "AdaBoost": (ada, ada_acc),
    "Gradient Boosting": (gb, gb_acc)
}

print("\n✅ All models trained & stored.")
print("Stored models: ", list(models.keys()))


✅ All models trained & stored.
Stored models:  ['Logistic Regression', 'SVM', 'Random Forest', 'AdaBoost', 'Gradient Boosting']
