In [10]:
import pandas as pd
import joblib
import os

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [11]:
# ✅ Step 1: Load preprocessed data
X_train = pd.read_csv("../Data/preprocessed/X_train.csv")
X_test = pd.read_csv("../Data/preprocessed/X_test.csv")
y_train = pd.read_csv("../Data/preprocessed/y_train.csv").squeeze()
y_test = pd.read_csv("../Data/preprocessed/y_test.csv").squeeze()

In [12]:
# ✅ Step 2: Define supervised models
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(probability=True),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boost": GradientBoostingClassifier()
}

In [13]:
# ✅ Step 3: Train and evaluate
best_model = None
best_score = 0
best_model_name = ""

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)

    print(f"🔍 {name} Accuracy: {score:.4f}")

    if score > best_score:
        best_score = score
        best_model = model
        best_model_name = name

🔍 Logistic Regression Accuracy: 0.5950
🔍 SVM Accuracy: 0.5633
🔍 Random Forest Accuracy: 0.6450
🔍 AdaBoost Accuracy: 0.5883
🔍 Gradient Boost Accuracy: 0.6300


In [14]:
# ✅ Step 4: Save best model
model_dir = "../App/model"
os.makedirs(model_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(model_dir, "model.pkl"))

print(f"\n✅ Best Model Summary:")
print(f"Best Model: {best_model_name}")
print(f"Accuracy: {best_score:.4f}")
print(f"📁 Saved to: {model_dir}/model.pkl")


✅ Best Model Summary:
Best Model: Random Forest
Accuracy: 0.6450
📁 Saved to: ../App/model/model.pkl
