In [1]:
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd


In [2]:
DATA_PATH = "../data/preprocessed"

# Load preprocessed data
X_train = joblib.load(f"{DATA_PATH}/X_train.pkl")
X_test = joblib.load(f"{DATA_PATH}/X_test.pkl")
y_train = joblib.load(f"{DATA_PATH}/y_train.pkl")
y_test = joblib.load(f"{DATA_PATH}/y_test.pkl")

print("✅ Data loaded successfully")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


✅ Data loaded successfully
X_train shape: (5634, 19)
X_test shape: (1409, 19)


In [3]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

print("✅ Models trained successfully")


✅ Models trained successfully


In [4]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    return acc, prec, rec, f1, cm

# Logistic Regression Metrics
lr_metrics = evaluate_model(lr_model, X_test, y_test)
print("Logistic Regression Metrics:")
print("Accuracy:", lr_metrics[0])
print("Precision:", lr_metrics[1])
print("Recall:", lr_metrics[2])
print("F1-Score:", lr_metrics[3])
print("Confusion Matrix:\n", lr_metrics[4])

# Random Forest Metrics
rf_metrics = evaluate_model(rf_model, X_test, y_test)
print("\nRandom Forest Metrics:")
print("Accuracy:", rf_metrics[0])
print("Precision:", rf_metrics[1])
print("Recall:", rf_metrics[2])
print("F1-Score:", rf_metrics[3])
print("Confusion Matrix:\n", rf_metrics[4])


Logistic Regression Metrics:
Accuracy: 0.815471965933286
Precision: 0.677115987460815
Recall: 0.579088471849866
F1-Score: 0.6242774566473989
Confusion Matrix:
 [[933 103]
 [157 216]]

Random Forest Metrics:
Accuracy: 0.7963094393186657
Precision: 0.6616541353383458
Recall: 0.4718498659517426
F1-Score: 0.5508607198748043
Confusion Matrix:
 [[946  90]
 [197 176]]


In [8]:
# Select the best model (Random Forest - better performance)
best_model = rf_model  # Use the actual Random Forest model, not metrics
MODEL_PATH = "../data/best_model.pkl"
joblib.dump(best_model, MODEL_PATH)

print(f"✅ Best model saved at {MODEL_PATH}")
print(f"Model type: {type(best_model)}")

✅ Best model saved at ../data/best_model.pkl
Model type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
