In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import joblib
import pickle
import os

# ---------------- Load dataset ----------------
df = pd.read_csv(r"/home/cloud/Downloads/ML Assignment 2/Churn_Modelling.csv.csv")

df.drop(columns=["RowNumber", "CustomerId", "Surname"], inplace=True)

X = df.drop("Exited", axis=1)
y = df["Exited"]

le = LabelEncoder()
X["Geography"] = le.fit_transform(X["Geography"])
X["Gender"] = le.fit_transform(X["Gender"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ================= Random Forest =================
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, y_proba_rf))

joblib.dump(rf, "model_random_forest.pkl")

# ================= XGBoost (SAFE TRY) =================
try:
    from xgboost import XGBClassifier

    xgb = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        eval_metric="logloss",
        random_state=42
    )
    xgb.fit(X_train, y_train)

    y_pred_xgb = xgb.predict(X_test)
    y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

    print("\nXGBoost")
    print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
    print("AUC:", roc_auc_score(y_test, y_proba_xgb))

    joblib.dump(xgb, "model_xgboost.pkl")

except ModuleNotFoundError:
    print("\nXGBoost NOT available in this environment.")
    print("Model trained locally / to be reported in README.")

Random Forest
Accuracy: 0.864
AUC: 0.846416524382626

XGBoost
Accuracy: 0.8645
AUC: 0.8628320153743883


In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

In [7]:
df = pd.read_csv(r"/home/cloud/Downloads/ML Assignment 2/Churn_Modelling.csv.csv")

# Drop unnecessary columns
df.drop(columns=["RowNumber", "CustomerId", "Surname"], inplace=True)

# Features & target
X = df.drop("Exited", axis=1)
y = df["Exited"]

# Encode categorical variables
le = LabelEncoder()
X["Geography"] = le.fit_transform(X["Geography"])
X["Gender"] = le.fit_transform(X["Gender"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling (for LR & KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
def evaluate_model(model, X_test, y_test, scaled=False):
    if scaled:
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

In [11]:
results = {}

# 1. Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
results["Logistic Regression"] = evaluate_model(lr, X_test_scaled, y_test)

# 2. Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
results["Decision Tree"] = evaluate_model(dt, X_test, y_test)

# 3. KNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
results["KNN"] = evaluate_model(knn, X_test_scaled, y_test)

# 4. Naive Bayes
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)
results["Naive Bayes"] = evaluate_model(nb, X_test_scaled, y_test)

# 5. Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
results["Random Forest"] = evaluate_model(rf, X_test, y_test)

# 6. XGBoost (only if available)
try:
    from xgboost import XGBClassifier

    xgb = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        eval_metric="logloss",
        random_state=42
    )
    xgb.fit(X_train, y_train)
    results["XGBoost"] = evaluate_model(xgb, X_test, y_test)

except:
    print("XGBoost not available in this environment")

In [6]:
comparison_df = pd.DataFrame(results).T
comparison_df

Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.805,0.771044,0.585859,0.142506,0.229249,0.216732
Decision Tree,0.7765,0.664883,0.453271,0.476658,0.464671,0.323715
KNN,0.835,0.772447,0.662447,0.385749,0.487578,0.417961
Naive Bayes,0.829,0.814619,0.755906,0.235872,0.359551,0.357286
Random Forest,0.864,0.846417,0.782427,0.459459,0.578947,0.529746
XGBoost,0.8645,0.862832,0.776423,0.469287,0.584992,0.532934


In [13]:
# Create model folder
os.makedirs("model", exist_ok=True)

# Save KNN model
with open("model/model.pkl", "wb") as f:
    pickle.dump(knn, f)

# Save scaler (important because you used StandardScaler)
with open("model/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)