In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import f1_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../data/Churn_Modelling.csv")

df.shape, df.head()

In [None]:
df.isnull().sum()

In [None]:
df["Exited"].value_counts(normalize=True)

In [None]:
data = df.copy()

# Encode categorical variables
label_encoders = {}
for col in ["Geography", "Gender"]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Separate features and target
X = data.drop(columns=["RowNumber", "CustomerId", "Surname", "Exited"])
y = data["Exited"]

# Scale numerical features
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

## Train / Validation / Test Split

Stratified splits are used to preserve class balance across datasets.


In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

X_train.shape, X_val.shape, X_test.shape

## Model Comparison

Multiple baseline and tree-based models are evaluated using F1-score to account
for class imbalance.


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    score = f1_score(y_val, preds)
    results.append((name, score))

results_df = pd.DataFrame(results, columns=["Model", "F1 Score"])
results_df.sort_values("F1 Score", ascending=False)

In [None]:
best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train, y_train)

test_preds = best_model.predict(X_test)

f1_score(y_test, test_preds)

In [None]:
confusion_matrix(y_test, test_preds)

In [None]:
print(classification_report(y_test, test_preds))