In [2]:
import pandas as pd
import numpy as np

# Modeling pieces
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [3]:
# 1) Load the data
df = pd.read_csv("bank_cleaned.csv")
print("First 5 rows:")
print(df.head())

First 5 rows:
   Unnamed: 0  age           job  marital  education default  balance housing  \
0           0   58    management  married   tertiary      no     2143     yes   
1           1   44    technician   single  secondary      no       29     yes   
2           2   33  entrepreneur  married  secondary      no        2     yes   
3           5   35    management  married   tertiary      no      231     yes   
4           6   28    management   single   tertiary      no      447     yes   

  loan  day month  duration  campaign  pdays  previous poutcome response  \
0   no    5   may      4.35         1     -1         0  unknown       no   
1   no    5   may      2.52         1     -1         0  unknown       no   
2  yes    5   may      1.27         1     -1         0  unknown       no   
3   no    5   may      2.32         1     -1         0  unknown       no   
4  yes    5   may      3.62         1     -1         0  unknown       no   

   response_binary  
0                0  


In [4]:
# 2) Make a churn column
# response_binary: 1=subscribed, 0=did not subscribe
# Let's define churn = 1 if did not subscribe, else 0
df["churn"] = df["response_binary"].apply(lambda x: 1 if x == 0 else 0)

In [5]:
# 3) Pick features
cols_to_drop = ["Unnamed: 0", "response", "response_binary", "churn"]
if "duration" in df.columns:  # duration is known after the call; drop for fair modeling
    cols_to_drop.append("duration")

X = df.drop(columns=[c for c in cols_to_drop if c in df.columns]).copy()
y = df["churn"].copy()


In [6]:
# 4) One-hot encode categoricals (easy)
X = pd.get_dummies(X, drop_first=True)

In [7]:
# 5) SAMPLE to 2,000 rows (keeps it fast)
N = min(2000, len(X))
X_small = X.sample(n=N, random_state=42)
y_small = y.loc[X_small.index]

In [8]:
# 6) Scale features (helps KMeans and Logistic Regression)
scaler = StandardScaler(with_mean=False)  # good with sparse matrices from get_dummies
X_small_scaled = scaler.fit_transform(X_small)

In [15]:
# 7) K-Means (k=4)
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_small_scaled)

# Cluster summary (churn rate per cluster)
tmp = pd.DataFrame({"cluster": clusters, "churn": y_small.values})
cluster_summary = tmp.groupby("cluster")["churn"].agg(["mean", "count"]).reset_index()
cluster_summary.rename(columns={"mean": "churn_rate", "count": "num_customers"}, inplace=True)

print("\nCluster summary (k=4 on sample):")
print(cluster_summary)


Cluster summary (k=4 on sample):
   cluster  churn_rate  num_customers
0        0    0.895604            546
1        1    0.925725           1104
2        2    0.758621             87
3        3    0.756654            263


In [12]:
# 8) Train/Test split for Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(
    X_small, y_small, test_size=0.3, random_state=42, stratify=y_small
)
# Scale train/test
scaler_lr = StandardScaler(with_mean=False)
X_train_scaled = scaler_lr.fit_transform(X_train)
X_test_scaled = scaler_lr.transform(X_test)

In [13]:
# 9) Logistic Regression + AUC
logit = LogisticRegression(max_iter=1000, solver="lbfgs")
logit.fit(X_train_scaled, y_train)
y_pred_proba = logit.predict_proba(X_test_scaled)[:, 1]

auc = roc_auc_score(y_test, y_pred_proba)
print(f"\nAUC: {auc:.3f}")


AUC: 0.737


In [14]:
# 10) Feature importance (by absolute coefficient)
coefs = pd.DataFrame({
    "Feature": X_small.columns,
    "Coefficient": logit.coef_[0]
})
coefs["AbsCoefficient"] = coefs["Coefficient"].abs()
top10 = coefs.sort_values("AbsCoefficient", ascending=False).head(10)

print("\nTop 10 features by absolute coefficient:")
print(top10)


Top 10 features by absolute coefficient:
                Feature  Coefficient  AbsCoefficient
21          default_yes     0.440630        0.440630
35     poutcome_success    -0.355191        0.355191
7      job_entrepreneur     0.352792        0.352792
3              campaign     0.331374        0.331374
19  education_secondary     0.318862        0.318862
31            month_may     0.276964        0.276964
10            job_other     0.268087        0.268087
32            month_nov     0.267068        0.267068
33            month_oct    -0.255655        0.255655
9        job_management     0.174238        0.174238
