In [2]:
import pandas as pd
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [3]:
import pandas as pd

df = pd.read_csv("bank-additional-full.csv", sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df.shape

(41188, 21)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [6]:
df['y'].value_counts()

y
no     36548
yes     4640
Name: count, dtype: int64

In [7]:
# Convert target variable to binary (yes=1, no=0)
df['y'] = df['y'].map({'yes': 1, 'no': 0})

df['y'].value_counts()

y
0    36548
1     4640
Name: count, dtype: int64

In [8]:
X=df.drop('y', axis=1)
y=df['y']

In [9]:
X=pd.get_dummies(X, drop_first='true')

In [10]:
X.shape

(41188, 53)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [12]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Initialize model
log_model = LogisticRegression(max_iter=1000)

# Train on SCALED data
log_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_log = log_model.predict(X_test_scaled)
y_prob_log = log_model.predict_proba(X_test_scaled)[:, 1]


In [14]:
def evaluate_model(y_test, y_pred, y_prob):
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return accuracy, auc, precision, recall, f1, mcc


In [15]:
log_results = evaluate_model(y_test, y_pred_log, y_prob_log)

log_results

(0.9163631949502307,
 0.9424203382235011,
 0.7100175746924429,
 0.4353448275862069,
 0.539746158984636,
 0.5146576882433802)

In [16]:
# Initialize model
dt_model = DecisionTreeClassifier(random_state=42)

# Train (NO scaling needed)
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:, 1]

# Evaluate
dt_results = evaluate_model(y_test, y_pred_dt, y_prob_dt)

dt_results

(0.8928137897547949,
 0.7415675562526534,
 0.5232198142414861,
 0.5463362068965517,
 0.5345282024248814,
 0.4741350433245297)

In [17]:
# Initialize model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train on scaled data
knn_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test_scaled)
y_prob_knn = knn_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
knn_results = evaluate_model(y_test, y_pred_knn, y_prob_knn)

knn_results


(0.9021607186210245,
 0.8322029488419265,
 0.6177606177606177,
 0.3448275862068966,
 0.4426002766251729,
 0.4138404667896235)

In [18]:
# Initialize model
nb_model = GaussianNB()

# Train on scaled data
nb_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test_scaled)
y_prob_nb = nb_model.predict_proba(X_test_scaled)[:, 1]

# Evaluate
nb_results = evaluate_model(y_test, y_pred_nb, y_prob_nb)

nb_results


(0.7527312454479242,
 0.849331778621633,
 0.2902005297010972,
 0.8265086206896551,
 0.4295715485858303,
 0.38597821781514297)

In [19]:
# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

# Train
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate
rf_results = evaluate_model(y_test, y_pred_rf, y_prob_rf)

rf_results


(0.915270696771061,
 0.9453483212651541,
 0.6701183431952663,
 0.48814655172413796,
 0.5648379052369077,
 0.5271877917582793)

In [20]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Train
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluate
xgb_results = evaluate_model(y_test, y_pred_xgb, y_prob_xgb)

xgb_results


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


(0.9225540179655256,
 0.9545511433322327,
 0.6923076923076923,
 0.5625,
 0.6206896551724138,
 0.5819430303279682)

In [21]:
results_df = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ],
    "Accuracy": [
        log_results[0],
        dt_results[0],
        knn_results[0],
        nb_results[0],
        rf_results[0],
        xgb_results[0]
    ],
    "AUC": [
        log_results[1],
        dt_results[1],
        knn_results[1],
        nb_results[1],
        rf_results[1],
        xgb_results[1]
    ],
    "Precision": [
        log_results[2],
        dt_results[2],
        knn_results[2],
        nb_results[2],
        rf_results[2],
        xgb_results[2]
    ],
    "Recall": [
        log_results[3],
        dt_results[3],
        knn_results[3],
        nb_results[3],
        rf_results[3],
        xgb_results[3]
    ],
    "F1 Score": [
        log_results[4],
        dt_results[4],
        knn_results[4],
        nb_results[4],
        rf_results[4],
        xgb_results[4]
    ],
    "MCC": [
        log_results[5],
        dt_results[5],
        knn_results[5],
        nb_results[5],
        rf_results[5],
        xgb_results[5]
    ]
})

results_df


Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.916363,0.94242,0.710018,0.435345,0.539746,0.514658
1,Decision Tree,0.892814,0.741568,0.52322,0.546336,0.534528,0.474135
2,KNN,0.902161,0.832203,0.617761,0.344828,0.4426,0.41384
3,Naive Bayes,0.752731,0.849332,0.290201,0.826509,0.429572,0.385978
4,Random Forest,0.915271,0.945348,0.670118,0.488147,0.564838,0.527188
5,XGBoost,0.922554,0.954551,0.692308,0.5625,0.62069,0.581943


In [22]:
os.makedirs("model", exist_ok=True)

joblib.dump(log_model, "model/logistic.pkl")
joblib.dump(dt_model, "model/decision_tree.pkl")
joblib.dump(knn_model, "model/knn.pkl")
joblib.dump(nb_model, "model/naive_bayes.pkl")
joblib.dump(rf_model, "model/random_forest.pkl")
joblib.dump(xgb_model, "model/xgboost.pkl")

joblib.dump(X.columns, "model/columns.pkl")


joblib.dump(scaler, "model/scaler.pkl")


['model/scaler.pkl']