In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

import sys
!{sys.executable} -m pip install kagglehub
import kagglehub

path = kagglehub.dataset_download("janiobachmann/bank-marketing-dataset")
print("Path to dataset files:", path)

filepath = Path(path) / Path("bank.csv")
data = pd.read_csv(filepath)
data.head()

Using Colab cache for faster access to the 'bank-marketing-dataset' dataset.
Path to dataset files: /kaggle/input/bank-marketing-dataset


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In this bank marketing classification problem, each prediction is used to decide whether a client should be contacted for a term deposit campaign. For this activity, **Accuracy** is selected as the objective metric because it measures the overall proportion of correct predictions, including both positive and negative classifications.

Although marketing datasets can be imbalanced, the evaluation will also include a confusion matrix and a classification report to inspect class-level behavior (precision and recall) and ensure the model does not disproportionately favor one class.

In [2]:
target_col = "deposit"

X = data.drop(columns=["deposit", "duration", "pdays", "poutcome"])
y = data["deposit"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

num_cols, cat_cols

(['age', 'balance', 'day', 'campaign', 'previous'],
 ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month'])

## Preprocessing pipeline

Numeric features are imputed using the median and scaled. Categorical features are imputed using the most frequent value and one-hot encoded. This is done inside a pipeline to avoid data leakage.

In [3]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ],
    remainder="drop"
)

## Model training with GridSearchCV (cv=5, scoring=accuracy)

Three different classification model types are trained and optimized:

1. Logistic Regression  
2. Random Forest  
3. Gradient Boosting  

Each model is tuned using GridSearchCV with 5-fold cross-validation and optimized for Accuracy.

In [None]:
pipe_lr = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=5000, random_state=42))
])

pipe_rf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(random_state=42))
])

pipe_gb = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", GradientBoostingClassifier(random_state=42))
])

param_grid_lr = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__solver": ["liblinear", "lbfgs"],
    "model__penalty": ["l2"]
}

param_grid_rf = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [None, 8, 16],
    "model__min_samples_split": [2, 5],
    "model__min_samples_leaf": [1, 2],
    "model__max_features": ["sqrt", "log2"]
}

param_grid_gb = {
    "model__n_estimators": [100, 200],
    "model__learning_rate": [0.05, 0.1],
    "model__max_depth": [2, 3],
    "model__subsample": [0.8, 1.0]
}

grid_lr = GridSearchCV(pipe_lr, param_grid_lr, scoring="accuracy", cv=5, n_jobs=-1)
grid_rf = GridSearchCV(pipe_rf, param_grid_rf, scoring="accuracy", cv=5, n_jobs=-1)
grid_gb = GridSearchCV(pipe_gb, param_grid_gb, scoring="accuracy", cv=5, n_jobs=-1)

grid_lr.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)
grid_gb.fit(X_train, y_train)

best_lr = grid_lr.best_estimator_
best_rf = grid_rf.best_estimator_
best_gb = grid_gb.best_estimator_

best_lr_cv = grid_lr.best_score_
best_rf_cv = grid_rf.best_score_
best_gb_cv = grid_gb.best_score_

best_lr, best_lr_cv, best_rf, best_rf_cv, best_gb, best_gb_cv

## Compare the best 3 models on the test set

The comparison is performed on the held-out test set. Training accuracy is also reported to evaluate generalization. A large Train-Test gap may indicate overfitting.

In [None]:
models = {
    "LogisticRegression": (best_lr, best_lr_cv),
    "RandomForest": (best_rf, best_rf_cv),
    "GradientBoosting": (best_gb, best_gb_cv)
}

rows = []
for name, (model, cv_acc) in models.items():
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    acc_train = accuracy_score(y_train, y_pred_train)
    acc_test = accuracy_score(y_test, y_pred_test)

    rows.append({
        "Model": name,
        "Best_CV_Accuracy": cv_acc,
        "Train_Accuracy": acc_train,
        "Test_Accuracy": acc_test,
        "Train_Test_Gap": acc_train - acc_test
    })

results = pd.DataFrame(rows).sort_values(by="Test_Accuracy", ascending=False).reset_index(drop=True)
results

## Best model selection

The best model is selected based on the highest Test Accuracy while also reviewing the Train-Test gap to reduce the risk of selecting an overfitted model.

In [None]:
best_model_name = results.loc[0, "Model"]
best_model = dict((k, v[0]) for k, v in models.items())[best_model_name]

best_model_name

## Confusion matrix (best model)

In [None]:
y_pred_best = best_model.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_best)

## Classification report (best model)

In [None]:
print(classification_report(y_test, y_pred_best))

## General conclusions

Three different classification algorithms were optimized using GridSearchCV with 5-fold cross-validation, using Accuracy as the objective metric. The final comparison was performed on the held-out test set, reporting both Train and Test accuracy to evaluate generalization. The best model was selected based on Test Accuracy while also considering the Train-Test gap as an indicator of possible overfitting. The confusion matrix and classification report provided additional detail on class-level performance beyond the single objective metric. From a business perspective, improved accuracy supports more reliable client selection for marketing calls, potentially reducing unnecessary contacts and improving overall campaign efficiency!!!