# Bank Marketing Campaign - Model Training
This notebook trains a machine learning pipeline on the Bank Marketing dataset and saves the model using joblib.

In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib
import os


In [4]:

DATA_PATH = "data/bank-full.csv"   # Update if needed
MODEL_PATH = "models/pipeline.joblib"

def load_data(path=DATA_PATH):
    # Try both separators (; and ,)
    try:
        df = pd.read_csv(path, sep=';')
        if len(df.columns) == 1:
            df = pd.read_csv(path, sep=',')
    except Exception:
        df = pd.read_csv(path, sep=',')
    return df

def build_pipeline(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=200, random_state=42, class_weight='balanced'))
    ])
    return clf


def train_and_save():
    df = load_data()
    print("Columns:", df.columns.tolist())

    # detect target column
    possible_targets = ['y', 'deposit', 'subscribed', 'Target']
    target_col = next((c for c in possible_targets if c in df.columns), None)
    if target_col is None:
        raise ValueError(f"Target not found. Columns: {df.columns.tolist()}")

    df[target_col] = df[target_col].map({'yes':1,'no':0,'1':1,'0':0}).astype(int)
    X = df.drop(columns=[target_col])
    y = df[target_col]

    numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
    categorical_features = [c for c in X.columns if c not in numeric_features]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )

    pipeline = build_pipeline(numeric_features, categorical_features)
    pipeline.fit(X_train, y_train)

    preds = pipeline.predict(X_test)
    probs = pipeline.predict_proba(X_test)[:, 1]
    print("Classification report:\n", classification_report(y_test, preds))
    print("ROC AUC:", roc_auc_score(y_test, probs))

    os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
    joblib.dump({
        'pipeline': pipeline,
        'numeric_features': numeric_features,
        'categorical_features': categorical_features,
        'target_col': target_col
    }, MODEL_PATH)
    print(f"Model saved to {MODEL_PATH}")


In [5]:
train_and_save()

Columns: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'Target']
Classification report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      7985
           1       0.69      0.34      0.45      1058

    accuracy                           0.91      9043
   macro avg       0.81      0.66      0.70      9043
weighted avg       0.89      0.91      0.89      9043

ROC AUC: 0.928626630982241
Model saved to models/pipeline.joblib
