BANK MARKETING CAMPAIGN - BELOM SAVE PICKLE !!!!

TARGET : Deposit

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("data_bank_marketing_campaign.csv")
df

Unnamed: 0,age,job,balance,housing,loan,contact,month,campaign,pdays,poutcome,deposit
0,55,admin.,1662,no,no,cellular,jun,2,-1,unknown,yes
1,39,self-employed,-3058,yes,yes,cellular,apr,3,-1,unknown,yes
2,51,admin.,3025,no,no,cellular,may,1,352,other,yes
3,38,services,-87,yes,no,cellular,may,1,-1,unknown,no
4,36,housemaid,205,yes,no,telephone,nov,4,-1,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...
7808,62,housemaid,2,no,no,cellular,nov,1,-1,unknown,yes
7809,48,housemaid,5473,no,no,cellular,feb,2,184,success,yes
7810,36,unknown,179,no,no,cellular,aug,8,-1,unknown,no
7811,31,admin.,54,yes,no,cellular,nov,1,-1,unknown,no


EDA

EDA Result :

Missing Values
<br>
    1. Job = unknown (54) -> fill with mode
<br>
<br>
Drop columns 
<br>
    1. Contact 
    <br>
    2. Month 
    <br>
    3. pdays : too much unknown value
    <br>
    4. poutcome : too much unknown value
    <br>
    <br>
Deposit 
<br>
    1. yes = 3732
    <br>
    2. no = 4081
    <br>
    slightly imbalanced

In [3]:
df_copy = df.copy()

- Dropping Columns

In [4]:
df_copy= df_copy.drop(columns=['contact', 'month', 'pdays', 'poutcome'], axis=1)
df_copy

Unnamed: 0,age,job,balance,housing,loan,campaign,deposit
0,55,admin.,1662,no,no,2,yes
1,39,self-employed,-3058,yes,yes,3,yes
2,51,admin.,3025,no,no,1,yes
3,38,services,-87,yes,no,1,no
4,36,housemaid,205,yes,no,4,no
...,...,...,...,...,...,...,...
7808,62,housemaid,2,no,no,1,yes
7809,48,housemaid,5473,no,no,2,yes
7810,36,unknown,179,no,no,8,no
7811,31,admin.,54,yes,no,1,no


FEATURE ENGINEERING

- Dealing with missing values in 'job'

In [5]:
df_copy['job'] = df['job'].apply(lambda x : np.NaN if x=='unknown' else x)

In [6]:
df_copy['job'].unique()

array(['admin.', 'self-employed', 'services', 'housemaid', 'technician',
       'management', 'student', 'blue-collar', 'entrepreneur', 'retired',
       'unemployed', nan], dtype=object)

In [7]:
from sklearn.impute import SimpleImputer

job_impute = SimpleImputer(strategy='most_frequent')
df_copy['job'] = job_impute.fit_transform(pd.DataFrame(df_copy['job'])).ravel()

In [8]:
df_copy['job'].unique()

array(['admin.', 'self-employed', 'services', 'housemaid', 'technician',
       'management', 'student', 'blue-collar', 'entrepreneur', 'retired',
       'unemployed'], dtype=object)

In [9]:
df_copy['deposit'] = df_copy['deposit'].map({'no': 0, 'yes': 1})

In [10]:
df_copy['deposit'].unique()

array([1, 0])

- Encoding

One Hot Encoding : Housing, Loan
<br>
Binary Encoding : Job
<br>
Standard Scaler : Balance 
<br>
no treatment : campaign, age

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder
from sklearn.compose import ColumnTransformer

In [12]:
one_hot_columns = ['housing', 'loan']

one_hot_pipeline = Pipeline(
    steps = [
        ('encode', OneHotEncoder(sparse_output=False))
    ]
)

In [13]:
binary_columns = ['job']

binary_pipeline = Pipeline(
    steps = [
        ('encode', BinaryEncoder())
    ]
)

In [14]:
transformer = ColumnTransformer([
    ('one_hot_preprocessing', one_hot_pipeline, one_hot_columns),
    ('binary_preprocessing', binary_pipeline, binary_columns)
],remainder='passthrough')

- Data Splitting

In [15]:
X = df_copy.drop(columns='deposit')
y = df_copy['deposit']

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,y,test_size=0.2,random_state=2020,stratify=y
)

In [17]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [18]:
feature_names = transformer.get_feature_names_out()

X_train_transformed = pd.DataFrame(X_train_transformed, columns=feature_names, index=X_train.index)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=feature_names, index=X_test.index)

ML MODEL

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

In [20]:
DTmodel = DecisionTreeClassifier(
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

RFCmodel = RandomForestClassifier(
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

GBCmodel = GradientBoostingClassifier(
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

XGBmodel = XGBClassifier(
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

In [21]:
from sklearn.metrics import classification_report

estimators = [DTmodel,RFCmodel,GBCmodel,XGBmodel]

for model in estimators:
    y_pred = model.predict(X_test_transformed)
    print(model)
    print(classification_report(y_true=y_test, y_pred=y_pred))

DecisionTreeClassifier(random_state=2020)
              precision    recall  f1-score   support

           0       0.61      0.59      0.60       816
           1       0.56      0.58      0.57       747

    accuracy                           0.59      1563
   macro avg       0.59      0.59      0.59      1563
weighted avg       0.59      0.59      0.59      1563

RandomForestClassifier(random_state=2020)
              precision    recall  f1-score   support

           0       0.64      0.66      0.65       816
           1       0.62      0.59      0.61       747

    accuracy                           0.63      1563
   macro avg       0.63      0.63      0.63      1563
weighted avg       0.63      0.63      0.63      1563

GradientBoostingClassifier(random_state=2020)
              precision    recall  f1-score   support

           0       0.65      0.76      0.70       816
           1       0.67      0.55      0.60       747

    accuracy                           0.66      156

MODEL IMPROVEMENT

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)

params ={
    'Decision Tree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [5, 10, 15, 20],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 3, 5, 10]
    },
    'Random Forest': {
        'n_estimators': [300, 500, 700],  
        'max_depth': [5, 7, 10], 
        'min_samples_split': [2, 5, 10],  
        'min_samples_leaf': [1, 3, 5]
    },
    'Gradient Boost': {
        'learning_rate': [0.01, 0.03, 0.05, 0.1],
        'n_estimators': [500, 700, 1000],
        'max_depth': [3, 5, 7],
        'subsample': [0.7, 0.8, 1.0],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 3, 5]
    },
    'XGBoost':{
        'learning_rate': [0.01, 0.03, 0.05],
        'n_estimators': [500, 700, 1000],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.7, 0.9, 1.0],
        'gamma': [0, 0.1, 0.3]
    }
}

estimators = [
    ('Decision Tree', DTmodel),
    ('Random Forest', RFCmodel),
    ('Gradient Boost', GBCmodel),
    ('XGBoost', XGBmodel)
]

pipelines = []

for name, model in estimators:
    pipelines.append((name, make_pipeline(
        GridSearchCV(
            model, 
            param_grid=params[name], 
            cv=skf, 
            scoring='f1', 
            verbose=True,
            n_jobs=-1)
        )
    )
    )

In [23]:
best_models = {}  

for name, pipe in pipelines:
    print(f"Training {name}...")
    pipe.fit(X_train_transformed, y_train)  
    
    best_model = pipe.named_steps['gridsearchcv'].best_estimator_
    best_models[name] = best_model
    
    print(f"Best parameters for {name}: {pipe.named_steps['gridsearchcv'].best_params_}")

Training Decision Tree...
Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best parameters for Decision Tree: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
Training Random Forest...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters for Random Forest: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Training Gradient Boost...
Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best parameters for Gradient Boost: {'learning_rate': 0.01, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 700, 'subsample': 0.7}
Training XGBoost...
Fitting 5 folds for each of 486 candidates, totalling 2430 fits
Best parameters for XGBoost: {'colsample_bytree': 1.0, 'gamma': 0.3, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 1000, 'subsample': 0.8}


In [22]:
DTmodel2 = DecisionTreeClassifier(
    class_weight='balanced',
    criterion = 'gini', 
    max_depth = 10,
    min_samples_leaf = 10, 
    min_samples_split = 2,
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

RFCmodel2 = RandomForestClassifier(
    class_weight='balanced',
    max_depth = 5, 
    min_samples_leaf = 1, 
    min_samples_split = 5, 
    n_estimators = 300,
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

GBCmodel2 = GradientBoostingClassifier(
    learning_rate = 0.01,
    max_depth = 7, 
    min_samples_leaf = 1,
    min_samples_split = 5,
    n_estimators = 700, 
    subsample = 0.7,
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

XGBmodel2 = XGBClassifier(
    colsample_bytree = 1.0,
    gamma = 0.3, 
    learning_rate = 0.01, 
    max_depth = 7, 
    n_estimators = 1000, 
    subsample = 0.8,
    random_state=2020
).fit(X=X_train_transformed, y=y_train)

In [23]:
estimators = [DTmodel2,RFCmodel2,GBCmodel2,XGBmodel2]

for model in estimators:
    y_pred = model.predict(X_test_transformed)
    print(f"{model}")
    print(classification_report(y_true=y_test, y_pred=y_pred))

DecisionTreeClassifier(class_weight='balanced', max_depth=10,
                       min_samples_leaf=10, random_state=2020)
              precision    recall  f1-score   support

           0       0.62      0.68      0.65       816
           1       0.61      0.54      0.57       747

    accuracy                           0.61      1563
   macro avg       0.61      0.61      0.61      1563
weighted avg       0.61      0.61      0.61      1563

RandomForestClassifier(class_weight='balanced', max_depth=5,
                       min_samples_split=5, n_estimators=300,
                       random_state=2020)
              precision    recall  f1-score   support

           0       0.64      0.73      0.68       816
           1       0.65      0.56      0.60       747

    accuracy                           0.65      1563
   macro avg       0.65      0.64      0.64      1563
weighted avg       0.65      0.65      0.64      1563

GradientBoostingClassifier(learning_rate=0.01, max_depth

- Trying other Models

In [24]:
from sklearn.preprocessing import RobustScaler

scale_columns = ['balance','age','campaign']

scale_pipeline = Pipeline(
    steps = [
        ('scale', RobustScaler())
    ]
)

In [25]:
transformer2 = ColumnTransformer([
    ('one_hot_preprocessing', one_hot_pipeline, one_hot_columns),
    ('binary_preprocessing', binary_pipeline, binary_columns),
    ('RobustScaler', scale_pipeline, scale_columns)
],remainder='passthrough')

In [26]:
X_train_scaled = transformer2.fit_transform(X_train)
X_test_scaled = transformer2.transform(X_test)

In [27]:
feature_names = transformer2.get_feature_names_out()

X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names, index=X_test.index)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

LRmodel = LogisticRegression(random_state=2020).fit(X=X_train_scaled, y=y_train)
MLPmodel = MLPClassifier(random_state=2020).fit(X=X_train_scaled, y=y_train)
KNNmodel = KNeighborsClassifier().fit(X=X_train_scaled, y=y_train)



In [29]:
estimators = [LRmodel,MLPmodel,KNNmodel]

for model in estimators:
    y_pred = model.predict(X_test_scaled)
    print(f"{model}")
    print(classification_report(y_true=y_test, y_pred=y_pred))

LogisticRegression(random_state=2020)
              precision    recall  f1-score   support

           0       0.65      0.68      0.66       816
           1       0.63      0.60      0.61       747

    accuracy                           0.64      1563
   macro avg       0.64      0.64      0.64      1563
weighted avg       0.64      0.64      0.64      1563

MLPClassifier(random_state=2020)
              precision    recall  f1-score   support

           0       0.65      0.75      0.70       816
           1       0.67      0.56      0.61       747

    accuracy                           0.66      1563
   macro avg       0.66      0.65      0.65      1563
weighted avg       0.66      0.66      0.65      1563

KNeighborsClassifier()
              precision    recall  f1-score   support

           0       0.60      0.62      0.61       816
           1       0.57      0.55      0.56       747

    accuracy                           0.58      1563
   macro avg       0.58      0.58 

In [29]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

skf = StratifiedKFold(n_splits=5)

params ={
    'KNN': {
        'n_neighbors': [3, 5, 7, 9, 11],  
        'weights': ['uniform', 'distance'],  
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],  
        'penalty': ['l1', 'l2'],  
        'solver': ['liblinear', 'saga']
    },
    'MLPC': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
        'activation': ['relu', 'tanh'],  
        'alpha': [0.0001, 0.001, 0.01],  
        'solver': ['adam', 'sgd'],  
        'learning_rate': ['constant', 'adaptive']
}
}

estimators = [
    ('KNN', KNNmodel),
    ('Logistic Regression', LRmodel),
    ('MLPC', MLPmodel)
]

pipelines = []

for name, model in estimators:
    pipelines.append((name, make_pipeline(
        GridSearchCV(
            model, 
            param_grid=params[name], 
            cv=skf, 
            scoring='f1', 
            verbose=True,
            n_jobs=-1)
        )
    )
    )

In [33]:
best_models = {}  

for name, pipe in pipelines:
    print(f"Training {name}...")
    pipe.fit(X_train_scaled, y_train)  
    
    best_model = pipe.named_steps['gridsearchcv'].best_estimator_
    best_models[name] = best_model
    
    print(f"Best parameters for {name}: {pipe.named_steps['gridsearchcv'].best_params_}")

Training KNN...
Fitting 5 folds for each of 30 candidates, totalling 150 fits


Best parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'distance'}
Training Logistic Regression...
Fitting 5 folds for each of 20 candidates, totalling 100 fits




Best parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Training MLPC...
Fitting 5 folds for each of 96 candidates, totalling 480 fits




Best parameters for MLPC: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}


In [30]:
LRmodel = LogisticRegression(
    C= 0.1, penalty= 'l1', 
    solver= 'liblinear',
    random_state=2020
).fit(X=X_train_scaled, y=y_train)
MLPmodel = MLPClassifier(
    activation= 'tanh', 
    alpha= 0.0001, 
    hidden_layer_sizes= (50, 50), 
    learning_rate= 'adaptive', 
    solver= 'sgd', 
    random_state=2020
).fit(X=X_train_scaled, y=y_train)
KNNmodel = KNeighborsClassifier(
    metric= 'euclidean', 
    n_neighbors= 11, 
    weights='distance'
).fit(X=X_train_scaled, y=y_train)

In [31]:
estimators = [LRmodel,MLPmodel,KNNmodel]

for model in estimators:
    y_pred = model.predict(X_test_scaled)
    print(f"{model}")
    print(classification_report(y_true=y_test, y_pred=y_pred))

LogisticRegression(C=0.1, penalty='l1', random_state=2020, solver='liblinear')
              precision    recall  f1-score   support

           0       0.65      0.67      0.66       816
           1       0.63      0.60      0.61       747

    accuracy                           0.64      1563
   macro avg       0.64      0.64      0.64      1563
weighted avg       0.64      0.64      0.64      1563

MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50),
              learning_rate='adaptive', random_state=2020, solver='sgd')
              precision    recall  f1-score   support

           0       0.66      0.67      0.67       816
           1       0.63      0.63      0.63       747

    accuracy                           0.65      1563
   macro avg       0.65      0.65      0.65      1563
weighted avg       0.65      0.65      0.65      1563

KNeighborsClassifier(metric='euclidean', n_neighbors=11, weights='distance')
              precision    recall  f1-score   support



- Trying Voting Classifier 

In [32]:
from sklearn.ensemble import VotingClassifier

VCmodel = VotingClassifier(
    estimators=[
        ('KNN', KNNmodel),
        ('MLPC', MLPmodel),
        ('LogisticRegression', LRmodel),
        ('DecisionTree', DTmodel2), 
        ('RandomForest', RFCmodel2), 
        ('Gradient Boosting', GBCmodel2), 
        ('XGBoost', XGBmodel2)],
    voting='hard',
    verbose=True
)
VCmodel.fit(X_train_scaled, y_train)

[Voting] ...................... (1 of 7) Processing KNN, total=   0.0s
[Voting] ..................... (2 of 7) Processing MLPC, total=   3.3s
[Voting] ....... (3 of 7) Processing LogisticRegression, total=   0.0s
[Voting] ............. (4 of 7) Processing DecisionTree, total=   0.0s
[Voting] ............. (5 of 7) Processing RandomForest, total=   0.6s
[Voting] ........ (6 of 7) Processing Gradient Boosting, total=   4.9s
[Voting] .................. (7 of 7) Processing XGBoost, total=   0.6s


In [33]:
y_pred = VCmodel.predict(X_test_scaled)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.65      0.73      0.69       816
           1       0.66      0.57      0.61       747

    accuracy                           0.66      1563
   macro avg       0.66      0.65      0.65      1563
weighted avg       0.66      0.66      0.65      1563



- Trying Stacking Classifier

In [34]:
estimators = MLPClassifier(random_state=2020)

In [35]:
estimators.fit(X_train_scaled, y_train)



In [36]:
from sklearn.ensemble import StackingClassifier

SCmodel = StackingClassifier(
    estimators=[
        ('KNN', KNNmodel),
        ('MLPC', MLPmodel),
        ('LogisticRegression', LRmodel),
        ('DecisionTree', DTmodel2), 
        ('RandomForest', RFCmodel2), 
        ('Gradient Boosting', GBCmodel2), 
        ('XGBoost', XGBmodel2)],
    final_estimator=estimators,
    verbose=True
)

SCmodel.fit(X_train_scaled, y_train)

In [37]:
y_pred = SCmodel.predict(X_test_scaled)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.65      0.78      0.71       816
           1       0.69      0.55      0.61       747

    accuracy                           0.67      1563
   macro avg       0.67      0.66      0.66      1563
weighted avg       0.67      0.67      0.66      1563



In [38]:
stacking_pipeline = Pipeline([
    ("preprocessor", transformer2),  
    ("model", SCmodel)  
])

In [39]:
stacking_pipeline.fit(X_train, y_train)

In [40]:
y_pred = stacking_pipeline.predict(X_test)
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.65      0.78      0.71       816
           1       0.69      0.55      0.61       747

    accuracy                           0.67      1563
   macro avg       0.67      0.66      0.66      1563
weighted avg       0.67      0.67      0.66      1563



In [41]:
import pickle

In [42]:
filename = "SC_FinalModel.pkl"
pickle.dump(stacking_pipeline, open(filename,"wb"))