## Import modules

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
%matplotlib inline
warnings.filterwarnings('ignore')

## Loading the dataset

In [2]:
df = pd.read_csv('bank-additional.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [3]:
# statistical info
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0
mean,40.11362,256.788055,2.537266,960.42219,0.190337,0.084972,93.579704,-40.499102,3.621356,5166.481695
std,10.313362,254.703736,2.568159,191.922786,0.541788,1.563114,0.579349,4.594578,1.733591,73.667904
min,18.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.635,4963.6
25%,32.0,103.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.334,5099.1
50%,38.0,181.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,317.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,88.0,3643.0,35.0,999.0,6.0,1.4,94.767,-26.9,5.045,5228.1


In [4]:
# datatype info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

In [5]:
# find unique values
df.apply(lambda x: len(x.unique()))

age                67
job                12
marital             4
education           8
default             3
housing             3
loan                3
contact             2
month              10
day_of_week         5
duration          828
campaign           25
pdays              21
previous            7
poutcome            3
emp.var.rate       10
cons.price.idx     26
cons.conf.idx      26
euribor3m         234
nr.employed        11
y                   2
dtype: int64

# Data Cleaning

In [6]:
unknown_values = ['unknown']
df.replace(unknown_values, np.nan, inplace=True)

In [7]:
# In order to detect the missing values, I do the following operation and determine the total value of the missing values.
# Checking for unknown values and replacing them with NaN

df.isnull().sum()

age                 0
job                39
marital            11
education         167
default           803
housing           105
loan              105
contact             0
month               0
day_of_week         0
duration            0
campaign            0
pdays               0
previous            0
poutcome            0
emp.var.rate        0
cons.price.idx      0
cons.conf.idx       0
euribor3m           0
nr.employed         0
y                   0
dtype: int64

In [8]:
# Handle missing values for categorical features
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
for feature in categorical_features:
    mode_value = df[feature].mode().values[0]
    df[feature].fillna(mode_value, inplace=True)

# Data Preprocessing

In [9]:
# Improve the metric use one hot encoding, label encoding
cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome'] 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,1,1,2,0,1,0,0,6,0,...,2,999,0,1,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,7,2,3,0,0,0,1,6,0,...,4,999,0,1,1.1,93.994,-36.4,4.855,5191.0,no
2,25,7,1,3,0,1,0,1,4,4,...,1,999,0,1,1.4,94.465,-41.8,4.962,5228.1,no
3,38,7,1,2,0,1,0,1,4,0,...,3,999,0,1,1.4,94.465,-41.8,4.959,5228.1,no
4,47,0,1,6,0,1,0,0,7,1,...,1,999,0,1,-0.1,93.2,-42.0,4.191,5195.8,no


In [10]:
columns_with_outliers = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

# Function to handle outliers using IQR
def handle_outliers_iqr(data, column):
    q1 = data[column].quantile(0.25)
    q3 = data[column].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    data[column] = data[column].clip(lower_bound, upper_bound)

# Apply outlier handling using IQR for specified columns
for column in columns_with_outliers:
    handle_outliers_iqr(df, column)

df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30.0,1,1,2,0,1,0,0,6,0,...,2,999,0,1,-1.8,92.893,-46.2,1.313,5099.1,no
1,39.0,7,2,3,0,0,0,1,6,0,...,4,999,0,1,1.1,93.994,-36.4,4.855,5191.0,no
2,25.0,7,1,3,0,1,0,1,4,4,...,1,999,0,1,1.4,94.465,-41.8,4.962,5228.1,no
3,38.0,7,1,2,0,1,0,1,4,0,...,3,999,0,1,1.4,94.465,-41.8,4.959,5228.1,no
4,47.0,0,1,6,0,1,0,0,7,1,...,1,999,0,1,-0.1,93.2,-42.0,4.191,5195.8,no


In [11]:
# I imported MinMaxScaler to perform feature scaling for numerical columns.

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numColums = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'] 
df[numColums] = scaler.fit_transform(df[numColums])

# Feature Engineering

In [12]:
# I do feature engineering and add some new features.

df['total_contacts'] = df['campaign'] + df['previous']
df['euribor_emp_var'] = df['euribor3m'] * df['emp.var.rate']

In [13]:
# Encode 'y' for other processes.

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['y'] = le.fit_transform(df['y']) 

print(df['y'].unique())

df.head()

[0 1]


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,total_contacts,euribor_emp_var
0,0.23301,1,1,2,0,1,0,0,6,0,...,0.0,1,0.333333,0.26968,0.192872,0.153741,0.512287,0,0.2,0.051247
1,0.407767,7,2,3,0,0,0,1,6,0,...,0.0,1,0.9375,0.698753,0.603774,0.956916,0.859735,0,0.6,0.897109
2,0.135922,7,1,3,0,1,0,1,4,4,...,0.0,1,1.0,0.882307,0.377358,0.981179,1.0,0,0.0,0.981179
3,0.38835,7,1,2,0,1,0,1,4,0,...,0.0,1,1.0,0.882307,0.377358,0.980499,1.0,0,0.4,0.980499
4,0.563107,0,1,6,0,1,0,0,7,1,...,0.0,1,0.6875,0.389322,0.368973,0.806349,0.877883,0,0.0,0.554365


## Input Split

In [14]:
X = df.drop('y', axis=1)
y = df['y']

I have done the desired model selection, hyperparameter tuning, pipelines and evaluation below. Then I created the best_model.pkl file for the deployment process. I will use this file in my streamlit app.

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())])

categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# I define classifiers and parameter grid
classifiers = {
    'Logistic Regression': (LogisticRegression(), {
        'classifier__C': [0.1, 1.0, 10.0],
        'classifier__penalty': ['l2']}),
    'Random Forest': (RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [2, 4, 6],
        'classifier__min_samples_split': [2, 5, 10]}),
    'Gradient Boosting': (GradientBoostingClassifier(), {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__learning_rate': [0.1, 0.05, 0.01],
        'classifier__max_depth': [3, 4, 5]})
}

# Train and evaluate models
results = {}
for model_name, (model, params) in classifiers.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])
    grid_search = GridSearchCV(pipeline, param_grid=params, cv=3)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    results[model_name] = {'best_model': best_model, 'accuracy': accuracy, 'classification_report': report}

# Determine best model
best_model_name = max(results, key=lambda x: results[x]['accuracy'])
best_model = results[best_model_name]['best_model']
best_accuracy = results[best_model_name]['accuracy']
best_classification_report = results[best_model_name]['classification_report']

# Save the best model for deployment
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# I evaluate the performance of the final model using appropriate evaluation metrics and find best model.
for model_name, result in results.items():
    print(f"{model_name}: Accuracy = {result['accuracy']}")
    
print('\nBest Model:')
print(f"Model: {best_model_name}")
print(f"Accuracy: {best_accuracy}")
print("Classification Report:")
print(best_classification_report)

Logistic Regression: Accuracy = 0.9029126213592233
Random Forest: Accuracy = 0.9016990291262136
Gradient Boosting: Accuracy = 0.8992718446601942

Best Model:
Model: Logistic Regression
Accuracy: 0.9029126213592233
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       732
           1       0.61      0.37      0.46        92

    accuracy                           0.90       824
   macro avg       0.77      0.67      0.70       824
weighted avg       0.89      0.90      0.89       824



In [17]:
# These for testing

import pandas as pd

# feature_names is a list of the names of the columns of your data
feature_names = ['age', 'job', 'marital', 'education', 'default', 'housing', 
                 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 
                 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 
                 'cons.conf.idx', 'euribor3m', 'nr.employed']

input_data = (50,'blue-collar','married','basic.4y','no','no','yes','cellular','jul','tue',849,1,999,0,'nonexistent',1.4,93.918,-42.7,4.961,5228.1)
# input_data = (39,'services','single','high.school','no','no','no','telephone','may','fri',346,4,999,0,'nonexistent',1.1,93.994,-36.4,4.855,5191)

# convert input data to a DataFrame
input_data_df = pd.DataFrame([input_data], columns=feature_names)

# Apply preprocessing steps on the input data
processed_input_data = best_model.named_steps['preprocessor'].transform(input_data_df)

# Make the prediction
prediction = best_model.named_steps['classifier'].predict(processed_input_data)

# Print the prediction
if prediction == 0:
    prediction = "no"
else:
    prediction = "yes"

print('Prediction : ',str(prediction))

Prediction :  yes
