# SECTION C – Advance Machine Learning

Bank Marketing:
The dataset is related with direct marketing campaigns of a Portuguese banking institution. The
marketing campaigns were based on phone calls. Often, more than one contact to the same client
was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no')
subscribed.

Objective:
The binary classification goal is to predict if the client will subscribe a bank term deposit or not.
Gradient  boosting techniques learned in the module to solve this business problem.
N.B: The data set may seem highly imbalanced, so please apply required technique to balance
the data then apply classifier.

**Task 1: Bank Marketing** 

In [0]:
#upload data 
url = 'https://raw.githubusercontent.com/shivckr/border/master/bank-additional-full.csv'

In [0]:
# import packages 
import pandas as pd                                                              # package to manipulate dataframe
from sklearn.ensemble import GradientBoostingRegressor                           # gradient boost algorithm 
from sklearn.metrics import mean_squared_error                                   # Metrics to evalaute the model
from sklearn.model_selection import train_test_split                             # splitting the data into train and test  
from sklearn.metrics import mean_absolute_error                                  # Metrics to evalaute the model
from sklearn import metrics
import numpy as np                                                                # scientific computing package



In [0]:
# read the csv file 
    bank_mkt_data = pd.read_csv(url, delimiter=';') 


In [0]:
 #data head
 bank_mkt_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [0]:
#creating one hot encoding for categorical feature
categorical_features = ['job','marital','education','default', 'housing','loan','contact','month','day_of_week', 'poutcome','y' ]

In [0]:
#creating one hot encoding for categorical feature


job_dummy = pd.get_dummies(bank_mkt_data['job'],drop_first=True, prefix='job')

housing_dummy = pd.get_dummies(bank_mkt_data['housing'],drop_first=True, prefix='housing')

education_dummy = pd.get_dummies(bank_mkt_data['education'],drop_first=True,prefix='education')
marital_dummy = pd.get_dummies(bank_mkt_data['marital'],drop_first=True,prefix='marital')
default_dummy = pd.get_dummies(bank_mkt_data['default'],drop_first=True,prefix='default')

loan_dummy = pd.get_dummies(bank_mkt_data['loan'],drop_first=True,prefix='loan')

contact_dummy = pd.get_dummies(bank_mkt_data['contact'],drop_first=True,prefix='contact')
month_dummy = pd.get_dummies(bank_mkt_data['month'],drop_first=True,prefix='month')
day_of_week_dummy = pd.get_dummies(bank_mkt_data['day_of_week'],drop_first=True,prefix='day_of_week')

poutcome_dummy = pd.get_dummies(bank_mkt_data['poutcome'],drop_first=True,prefix='poutcome')



In [0]:
#target variable
y_dummy = pd.get_dummies(bank_mkt_data['y'],drop_first=True,prefix='y')

In [0]:
# merge all dummified categorical columns
bank_mkt_data= pd.concat([bank_mkt_data,job_dummy,housing_dummy,education_dummy,marital_dummy,default_dummy,loan_dummy,contact_dummy,month_dummy,day_of_week_dummy,poutcome_dummy],1)

In [0]:
# merge target variable
bank_mkt_data = pd.concat([bank_mkt_data, y_dummy],axis=1)

In [0]:
bank_mkt_data.head(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,housing_unknown,housing_yes,education_basic.6y,education_basic.9y,education_high.school,education_illiterate,education_professional.course,education_university.degree,education_unknown,marital_married,marital_single,marital_unknown,default_unknown,default_yes,loan_unknown,loan_yes,contact_telephone,month_aug,month_dec,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0


In [0]:
bank_mkt_data.shape

(41188, 54)

In [0]:
#delete all non dummified columns
bank_mkt_data =bank_mkt_data.drop(categorical_features,axis = 1)

In [0]:
bank_mkt_data.shape

(41188, 43)

In [0]:
bank_mkt_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41188 entries, 0 to 41187
Data columns (total 43 columns):
age                              41188 non-null int64
duration                         41188 non-null int64
campaign                         41188 non-null int64
pdays                            41188 non-null int64
previous                         41188 non-null int64
emp.var.rate                     41188 non-null float64
cons.price.idx                   41188 non-null float64
cons.conf.idx                    41188 non-null float64
euribor3m                        41188 non-null float64
nr.employed                      41188 non-null float64
housing_unknown                  41188 non-null uint8
housing_yes                      41188 non-null uint8
education_basic.6y               41188 non-null uint8
education_basic.9y               41188 non-null uint8
education_high.school            41188 non-null uint8
education_illiterate             41188 non-null uint8
education_profess

In [0]:
# no missing value
bank_mkt_data.isnull().sum()

age                              0
duration                         0
campaign                         0
pdays                            0
previous                         0
emp.var.rate                     0
cons.price.idx                   0
cons.conf.idx                    0
euribor3m                        0
nr.employed                      0
housing_unknown                  0
housing_yes                      0
education_basic.6y               0
education_basic.9y               0
education_high.school            0
education_illiterate             0
education_professional.course    0
education_university.degree      0
education_unknown                0
marital_married                  0
marital_single                   0
marital_unknown                  0
default_unknown                  0
default_yes                      0
loan_unknown                     0
loan_yes                         0
contact_telephone                0
month_aug                        0
month_dec           

In [0]:
# prepare data
X = bank_mkt_data.drop(['y_yes'],axis=1)
y = bank_mkt_data.y_yes

In [0]:
# check target  count 
y.value_counts()

0    36548
1     4640
Name: y_yes, dtype: int64

In [0]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(ratio='minority')                  # To resample the minority class
X_sm, y_sm = smote.fit_sample(X, y)              # Fit SMOTE on the data



In [0]:
# split data  
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm)

In [0]:
# construct and fit  model
regressor = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=3,
    learning_rate=1.0
)
regressor.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [0]:
#error at each stage of training
errors = [mean_squared_error(y_test, y_pred) for y_pred in regressor.staged_predict(X_test)]
best_n_estimators = np.argmin(errors)

In [0]:
# train model with best suitable parameter 
best_regressor = GradientBoostingRegressor(
    max_depth=2,
    n_estimators=best_n_estimators,
    learning_rate=1.0
)
best_regressor.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [0]:
# mean absolute error
y_pred = best_regressor.predict(X_test)
mean_absolute_error(y_test, y_pred)

0.2614596322926371