In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
%matplotlib inline

from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import tree
from sklearn.model_selection import cross_val_score

In [2]:
data_1 = pd.read_csv("question_dataset.csv")

In [3]:
data_1.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

## Now, every column is now being coverted to one-hot encoding, label or any other method.

# Binning & one-hot of age

In [5]:
data_1['age'] = pd.cut(data_1['age'], bins=[20,30,40,50,60,70,80,90])

In [6]:
data_1 = pd.get_dummies(data_1, columns=['age'], drop_first=False)
data_1.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,...,euribor3m,nr.employed,y,"age_(20, 30]","age_(30, 40]","age_(40, 50]","age_(50, 60]","age_(60, 70]","age_(70, 80]","age_(80, 90]"
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,...,4.857,5191.0,no,0,0,0,1,0,0,0
1,services,married,high.school,unknown,no,no,telephone,may,mon,149,...,4.857,5191.0,no,0,0,0,1,0,0,0
2,services,married,high.school,no,yes,no,telephone,may,mon,226,...,4.857,5191.0,no,0,1,0,0,0,0,0
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,...,4.857,5191.0,no,0,1,0,0,0,0,0
4,services,married,high.school,no,no,yes,telephone,may,mon,307,...,4.857,5191.0,no,0,0,0,1,0,0,0


# In education column, basic.4y,basic.6y and basic.9y = Basic. 

In [7]:
data_1['education']=np.where(data_1['education'] =='basic.9y', 'Basic', data_1['education'])
data_1['education']=np.where(data_1['education'] =='basic.6y', 'Basic', data_1['education'])
data_1['education']=np.where(data_1['education'] =='basic.4y', 'Basic', data_1['education'])

# One hot encoding of job, marital, education,month, day_of_week

In [8]:
data_1 = pd.get_dummies(data_1, columns=['job', 'marital', 'education', 'month','day_of_week'], drop_first=False)

# Mapping appropriate values to poutcome, default, housing and loan

In [9]:
data_1['poutcome'] = data_1['poutcome'].map({'failure': -1,'nonexistent': 0,'success': 1})
data_1['default'] = data_1['default'].map({'yes': 1,'unknown': 0,'no': -1})
data_1['housing'] = data_1['housing'].map({'yes': 1,'unknown': 0,'no': -1})
data_1['loan'] = data_1['loan'].map({'yes': 1,'unknown': 0,'no': -1})

# If contact type is telephone then 0, cellular is 1. 

In [10]:
data_1['contact']=np.where(data_1['contact'] =='telephone', 0, 1)

In [11]:
data_1.duration.unique()

array([ 261,  149,  226, ..., 1246, 1556, 1868], dtype=int64)

# Dropping duration column because of no pratical use to build model. 

In [12]:
data_1 = data_1.drop(['duration'],axis=1)

# For campaign, if one type performed then 1, else 0. 

In [13]:
data_1['campaign'] = np.where(data_1['campaign']== 1, 1, 0)

# Working on pdays, if 999 = client was not contacted, 0 else 1. ( 1 means sum days are passed)

In [14]:
data_1['pdays'] = np.where(data_1['pdays']==0,0,1)

# In 'previous' column, when it is contacted one or more time, then value is 1 else, 0. 

In [15]:
data_1['previous'] = np.where(data_1['previous']== 0, 0, 1)

# Feature scaling - Standardization – If feature follows Gaussian distribution, Normalization – NO Gaussian distribution 

In [16]:
scaler_1 = StandardScaler()
data_1[['emp.var.rate','euribor3m','nr.employed']] = scaler_1.fit_transform(data_1[['emp.var.rate','euribor3m','nr.employed']].to_numpy())

In [17]:
scaler_2 = MinMaxScaler()
data_1[['cons.price.idx','cons.conf.idx']] = scaler_2.fit_transform(data_1[['cons.price.idx','cons.conf.idx']].to_numpy())

# In target column 'y', yes = 1 and no=0

In [18]:
data_1['y'] = np.where(data_1['y']== 'yes', 1, 0)

In [19]:
data_1.columns

Index(['default', 'housing', 'loan', 'contact', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'age_(20, 30]',
       'age_(30, 40]', 'age_(40, 50]', 'age_(50, 60]', 'age_(60, 70]',
       'age_(70, 80]', 'age_(80, 90]', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'marital_unknown', 'education_Basic',
       'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'month_apr', 'month_aug', 'month_dec', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu'

# Defining X and y & Train-test split

In [20]:
X = data_1.drop('y', axis=1)
y = data_1.loc[:,'y']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state=32)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(27595, 57)
(27595,)
(13593, 57)
(13593,)


# Logistic Regression

In [23]:
model_lg = linear_model.LogisticRegression(solver='liblinear')
model_lg.fit(X_train,y_train)
model_lg.score(X_train,y_train)

0.9015038956332669

In [24]:
# Model valdiation steps
y_actual = y_test 
y_pred =  model_lg.predict(X_test)


from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

results = precision_recall_fscore_support(y_actual, y_pred,average='macro')
accuracy = accuracy_score(y_actual, y_pred)

precision = results[0]
recall = results[1]
fscore = results[2]

print(precision)
print(recall)
print(fscore)
print(accuracy)

0.7778800194746089
0.6049570316218651
0.639593193321439
0.8971529463694549


# Decision Tree

In [25]:
model_dt = tree.DecisionTreeClassifier(max_depth=5)
model_dt.fit(X_train,y_train)
model_dt.score(X_train,y_train)

0.9044754484508063

In [26]:
# Model valdiation steps
y_actual = y_test 
y_pred =  model_dt.predict(X_test)


from sklearn.metrics import precision_recall_fscore_support

results2 = precision_recall_fscore_support(y_actual, y_pred, average='macro')
accuracy2 = accuracy_score(y_actual, y_pred)

precision_dt = results2[0]
recall_dt = results2[1]
fscore_dt = results2[2]

print(precision_dt)
print(recall_dt)
print(fscore_dt)
print(accuracy2)

0.7670989117079164
0.6167951228201729
0.6521539987115156
0.8968586772603546


In [27]:
model_dt

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

# Random Forest 

In [67]:
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor(n_estimators=50, random_state=0,max_depth=5)
model_rf.fit(X_train, y_train)
y_pred = regmodel_rf.predict(X_test)

In [68]:
# Model valdiation steps
y_actual = y_test 
y_pred =  regmodel_rf.predict(X_test)
from sklearn import metrics
accuracy3 = metrics.accuracy_score(y_actual, y_pred.round())
print(accuracy3)

0.8981828882513058


# Gradient Boosting

In [72]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
gb = gb.fit(X_train, y_train)

In [76]:
y_pred =gb.predict(X_test)
acc_GB = accuracy_score(y_actual, y_pred)
print(acc_GB)

0.8988449937467814


In [77]:
from sklearn.metrics import precision_recall_fscore_support

results4 = precision_recall_fscore_support(y_actual, y_pred, average='macro')
accuracy4 = accuracy_score(y_actual, y_pred)

precision_gb = results4[0]
recall_gb = results4[1]
fscore_gb = results4[2]

print(precision_gb)
print(recall_gb)
print(fscore_gb)
print(accuracy4)

0.7901969970342034
0.6078668436010308
0.6441260489002887
0.8988449937467814


In [79]:
result1 = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F Score'],
    'Logistic Regression with all features' : [precision, recall, fscore, accuracy],
    'Decision tree with all features' : [precision_dt, recall_dt, fscore_dt, accuracy2],
    'Random forest with all features' : [accuracy3,'-','-','-'],
    'Gradient boosting with all features' : [precision_gb, recall_gb, fscore_gb, accuracy4]})
result1

Unnamed: 0,Metric,Logistic Regression with all features,Decision tree with all features,Random forest with all features,Gradient boosting with all features
0,Accuracy,0.77788,0.767099,0.898183,0.790197
1,Precision,0.604957,0.616795,-,0.607867
2,Recall,0.639593,0.652154,-,0.644126
3,F Score,0.897153,0.896859,-,0.898845
