In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('clean_data.csv', index_col='Id')

In [4]:
df.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,month,duration,campaign,pdays,previous,y
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1002,44,technician,single,secondary,no,29.0,yes,no,cellular,may,151,1,5000,0,no
1003,33,entrepreneur,married,secondary,no,2.0,yes,yes,cellular,may,76,1,5000,0,no
1004,47,blue-collar,married,secondary,no,1506.0,yes,no,cellular,may,92,1,5000,0,no
1005,33,blue-collar,single,secondary,no,1.0,no,no,cellular,may,198,1,5000,0,no
1006,35,management,married,tertiary,no,231.0,yes,no,cellular,may,139,1,5000,0,no


## Preprocessing

1. Created dummy variables for categorical features
2. Scaled numeric data

In [5]:
# dummy variables

df_dummies = pd.get_dummies(df[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month']])

In [34]:
# concatenate dataframes

df = pd.concat([df, df_dummies], axis=1)
df.head()

Unnamed: 0_level_0,age,balance,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002,44,29.0,151,1,5000,0,no,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1003,33,2.0,76,1,5000,0,no,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1004,47,1506.0,92,1,5000,0,no,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1005,33,1.0,198,1,5000,0,no,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1006,35,231.0,139,1,5000,0,no,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
# drop categorical variables that have dummy variables created

df = df.drop(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month'], axis=1)
df.head()

Unnamed: 0_level_0,age,balance,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002,44,29.0,151,1,5000,0,no,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1003,33,2.0,76,1,5000,0,no,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1004,47,1506.0,92,1,5000,0,no,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1005,33,1.0,198,1,5000,0,no,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1006,35,231.0,139,1,5000,0,no,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [32]:
# scale data

df_scaled = df.copy()
scaler = StandardScaler()
cols_to_scale = ['age', 'balance', 'duration', 'campaign',
       'pdays', 'previous']
df_scaled[cols_to_scale] = scaler.fit_transform(df_scaled[cols_to_scale])

In [10]:
df_scaled.head()

Unnamed: 0_level_0,age,balance,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002,0.288587,-0.437929,-0.416156,-0.569467,0.472587,-0.251937,no,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1003,-0.747423,-0.446795,-0.707479,-0.569467,0.472587,-0.251937,no,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1004,0.571136,0.047104,-0.64533,-0.569467,0.472587,-0.251937,no,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1005,-0.747423,-0.447124,-0.233593,-0.569467,0.472587,-0.251937,no,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1006,-0.559057,-0.371594,-0.462767,-0.569467,0.472587,-0.251937,no,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [11]:
X = df_scaled.drop(['y'], axis=1)
y = df_scaled['y']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Baseline Models

Tried the following 3 models with default parameters using 5-fold cross-validation.

a. Gaussian Naive-Bayes (85.9%)

b. Logistic Regression (89.4%)

c. K-Nearest Neighbors (89%)

In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [14]:
# gaussian naive-bayes classifier

gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.85481195 0.86073849 0.86018531 0.86129166 0.85880238]
0.8591659573517658


In [15]:
# logistic regression

lr = LogisticRegression(max_iter = 2000)
cv = cross_val_score(lr,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.89408186 0.89406721 0.89323745 0.89392892 0.89296086]
0.8936552598020077


In [16]:
# k-nearest neighbors classifier

knn = KNeighborsClassifier()
cv = cross_val_score(knn,X_train,y_train,cv=5)
print(cv)
print(cv.mean())

[0.88606195 0.88950353 0.89185452 0.88978011 0.89171622]
0.8897832647781246


In [17]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV

In [18]:
# performance reporting function

def clf_performance(classifier, model_name):
    print(model_name)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

## Model Fine-Tuning

Used grid-search to fine-tune the logistic regression and knn models. It produces only neglibible improvements in the scores of both models.

a. Logistic Regression (89.4%)

b. K-Nearest Neighbors (89.3%)

In [19]:
# hyperparameter tuning for logistic regression model

lr = LogisticRegression()
param_grid = {'max_iter' : [2000],
              'penalty' : ['l1', 'l2', 'elasticnet'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

clf_lr = GridSearchCV(lr, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_lr = clf_lr.fit(X_train,y_train)
clf_performance(best_clf_lr,'Logistic Regression')

Fitting 5 folds for each of 60 candidates, totalling 300 fits


 0.88602169 0.88917469        nan 0.88953427 0.89119372        nan
 0.89138733 0.89227239        nan 0.89254895 0.89307444        nan
 0.89368286 0.89337867        nan 0.89337867 0.89337869        nan
 0.89351696 0.89340632        nan 0.8934893  0.89357228        nan
 0.89365526 0.89371058        nan 0.89384886 0.89376589        nan
 0.89379355 0.89382121        nan 0.89382121 0.89382121        nan
 0.89382121 0.89382121        nan 0.89382121 0.89382121        nan
 0.89382121 0.89382121        nan 0.89382121 0.89382121        nan
 0.89382121 0.89382121        nan 0.89382121 0.89382121        nan]


Logistic Regression
Best Score: 0.8938488629952406
Best Parameters: {'C': 4.281332398719396, 'max_iter': 2000, 'penalty': 'l1', 'solver': 'liblinear'}


In [108]:
# hyperparameter tuning for k-nearest neighbours classifier model

knn = KNeighborsClassifier()
param_grid = {'n_neighbors' : [3,5,7,9],
              'weights' : ['uniform', 'distance'],
              'algorithm' : ['auto', 'ball_tree','kd_tree'],
              'p' : [1,2]}
clf_knn = GridSearchCV(knn, param_grid = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_knn = clf_knn.fit(X_train,y_train)
clf_performance(best_clf_knn,'KNN')

Fitting 5 folds for each of 48 candidates, totalling 240 fits
KNN
Best Score: 0.8931574798403629
Best Parameters: {'algorithm': 'auto', 'n_neighbors': 9, 'p': 2, 'weights': 'uniform'}


## Model Testing

In [23]:
# predictions

y_pred = best_clf_lr.predict(X_test)

array(['no', 'no', 'yes', ..., 'no', 'yes', 'no'], dtype=object)

In [33]:
# create dataframe and csv file for observations & predictions

final_data = {'Observations': y_test, 'Predictions': y_pred}
df_main = pd.DataFrame(data=final_data)
df_main.to_csv('predictions.csv' ,index=False)
df_main.head()

Unnamed: 0_level_0,Observations,Predictions
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
20905,no,no
1911,no,no
42523,no,yes
7444,no,no
16183,no,yes


In [30]:
# confusion matrix

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[7777  180]
 [ 779  304]]
