In [1]:
# Importing relevant ibraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_original = pd.read_csv('bank_marketing.csv')
data_original.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


We'll be modeling KNN classifiers on a revised dataset,
**by Dropping 'default' column from original dataset**

In [3]:
# Revised dataset: Dropping 'default' column from original dataset
data_revised_1 = data_original.copy()
data_revised_1.drop('default', axis=1, inplace=True)
data_revised_1.head()

Unnamed: 0,age,job,marital,education,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


**This dataset has an Imbalanced Target Class. In the 'y' column, there are 88% observations with label 'no' and ONLY 12% observations with label 'yes'. <br/>
Machine learning algorithms have trouble learning when one class dominates the other in the target variable. And so before we train any ML model on this dataset we need to balance it appropriately.**

## Handling Imbalanced Classes

### Method 1 - SMOTE (Synthetic Minority Over-sampling TEchnique)

SMOTE draws artificial samples by choosing points that lie on the line connecting the rare observation to one of its nearest neighbors in the feature space.

In [4]:
# Importing relevant libraries
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# First, convert all columns into numeric variables
variables_set = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'y']

# One Hot Encoding: All categorical columns/variables in the dataset must be encoded as numeric values.
# For dataset 1
one_hot_encoded_job       = pd.get_dummies(data_revised_1['job'])
one_hot_encoded_marital   = pd.get_dummies(data_revised_1['marital'])
one_hot_encoded_education = pd.get_dummies(data_revised_1['education'])
one_hot_encoded_housing   = pd.get_dummies(data_revised_1['housing'])
one_hot_encoded_loan      = pd.get_dummies(data_revised_1['loan'])
one_hot_encoded_contact   = pd.get_dummies(data_revised_1['contact'])
one_hot_encoded_month     = pd.get_dummies(data_revised_1['month'])
one_hot_encoded_poutcome  = pd.get_dummies(data_revised_1['poutcome'])

data_revised_1_converted = data_revised_1[variables_set]

data_revised_1_converted = pd.concat([data_revised_1_converted,
                                      one_hot_encoded_job, 
                                      one_hot_encoded_marital, 
                                      one_hot_encoded_education,
                                      one_hot_encoded_housing,
                                      one_hot_encoded_loan,
                                      one_hot_encoded_contact,
                                      one_hot_encoded_month,
                                      one_hot_encoded_poutcome], axis = 1)



In [5]:
# From the 1st dataset, create a training set and a test set
data_revised_1_X_train, data_revised_1_X_test, \
data_revised_1_y_train, data_revised_1_y_test = train_test_split(data_revised_1_converted.drop('y', axis=1),
                                                                 data_revised_1_converted.y, 
                                                                 test_size=.1,       # Training set = 90%, Test set = 10%
                                                                 random_state=12)

# Use the training set to create another set of training and validation set
X1_train, X1_val, y1_train, y1_val = train_test_split(data_revised_1_X_train, data_revised_1_y_train,
                                                      test_size = .1,
                                                      random_state=12)

# Use SMOTE to over-sample this training set
sm = SMOTE(random_state=12, ratio = 1.0)
X1_train_res, y1_train_res = sm.fit_sample(X1_train, y1_train)

export to csv

In [6]:
data_revised_1_converted.to_csv("all.csv")
X1_train.to_csv("xBefore.csv")
y1_train.to_csv("yBefroe.csv")
pd.DataFrame(data=X1_train_res).to_csv("xAfter.csv")
pd.DataFrame(data=y1_train_res).to_csv("yAfter.csv")

#### Building KNN Classifier using GridSearchCV

In [7]:
# Importing relevant libraries
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics


# Set a range for hyper-parameters
k_range = list(range(1,19))
weight_options = ['uniform', 'distance']

param_grid = dict(n_neighbors = k_range, weights = weight_options)

knn = KNeighborsClassifier()

grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X1_train_res,y1_train_res)

#print(grid.grid_scores_)
'''
print(grid.grid_scores_[0].parameters)
print(grid.grid_scores_[0].cv_validation_scores)
print(grid.grid_scores_[0].mean_validation_score)
'''

print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

#Train the best estimator on the train set
knn_clf_1 = grid.best_estimator_.fit(X1_train_res,y1_train_res)

#Predict the response of best estimator for val set
knn_clf_1_pred = knn_clf_1.predict(X1_val)

# Calculate the accuracy score
knn_clf_1_score = metrics.accuracy_score(y1_val, knn_clf_1_pred)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Classifier 1 is: ",np.round(knn_clf_1_score*100, 2))

0.8855828220858896
{'n_neighbors': 1, 'weights': 'uniform'}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')
Accuracy of Classifier 1 is:  79.12


#### Testing Classifier 1 on Test set

In [8]:
#Predict the response of best estimator for test set
knn_clf_1_pred_test = knn_clf_1.predict(data_revised_1_X_test)

# Calculate the accuracy score
knn_clf_1_score_test = metrics.accuracy_score(data_revised_1_y_test, knn_clf_1_pred_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Classifier 1 is: ",np.round(knn_clf_1_score_test*100, 2))

Accuracy of Classifier 1 is:  79.91


#### Building Gaussian Naive Bayes Classifier

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn import preprocessing
%matplotlib inline

There isn't a hyper-parameter for Gaussian Naive Bayes to tune, so I didn't use grid search

In [10]:
gnb1 = GaussianNB()
#Train the model
x_normalize = X1_train_res.values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x_normalize)

# Run the normalizer on the dataframe
X1_normalized = pd.DataFrame(x_scaled)

gnb1.fit(X1_normalized,y1_train_res)

#Predict the response for val set
gnb1_pred = gnb1.predict(X1_val)

# Calculate the accuracy score
gnb1_score = metrics.accuracy_score(y1_val, gnb1_pred)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Gaussian Naive Bayes is: ",np.round(gnb1_score*100, 2))

Accuracy of Gaussian Naive Bayes is:  77.15


In [None]:
model_fit(gnb1, print_FeatureImportance=False)

#### Test Gaussian Naive Bayes on Test set

In [11]:
#Predict the response for test set
gnb1_pred_test = gnb1.predict(data_revised_1_X_test)

# Calculate the accuracy score
gnb1_score_test = metrics.accuracy_score(data_revised_1_y_test, gnb1_pred_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Gaussian Naive Bayes is: ",np.round(gnb1_score_test*100, 2))

Accuracy of Gaussian Naive Bayes is:  80.79


In [None]:
model_fit(gnb1, print_FeatureImportance=False)

### Method 2 - ROSE (Random Over-Sampling Examples)

ROSE uses smoothed bootstrapping to draw artificial samples from the feature space neighbourhood around the minority class.

In [12]:
# Import ROS library
from imblearn.over_sampling import RandomOverSampler

# Use Random Over-Sampling to over-sample this training set
sm = RandomOverSampler(random_state=12)
X1_train_res, y1_train_res = sm.fit_sample(X1_train, y1_train)

#### Building KNN Classifier using GridSearchCV

In [13]:
# Set a range for hyper-parameters
k_range = list(range(1,19))
weight_options = ['uniform', 'distance']

param_grid = dict(n_neighbors = k_range, weights = weight_options)

knn = KNeighborsClassifier()

grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X1_train_res,y1_train_res)

#print(grid.grid_scores_)
'''
print(grid.grid_scores_[0].parameters)
print(grid.grid_scores_[0].cv_validation_scores)
print(grid.grid_scores_[0].mean_validation_score)
'''

print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

#Train the best estimator on the train set
knn_clf_2 = grid.best_estimator_.fit(X1_train_res,y1_train_res)

#Predict the response of best estimator for val set
knn_clf_2_pred = knn_clf_2.predict(X1_val)

# Calculate the accuracy score
knn_clf_2_score = metrics.accuracy_score(y1_val, knn_clf_2_pred)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Classifier 2 is: ",np.round(knn_clf_2_score*100, 2))

0.9544478527607362
{'n_neighbors': 1, 'weights': 'uniform'}
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')
Accuracy of Classifier 2 is:  84.03


#### Testing Classifier 2 on Test set

In [14]:
#Predict the response of best estimator for test set
knn_clf_2_pred_test = knn_clf_2.predict(data_revised_1_X_test)

# Calculate the accuracy score
knn_clf_2_score_test = metrics.accuracy_score(data_revised_1_y_test, knn_clf_2_pred_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Classifier 2 is: ",np.round(knn_clf_2_score_test*100, 2))

Accuracy of Classifier 2 is:  84.11


**After evaluating the two different methods of handling imbalanced classes SMOTE and ROSE, what we could see is ROSE (Random Over-Sampling Examples) shows better prediction accuracy for KNN Classification Algorithm on this dataset.**

#### Building Gaussian Naive Bayes Classifier

In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

There isn't a hyper-parameter for Gaussian Naive Bayes to tune, so I didn't use grid search

In [18]:
gnb2 = GaussianNB()
#Train the model
x_normalize = X1_train_res.values.astype(float)

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
x_scaled = min_max_scaler.fit_transform(x_normalize)

# Run the normalizer on the dataframe
X1_normalized = pd.DataFrame(x_scaled)

gnb2.fit(X1_normalized,y1_train_res)


#Predict the response for val set
gnb2_pred = gnb2.predict(X1_val)

# Calculate the accuracy score
gnb2_score = metrics.accuracy_score(y1_val, gnb2_pred)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Gaussian Naive Bayes2 is: ",np.round(gnb2_score*100, 2))

Accuracy of Gaussian Naive Bayes2 is:  78.87


In [None]:
model_fit(gnb2, print_FeatureImportance=False)

#### Test Gaussian Naive Bayes on Test set

In [19]:
#Predict the response for test set
gnb2_pred_test = gnb2.predict(data_revised_1_X_test)

# Calculate the accuracy score
gnb2_score_test = metrics.accuracy_score(data_revised_1_y_test, gnb2_pred_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy of Gaussian Naive Bayes2 is: ",np.round(gnb2_score_test*100, 2))

Accuracy of Gaussian Naive Bayes2 is:  79.69


In [None]:
model_fit(gnb2, print_FeatureImportance=False)