###  Subash Chandra Biswal (U77884251) ###

## Set Up  ##

In [1]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# set random seed to ensure that results are repeatable
np.random.seed(1)

## Data Load ##

In [2]:
# load data
ubank = pd.read_csv("./data/UniversalBank.csv")

ubank.head(3)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0


## Initial Analysis of Data ##

The dataset has all the numeric variables with Education variable with multi categories. The Experience variable has 52 rows with negative values. Since negative experience does not make sense, we can drop these 52 (1% of the dataset) records from the dataset.

In [3]:
ubank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [4]:
# generate a statistical summary of the numeric value in the data
ubank.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Removing invalid rows (Negative values)
ubank=ubank[ubank.Experience>=0]

In [6]:
ubank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4948 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  4948 non-null   int64  
 1   Age                 4948 non-null   int64  
 2   Experience          4948 non-null   int64  
 3   Income              4948 non-null   int64  
 4   ZIP Code            4948 non-null   int64  
 5   Family              4948 non-null   int64  
 6   CCAvg               4948 non-null   float64
 7   Education           4948 non-null   int64  
 8   Mortgage            4948 non-null   int64  
 9   Personal Loan       4948 non-null   int64  
 10  Securities Account  4948 non-null   int64  
 11  CD Account          4948 non-null   int64  
 12  Online              4948 non-null   int64  
 13  CreditCard          4948 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 579.8 KB


In [7]:
ubank.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0,4948.0
mean,2501.268795,45.557195,20.331043,73.81447,93151.573363,2.391471,1.935926,1.878941,56.634398,0.097009,0.104285,0.061035,0.597009,0.294058
std,1443.277676,11.320735,11.311973,46.112596,2126.669017,1.148444,1.747694,0.839745,101.828885,0.296,0.30566,0.239418,0.490549,0.455664
min,1.0,24.0,0.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1254.75,36.0,10.75,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2497.5,46.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.6,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [6]:
ubank.head(10)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
5,6,37,13,29,92121,4,0.4,2,155,0,0,0,1,0
6,7,53,27,72,91711,2,1.5,2,0,0,0,0,1,0
7,8,50,24,22,93943,1,0.3,3,0,0,0,0,0,1
8,9,35,10,81,90089,3,0.6,2,104,0,0,0,1,0
9,10,34,9,180,93023,1,8.9,3,0,1,0,0,0,0


In [7]:
# Check the missing values by summing the total na's for each variable
ubank.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

## Drop Unnecessary Variables and fill missing categorical values ##

The variables ID and ZIP Code do not contribute to the problem and so can be dropped from the dataset.

In [8]:
# Our target is price; but there are three related price variableds - price, price_gte_150, 
# and price_category. We need to drop price_gte_150, and price_category
ubank.drop(['ID', 'ZIP Code'], axis=1, inplace = True) # keep price_gte_150 for our target

## Encode the categorical variables ##

The Education variable has 3 categories of data. 1: Undergrad  2: Graduate  3: Advanced/Professional. We can use get_dummies encoder to create features. The categories are not equally distanced from each other. So we have to encode this. We also have family variable with 4 categories(1,2,3,4). The values are integer and ordinal. Since the values are already in integer form, we dont have to encode this variable. 

In [9]:
ubank = ubank.join(pd.get_dummies(ubank['Education'], prefix='Education', drop_first=True))
ubank.drop('Education', axis=1, inplace = True)

In [10]:
# explore the dataframe columns to verify encoding and dropped columns
ubank.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4948 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 4948 non-null   int64  
 1   Experience          4948 non-null   int64  
 2   Income              4948 non-null   int64  
 3   Family              4948 non-null   int64  
 4   CCAvg               4948 non-null   float64
 5   Mortgage            4948 non-null   int64  
 6   Personal Loan       4948 non-null   int64  
 7   Securities Account  4948 non-null   int64  
 8   CD Account          4948 non-null   int64  
 9   Online              4948 non-null   int64  
 10  CreditCard          4948 non-null   int64  
 11  Education_2         4948 non-null   uint8  
 12  Education_3         4948 non-null   uint8  
dtypes: float64(1), int64(10), uint8(2)
memory usage: 602.6 KB


## Train Test Split ##

Since we have a lot of data, we have decided to split the data set to 70% train data and 30% test data. The target variable here is "Securities Account", which has either 0 or 1 as the values. So we dont have to transform the variable further.

In [11]:
# split the data into validation and training set
train_df, test_df = train_test_split(ubank, test_size=0.30)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'Securities Account'
predictors = list(ubank.columns)
predictors.remove(target)

## Impute missing values of numerical variables ##

In [12]:
train_X = train_df[predictors]
train_y = train_df[target] # train_target is now a series object
test_X = test_df[predictors]
test_y = test_df[target] # validation_target is now a series object

In [13]:
train_y.head(5)

3395    0
370     0
2337    0
3341    1
3602    0
Name: Securities Account, dtype: int64

## Random search of parameters grid ##

The score measure is precision and we are using 3 classification models here. They are Logistic Regression, Decision Tree, and SVM. Due to huge time consumption by the SVM, the parameters are limited here.

In [13]:
score_measure = "precision"
kfolds = 5

dtree = DecisionTreeClassifier()
svmm = SVC()
logreg = LogisticRegression()


In [14]:
#Grid for Logistic Regression

param_grid_logr = [{
     'penalty': ['l1', 'l2', 'elasticnet', 'none'],
     'solver': ['saga'],
     'max_iter': np.arange(100,900),},
      {
     'penalty': ['l1', 'l2'],
     'solver': ['liblinear'],
     'max_iter': np.arange(100,900),},
    {
     'penalty': ['l2', 'none'],
     'solver': ['lbfgs'],
     'max_iter': np.arange(100,900),}
]    

rand_search_logr = RandomizedSearchCV(estimator = logreg, param_distributions=param_grid_logr, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

# Logistic Regression model fit for grid search
_ = rand_search_logr.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search_logr.best_score_}")
print(f"... with parameters: {rand_search_logr.best_params_}")

bestPrecisionLogr = rand_search_logr.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The best precision score is 0.6883986928104575
... with parameters: {'solver': 'lbfgs', 'penalty': 'none', 'max_iter': 119}


65 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
65 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1471, in fit
    raise ValueError(
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

 0.6339021  0.6339021  0.63528139 0.63130288 0.6339021  0.63910411
 0.         0.62354312 0.         0.                nan 0.
 0.         0.         0.         0.62909091 0.                nan
 0.61607378 0.         0.640

In [15]:
# Grid for decision tree
param_grid_tree = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 50), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

rand_search_tree = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid_tree, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

# Decision tree model fit for grid search
_ = rand_search_tree.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search_tree.best_score_}")
print(f"... with parameters: {rand_search_tree.best_params_}")

bestPrecisionTree = rand_search_tree.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The best precision score is 0.7276736596736597
... with parameters: {'min_samples_split': 66, 'min_samples_leaf': 22, 'min_impurity_decrease': 0.0006000000000000001, 'max_leaf_nodes': 49, 'max_depth': 11, 'criterion': 'gini'}


5 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\scbis\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

 0.3875     0.3875     0.3875     0.3875     0.58594508 0.3875
 0.60810082 0.72

In [None]:

# Grid for SVM
param_grid_svm = [{
    'degree': [2,3],
    'C': [1,5,10,15],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['poly'],   
},{
    'C': [1,5,10,15],
    'gamma': [1, 0.1, 0.01],
    'kernel': ['rbf'],   
},
{
    'C': [1,5,10,15],
    'kernel': ['linear'],  
}]

rand_search_svm = RandomizedSearchCV(estimator = svmm, param_distributions=param_grid_svm, cv=kfolds, n_iter=50,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

# SVM model fit for grid search
_ = rand_search_svm.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search_svm.best_score_}")
print(f"... with parameters: {rand_search_svm.best_params_}")

bestPrecisionSvm = rand_search_svm.best_estimator_



Fitting 5 folds for each of 40 candidates, totalling 200 fits


## Confusion matrix of Models ##

In [43]:
## Decision Tree
c_matrix = confusion_matrix(test_y, rand_search_tree.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8472352 Precision=0.8333333 Recall=0.8662900 F1=0.8494922


In [44]:
## SVM
c_matrix = confusion_matrix(test_y, rand_search_svm.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8631678 Precision=0.8493648 Recall=0.8813559 F1=0.8650647


In [None]:
## Logistic Regression
c_matrix = confusion_matrix(test_y, rand_search_logr.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

## Final grid search with smaller range ##

In [51]:
score_measure = "precision"
kfolds = 5

# Logistic Regression grid
param_grid_logr = {
     'penalty': ['none'],
     'solver': ['lbfgs'],
     'max_iter': np.arange(116,122),}

# Decision tree grid
param_grid_tree = {
    'min_samples_split': np.arange(63,69),  
    'min_samples_leaf': np.arange(19,25),
    'min_impurity_decrease': np.arange(0.0003, 0.0009, 0.0001),
    'max_leaf_nodes': np.arange(46,52), 
    'max_depth': np.arange(8,14), 
    'criterion': ['gini'],
}

# SVM grid
param_grid_svm = {
    'degree': np.arange(1,3),
    'coef0': np.arange(1,3),
    'C': np.arange(10,16),
    'kernel': ['poly'],   
}

dtree = DecisionTreeClassifier()
svmm = SVC()
logreg = LogisticRegression()

grid_search_logr = RandomizedSearchCV(estimator = logreg, param_distributions=param_grid_logr, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

grid_search_tree = GridSearchCV(estimator = dtree, param_grid=param_grid_tree, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

grid_search_svm = GridSearchCV(estimator = svmm, param_grid=param_grid_svm, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

# Logistic Regression fit
_ = grid_search_logr.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search_logr.best_score_}")
print(f"... with parameters: {grid_search_logr.best_params_}")

bestPrecisionLogr = grid_search_logr.best_estimator_

# Decision tree fit
_ = grid_search_tree.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search_tree.best_score_}")
print(f"... with parameters: {grid_search_tree.best_params_}")

bestPrecisionTree = grid_search_tree.best_estimator_


# SVM fit
_ = grid_search_svm.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search_svm.best_score_}")
print(f"... with parameters: {grid_search_svm.best_params_}")

bestPrecisionSvm = grid_search_svm.best_estimator_

Fitting 5 folds for each of 7776 candidates, totalling 38880 fits
The best precision score is 0.8584728930092839
... with parameters: {'criterion': 'gini', 'max_depth': 22, 'max_leaf_nodes': 15, 'min_impurity_decrease': 0.0021000000000000003, 'min_samples_leaf': 19, 'min_samples_split': 45}
Fitting 5 folds for each of 24 candidates, totalling 120 fits
The best precision score is 0.8559629990754983
... with parameters: {'C': 13, 'coef0': 2, 'degree': 2, 'kernel': 'poly'}


In [52]:
# Confusion matrix of Decision tree
c_matrix = confusion_matrix(test_y, grid_search_tree.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8472352 Precision=0.8333333 Recall=0.8662900 F1=0.8494922


In [53]:
# Confusion matrix of SVM
c_matrix = confusion_matrix(test_y, grid_search_svm.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

Accuracy=0.8631678 Precision=0.8493648 Recall=0.8813559 F1=0.8650647


In [None]:
# Confusion matrix of Logistic Regression
c_matrix = confusion_matrix(test_y, grid_search_logr.predict(test_X))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
print(f"Accuracy={(TP+TN)/(TP+TN+FP+FN):.7f} Precision={TP/(TP+FP):.7f} Recall={TP/(TP+FN):.7f} F1={2*TP/(2*TP+FP+FN):.7f}")

## Important features of Decision tree model ##

In [54]:
np.round(grid_search_tree.best_estimator_.feature_importances_,2)

array([0.  , 0.  , 0.03, 0.1 , 0.78, 0.  , 0.  , 0.05, 0.01, 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.03, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

## Analysis of models ## 

The SVM model grid search takes a lot of time. I will update the analysis and resubmit when the model run complets.