## Importing the libraries

In [2]:
import pandas as pd
from sklearn.svm import SVC
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


np.random.seed(1)

## Loading the dataset

In [3]:
df = pd.read_csv('C:/Users/Srinidhi/Documents/USF/Data_Science_Programming/Week4_Assignments/UniversalBank.csv')
df.head(5)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


Figuring out the size of the dataset

In [4]:
df.shape

(5000, 14)

Checking to see if there are any null values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [6]:
df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [6]:
df.corr()['CD Account'].sort_values(ascending=False)

CD Account            1.000000
Securities Account    0.317034
Personal Loan         0.316355
CreditCard            0.278644
Online                0.175880
Income                0.169738
CCAvg                 0.136534
Mortgage              0.089311
ZIP Code              0.019972
Family                0.014110
Education             0.013934
Experience            0.010353
Age                   0.008043
ID                   -0.006909
Name: CD Account, dtype: float64

As per the above output, it can be seen that the dataset has no null values or missing values. Hence the dataset is almost pretty clean. Next step is to drop the columns that are not carrying any weight to determine the target value.

## Cleaning the dataset

In [7]:
#From the dataset, it can be seen that ID column and ZIP Code column doesnt have any information that is useful for our analysis and is the lease correlated. Hence we can drop the columns.

df.drop(columns=["ID"])
df.drop(columns=["ZIP Code"])

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,3,0.5,2,0,0,0,0,1,0


Above output shows that the dataset is pretty much clean with the required columns. Now we have a categorical ordinal variable, namely: Education. This requires some preprocessing using one-hot encoding.

In [8]:
# One hot encoding:

EncodedData = pd.get_dummies(df, columns = ['Education'])
EncodedData

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Education_1,Education_2,Education_3
0,1,25,1,49,91107,4,1.6,0,0,1,0,0,0,1,0,0
1,2,45,19,34,90089,3,1.5,0,0,1,0,0,0,1,0,0
2,3,39,15,11,94720,1,1.0,0,0,0,0,0,0,1,0,0
3,4,35,9,100,94112,1,2.7,0,0,0,0,0,0,0,1,0
4,5,35,8,45,91330,4,1.0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,0,0,0,0,1,0,0,0,1
4996,4997,30,4,15,92037,4,0.4,85,0,0,0,1,0,1,0,0
4997,4998,63,39,24,93023,2,0.3,0,0,0,0,0,0,0,0,1
4998,4999,65,40,49,90034,3,0.5,0,0,0,0,1,0,0,1,0


## Scaling the data set

In [9]:
scaler = MinMaxScaler()
model=scaler.fit(EncodedData)
scaled_data=model.transform(EncodedData)

scaled_data = pd.DataFrame(scaled_data, index=EncodedData.index, columns=EncodedData.columns)

## Splitting the dataset into test train sets

In [10]:
X = scaled_data.drop('CD Account',axis=1)
y = scaled_data['CD Account']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 14)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3500, 15), (1500, 15), (3500,), (1500,))

## Modelling the Data

## Initial random search based on the parameters set for the Random search for the logistic regression

In [1]:
score_measure = "precision"
kfolds = 10

param_grid = [
    { 'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100]}
]

Lr = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = Lr, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = rand_search.best_estimator_


NameError: name 'LogisticRegression' is not defined

## Grid search for the logistic regression

In [12]:
score_measure = "recall"
kfolds = 10

param_grid = [
    { 'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10, 100]}
]

Lr_grid = LogisticRegression()
grid_search = GridSearchCV(estimator = Lr_grid, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_


Fitting 10 folds for each of 10 candidates, totalling 100 fits
The best recall score is 0.6688095238095239
... with parameters: {'C': 1, 'penalty': 'l2'}


50 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Srinidhi\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Srinidhi\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Srinidhi\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.6688

### Fit a SVM classification model using Random Search for Linear kernel

In [13]:
score_measure = "recall"
kfolds = 10

param_grid = {
     'C': [0.0001, 0.001, 0.1, 1, 10, 100, 1000], 
    'kernel': ['linear']
}

rand_linear_SVC = SVC()
rand_search = RandomizedSearchCV(estimator = rand_linear_SVC, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestPrecisionTree = rand_search.best_estimator_



Fitting 10 folds for each of 7 candidates, totalling 70 fits
The best recall score is 0.6688095238095239
... with parameters: {'kernel': 'linear', 'C': 0.1}


### Fit a SVM classification model using Grid Search for Linear kernel

In [14]:
score_measure = "recall"
kfolds = 10

param_grid = {
     'C': [0.0001, 0.001, 0.1, 1, 10, 100, 1000], 
    'kernel': ['linear']
}

Grid_Linear_SVC = SVC()
grid_search = GridSearchCV(estimator = Grid_Linear_SVC, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

Fitting 10 folds for each of 7 candidates, totalling 70 fits
The best recall score is 0.6688095238095239
... with parameters: {'C': 0.1, 'kernel': 'linear'}


### Fit a SVM classification model using Random Search for Poly kernel

In [15]:
score_measure = "recall"
kfolds = 10

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}

Random_Poly_SVC = SVC()
rand_search = RandomizedSearchCV(estimator = Random_Poly_SVC, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestPrecisionTree = rand_search.best_estimator_

Fitting 10 folds for each of 16 candidates, totalling 160 fits




The best recall score is 0.6928571428571428
... with parameters: {'kernel': 'poly', 'gamma': 1, 'C': 100}


### Fit a SVM classification model using Grid Search for Poly kernel

In [16]:
score_measure = "recall"
kfolds = 10

param_grid = {
     'C': [0.1,1, 10, 100], 
    'gamma': [1,0.1,0.01,0.001],
    'kernel': ['poly']
}


Grid_Poly_SVC = SVC()
grid_search = GridSearchCV(estimator = Grid_Poly_SVC, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

Fitting 10 folds for each of 16 candidates, totalling 160 fits
The best recall score is 0.6928571428571428
... with parameters: {'C': 100, 'gamma': 1, 'kernel': 'poly'}


### Fit a SVM classification model using Random Search for RBF kernel

In [17]:
score_measure = "recall"
kfolds = 10

param_grid = {
     'C': [0.1,1, 10], 
    'gamma': [1,0.1,0.011],
    'kernel': ['rbf']
}

Random_rbf_SVC = SVC()
rand_search = RandomizedSearchCV(estimator = Random_rbf_SVC, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestPrecisionTree = rand_search.best_estimator_

Fitting 10 folds for each of 9 candidates, totalling 90 fits




The best recall score is 0.6688095238095239
... with parameters: {'kernel': 'rbf', 'gamma': 0.1, 'C': 1}


### Fit a SVM classification model using Grid Search for rbf kernel

In [18]:
score_measure = "recall"
kfolds = 10

param_grid = {
     'C': [0.1,1, 10], 
    'gamma': [1,0.1,0.011],
    'kernel': ['rbf']
}

Grid_rbf_SVC = SVC()
grid_search = GridSearchCV(estimator = Grid_rbf_SVC, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestPrecisionTree = grid_search.best_estimator_

Fitting 10 folds for each of 9 candidates, totalling 90 fits
The best recall score is 0.6688095238095239
... with parameters: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


## Initial random search based on the parameters set for the Random search for the Decision Tree Classifier

In [7]:
score_measure = "accuracy"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = rand_search.best_estimator_

NameError: name 'X_train' is not defined

## Initial random search based on the parameters set for the Grid search for the Decision Tree Classifier

In [22]:
score_measure = "recall"
kfolds = 10

param_grid = {
    'min_samples_split': np.arange(95,99),  
    'min_samples_leaf': np.arange(15,19),
    'min_impurity_decrease': np.arange(0.0019, 0.0025, 0.0001),
    'max_leaf_nodes': np.arange(70,75), 
    'max_depth': np.arange(29,33), 
    'criterion': ['entropy'],
}

dtree_grid = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree_grid, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 10 folds for each of 1920 candidates, totalling 19200 fits
The best recall score is 0.6930952380952381
... with parameters: {'criterion': 'entropy', 'max_depth': 29, 'max_leaf_nodes': 70, 'min_impurity_decrease': 0.0019, 'min_samples_leaf': 15, 'min_samples_split': 96}


As we see above here are the recall score results to determine the best predictive model:
    
    1. Logistic Regressino for the Random search and Grid search recall value is: 66.8%
    2. SVM Linear kernel has a Random search and the Grid search recall value of: 66.8%
    3. SVM Poly kernel has a Random Search and the Grid Search with recall value of: 69.2%
    4. SVM RBF kernel has a Random Search and the Grid Search with recall value of: 66.8%
    5. Finally the decision tree random search and grid search has a recall value of 69.3%
    
It is noticed that the Random search and the grid search are predicting the similar recall output.

Based on the above values, the decision tree is the best predictive performance model when only considering the recall value. The second best performing model is the SVM poly kernel with a recall value of 69.2%. However the average recall score for each model is considered to be low when we are predicting the marketing campaign to sell the Universal Bank's new CD account offer. But based on the recall value, it can be seen that the Decision tree classifier is the best working model out of the rest.