In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score
import sklearn.svm as svm
from sklearn.model_selection import GridSearchCV



In [3]:
#os.chdir("")

In [4]:
bankdata = pd.read_csv('UniversalBank.csv',na_values=['?',','])
print(bankdata.shape)
print(type(bankdata))

(5000, 14)
<class 'pandas.core.frame.DataFrame'>


In [5]:
print(bankdata.columns)
print(bankdata.dtypes)

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')
ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object


In [6]:
bankdata.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


#### Perform same EDA and data pre-processing as we did for dataset in random forest implementation (in R)

In [7]:
bankdata.shape[0] == len(bankdata['ID'].unique())

True

In [8]:
len(bankdata['ZIP Code'].unique())

467

In [9]:
#Check for class imbalance
bankdata['Personal Loan'].value_counts()/bankdata.shape[0]

0    0.904
1    0.096
Name: Personal Loan, dtype: float64

In [10]:
#Check for NAs
bankdata.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [11]:
#Drop ID and ZIP Code
bank = bankdata.drop(['ID','ZIP Code'],axis=1)

In [12]:
#List of categorical attributes
cat_vars = ['Family','Education','Personal Loan','Securities Account','CD Account', 'Online', 'CreditCard']

In [13]:
#Type conversion
bank[cat_vars] = bank[cat_vars].astype('category')

In [14]:
#Create dummy variables for categorical attributes
bank = pd.get_dummies(bank[bank[cat_vars].columns.difference(['Personal Loan'])])

In [15]:
bank = pd.concat([bankdata[['Age','Experience','Income','CCAvg','Mortgage']],bank],axis=1)
bank['Personal Loan'] = bankdata['Personal Loan']
bank.head()

Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage,CD Account_0,CD Account_1,CreditCard_0,CreditCard_1,Education_1,...,Education_3,Family_1,Family_2,Family_3,Family_4,Online_0,Online_1,Securities Account_0,Securities Account_1,Personal Loan
0,25,1,49,1.6,0,1,0,1,0,1,...,0,0,0,0,1,1,0,0,1,0
1,45,19,34,1.5,0,1,0,1,0,1,...,0,0,0,1,0,1,0,0,1,0
2,39,15,11,1.0,0,1,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0
3,35,9,100,2.7,0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
4,35,8,45,1.0,0,1,0,0,1,0,...,0,0,0,0,1,1,0,1,0,0


In [16]:
#Correlation plot
bank.loc[bank['Experience']>0,['Age','Experience','Income','CCAvg','Mortgage']].corr() \
                                                                               .style.background_gradient().set_precision(2)



Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage
Age,1.0,0.99,-0.062,-0.052,-0.019
Experience,0.99,1.0,-0.053,-0.05,-0.018
Income,-0.062,-0.053,1.0,0.64,0.21
CCAvg,-0.052,-0.05,0.64,1.0,0.11
Mortgage,-0.019,-0.018,0.21,0.11,1.0


In [17]:
#Replace negative values with NAs and use imputation technique
bank.loc[bank['Experience']<0,['Experience']] = np.nan

In [18]:
#Divide data into train and validation
y = bank['Personal Loan']
X = bank.drop('Personal Loan', axis=1)
#from sklearn.model_selection import train_test_split
np.random.seed(2000)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30)  

In [19]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(3500, 20)
(1500, 20)
(3500,)
(1500,)


In [20]:
#Imputation (using median values) - you can explore other imputation methods as well
imputer = Imputer(strategy='median')
imputer.fit(X_train.loc[:,['Experience']])
X_train.loc[:,['Experience']] = imputer.transform(X_train.loc[:,['Experience']])
X_val.loc[:,['Experience']] = imputer.transform(X_val.loc[:,['Experience']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [21]:
#Standardization
scaler = StandardScaler()
scaler.fit(X_train.loc[:,['Age','Experience','Income','CCAvg','Mortgage']])
X_train.loc[:,['Age','Experience','Income','CCAvg','Mortgage']] = scaler.transform( \
                                                                  X_train.loc[:,['Age','Experience','Income','CCAvg','Mortgage']])
X_val.loc[:,['Age','Experience','Income','CCAvg','Mortgage']] = scaler.transform( \
                                                                  X_val.loc[:,['Age','Experience','Income','CCAvg','Mortgage']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [22]:
X_train.head()
X_val.head()

Unnamed: 0,Age,Experience,Income,CCAvg,Mortgage,CD Account_0,CD Account_1,CreditCard_0,CreditCard_1,Education_1,Education_2,Education_3,Family_1,Family_2,Family_3,Family_4,Online_0,Online_1,Securities Account_0,Securities Account_1
400,-0.805761,-0.911069,2.308002,2.691834,-0.56222,1,0,1,0,1,0,0,0,0,1,0,0,1,1,0
1400,-1.153051,-1.088424,0.100878,-1.048681,-0.56222,1,0,1,0,0,1,0,0,0,0,1,0,1,1,0
2031,1.277976,1.30586,0.144584,-0.818495,-0.56222,1,0,1,0,1,0,0,0,0,1,0,0,1,1,0
2378,-1.326695,-1.354455,-0.270618,-0.645856,1.945475,0,1,1,0,0,1,0,1,0,0,0,0,1,0,1
1068,-0.979406,-0.999747,0.690901,-0.415671,-0.56222,1,0,1,0,0,0,1,0,0,1,0,1,0,0,1


In [23]:
#Convert y variables to category
y_train = y_train.astype('category')
y_val = y_val.astype('category')

In [24]:
#Define metrics function
def metrics_calculation(preds,trues):
    print('Accuray is: {}'.format(accuracy_score(y_pred=preds,y_true=trues)))
    print('Recall is: {}'.format(recall_score(y_pred=preds,y_true=trues)))
    print('Precision is: {}\n'.format(precision_score(y_pred=preds,y_true=trues)))

#### Build SVM Classifier - with default parameters and linear kernel

In [25]:
svm_obj = svm.SVC(kernel = 'linear')
svm_obj.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
train_preds = svm_obj.predict(X_train)
val_preds = svm_obj.predict(X_val)

In [27]:
metrics_calculation(train_preds,y_train)
metrics_calculation(val_preds,y_val)
#Though there doesn't seem to be over-fitting, the recall values are pretty low. This is because of class imbalance.

Accuray is: 0.9631428571428572
Recall is: 0.6686746987951807
Precision is: 0.921161825726141

Accuray is: 0.962
Recall is: 0.668918918918919
Precision is: 0.9252336448598131



#### Build SVM Classifier with weights and linear kernel

In [28]:
svm_obj_cw = svm.SVC(kernel='linear',class_weight={0:0.5,1:3})
svm_obj_cw.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight={0: 0.5, 1: 3}, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
train_preds_cw = svm_obj_cw.predict(X_train)
val_preds_cw = svm_obj_cw.predict(X_val)

In [30]:
metrics_calculation(train_preds_cw,y_train)
metrics_calculation(val_preds_cw,y_val)
#The recall value has shot up considerable by setting class_weights parameter to counter class imbalance - but, precision
#dropped significantly. You can infer that model is now focussing only on making correct prediction for ones, even if it has to
#make incorrect predictions for zeroes

Accuray is: 0.9337142857142857
Recall is: 0.8734939759036144
Precision is: 0.6041666666666666

Accuray is: 0.926
Recall is: 0.8513513513513513
Precision is: 0.586046511627907



# GridSearch Cross Validation

In [31]:
parameter_grid = {'C':[0.001,0.01,0.1,1,5,10,15,20],
                  'gamma':[0.001,0.01,0.1,1,3,5],
                  'kernel':['linear','rbf','poly'],
                  'degree':[1,2,3]}
#'degree' is used for polynomial kernels
zero_proportion, one_proportion = 1/y_train.value_counts()[0],1/y_train.value_counts()[1]
#Usually, class_weights are given as inverse of class proportion
#You can try sigmoid kernel
np.random.seed(1500)
grid_obj = GridSearchCV(svm.SVC(class_weight={0:zero_proportion,1:one_proportion},random_state=3000),
                        param_grid=parameter_grid,scoring='recall',n_jobs=-2,cv=5)

In [32]:
grid_obj.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200,
  class_weight={0: 0.0003156565656565657, 1: 0.0030120481927710845},
  coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto',
  kernel='rbf', max_iter=-1, probability=False, random_state=3000,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-2,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 5, 10, 15, 20], 'gamma': [0.001, 0.01, 0.1, 1, 3, 5], 'kernel': ['linear', 'rbf', 'poly'], 'degree': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [33]:
print('Best parameters after 5-fold cross-validation are: {}'.format(grid_obj.best_params_))
print('Best {} obtained is: {}'.format(grid_obj.scoring,grid_obj.best_score_))

Best parameters after 5-fold cross-validation are: {'C': 15, 'degree': 2, 'gamma': 1, 'kernel': 'poly'}
Best recall obtained is: 0.9489161336176262


In [34]:
#Build a model based on parameters obtained from cross-validation
train_preds_tuned = grid_obj.predict(X_train)
val_preds_tuned = grid_obj.predict(X_val)

In [35]:
metrics_calculation(train_preds_tuned,y_train)
metrics_calculation(val_preds_tuned,y_val)
#We observe that recall has increased by around 7% on test set (though there seems to be slight overfitting) and precision has
#increased by almost 15%!! 

Accuray is: 0.968
Recall is: 0.9819277108433735
Precision is: 0.7546296296296297

Accuray is: 0.9626666666666667
Recall is: 0.9256756756756757
Precision is: 0.7527472527472527

