In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier

#from xgboost import XGBClassifier

#from vecstack import stacking

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

In [12]:
df1=pd.read_csv('UnivBank_1.csv',na_values=['?','nan','#'])

In [13]:
df2=pd.read_csv('UnivBank_2.csv',na_values=['?','nan','#'])

In [14]:
df1.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Personal Loan,Securities Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,1.0,0,0
1,2,45,19,34,90089,3,1.5,1,0,1.0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0.0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0.0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0.0,0,1


In [15]:
df1.shape

(5000, 12)

In [16]:
df2.head()

Unnamed: 0,Mortgage,CD Account,ID
0,0.0,0.0,1330
1,0.0,0.0,2102
2,0.0,0.0,2110
3,0.0,0.0,3403
4,0.0,0.0,2453


In [17]:
df2.shape

(5000, 3)

In [18]:
df3=df1.merge(df2, on='ID')

In [19]:
df3.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Personal Loan,Securities Account,Online,CreditCard,Mortgage,CD Account
0,1,25,1,49,91107,4,1.6,1,0,1.0,0,0,0.0,0.0
1,2,45,19,34,90089,3,1.5,1,0,1.0,0,0,0.0,0.0
2,3,39,15,11,94720,1,1.0,1,0,0.0,0,0,0.0,0.0
3,4,35,9,100,94112,1,2.7,2,0,0.0,0,0,0.0,
4,5,35,8,45,91330,4,1.0,2,0,0.0,0,1,0.0,0.0


In [20]:
df3.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Personal Loan         0
Securities Account    2
Online                0
CreditCard            0
Mortgage              2
CD Account            1
dtype: int64

In [21]:
df3.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Personal Loan           int64
Securities Account    float64
Online                  int64
CreditCard              int64
Mortgage              float64
CD Account            float64
dtype: object

In [22]:
df3.Education.unique()

array([1, 2, 3], dtype=int64)

In [23]:
df3.Family.unique()

array([4, 3, 1, 2], dtype=int64)

In [24]:
df3['Securities Account'].unique()

array([ 1.,  0., nan])

In [30]:
df3['ZIP Code'].nunique()

467

In [31]:
df3.nunique()

ID                    5000
Age                     45
Experience              47
Income                 162
ZIP Code               467
Family                   4
CCAvg                  108
Education                3
Personal Loan            2
Securities Account       2
Online                   2
CreditCard               2
Mortgage               347
CD Account               2
dtype: int64

In [26]:
df3.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Personal Loan,Securities Account,Online,CreditCard,Mortgage,CD Account
0,1,25,1,49,91107,4,1.6,1,0,1.0,0,0,0.0,0.0
1,2,45,19,34,90089,3,1.5,1,0,1.0,0,0,0.0,0.0
2,3,39,15,11,94720,1,1.0,1,0,0.0,0,0,0.0,0.0
3,4,35,9,100,94112,1,2.7,2,0,0.0,0,0,0.0,
4,5,35,8,45,91330,4,1.0,2,0,0.0,0,1,0.0,0.0


In [25]:
df3.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Personal Loan', 'Securities Account', 'Online',
       'CreditCard', 'Mortgage', 'CD Account'],
      dtype='object')

In [None]:
df3.drop(['ID','ZIP Code'], axis=1, inplace=True)

In [36]:
cat_cols=['Family','Education', 'Personal Loan', 'Securities Account', 'Online', 'CD Account']

In [51]:
num_cols=df3.drop(cat_cols, axis=1)

In [52]:
num_cols.drop('CreditCard', axis=1, inplace=True)

In [53]:
num_cols=num_cols.columns

In [54]:
X=df3.drop('CreditCard', axis=1)

In [55]:
y=df3['CreditCard']

In [56]:
X[cat_cols] = X[cat_cols].astype('category')

In [57]:
X.dtypes

Age                      int64
Experience               int64
Income                   int64
Family                category
CCAvg                  float64
Education             category
Personal Loan         category
Securities Account    category
Online                category
Mortgage               float64
CD Account            category
dtype: object

In [58]:
y=y.astype('category')

In [59]:
X_train, X_test, y_train, y_test =  train_test_split(X,y,test_size=0.3,random_state=0)

In [68]:
numeric_transformer = Pipeline(steps=[('missing_value',SimpleImputer(strategy='mean'))])

In [69]:
categorical_transformer = Pipeline(steps=[('missing_value1',SimpleImputer(strategy='most_frequent')),
                                     ('onehotencode',OneHotEncoder(handle_unknown='ignore') )])

In [70]:
preprocessor=ColumnTransformer(transformers=[('num',numeric_transformer, num_cols),('cat',categorical_transformer,cat_cols)])

In [118]:
from sklearn import svm

In [119]:
clf=svm.SVC()

In [130]:
from sklearn.ensemble import AdaBoostClassifier

In [131]:
RF_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', AdaBoostClassifier())])

In [132]:
RF_pipe.fit(X_train, y_train)

In [133]:
train_pred_RFPipe = RF_pipe.predict(X_train)
test_pred_RFPipe = RF_pipe.predict(X_test)

In [134]:
def evaluate_model(act, pred):
    from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
    print("Confusion Matrix \n", confusion_matrix(act, pred))
    print("Accurcay : ", accuracy_score(act, pred))
    print("Recall   : ", recall_score(act, pred, average='weighted'))
    print("Precision: ", precision_score(act, pred, average='weighted'))  

In [135]:
print("--Train--")
evaluate_model(y_train, train_pred_RFPipe)
print("--Test--")
evaluate_model(y_test, test_pred_RFPipe)

--Train--
Confusion Matrix 
 [[2426   49]
 [ 848  177]]
Accurcay :  0.7437142857142857
Recall   :  0.7437142857142857
Precision:  0.7533470812060235
--Test--
Confusion Matrix 
 [[1040   15]
 [ 375   70]]
Accurcay :  0.74
Recall   :  0.74
Precision:  0.7612512991062149


In [86]:
param_grid={'classifier__max_depth':[4,5,6,7,8,9]}

grid_search=GridSearchCV(RF_pipe,param_grid,cv=10)
grid_search

In [87]:
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

{'classifier__max_depth': 4}


In [80]:
y.value_counts()

0    3530
1    1470
Name: CreditCard, dtype: int64