In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv('Banking Dataset.csv')

In [4]:
data.head(10)

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C1,0,30,M,160378.6,SAL,2,26-30,826,9
1,C10,1,41,M,84370.59,SELF-EMP,14,41-45,843,9
2,C100,0,49,F,60849.26,PROF,49,46-50,328,26
3,C1000,0,49,M,10558.81,SAL,23,46-50,619,19
4,C10000,0,43,M,97100.48,SENP,3,41-45,397,8
5,C10001,0,30,M,160378.6,SAL,2,26-30,781,11
6,C10002,0,43,M,26275.55,PROF,23,41-45,354,12
7,C10003,0,53,M,33616.47,SAL,45,>50,239,5
8,C10004,0,45,M,1881.37,PROF,3,41-45,339,13
9,C10005,0,37,M,3274.37,PROF,33,36-40,535,9


In [5]:
data.shape

(20000, 10)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Cust_ID         20000 non-null  object 
 1   Target          20000 non-null  int64  
 2   Age             20000 non-null  int64  
 3   Gender          20000 non-null  object 
 4   Balance         20000 non-null  float64
 5   Occupation      20000 non-null  object 
 6   No_OF_CR_TXNS   20000 non-null  int64  
 7   AGE_BKT         20000 non-null  object 
 8   SCR             20000 non-null  int64  
 9   Holding_Period  20000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 1.5+ MB


In [9]:
for feature in data.columns:
    if data[feature].dtypes=='object':
        data[feature]=pd.Categorical(data[feature]).codes

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Cust_ID         20000 non-null  int16  
 1   Target          20000 non-null  int64  
 2   Age             20000 non-null  int64  
 3   Gender          20000 non-null  int8   
 4   Balance         20000 non-null  float64
 5   Occupation      20000 non-null  int8   
 6   No_OF_CR_TXNS   20000 non-null  int64  
 7   AGE_BKT         20000 non-null  int8   
 8   SCR             20000 non-null  int64  
 9   Holding_Period  20000 non-null  int64  
dtypes: float64(1), int16(1), int64(5), int8(3)
memory usage: 1.0 MB


In [11]:
X = data.drop(['Target','Cust_ID'],axis=1)
y = data.pop('Target')

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [23]:
rfc1 = RandomForestClassifier(n_estimators=501,oob_score=True,max_depth=10,max_features=5,min_samples_leaf=50,min_samples_split=110)
#'n_estimators': number of the trees to be built using Random Forest
#depth implies level of the tree, which in this case cannot be more 

In [24]:
rfc1 = rfc1.fit(X_train,y_train)

In [25]:
rfc1.oob_score
#out of bag score within random forest

True

In [26]:
rfc1.oob_score_
#ACCURACY

0.9150714285714285

In [27]:
#ERROR
1-rfc1.oob_score_

0.08492857142857146

#### the accuracy more or less remains the same even after prunning

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
param_grid = {'max_depth':[7,10], 'max_features':[4,6], 'min_samples_leaf':[50,100], 'min_samples_split':[150,300], 'n_estimators':[301,501]}

In [30]:
rfc1 = RandomForestClassifier()

In [31]:
grid_search = GridSearchCV(estimator=rfc1,param_grid=param_grid,cv=3)
#cv --> cross-validation 

In [32]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [7, 10], 'max_features': [4, 6],
                         'min_samples_leaf': [50, 100],
                         'min_samples_split': [150, 300],
                         'n_estimators': [301, 501]})

In [33]:
grid_search.best_params_

{'max_depth': 10,
 'max_features': 6,
 'min_samples_leaf': 50,
 'min_samples_split': 150,
 'n_estimators': 301}

In [35]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=10, max_features=6, min_samples_leaf=50,
                       min_samples_split=150, n_estimators=301)

In [37]:
best_grid = grid_search.best_estimator_

In [39]:
y_train_predict = best_grid.predict(X_train)
y_test_predict = best_grid.predict(X_test)

In [41]:
from sklearn.metrics import confusion_matrix,classification_report

In [42]:
confusion_matrix(y_train,y_train_predict)

array([[12748,    34],
       [ 1145,    73]], dtype=int64)

In [43]:
confusion_matrix(y_test,y_test_predict)

array([[5472,   13],
       [ 487,   28]], dtype=int64)

In [44]:
print(classification_report(y_train,y_train_predict))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     12782
           1       0.68      0.06      0.11      1218

    accuracy                           0.92     14000
   macro avg       0.80      0.53      0.53     14000
weighted avg       0.90      0.92      0.88     14000



In [45]:
print(classification_report(y_test,y_test_predict))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      5485
           1       0.68      0.05      0.10       515

    accuracy                           0.92      6000
   macro avg       0.80      0.53      0.53      6000
weighted avg       0.90      0.92      0.88      6000



In [46]:
import matplotlib.pyplot as plt

In [47]:
probs = best_grid.predict_proba(X_train)

In [48]:
probs = probs[:,1]

In [50]:
from sklearn.metrics import roc_auc_score

In [52]:
auc = roc_auc_score(y_train,probs)
auc

0.843854658606276