In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV

In [19]:
from xgboost import XGBClassifier


# Loading Data

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/rajap20/streamlit-example/master/data_final.csv')

In [3]:
df.head(5)

Unnamed: 0,Customer_id,Date_disb,pay_type,area_code,pin_code,state,dealer,product_code,tenure,roi,emi,proc_fee,asset_cost,loan_amt,gender,qualification,employ_type,resid_type,age,cibil_score,net_salary,net_irr,fraud
0,0,2/13/2020,ACH,3075,530041,AP,DEALER,SC,0.470588,-0.793282,0.124252,-0.093758,87000,71000,M,OTHERS,SAL,O,0.306122,-0.906781,0.158169,-1.321153,0
1,1,06-02-2020,ADM,3075,532123,AP,DEALER,SC,0.352941,-0.952045,0.083863,-0.572295,61500,51000,F,PG,SAL,O,0.55102,0.163204,1.161336,-1.308269,0
2,2,10-02-2020,ADM,3075,533218,AP,ASC,SC,0.352941,0.318061,0.210079,0.913555,87000,74038,M,SSC,SEP,O,0.530612,,0.910545,0.605033,0
3,3,10-02-2020,ADM,3075,532484,AP,DEALER,SC,0.352941,-1.309262,0.166511,0.649593,84000,69720,M,OTHERS,AGR,O,0.22449,-0.167982,-0.544048,-1.098901,0
4,4,10-02-2020,ADM,3075,530046,AP,DEALER,SC,0.352941,-0.7139,0.092091,-0.572295,61500,52220,F,HSC,SAL,O,0.22449,,-0.343415,-0.986164,0


# Feature Set Selection




In [4]:
df.columns

Index(['Customer_id', 'Date_disb', 'pay_type', 'area_code', 'pin_code',
       'state', 'dealer', 'product_code', 'tenure', 'roi', 'emi', 'proc_fee',
       'asset_cost', 'loan_amt', 'gender', 'qualification', 'employ_type',
       'resid_type', 'age', 'cibil_score', 'net_salary', 'net_irr', 'fraud'],
      dtype='object')

In [5]:
x_features = [
              'area_code', 
              'state', 'resid_type', 'net_irr', 'proc_fee',
       'asset_cost', 'loan_amt', 'emi', 'net_salary', 'roi', 'tenure',
       'age']

In [6]:
cat_features = [
                'area_code', 
                'state', 'resid_type']

In [7]:
num_features = list(set(x_features) - set(cat_features))
num_features

['asset_cost',
 'net_salary',
 'loan_amt',
 'age',
 'net_irr',
 'emi',
 'roi',
 'proc_fee',
 'tenure']

In [8]:
df[cat_features] = df[cat_features].astype(object)
df[x_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11702 entries, 0 to 11701
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_code   11702 non-null  object 
 1   state       11702 non-null  object 
 2   resid_type  11698 non-null  object 
 3   net_irr     11702 non-null  float64
 4   proc_fee    11702 non-null  float64
 5   asset_cost  11702 non-null  int64  
 6   loan_amt    11702 non-null  int64  
 7   emi         11702 non-null  float64
 8   net_salary  11702 non-null  float64
 9   roi         11702 non-null  float64
 10  tenure      11702 non-null  float64
 11  age         11698 non-null  float64
dtypes: float64(7), int64(2), object(3)
memory usage: 1.1+ MB


In [9]:
df = df[x_features + ['fraud']].dropna()
df.shape

(11698, 13)

In [10]:
X = df[x_features]
y = df['fraud']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 80)

# Pipeline for GridSearch

In [15]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore')

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', ohe_encoder, cat_features)])

In [70]:
params = { 
          "n_estimators": 301,
          "learning_rate": 0.1 ,
          "max_depth": 20,
          "min_child_weight": 7,
          "gamma": 0.1,
          "colsample_bytree" : 0.7
          }


In [71]:
xgb = XGBClassifier(**params)

In [72]:
xgb_v1 = Pipeline(steps=[('preprocessor', preprocessor),
                          ('xgb', xgb)])

In [73]:
xgb_v1.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['area_code', 'state',
                                                   'resid_type'])])),
                ('xgb',
                 XGBClassifier(colsample_bytree=0.7, gamma=0.1, max_depth=20,
                               min_child_weight=7, n_estimators=301))])

In [74]:
temp1 = pd.DataFrame({
                    "y_test":y_test,
                      "y_prob":xgb_v1.predict_proba(X_test)[:,1]
                    })

sum(temp1.y_prob.map(lambda x: 1 if x > 0.25 else 0))

96

In [94]:
params = { 'xgb__max_depth': [15, 20, 25]
          ,'xgb__learning_rate': [0.01, 0.05, 0.1],
           'xgb__n_estimators': [100, 300, 500]
          ,'xgb__colsample_bytree': [0.3, 0.7]
          }

In [101]:
grid_v1 = GridSearchCV(xgb_v1,
                           param_grid= params,
                           cv = 5,
                           scoring = 'accuracy')

In [102]:
grid_v1.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['area_code',
                                                                          'state',
                                                                          'resid_type'])])),
                                       ('xgb',
                                        XGBClassifier(colsample_bytree=0.7,
                                                      gamma=0.1, max_depth=20,
                                                      min_child_weight=7,
                                                      n_estimators=301))]),
             param_grid={'xgb__colsample_bytree': [0.3, 0.7],
                         'xgb__learning_rate': [0.01, 0

In [103]:
grid_v1.best_params_

{'xgb__colsample_bytree': 0.3,
 'xgb__learning_rate': 0.01,
 'xgb__max_depth': 15,
 'xgb__n_estimators': 100}

In [104]:
grid_v1.best_score_

0.9262662911647412

In [108]:
grid_results = pd.DataFrame(grid_v1.cv_results_ )
#grid_results[['param_xgb__colsample_bytree', 'param_xgb__learning_rate', 'param_xgb__max_depth','param_xgb__n_estimators','mean_test_score', 'std_test_score']]

### Finding the model using **One Standard Error Rule**

In [109]:
highest_score_std = grid_results[grid_results.rank_test_score == 1].iloc[0]['std_test_score']
highest_score_mean = grid_results[grid_results.rank_test_score == 1].iloc[0]['mean_test_score']

In [110]:
highest_score_mean, highest_score_std

(0.9262662911647412, 1.9302122848106964e-05)

In [111]:
std_error = highest_score_std / np.sqrt(10)
one_standard_error = highest_score_mean - std_error

In [112]:
one_standard_error

0.9262601872975535

In [116]:
grid_results[grid_results.mean_test_score > one_standard_error].head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__colsample_bytree,param_xgb__learning_rate,param_xgb__max_depth,param_xgb__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.286915,0.006307,0.015036,0.0007,0.3,0.01,15,100,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
1,0.980069,0.024228,0.038689,0.001474,0.3,0.01,15,300,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
2,1.593831,0.020951,0.060646,0.002008,0.3,0.01,15,500,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
3,0.283124,0.005162,0.014939,0.000935,0.3,0.01,20,100,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
4,0.976506,0.026202,0.039534,0.001608,0.3,0.01,20,300,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
5,1.597871,0.040053,0.062625,0.003352,0.3,0.01,20,500,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
6,0.284523,0.007817,0.014606,0.000786,0.3,0.01,25,100,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
7,0.987429,0.032797,0.040112,0.002274,0.3,0.01,25,300,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
8,1.621042,0.034152,0.063452,0.002879,0.3,0.01,25,500,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
9,0.340636,0.005227,0.016741,0.000279,0.3,0.05,15,100,"{'xgb__colsample_bytree': 0.3, 'xgb__learning_...",0.926282,0.926282,0.926282,0.926243,0.926243,0.926266,1.9e-05,1
