# Libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from functools import partial
from collections import Counter
import seaborn as sns

from joblib import load

# modeling
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# ignore warnings
import warnings
warnings.filterwarnings(action="ignore")

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Get dataset

In [2]:
# get pickle file
providers = load('./data/Providers_Third.pkl')

In [3]:
providers

Unnamed: 0_level_0,PotentialFraud,Perc_Outpatient,DualPatientProvider,Perc_DualPatientType,Ratio_ClaimsPerPatient,Ratio_ClaimsPerAttPhys,Perc_ClaimsPerTopFraudState,Perc_HasTop5AdmtCode,Perc_MultHospAttPhys,Perc_MultHospOperPhys,Perc_MultHospOtherPhys,IP_Perc_Duplicates,IP_Count_UniquePatients,IP_Count_UniqueState,IP_Mean_AgeAtClaim,IP_Perc_HasDied,IP_Perc_GenderZero,IP_Perc_RaceOne,IP_Perc_RaceTwo,IP_Perc_RaceThree,IP_Mean_NumChronicConds,IP_Perc_Alzheimers_Chronic,IP_Perc_Cancer_Chronic,IP_Perc_Depression_Chronic,IP_Perc_Diabetes_Chronic,IP_Perc_HeartFailure_Chronic,IP_Perc_IschemicHeart_Chronic,IP_Perc_KidneyDisease_Chronic,IP_Perc_ObstrPulmonary_Chronic,IP_Perc_Osteoporosis_Chronic,IP_Perc_RheumatoidArthritis_Chronic,IP_Perc_Stroke_Chronic,IP_Perc_HasRenalDisease,IP_Mean_ClaimDuration,IP_Mean_AdmitDuration,IP_Mean_NoOfMonths_PartACov,IP_Mean_NoOfMonths_PartBCov,IP_Mean_ClaimCost,IP_Mean_DailyClaimCost,IP_Mean_DeductibleAmtPaid,IP_Mean_InscClaimAmtReimbursed,IP_Mean_InsReimbursementRatio,IP_Mean_AnnualDeductibleAmt,IP_Mean_AnnualReimbursementAmt,IP_Perc_No_ProcCode,IP_Perc_HasAllPhys,IP_Perc_HasNoPhys,IP_Perc_MultHosp,OP_Perc_Duplicates,OP_Count_UniquePatients,OP_Count_UniqueState,OP_Mean_AgeAtClaim,OP_Perc_HasDied,OP_Perc_GenderZero,OP_Perc_RaceOne,OP_Perc_RaceTwo,OP_Perc_RaceThree,OP_Mean_NumChronicConds,OP_Perc_Alzheimers_Chronic,OP_Perc_Cancer_Chronic,OP_Perc_Depression_Chronic,OP_Perc_Diabetes_Chronic,OP_Perc_HeartFailure_Chronic,OP_Perc_IschemicHeart_Chronic,OP_Perc_KidneyDisease_Chronic,OP_Perc_ObstrPulmonary_Chronic,OP_Perc_Osteoporosis_Chronic,OP_Perc_RheumatoidArthritis_Chronic,OP_Perc_Stroke_Chronic,OP_Perc_HasRenalDisease,OP_Mean_ClaimDuration,OP_Mean_NoOfMonths_PartACov,OP_Mean_NoOfMonths_PartBCov,OP_Mean_ClaimCost,OP_Mean_DailyClaimCost,OP_Mean_DeductibleAmtPaid,OP_Mean_InscClaimAmtReimbursed,OP_Mean_InsReimbursementRatio,OP_Mean_AnnualDeductibleAmt,OP_Mean_AnnualReimbursementAmt,OP_Perc_No_DiagCode,OP_Perc_HasAllPhys,OP_Perc_HasNoPhys,OP_Perc_MultHosp,IP_Mean_PatientsPerAttPhys,IP_Mean_PatientsPerOperPhys,IP_Mean_PatientsPerOtherPhys,IP_AgeRange,OP_Mean_PatientsPerAttPhys,OP_Mean_PatientsPerOperPhys,OP_Mean_PatientsPerOtherPhys,OP_AgeRange
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1
PRV51001,0,0.800000,1.0,0.400000,1.041667,1.785714,0.0,0.040000,0.240000,0.040000,0.000000,0.000000,5.0,1.0,77.600000,0.000000,0.400000,1.000000,0.000000,0.0,6.000000,0.400000,0.200000,0.800000,0.800000,0.800000,0.800000,0.800000,0.400000,0.000000,0.600000,0.400000,0.400000,6.000000,6.000000,12.000000,12.000000,20468.000000,4077.360000,1068.0,19400.000000,0.878279,897.120000,17606.000000,0.600000,0.2,0.0,0.160000,0.550000,19.0,1.0,77.950000,0.000000,0.350000,0.800000,0.200000,0.000000,5.450000,0.650000,0.200000,0.250000,0.850000,0.750000,0.950000,0.650000,0.400000,0.300000,0.250000,0.200000,0.300000,1.550000,12.000000,12.000000,382.000000,307.000000,0.000000,382.000000,1.000000,463.920000,2615.200000,0.000000,0.050000,0.000000,0.920000,1.25,1.0,1.0,26.0,2.000000,1.000000,1.285714,40.0
PRV51003,1,0.530303,1.0,0.500000,1.128205,3.000000,0.0,0.060606,0.007576,0.000000,0.000000,0.016129,53.0,3.0,69.935484,0.016129,0.338710,0.790323,0.209677,0.0,4.919355,0.516129,0.112903,0.403226,0.790323,0.580645,0.887097,0.629032,0.370968,0.209677,0.306452,0.112903,0.274194,6.161290,6.161290,11.806452,11.806452,10309.935484,2384.941628,1068.0,9241.935484,0.821059,931.424242,7568.181818,0.370968,0.0,0.0,0.045455,0.357143,66.0,3.0,68.371429,0.000000,0.471429,0.828571,0.157143,0.000000,4.214286,0.342857,0.042857,0.414286,0.728571,0.628571,0.814286,0.357143,0.257143,0.285714,0.271429,0.071429,0.171429,3.357143,11.828571,11.928571,467.714286,336.440760,1.000000,466.714286,0.994032,737.121212,2678.181818,0.000000,0.057143,0.000000,0.818182,29.50,20.0,0.0,55.0,1.590909,1.000000,1.136364,59.0
PRV51004,0,1.000000,0.0,0.208054,1.079710,3.921053,0.0,0.013423,0.167785,0.013423,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,434.953020,4351.879195,0.000000,0.0,0.0,0.060403,0.461538,138.0,9.0,71.302013,0.006711,0.308725,0.000000,0.000000,0.000000,4.342282,0.429530,0.107383,0.422819,0.704698,0.590604,0.724832,0.335570,0.275168,0.328859,0.308725,0.114094,0.154362,2.429530,11.865772,11.959732,352.214765,250.363050,2.080537,350.134228,0.978485,622.751678,2194.899329,0.040268,0.080537,0.000000,0.899329,0.00,0.0,0.0,0.0,3.921053,1.421053,2.423077,74.0
PRV51005,1,1.000000,0.0,0.248069,2.353535,194.166667,0.0,0.032618,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,379.162232,3623.991416,0.000000,0.0,0.0,0.048069,0.452257,495.0,4.0,69.567382,0.003433,0.438627,0.805369,0.161074,0.033557,4.335622,0.365665,0.141631,0.416309,0.685837,0.583691,0.768240,0.435193,0.253219,0.295279,0.284120,0.106438,0.222318,2.088412,11.907296,11.939914,244.300429,196.533055,3.175966,241.124464,0.980747,636.328755,2109.733906,0.011159,0.084120,0.001717,0.731330,0.00,0.0,0.0,0.0,147.833333,33.833333,102.750000,72.0
PRV51007,0,0.958333,1.0,0.277778,1.241379,7.200000,0.0,0.027778,0.597222,0.083333,0.027778,0.000000,3.0,1.0,78.000000,0.000000,0.333333,0.000000,0.000000,0.0,5.666667,0.666667,0.000000,0.666667,1.000000,1.000000,1.000000,0.333333,0.000000,0.000000,0.333333,0.666667,0.333333,6.333333,6.333333,12.000000,12.000000,7401.333333,1255.588889,1068.0,6333.333333,0.829955,445.000000,3050.000000,0.666667,0.0,0.0,0.069444,0.420290,56.0,2.0,67.956522,0.014493,0.478261,0.766524,0.224893,0.008584,4.101449,0.347826,0.173913,0.391304,0.666667,0.536232,0.695652,0.304348,0.231884,0.304348,0.304348,0.144928,0.144928,1.768116,11.826087,11.826087,214.057971,199.685990,0.869565,213.188406,0.992157,469.722222,1729.722222,0.000000,0.115942,0.000000,0.902778,1.50,1.0,0.0,10.0,8.000000,2.750000,3.571429,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRV57759,0,1.000000,0.0,0.214286,1.166667,28.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,457.714286,3962.142857,0.000000,0.0,0.0,0.178571,0.518519,24.0,1.0,73.000000,0.000000,0.571429,0.931034,0.034483,0.034483,5.250000,0.500000,0.142857,0.321429,0.714286,0.714286,1.000000,0.535714,0.392857,0.464286,0.321429,0.142857,0.178571,3.142857,12.000000,12.000000,384.642857,200.629252,4.642857,380.000000,0.983401,886.785714,3241.785714,0.035714,0.035714,0.000000,0.928571,0.00,0.0,0.0,0.0,24.000000,1.000000,11.000000,34.0
PRV57760,0,1.000000,0.0,0.363636,2.444444,7.333333,0.0,0.045455,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,436.909091,2785.454545,0.000000,0.0,0.0,0.000000,0.590909,9.0,1.0,60.590909,0.000000,0.772727,0.923077,0.076923,0.000000,3.500000,0.136364,0.000000,0.318182,0.818182,0.500000,1.000000,0.090909,0.090909,0.500000,0.045455,0.000000,0.000000,1.318182,12.000000,11.727273,216.818182,216.022727,0.000000,216.818182,1.000000,805.454545,1492.727273,0.000000,0.136364,0.000000,1.000000,0.00,0.0,0.0,0.0,4.333333,2.500000,2.000000,39.0
PRV57761,0,1.000000,0.0,0.341463,1.223881,41.000000,0.0,0.000000,0.000000,0.060976,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,586.097561,7026.585366,0.000000,0.0,0.0,0.097561,0.390244,67.0,1.0,71.134146,0.012195,0.487805,0.919437,0.080563,0.000000,4.841463,0.439024,0.170732,0.463415,0.670732,0.682927,0.756098,0.487805,0.365854,0.390244,0.292683,0.121951,0.280488,2.390244,12.000000,12.000000,229.756098,157.134674,4.512195,225.243902,0.935979,707.317073,2928.414634,0.000000,0.109756,0.000000,0.951220,0.00,0.0,0.0,0.0,34.500000,2.800000,17.000000,68.0
PRV57762,0,1.000000,0.0,1.000000,1.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1068.000000,15000.000000,0.000000,0.0,0.0,0.000000,1.000000,1.0,1.0,67.000000,0.000000,1.000000,0.600000,0.000000,0.400000,5.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,12.000000,12.000000,1900.000000,1900.000000,0.000000,1900.000000,1.000000,400.000000,2540.000000,0.000000,0.000000,0.000000,1.000000,0.00,0.0,0.0,0.0,1.000000,0.000000,0.000000,0.0


# Train test split

In [4]:
# we want to normalize and down/upsample after splitting.
scores = providers.drop(['PotentialFraud'], axis=1)
scores = pd.DataFrame(scores, columns = scores.columns)
decision = providers["PotentialFraud"]

Xtrain, Xtest, ytrain, ytest = train_test_split(scores, decision, 
                                                test_size = .30, random_state=0, 
                                                stratify = decision)


# get splitted train set to df for upsampling
providers_trainTestSplit_train = pd.concat([ytrain, Xtrain], axis=1)
display(providers_trainTestSplit_train)
display(providers_trainTestSplit_train['PotentialFraud'].value_counts())

Unnamed: 0_level_0,PotentialFraud,Perc_Outpatient,DualPatientProvider,Perc_DualPatientType,Ratio_ClaimsPerPatient,Ratio_ClaimsPerAttPhys,Perc_ClaimsPerTopFraudState,Perc_HasTop5AdmtCode,Perc_MultHospAttPhys,Perc_MultHospOperPhys,Perc_MultHospOtherPhys,IP_Perc_Duplicates,IP_Count_UniquePatients,IP_Count_UniqueState,IP_Mean_AgeAtClaim,IP_Perc_HasDied,IP_Perc_GenderZero,IP_Perc_RaceOne,IP_Perc_RaceTwo,IP_Perc_RaceThree,IP_Mean_NumChronicConds,IP_Perc_Alzheimers_Chronic,IP_Perc_Cancer_Chronic,IP_Perc_Depression_Chronic,IP_Perc_Diabetes_Chronic,IP_Perc_HeartFailure_Chronic,IP_Perc_IschemicHeart_Chronic,IP_Perc_KidneyDisease_Chronic,IP_Perc_ObstrPulmonary_Chronic,IP_Perc_Osteoporosis_Chronic,IP_Perc_RheumatoidArthritis_Chronic,IP_Perc_Stroke_Chronic,IP_Perc_HasRenalDisease,IP_Mean_ClaimDuration,IP_Mean_AdmitDuration,IP_Mean_NoOfMonths_PartACov,IP_Mean_NoOfMonths_PartBCov,IP_Mean_ClaimCost,IP_Mean_DailyClaimCost,IP_Mean_DeductibleAmtPaid,IP_Mean_InscClaimAmtReimbursed,IP_Mean_InsReimbursementRatio,IP_Mean_AnnualDeductibleAmt,IP_Mean_AnnualReimbursementAmt,IP_Perc_No_ProcCode,IP_Perc_HasAllPhys,IP_Perc_HasNoPhys,IP_Perc_MultHosp,OP_Perc_Duplicates,OP_Count_UniquePatients,OP_Count_UniqueState,OP_Mean_AgeAtClaim,OP_Perc_HasDied,OP_Perc_GenderZero,OP_Perc_RaceOne,OP_Perc_RaceTwo,OP_Perc_RaceThree,OP_Mean_NumChronicConds,OP_Perc_Alzheimers_Chronic,OP_Perc_Cancer_Chronic,OP_Perc_Depression_Chronic,OP_Perc_Diabetes_Chronic,OP_Perc_HeartFailure_Chronic,OP_Perc_IschemicHeart_Chronic,OP_Perc_KidneyDisease_Chronic,OP_Perc_ObstrPulmonary_Chronic,OP_Perc_Osteoporosis_Chronic,OP_Perc_RheumatoidArthritis_Chronic,OP_Perc_Stroke_Chronic,OP_Perc_HasRenalDisease,OP_Mean_ClaimDuration,OP_Mean_NoOfMonths_PartACov,OP_Mean_NoOfMonths_PartBCov,OP_Mean_ClaimCost,OP_Mean_DailyClaimCost,OP_Mean_DeductibleAmtPaid,OP_Mean_InscClaimAmtReimbursed,OP_Mean_InsReimbursementRatio,OP_Mean_AnnualDeductibleAmt,OP_Mean_AnnualReimbursementAmt,OP_Perc_No_DiagCode,OP_Perc_HasAllPhys,OP_Perc_HasNoPhys,OP_Perc_MultHosp,IP_Mean_PatientsPerAttPhys,IP_Mean_PatientsPerOperPhys,IP_Mean_PatientsPerOtherPhys,IP_AgeRange,OP_Mean_PatientsPerAttPhys,OP_Mean_PatientsPerOperPhys,OP_Mean_PatientsPerOtherPhys,OP_AgeRange
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1
PRV52041,1,1.000000,0.0,0.245623,2.053377,235.625000,0.037666,0.021751,0.115119,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,389.901326,3762.005305,0.000000,0.000000,0.0,0.041910,0.430960,918.0,17.0,74.118302,0.004775,0.410610,0.666667,0.000000,0.142857,4.338462,0.363395,0.138462,0.406897,0.663660,0.592573,0.766578,0.372414,0.328912,0.327321,0.283820,0.094430,0.198939,2.445093,11.980371,11.970822,323.856764,242.538986,2.912467,320.944297,0.984813,653.856764,2449.395225,0.016446,0.093369,0.003714,0.846154,0.000000,0.00,0.000000,0.0,203.125000,51.000000,288.000000,74.0
PRV51771,0,1.000000,0.0,0.425000,1.176471,40.000000,1.000000,0.050000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,507.300000,3096.000000,0.000000,0.000000,0.0,0.050000,0.525000,34.0,1.0,73.725000,0.000000,0.325000,1.000000,0.000000,0.000000,3.525000,0.325000,0.100000,0.375000,0.575000,0.475000,0.625000,0.325000,0.325000,0.200000,0.150000,0.050000,0.175000,1.675000,11.700000,12.000000,214.750000,143.321429,1.500000,213.250000,0.974969,632.500000,1709.250000,0.000000,0.100000,0.000000,0.925000,0.000000,0.00,0.000000,0.0,34.000000,6.000000,15.000000,72.0
PRV52079,0,0.787879,1.0,0.272727,1.064516,1.500000,0.000000,0.030303,0.090909,0.030303,0.000000,0.142857,7.0,2.0,68.714286,0.0,0.285714,0.571429,0.071429,0.214286,5.000000,0.571429,0.000000,0.285714,1.000000,0.428571,0.857143,0.571429,0.285714,0.428571,0.428571,0.142857,0.142857,7.571429,7.571429,12.000000,12.0,6568.000000,1817.212600,1068.0,5500.000000,0.737450,824.969697,4883.939394,0.428571,0.142857,0.0,0.121212,0.360000,24.0,5.0,73.653846,0.000000,0.307692,0.000000,0.000000,0.000000,4.923077,0.461538,0.115385,0.576923,0.730769,0.615385,0.653846,0.269231,0.500000,0.346154,0.576923,0.076923,0.153846,3.500000,12.000000,12.000000,253.846154,150.883700,0.000000,253.846154,1.000000,533.636364,1710.303030,0.038462,0.115385,0.000000,0.818182,1.000000,1.25,1.000000,38.0,1.529412,1.000000,1.600000,51.0
PRV55240,0,0.932515,1.0,0.306748,1.124138,3.975610,0.079755,0.049080,0.165644,0.018405,0.061350,0.090909,11.0,1.0,70.818182,0.0,0.545455,0.844262,0.073770,0.032787,5.454545,0.272727,0.181818,0.454545,0.727273,0.727273,0.818182,0.545455,0.454545,0.272727,0.727273,0.272727,0.272727,4.545455,4.545455,12.000000,12.0,8031.636364,2456.613774,1068.0,6963.636364,0.776311,543.828221,5410.920245,0.363636,0.090909,0.0,0.024540,0.513158,136.0,2.0,73.875000,0.000000,0.414474,0.897436,0.061538,0.035897,4.434211,0.335526,0.098684,0.486842,0.730263,0.559211,0.743421,0.434211,0.302632,0.315789,0.348684,0.078947,0.190789,1.835526,11.921053,11.888158,287.565789,248.932487,3.092105,284.473684,0.981783,546.319018,1700.736196,0.000000,0.098684,0.006579,0.895706,1.833333,1.75,1.000000,47.0,3.947368,2.071429,2.541667,69.0
PRV53417,1,0.959064,1.0,0.315789,2.408451,17.100000,0.000000,0.017544,0.105263,0.029240,0.000000,0.000000,7.0,1.0,69.857143,0.0,0.428571,0.911765,0.088235,0.000000,4.428571,0.142857,0.571429,0.428571,0.857143,0.571429,0.714286,0.428571,0.285714,0.142857,0.285714,0.000000,0.285714,8.857143,8.857143,10.285714,12.0,11925.142857,2347.371989,1068.0,10857.142857,0.879256,523.040936,5251.228070,0.428571,0.000000,0.0,0.058480,0.392638,69.0,1.0,69.859756,0.000000,0.371951,0.985714,0.014286,0.000000,4.298780,0.286585,0.121951,0.445122,0.670732,0.554878,0.719512,0.408537,0.359756,0.304878,0.268293,0.158537,0.201220,2.554878,11.195122,11.902439,241.829268,178.150705,3.475610,238.353659,0.975411,861.929825,2659.473684,0.006098,0.109756,0.006098,0.771930,1.750000,2.00,1.000000,44.0,16.750000,3.500000,12.600000,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRV53726,0,1.000000,0.0,0.562500,1.333333,8.000000,0.000000,0.062500,0.937500,0.062500,0.375000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,801.000000,7062.500000,0.000000,0.000000,0.0,0.187500,0.666667,12.0,2.0,74.375000,0.000000,0.437500,0.500000,0.500000,0.000000,5.062500,0.312500,0.000000,0.312500,0.812500,0.875000,0.937500,0.562500,0.437500,0.375000,0.312500,0.125000,0.187500,1.500000,12.000000,12.000000,231.875000,230.078125,0.000000,231.875000,1.000000,1218.750000,4679.375000,0.062500,0.062500,0.000000,1.000000,0.000000,0.00,0.000000,0.0,6.500000,1.000000,5.000000,48.0
PRV54226,0,1.000000,0.0,0.461538,1.181818,1.857143,0.000000,0.076923,0.384615,0.000000,0.153846,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,575.076923,4307.692308,0.000000,0.000000,0.0,0.076923,0.538462,11.0,1.0,80.076923,0.000000,0.615385,1.000000,0.000000,0.000000,4.076923,0.461538,0.230769,0.384615,0.692308,0.461538,0.538462,0.230769,0.230769,0.461538,0.307692,0.076923,0.000000,2.076923,12.000000,12.000000,58.461538,56.153846,0.000000,58.461538,1.000000,1049.230769,3419.230769,0.000000,0.307692,0.000000,1.000000,0.000000,0.00,0.000000,0.0,1.857143,1.000000,1.333333,42.0
PRV56207,0,0.968421,1.0,0.252632,1.032609,2.021277,0.200000,0.042105,0.052632,0.021053,0.000000,0.333333,3.0,1.0,79.333333,0.0,0.333333,1.000000,0.000000,0.000000,6.333333,0.333333,0.333333,0.333333,0.666667,0.666667,1.000000,0.666667,0.666667,0.666667,0.666667,0.333333,0.333333,10.333333,10.333333,12.000000,12.0,24068.000000,1776.913725,1068.0,23000.000000,0.848226,483.747368,5692.000000,0.333333,0.333333,0.0,0.042105,0.444444,89.0,5.0,75.608696,0.021739,0.358696,0.965517,0.034483,0.000000,4.652174,0.423913,0.163043,0.423913,0.717391,0.630435,0.739130,0.434783,0.326087,0.423913,0.304348,0.065217,0.250000,2.195652,12.000000,11.869565,227.173913,194.278255,4.130435,223.043478,0.975490,516.842105,1940.421053,0.021739,0.086957,0.010870,0.926316,1.000000,1.00,1.000000,8.0,2.068182,1.285714,1.320000,68.0
PRV53427,0,1.000000,0.0,0.250000,1.411765,2.666667,0.000000,0.041667,0.541667,0.041667,0.166667,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,267.000000,1757.083333,0.000000,0.000000,0.0,0.000000,0.454545,17.0,1.0,71.250000,0.000000,0.458333,0.834455,0.125168,0.014805,5.375000,0.541667,0.041667,0.708333,0.875000,0.666667,0.833333,0.583333,0.375000,0.541667,0.166667,0.041667,0.500000,4.541667,12.000000,12.000000,372.500000,77.398705,2.500000,370.000000,0.981366,911.250000,3045.416667,0.083333,0.000000,0.000000,0.958333,0.000000,0.00,0.000000,0.0,2.555556,1.250000,1.142857,44.0


0    3433
1     354
Name: PotentialFraud, dtype: int64

# Upsampling

In [5]:
### UPSAMPLING
# Separate majority and minority classes
df_majority = providers_trainTestSplit_train[providers_trainTestSplit_train.PotentialFraud==0]
df_minority = providers_trainTestSplit_train[providers_trainTestSplit_train.PotentialFraud==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                # to match majority class
                                 n_samples=providers_trainTestSplit_train['PotentialFraud'].value_counts()[0],
                                 random_state=0) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
display(df_upsampled.PotentialFraud.value_counts())

# separate into Xtrain and Ytrain
Xtrain_upsampled = df_upsampled.drop(['PotentialFraud'], axis=1)
ytrain_upsampled = df_upsampled['PotentialFraud']

1    3433
0    3433
Name: PotentialFraud, dtype: int64

# Modeling

In [6]:
# Stratified Cross Validation needs to be applied 
skf = StratifiedKFold(n_splits = 3, random_state = 0, shuffle = True)

### 1. Logistic regression

In [7]:
logreg = LogisticRegression(random_state=0, class_weight='balanced', penalty='l1', solver='liblinear')

param_grid = {'C': np.logspace(-3,1,100)}

logregCV = GridSearchCV(logreg, 
                        param_grid = param_grid, 
                        scoring='recall',
                        cv=skf).fit(Xtrain,ytrain)

print("Best parameters: {}".format(logregCV.best_params_))
print("Best cross-validation score: {:.2f}".format(logregCV.best_score_))
print("Best estimator:\n{}".format(logregCV.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain, logregCV.best_estimator_.predict(Xtrain))))
print("Train recall score: {}".format(recall_score(ytest, logregCV.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,logregCV.predict(Xtest)))

Best parameters: {'C': 0.22051307399030456}
Best cross-validation score: 0.87
Best estimator:
LogisticRegression(C=0.22051307399030456, class_weight='balanced', penalty='l1',
                   random_state=0, solver='liblinear')
------------------------------------------------------------
Train recall score: 0.9180790960451978
Train recall score: 0.8881578947368421
------------------------------------------------------------
Confusion matrix
[[1278  193]
 [  17  135]]


### 2. Random forest

In [29]:
rf_param_grid = {
    'max_depth': range(1, 10), 
    'max_features': ['auto', 'log2'], 
    'min_samples_leaf': [4,5,6], 
    'min_samples_split': [1,2,3,4,5], 
    'n_estimators': [300,350,375,400]
}

rf_grid_search = RandomizedSearchCV(RandomForestClassifier(random_state=0, 
                                                  class_weight='balanced'), 
                                    param_distributions=rf_param_grid, cv=skf, scoring='recall', 
                                    random_state=0, return_train_score=True)
rf_grid_search.fit(Xtrain, ytrain)


print("Best parameters: {}".format(rf_grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(rf_grid_search.best_score_))
print("Best estimator:\n{}".format(rf_grid_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain, rf_grid_search.best_estimator_.predict(Xtrain))))
print("Train recall score: {}".format(recall_score(ytest, rf_grid_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,rf_grid_search.predict(Xtest)))

Best parameters: {'n_estimators': 350, 'min_samples_split': 5, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 3}
Best cross-validation score: 0.86
Best estimator:
RandomForestClassifier(class_weight='balanced', max_depth=3, min_samples_leaf=5,
                       min_samples_split=5, n_estimators=350, random_state=0)
------------------------------------------------------------
Train recall score: 0.9096045197740112
Train recall score: 0.9013157894736842
------------------------------------------------------------
Confusion matrix
[[1235  236]
 [  15  137]]


### 3. Support vector machine

In [9]:
svm_param_grid = {'C': np.logspace(-3,1,100)}

svm_rand_search = RandomizedSearchCV(SVC(random_state=0, class_weight='balanced'),
                                     svm_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
svm_rand_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(svm_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(svm_rand_search.best_score_))
print("Best estimator:\n{}".format(svm_rand_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain, svm_rand_search.best_estimator_.predict(Xtrain))))
print("Train recall score: {}".format(recall_score(ytest, svm_rand_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,svm_rand_search.predict(Xtest)))

Best parameters: {'C': 0.004862601580065354}
Best cross-validation score: 0.87
Best estimator:
SVC(C=0.004862601580065354, class_weight='balanced', random_state=0)
------------------------------------------------------------
Train recall score: 0.867231638418079
Train recall score: 0.8552631578947368
------------------------------------------------------------
Confusion matrix
[[1046  425]
 [  22  130]]


### 4. Gaussian Naive Bayes

In [10]:
gnb_param_grid = {'var_smoothing': np.logspace(-30,1,100)}

gnb_rand_search = RandomizedSearchCV(GaussianNB(),
                                     gnb_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
gnb_rand_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(gnb_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(gnb_rand_search.best_score_))
print("Best estimator:\n{}".format(gnb_rand_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain, gnb_rand_search.best_estimator_.predict(Xtrain))))
print("Train recall score: {}".format(recall_score(ytest, gnb_rand_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,gnb_rand_search.predict(Xtest)))

Best parameters: {'var_smoothing': 2.8480358684357933e-22}
Best cross-validation score: 0.82
Best estimator:
GaussianNB(var_smoothing=2.8480358684357933e-22)
------------------------------------------------------------
Train recall score: 0.8192090395480226
Train recall score: 0.8092105263157895
------------------------------------------------------------
Confusion matrix
[[1233  238]
 [  29  123]]


In [11]:
gnb_param_grid = {'var_smoothing': np.logspace(-10,1,100)}

gnb_rand_search = RandomizedSearchCV(GaussianNB(),
                                     gnb_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
gnb_rand_search.fit(Xtrain_upsampled, ytrain_upsampled)

print("Best parameters: {}".format(gnb_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(gnb_rand_search.best_score_))
print("Best estimator:\n{}".format(gnb_rand_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain_upsampled, 
                                                   gnb_rand_search.best_estimator_.predict(Xtrain_upsampled))))
print("Train recall score: {}".format(recall_score(ytest, gnb_rand_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,gnb_rand_search.predict(Xtest)))

Best parameters: {'var_smoothing': 2.1544346900318866e-09}
Best cross-validation score: 0.80
Best estimator:
GaussianNB(var_smoothing=2.1544346900318866e-09)
------------------------------------------------------------
Train recall score: 0.8054180017477425
Train recall score: 0.7828947368421053
------------------------------------------------------------
Confusion matrix
[[1326  145]
 [  33  119]]


### 5. KNN

In [12]:
knn_param_grid = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                  'leaf_size': [10,30,50],
                  'n_neighbors': [3,5,7],
                  'weights': ['uniform', 'distance']
                 }

knn_rand_search = RandomizedSearchCV(KNeighborsClassifier(),
                                     knn_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
knn_rand_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(knn_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(knn_rand_search.best_score_))
print("Best estimator:\n{}".format(knn_rand_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain, knn_rand_search.best_estimator_.predict(Xtrain))))
print("Train recall score: {}".format(recall_score(ytest, knn_rand_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,knn_rand_search.predict(Xtest)))

Best parameters: {'weights': 'uniform', 'n_neighbors': 3, 'leaf_size': 50, 'algorithm': 'brute'}
Best cross-validation score: 0.29
Best estimator:
KNeighborsClassifier(algorithm='brute', leaf_size=50, n_neighbors=3)
------------------------------------------------------------
Train recall score: 0.5988700564971752
Train recall score: 0.4342105263157895
------------------------------------------------------------
Confusion matrix
[[1410   61]
 [  86   66]]


In [13]:
knn_param_grid = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                  'leaf_size': [10,30,50],
                  'n_neighbors': [3,5,7],
                  'weights': ['uniform', 'distance']
                 }

knn_rand_search = RandomizedSearchCV(KNeighborsClassifier(),
                                     knn_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
knn_rand_search.fit(Xtrain_upsampled, ytrain_upsampled)

print("Best parameters: {}".format(knn_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(knn_rand_search.best_score_))
print("Best estimator:\n{}".format(knn_rand_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain_upsampled,
                                                   knn_rand_search.best_estimator_.predict(Xtrain_upsampled))))
print("Train recall score: {}".format(recall_score(ytest, knn_rand_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,knn_rand_search.predict(Xtest)))

Best parameters: {'weights': 'distance', 'n_neighbors': 3, 'leaf_size': 30, 'algorithm': 'auto'}
Best cross-validation score: 1.00
Best estimator:
KNeighborsClassifier(n_neighbors=3, weights='distance')
------------------------------------------------------------
Train recall score: 1.0
Train recall score: 0.6052631578947368
------------------------------------------------------------
Confusion matrix
[[1317  154]
 [  60   92]]


### 6. Gradient boosting

In [15]:
gb = GradientBoostingClassifier(random_state = 0, max_depth = 2)
gb_param_grid = {'n_estimators': [500,1000,2000,2500,3000],
                'max_features': [10,13,15,20],
                'learning_rate': [0.005, 0.01, 0.015, 0.02],
                    'min_samples_split': [17,20,25],
                    'min_samples_leaf' : [17,20,25]}

gb_rand_search = RandomizedSearchCV(gb, 
                                    gb_param_grid, 
                                    cv= skf, 
                                    n_jobs = 5, 
                                    verbose = 1,
                                    return_train_score = True,
                                    scoring = 'recall')

gb_rand_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(gb_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(gb_rand_search.best_score_))
print("Best estimator:\n{}".format(gb_rand_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain, gb_rand_search.best_estimator_.predict(Xtrain))))
print("Train recall score: {}".format(recall_score(ytest, gb_rand_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,gb_rand_search.predict(Xtest)))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed:   39.9s finished


Best parameters: {'n_estimators': 3000, 'min_samples_split': 25, 'min_samples_leaf': 25, 'max_features': 20, 'learning_rate': 0.01}
Best cross-validation score: 0.53
Best estimator:
GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features=20,
                           min_samples_leaf=25, min_samples_split=25,
                           n_estimators=3000, random_state=0)
------------------------------------------------------------
Train recall score: 0.7655367231638418
Train recall score: 0.5131578947368421
------------------------------------------------------------
Confusion matrix
[[1445   26]
 [  74   78]]


In [19]:
gb = GradientBoostingClassifier(random_state = 0, max_depth = 2)
gb_param_grid = {'n_estimators': [3000,3100,3200,3250],
                'max_features': [19,20,21],
                'learning_rate': [0.03,0.035,0.04],
                    'min_samples_split': [25,26,27,28],
                    'min_samples_leaf' : [14,15,16]}

gb_rand_search = RandomizedSearchCV(gb, 
                                    gb_param_grid, 
                                    cv= skf, 
                                    n_jobs = 5, 
                                    verbose = 1,
                                    return_train_score = True,
                                    scoring = 'recall')

gb_rand_search.fit(Xtrain_upsampled, ytrain_upsampled)

print("Best parameters: {}".format(gb_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(gb_rand_search.best_score_))
print("Best estimator:\n{}".format(gb_rand_search.best_estimator_))

print('--'*30)
print("Train recall score: {}".format(recall_score(ytrain_upsampled,
                                                   gb_rand_search.best_estimator_.predict(Xtrain_upsampled))))
print("Train recall score: {}".format(recall_score(ytest, gb_rand_search.best_estimator_.predict(Xtest))))

print('--'*30)
print('Confusion matrix')
print(confusion_matrix(ytest,gb_rand_search.predict(Xtest)))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed:  2.5min finished


Best parameters: {'n_estimators': 3200, 'min_samples_split': 27, 'min_samples_leaf': 15, 'max_features': 20, 'learning_rate': 0.04}
Best cross-validation score: 1.00
Best estimator:
GradientBoostingClassifier(learning_rate=0.04, max_depth=2, max_features=20,
                           min_samples_leaf=15, min_samples_split=27,
                           n_estimators=3200, random_state=0)
------------------------------------------------------------
Train recall score: 1.0
Train recall score: 0.6710526315789473
------------------------------------------------------------
Confusion matrix
[[1420   51]
 [  50  102]]
