# Libraries

In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from functools import partial
from collections import Counter
import seaborn as sns

from joblib import load

# modeling
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

# ignore warnings
import warnings
warnings.filterwarnings(action="ignore")

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Get dataset

In [2]:
# get pickle file
providers = load('./data/Providers_Final_2iteration.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
providers

Unnamed: 0_level_0,PotentialFraud,Perc_Outpatient,DualPatientProvider,Perc_DualPatientType,Ratio_ClaimsPerAttPhys,Ratio_ClaimsPerPatient,PatientsPerAttPhys,PatientsPerOperPhys,PatientsPerOthPhys,Perc_MultHospAttPhys,Perc_MultHospOperPhys,Perc_MultHospOtherPhys,Perc_HasTop5AdmitCode,Perc_ClaimsPerTopFraudState,Mean_StatePerAttPhys,Mean_StatePerOperPhys,Mean_StatePerOthPhys,IP_Count_UniquePatients,IP_Perc_MultHosp,IP_Perc_Duplicates,IP_Mean_Duplicate_per_AttPhy,IP_Mean_Duplicate_per_Patient,IP_Perc_Dup_Diff_Provider,IP_Perc_Dup_Diff_State,IP_Count_UniqueState,IP_Mean_PatientsPerAttPhys,IP_Mean_PatientsPerOperPhys,IP_Mean_PatientsPerOtherPhys,IP_Perc_HasAttPhys,IP_Perc_HasNoPhys,IP_Perc_AttPhysIsOperPhys,IP_Mean_ClaimCost,IP_Mean_DailyClaimCost,IP_Mean_ClaimCostPerAttPhys,IP_Mean_ClaimCostPerOperPhys,IP_Mean_ClaimCostPerOtherPhys,IP_Mean_ClaimCostPerPatient,IP_Perc_No_ProcCode,IP_Sum_DeductibleAmtPaid,IP_Mean_AnnualDeductibleAmt,IP_Sum_InscClaimAmtReimbursed,IP_Mean_InsReimbursementRatio,IP_Mean_AnnualReimbursementAmt,IP_Mean_NoOfMonths_PartACov,IP_Mean_NoOfMonths_PartBCov,IP_Mean_ClaimDuration,IP_Mean_AdmitDuration,IP_Mean_AdmitDurationPerAttPhys,IP_Mean_AdmitDurationPerPatient,IP_Mean_AgeAtClaim,IP_AgeRange,IP_Perc_HasDied,IP_Perc_GenderZero,IP_Perc_RaceOne,IP_Perc_RaceThree,IP_Perc_RaceTwo,IP_Perc_HasRenalDisease,IP_Mean_NumChronicConds,IP_Perc_Alzheimers_Chronic,IP_Perc_Cancer_Chronic,IP_Perc_Depression_Chronic,IP_Perc_Diabetes_Chronic,IP_Perc_HeartFailure_Chronic,IP_Perc_IschemicHeart_Chronic,IP_Perc_KidneyDisease_Chronic,IP_Perc_ObstrPulmonary_Chronic,IP_Perc_Osteoporosis_Chronic,IP_Perc_RheumatoidArthritis_Chronic,IP_Perc_Stroke_Chronic,OP_Count_UniquePatients,OP_Perc_MultHosp,OP_Perc_Duplicates,OP_Mean_Duplicate_per_AttPhy,OP_Mean_Duplicate_per_Patient,OP_Perc_Dup_Diff_Provider,OP_Perc_Dup_Diff_State,OP_Count_UniqueState,OP_Mean_PatientsPerAttPhys,OP_Mean_PatientsPerOperPhys,OP_Mean_PatientsPerOtherPhys,OP_Perc_HasNoPhys,OP_Perc_AttPhysIsOperPhys,OP_Perc_HasAttPhys,OP_Mean_ClaimCost,OP_Mean_DailyClaimCost,OP_Mean_ClaimCostPerAttPhys,OP_Mean_ClaimCostPerOperPhys,OP_Mean_ClaimCostPerOtherPhys,OP_Mean_ClaimCostPerPatient,OP_Perc_No_DiagCode,OP_Sum_DeductibleAmtPaid,OP_Mean_AnnualDeductibleAmt,OP_Sum_InscClaimAmtReimbursed,OP_Mean_InsReimbursementRatio,OP_Mean_AnnualReimbursementAmt,OP_Mean_NoOfMonths_PartACov,OP_Mean_NoOfMonths_PartBCov,OP_Mean_ClaimDuration,OP_Mean_AgeAtClaim,OP_AgeRange,OP_Perc_HasDied,OP_Perc_GenderZero,OP_Perc_RaceOne,OP_Perc_RaceThree,OP_Perc_RaceTwo,OP_Perc_HasRenalDisease,OP_Mean_NumChronicConds,OP_Perc_Alzheimers_Chronic,OP_Perc_Cancer_Chronic,OP_Perc_Depression_Chronic,OP_Perc_Diabetes_Chronic,OP_Perc_HeartFailure_Chronic,OP_Perc_IschemicHeart_Chronic,OP_Perc_KidneyDisease_Chronic,OP_Perc_ObstrPulmonary_Chronic,OP_Perc_Osteoporosis_Chronic,OP_Perc_RheumatoidArthritis_Chronic,OP_Perc_Stroke_Chronic
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1
PRV51001,0,0.800000,1,0.400000,1.785714,1.041667,10,19,16,0.240000,0.040000,0.000000,0.040000,0.0,1.000000,1.000000,1.000000,5.0,0.160000,0.000000,0.0,0.0,0.000000,0.000000,1.0,1.25,1.0,1.0,1.0,0.0,0.000000,20468.000000,4077.360000,19693.000000,24068.000000,43068.0,20468.000000,0.600000,5340.0,897.120000,97000.0,0.878279,17606.000000,12.000000,12.000000,6.000000,6.000000,20468.000000,20468.000000,77.600000,26.0,0.000000,0.400000,1.000000,0.0,0.000000,0.400000,6.000000,0.400000,0.200000,0.800000,0.800000,0.800000,0.800000,0.800000,0.400000,0.000000,0.600000,0.400000,19.0,0.920000,0.550000,1.833333,1.000000,0.550000,0.437500,1.0,2.000000,1.000000,1.285714,0.000000,0.000000,1.000000,382.000000,307.000000,322.950000,333.333333,217.857143,373.421053,0.000000,0.0,463.920000,7640.0,1.000000,2615.200000,12.000000,12.000000,1.550000,77.950000,40.0,0.000000,0.350000,0.800000,0.000000,0.200000,0.300000,5.450000,0.650000,0.200000,0.250000,0.850000,0.750000,0.950000,0.650000,0.400000,0.300000,0.250000,0.200000
PRV51003,1,0.530303,1,0.500000,3.000000,1.128205,73,110,95,0.007576,0.000000,0.000000,0.060606,0.0,1.295455,1.571429,1.090909,53.0,0.045455,0.016129,1.0,1.0,0.016129,0.016129,3.0,29.50,20.0,0.0,1.0,0.0,0.532258,10309.935484,2384.941628,10309.935484,12352.461153,0.0,10549.132075,0.370968,66216.0,931.424242,573000.0,0.821059,7568.181818,11.806452,11.806452,6.161290,6.161290,10469.754386,10395.868852,69.935484,55.0,0.016129,0.338710,0.790323,0.0,0.209677,0.274194,4.919355,0.516129,0.112903,0.403226,0.790323,0.580645,0.887097,0.629032,0.370968,0.209677,0.306452,0.112903,66.0,0.818182,0.357143,1.190476,1.000000,0.357143,0.237288,3.0,1.590909,1.000000,1.136364,0.000000,0.042857,1.000000,467.714286,336.440760,451.628788,408.000000,395.000000,462.121212,0.000000,70.0,737.121212,32670.0,0.994032,2678.181818,11.828571,11.928571,3.357143,68.371429,59.0,0.000000,0.471429,0.828571,0.000000,0.157143,0.171429,4.214286,0.342857,0.042857,0.414286,0.728571,0.628571,0.814286,0.357143,0.257143,0.285714,0.271429,0.071429
PRV51004,0,1.000000,0,0.208054,3.921053,1.079710,100,119,112,0.167785,0.013423,0.000000,0.013423,0.0,2.131579,1.263158,1.692308,0.0,0.060403,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,434.953020,0.0,0.000000,4351.879195,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,138.0,0.899329,0.461538,2.275862,1.031250,0.453901,0.306306,9.0,3.921053,1.421053,2.423077,0.000000,0.080537,1.000000,352.214765,250.363050,435.616855,268.728070,433.933150,352.415459,0.040268,310.0,622.751678,52170.0,0.978485,2194.899329,11.865772,11.959732,2.429530,71.302013,74.0,0.006711,0.308725,0.000000,0.000000,0.000000,0.154362,4.342282,0.429530,0.107383,0.422819,0.704698,0.590604,0.724832,0.335570,0.275168,0.328859,0.308725,0.114094
PRV51005,1,1.000000,0,0.248069,194.166667,2.353535,489,489,491,0.000000,0.000000,0.000000,0.032618,0.0,2.333333,1.500000,2.250000,0.0,0.048069,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,379.162232,0.0,0.000000,3623.991416,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,495.0,0.731330,0.452257,86.666667,1.713816,0.411929,0.310796,4.0,147.833333,33.833333,102.750000,0.001717,0.112446,0.998283,244.300429,196.533055,250.440584,370.014898,256.092600,256.429705,0.011159,3700.0,636.328755,280910.0,0.980747,2109.733906,11.907296,11.939914,2.088412,69.567382,72.0,0.003433,0.438627,0.805369,0.033557,0.161074,0.222318,4.335622,0.365665,0.141631,0.416309,0.685837,0.583691,0.768240,0.435193,0.253219,0.295279,0.284120,0.106438
PRV51007,0,0.958333,1,0.277778,7.200000,1.241379,48,53,51,0.597222,0.083333,0.027778,0.027778,0.0,1.300000,1.000000,1.000000,3.0,0.069444,0.000000,0.0,0.0,0.000000,0.000000,1.0,1.50,1.0,0.0,1.0,0.0,0.000000,7401.333333,1255.588889,8318.000000,7068.000000,0.0,7401.333333,0.666667,3204.0,445.000000,19000.0,0.829955,3050.000000,12.000000,12.000000,6.333333,6.333333,7401.333333,7401.333333,78.000000,10.0,0.000000,0.333333,0.000000,0.0,0.000000,0.333333,5.666667,0.666667,0.000000,0.666667,1.000000,1.000000,1.000000,0.333333,0.000000,0.000000,0.333333,0.666667,56.0,0.902778,0.420290,4.142857,1.115385,0.420290,0.298246,2.0,8.000000,2.750000,3.571429,0.000000,0.101449,1.000000,214.057971,199.685990,153.922893,488.333333,130.682540,193.482143,0.000000,60.0,469.722222,14710.0,0.992157,1729.722222,11.826087,11.826087,1.768116,67.956522,73.0,0.014493,0.478261,0.766524,0.008584,0.224893,0.144928,4.101449,0.347826,0.173913,0.391304,0.666667,0.536232,0.695652,0.304348,0.231884,0.304348,0.304348,0.144928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRV57759,0,1.000000,0,0.214286,28.000000,1.166667,23,23,23,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,1.000000,1.000000,0.0,0.178571,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,457.714286,0.0,0.000000,3962.142857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,24.0,0.928571,0.518519,14.000000,1.272727,0.518519,0.235294,1.0,24.000000,1.000000,11.000000,0.000000,0.035714,1.000000,384.642857,200.629252,384.642857,300.000000,240.833333,382.708333,0.035714,130.0,886.785714,10640.0,0.983401,3241.785714,12.000000,12.000000,3.142857,73.000000,34.0,0.000000,0.571429,0.931034,0.034483,0.034483,0.178571,5.250000,0.500000,0.142857,0.321429,0.714286,0.714286,1.000000,0.535714,0.392857,0.464286,0.321429,0.142857
PRV57760,0,1.000000,0,0.363636,7.333333,2.444444,6,7,7,0.000000,0.000000,0.000000,0.045455,0.0,1.000000,1.000000,1.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,436.909091,0.0,0.000000,2785.454545,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9.0,1.000000,0.590909,6.500000,1.857143,0.590909,0.470588,1.0,4.333333,2.500000,2.000000,0.000000,0.272727,1.000000,216.818182,216.022727,243.872549,187.000000,146.785714,295.416667,0.000000,0.0,805.454545,4770.0,1.000000,1492.727273,12.000000,11.727273,1.318182,60.590909,39.0,0.000000,0.772727,0.923077,0.000000,0.076923,0.000000,3.500000,0.136364,0.000000,0.318182,0.818182,0.500000,1.000000,0.090909,0.090909,0.500000,0.045455,0.000000
PRV57761,0,1.000000,0,0.341463,41.000000,1.223881,65,62,65,0.000000,0.060976,0.000000,0.000000,0.0,1.000000,1.000000,1.000000,0.0,0.097561,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,586.097561,0.0,0.000000,7026.585366,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,67.0,0.951220,0.390244,16.000000,1.103448,0.390244,0.166667,1.0,34.500000,2.800000,17.000000,0.000000,0.000000,1.000000,229.756098,157.134674,260.483193,249.766667,282.592593,208.619403,0.000000,370.0,707.317073,18470.0,0.935979,2928.414634,12.000000,12.000000,2.390244,71.134146,68.0,0.012195,0.487805,0.919437,0.000000,0.080563,0.280488,4.841463,0.439024,0.170732,0.463415,0.670732,0.682927,0.756098,0.487805,0.365854,0.390244,0.292683,0.121951
PRV57762,0,1.000000,0,1.000000,1.000000,1.000000,0,1,1,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,1068.000000,0.0,0.000000,15000.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1900.000000,1900.000000,1900.000000,0.000000,0.000000,1900.000000,0.000000,0.0,400.000000,1900.0,1.000000,2540.000000,12.000000,12.000000,1.000000,67.000000,0.0,0.000000,1.000000,0.600000,0.400000,0.000000,0.000000,5.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.000000,0.000000,0.000000


# Train test split

In [4]:
# we want to normalize and down/upsample after splitting.
scores = providers.drop(['PotentialFraud'], axis=1)
scores = pd.DataFrame(scores, columns = scores.columns)
decision = providers["PotentialFraud"]

Xtrain, Xtest, ytrain, ytest = train_test_split(scores, decision, 
                                                test_size = .30, random_state=0, 
                                                stratify = decision)

In [7]:
providers_trainTestSplit_train = pd.concat([ytrain, Xtrain], axis=1)
display(providers_trainTestSplit_train)
display(providers_trainTestSplit_train['PotentialFraud'].value_counts())

Unnamed: 0_level_0,PotentialFraud,Perc_Outpatient,DualPatientProvider,Perc_DualPatientType,Ratio_ClaimsPerAttPhys,Ratio_ClaimsPerPatient,PatientsPerAttPhys,PatientsPerOperPhys,PatientsPerOthPhys,Perc_MultHospAttPhys,Perc_MultHospOperPhys,Perc_MultHospOtherPhys,Perc_HasTop5AdmitCode,Perc_ClaimsPerTopFraudState,Mean_StatePerAttPhys,Mean_StatePerOperPhys,Mean_StatePerOthPhys,IP_Count_UniquePatients,IP_Perc_MultHosp,IP_Perc_Duplicates,IP_Mean_Duplicate_per_AttPhy,IP_Mean_Duplicate_per_Patient,IP_Perc_Dup_Diff_Provider,IP_Perc_Dup_Diff_State,IP_Count_UniqueState,IP_Mean_PatientsPerAttPhys,IP_Mean_PatientsPerOperPhys,IP_Mean_PatientsPerOtherPhys,IP_Perc_HasAttPhys,IP_Perc_HasNoPhys,IP_Perc_AttPhysIsOperPhys,IP_Mean_ClaimCost,IP_Mean_DailyClaimCost,IP_Mean_ClaimCostPerAttPhys,IP_Mean_ClaimCostPerOperPhys,IP_Mean_ClaimCostPerOtherPhys,IP_Mean_ClaimCostPerPatient,IP_Perc_No_ProcCode,IP_Sum_DeductibleAmtPaid,IP_Mean_AnnualDeductibleAmt,IP_Sum_InscClaimAmtReimbursed,IP_Mean_InsReimbursementRatio,IP_Mean_AnnualReimbursementAmt,IP_Mean_NoOfMonths_PartACov,IP_Mean_NoOfMonths_PartBCov,IP_Mean_ClaimDuration,IP_Mean_AdmitDuration,IP_Mean_AdmitDurationPerAttPhys,IP_Mean_AdmitDurationPerPatient,IP_Mean_AgeAtClaim,IP_AgeRange,IP_Perc_HasDied,IP_Perc_GenderZero,IP_Perc_RaceOne,IP_Perc_RaceThree,IP_Perc_RaceTwo,IP_Perc_HasRenalDisease,IP_Mean_NumChronicConds,IP_Perc_Alzheimers_Chronic,IP_Perc_Cancer_Chronic,IP_Perc_Depression_Chronic,IP_Perc_Diabetes_Chronic,IP_Perc_HeartFailure_Chronic,IP_Perc_IschemicHeart_Chronic,IP_Perc_KidneyDisease_Chronic,IP_Perc_ObstrPulmonary_Chronic,IP_Perc_Osteoporosis_Chronic,IP_Perc_RheumatoidArthritis_Chronic,IP_Perc_Stroke_Chronic,OP_Count_UniquePatients,OP_Perc_MultHosp,OP_Perc_Duplicates,OP_Mean_Duplicate_per_AttPhy,OP_Mean_Duplicate_per_Patient,OP_Perc_Dup_Diff_Provider,OP_Perc_Dup_Diff_State,OP_Count_UniqueState,OP_Mean_PatientsPerAttPhys,OP_Mean_PatientsPerOperPhys,OP_Mean_PatientsPerOtherPhys,OP_Perc_HasNoPhys,OP_Perc_AttPhysIsOperPhys,OP_Perc_HasAttPhys,OP_Mean_ClaimCost,OP_Mean_DailyClaimCost,OP_Mean_ClaimCostPerAttPhys,OP_Mean_ClaimCostPerOperPhys,OP_Mean_ClaimCostPerOtherPhys,OP_Mean_ClaimCostPerPatient,OP_Perc_No_DiagCode,OP_Sum_DeductibleAmtPaid,OP_Mean_AnnualDeductibleAmt,OP_Sum_InscClaimAmtReimbursed,OP_Mean_InsReimbursementRatio,OP_Mean_AnnualReimbursementAmt,OP_Mean_NoOfMonths_PartACov,OP_Mean_NoOfMonths_PartBCov,OP_Mean_ClaimDuration,OP_Mean_AgeAtClaim,OP_AgeRange,OP_Perc_HasDied,OP_Perc_GenderZero,OP_Perc_RaceOne,OP_Perc_RaceThree,OP_Perc_RaceTwo,OP_Perc_HasRenalDisease,OP_Mean_NumChronicConds,OP_Perc_Alzheimers_Chronic,OP_Perc_Cancer_Chronic,OP_Perc_Depression_Chronic,OP_Perc_Diabetes_Chronic,OP_Perc_HeartFailure_Chronic,OP_Perc_IschemicHeart_Chronic,OP_Perc_KidneyDisease_Chronic,OP_Perc_ObstrPulmonary_Chronic,OP_Perc_Osteoporosis_Chronic,OP_Perc_RheumatoidArthritis_Chronic,OP_Perc_Stroke_Chronic
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1
PRV52041,1,1.000000,0,0.245623,235.625000,2.053377,910,911,916,0.115119,0.000000,0.000000,0.021751,0.037666,7.875000,4.714286,9.500000,0.0,0.041910,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,389.901326,0.0,0.000000,3762.005305,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,918.0,0.846154,0.430960,99.375000,1.542471,0.377214,0.190184,17.0,203.125000,51.000000,288.000000,0.003714,0.134218,0.996286,323.856764,242.538986,321.161526,486.867541,338.796715,303.621408,0.016446,5490.0,653.856764,604980.0,0.984813,2449.395225,11.980371,11.970822,2.445093,74.118302,74.0,0.004775,0.410610,0.666667,0.142857,0.000000,0.198939,4.338462,0.363395,0.138462,0.406897,0.663660,0.592573,0.766578,0.372414,0.328912,0.327321,0.283820,0.094430
PRV51771,0,1.000000,0,0.425000,40.000000,1.176471,33,33,33,0.000000,0.000000,0.000000,0.050000,1.000000,1.000000,1.000000,1.000000,0.0,0.050000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,507.300000,0.0,0.000000,3096.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,34.0,0.925000,0.525000,21.000000,1.000000,0.525000,0.208333,1.0,34.000000,6.000000,15.000000,0.000000,0.150000,1.000000,214.750000,143.321429,214.750000,115.000000,100.625000,237.426471,0.000000,60.0,632.500000,8530.0,0.974969,1709.250000,11.700000,12.000000,1.675000,73.725000,72.0,0.000000,0.325000,1.000000,0.000000,0.000000,0.175000,3.525000,0.325000,0.100000,0.375000,0.575000,0.475000,0.625000,0.325000,0.325000,0.200000,0.150000,0.050000
PRV52079,0,0.787879,1,0.272727,1.500000,1.064516,9,20,26,0.090909,0.030303,0.000000,0.030303,0.000000,1.227273,1.000000,1.400000,7.0,0.121212,0.142857,1.0,1.0,0.142857,0.142857,2.0,1.000000,1.25,1.000000,1.0,0.0,0.000000,6568.000000,1817.212600,6568.000000,8818.0,9068.000000,6568.000000,0.428571,7476.0,824.969697,38500.0,0.737450,4883.939394,12.000000,12.0,7.571429,7.571429,6568.000000,6568.000000,68.714286,38.0,0.0,0.285714,0.571429,0.214286,0.071429,0.142857,5.000000,0.571429,0.000000,0.285714,1.000000,0.428571,0.857143,0.571429,0.285714,0.428571,0.428571,0.142857,24.0,0.818182,0.360000,1.285714,1.000000,0.360000,0.157895,5.0,1.529412,1.000000,1.600000,0.000000,0.038462,1.000000,253.846154,150.883700,355.049020,551.428571,632.333333,271.944444,0.038462,0.0,533.636364,6600.0,1.000000,1710.303030,12.000000,12.000000,3.500000,73.653846,51.0,0.000000,0.307692,0.000000,0.000000,0.000000,0.153846,4.923077,0.461538,0.115385,0.576923,0.730769,0.615385,0.653846,0.269231,0.500000,0.346154,0.576923,0.076923
PRV55240,0,0.932515,1,0.306748,3.975610,1.124138,104,127,120,0.165644,0.018405,0.061350,0.049080,0.079755,1.268293,1.111111,1.080000,11.0,0.024540,0.090909,1.0,1.0,0.090909,0.090909,1.0,1.833333,1.75,1.000000,1.0,0.0,0.454545,8031.636364,2456.613774,7809.666667,7868.0,25068.000000,8031.636364,0.363636,11748.0,543.828221,76600.0,0.776311,5410.920245,12.000000,12.0,4.545455,4.545455,8838.370370,8031.636364,70.818182,47.0,0.0,0.545455,0.844262,0.032787,0.073770,0.272727,5.454545,0.272727,0.181818,0.454545,0.727273,0.727273,0.818182,0.545455,0.454545,0.272727,0.727273,0.272727,136.0,0.895706,0.513158,3.080000,1.054054,0.513158,0.308411,2.0,3.947368,2.071429,2.541667,0.006579,0.138158,0.993421,287.565789,248.932487,347.753389,283.940476,250.682540,303.296569,0.000000,470.0,546.319018,43240.0,0.981783,1700.736196,11.921053,11.888158,1.835526,73.875000,69.0,0.000000,0.414474,0.897436,0.035897,0.061538,0.190789,4.434211,0.335526,0.098684,0.486842,0.730263,0.559211,0.743421,0.434211,0.302632,0.315789,0.348684,0.078947
PRV53417,1,0.959064,1,0.315789,17.100000,2.408451,61,61,65,0.105263,0.029240,0.000000,0.017544,0.000000,1.000000,1.000000,1.000000,7.0,0.058480,0.000000,0.0,0.0,0.000000,0.000000,1.0,1.750000,2.00,1.000000,1.0,0.0,0.142857,11925.142857,2347.371989,11109.666667,15568.0,7068.000000,11925.142857,0.428571,7476.0,523.040936,76000.0,0.879256,5251.228070,10.285714,12.0,8.857143,8.857143,12401.333333,11925.142857,69.857143,44.0,0.0,0.428571,0.911765,0.000000,0.088235,0.285714,4.428571,0.142857,0.571429,0.428571,0.857143,0.571429,0.714286,0.428571,0.285714,0.142857,0.285714,0.000000,69.0,0.771930,0.392638,8.000000,1.560976,0.373418,0.226562,1.0,16.750000,3.500000,12.600000,0.006098,0.115854,0.993902,241.829268,178.150705,222.117211,296.916667,218.383165,214.868185,0.006098,570.0,861.929825,39090.0,0.975411,2659.473684,11.195122,11.902439,2.554878,69.859756,73.0,0.000000,0.371951,0.985714,0.000000,0.014286,0.201220,4.298780,0.286585,0.121951,0.445122,0.670732,0.554878,0.719512,0.408537,0.359756,0.304878,0.268293,0.158537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRV53726,0,1.000000,0,0.562500,8.000000,1.333333,10,11,11,0.937500,0.062500,0.375000,0.062500,0.000000,1.500000,1.000000,1.000000,0.0,0.187500,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,801.000000,0.0,0.000000,7062.500000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,12.0,1.000000,0.666667,10.000000,1.111111,0.666667,0.545455,2.0,6.500000,1.000000,5.000000,0.000000,0.062500,1.000000,231.875000,230.078125,147.000000,200.000000,143.333333,185.000000,0.062500,0.0,1218.750000,3710.0,1.000000,4679.375000,12.000000,12.000000,1.500000,74.375000,48.0,0.000000,0.437500,0.500000,0.000000,0.500000,0.187500,5.062500,0.312500,0.000000,0.312500,0.812500,0.875000,0.937500,0.562500,0.437500,0.375000,0.312500,0.125000
PRV54226,0,1.000000,0,0.461538,1.857143,1.181818,4,7,5,0.384615,0.000000,0.153846,0.076923,0.000000,1.000000,1.000000,1.000000,0.0,0.076923,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,575.076923,0.0,0.000000,4307.692308,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,11.0,1.000000,0.538462,1.400000,1.400000,0.538462,0.333333,1.0,1.857143,1.000000,1.333333,0.000000,0.230769,1.000000,58.461538,56.153846,64.761905,45.000000,72.500000,48.484848,0.000000,0.0,1049.230769,760.0,1.000000,3419.230769,12.000000,12.000000,2.076923,80.076923,42.0,0.000000,0.615385,1.000000,0.000000,0.000000,0.000000,4.076923,0.461538,0.230769,0.384615,0.692308,0.461538,0.538462,0.230769,0.230769,0.461538,0.307692,0.076923
PRV56207,0,0.968421,1,0.252632,2.021277,1.032609,45,76,66,0.052632,0.021053,0.000000,0.042105,0.200000,1.531915,1.187500,1.230769,3.0,0.042105,0.333333,1.0,1.0,0.333333,0.333333,1.0,1.000000,1.00,1.000000,1.0,0.0,0.000000,24068.000000,1776.913725,24068.000000,7568.0,12068.000000,24068.000000,0.333333,3204.0,483.747368,69000.0,0.848226,5692.000000,12.000000,12.0,10.333333,10.333333,24068.000000,24068.000000,79.333333,8.0,0.0,0.333333,1.000000,0.000000,0.000000,0.333333,6.333333,0.333333,0.333333,0.333333,0.666667,0.666667,1.000000,0.666667,0.666667,0.666667,0.666667,0.333333,89.0,0.926316,0.444444,1.379310,1.025641,0.444444,0.275362,5.0,2.068182,1.285714,1.320000,0.010870,0.076087,0.989130,227.173913,194.278255,252.512987,441.071429,268.633333,230.505618,0.021739,380.0,516.842105,20520.0,0.975490,1940.421053,12.000000,11.869565,2.195652,75.608696,68.0,0.021739,0.358696,0.965517,0.000000,0.034483,0.250000,4.652174,0.423913,0.163043,0.423913,0.717391,0.630435,0.739130,0.434783,0.326087,0.423913,0.304348,0.065217
PRV53427,0,1.000000,0,0.250000,2.666667,1.411765,8,13,10,0.541667,0.041667,0.166667,0.041667,0.000000,1.000000,1.000000,1.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,267.000000,0.0,0.000000,1757.083333,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,17.0,0.958333,0.454545,1.666667,1.428571,0.454545,0.368421,1.0,2.555556,1.250000,1.142857,0.000000,0.083333,1.000000,372.500000,77.398705,408.333333,667.500000,145.000000,331.323529,0.083333,60.0,911.250000,8880.0,0.981366,3045.416667,12.000000,12.000000,4.541667,71.250000,44.0,0.000000,0.458333,0.834455,0.014805,0.125168,0.500000,5.375000,0.541667,0.041667,0.708333,0.875000,0.666667,0.833333,0.583333,0.375000,0.541667,0.166667,0.041667


0    3433
1     354
Name: PotentialFraud, dtype: int64

In [8]:
providers_trainTestSplit_test = pd.concat([ytest, Xtest], axis=1)
display(providers_trainTestSplit_test)
display(providers_trainTestSplit_test['PotentialFraud'].value_counts())

Unnamed: 0_level_0,PotentialFraud,Perc_Outpatient,DualPatientProvider,Perc_DualPatientType,Ratio_ClaimsPerAttPhys,Ratio_ClaimsPerPatient,PatientsPerAttPhys,PatientsPerOperPhys,PatientsPerOthPhys,Perc_MultHospAttPhys,Perc_MultHospOperPhys,Perc_MultHospOtherPhys,Perc_HasTop5AdmitCode,Perc_ClaimsPerTopFraudState,Mean_StatePerAttPhys,Mean_StatePerOperPhys,Mean_StatePerOthPhys,IP_Count_UniquePatients,IP_Perc_MultHosp,IP_Perc_Duplicates,IP_Mean_Duplicate_per_AttPhy,IP_Mean_Duplicate_per_Patient,IP_Perc_Dup_Diff_Provider,IP_Perc_Dup_Diff_State,IP_Count_UniqueState,IP_Mean_PatientsPerAttPhys,IP_Mean_PatientsPerOperPhys,IP_Mean_PatientsPerOtherPhys,IP_Perc_HasAttPhys,IP_Perc_HasNoPhys,IP_Perc_AttPhysIsOperPhys,IP_Mean_ClaimCost,IP_Mean_DailyClaimCost,IP_Mean_ClaimCostPerAttPhys,IP_Mean_ClaimCostPerOperPhys,IP_Mean_ClaimCostPerOtherPhys,IP_Mean_ClaimCostPerPatient,IP_Perc_No_ProcCode,IP_Sum_DeductibleAmtPaid,IP_Mean_AnnualDeductibleAmt,IP_Sum_InscClaimAmtReimbursed,IP_Mean_InsReimbursementRatio,IP_Mean_AnnualReimbursementAmt,IP_Mean_NoOfMonths_PartACov,IP_Mean_NoOfMonths_PartBCov,IP_Mean_ClaimDuration,IP_Mean_AdmitDuration,IP_Mean_AdmitDurationPerAttPhys,IP_Mean_AdmitDurationPerPatient,IP_Mean_AgeAtClaim,IP_AgeRange,IP_Perc_HasDied,IP_Perc_GenderZero,IP_Perc_RaceOne,IP_Perc_RaceThree,IP_Perc_RaceTwo,IP_Perc_HasRenalDisease,IP_Mean_NumChronicConds,IP_Perc_Alzheimers_Chronic,IP_Perc_Cancer_Chronic,IP_Perc_Depression_Chronic,IP_Perc_Diabetes_Chronic,IP_Perc_HeartFailure_Chronic,IP_Perc_IschemicHeart_Chronic,IP_Perc_KidneyDisease_Chronic,IP_Perc_ObstrPulmonary_Chronic,IP_Perc_Osteoporosis_Chronic,IP_Perc_RheumatoidArthritis_Chronic,IP_Perc_Stroke_Chronic,OP_Count_UniquePatients,OP_Perc_MultHosp,OP_Perc_Duplicates,OP_Mean_Duplicate_per_AttPhy,OP_Mean_Duplicate_per_Patient,OP_Perc_Dup_Diff_Provider,OP_Perc_Dup_Diff_State,OP_Count_UniqueState,OP_Mean_PatientsPerAttPhys,OP_Mean_PatientsPerOperPhys,OP_Mean_PatientsPerOtherPhys,OP_Perc_HasNoPhys,OP_Perc_AttPhysIsOperPhys,OP_Perc_HasAttPhys,OP_Mean_ClaimCost,OP_Mean_DailyClaimCost,OP_Mean_ClaimCostPerAttPhys,OP_Mean_ClaimCostPerOperPhys,OP_Mean_ClaimCostPerOtherPhys,OP_Mean_ClaimCostPerPatient,OP_Perc_No_DiagCode,OP_Sum_DeductibleAmtPaid,OP_Mean_AnnualDeductibleAmt,OP_Sum_InscClaimAmtReimbursed,OP_Mean_InsReimbursementRatio,OP_Mean_AnnualReimbursementAmt,OP_Mean_NoOfMonths_PartACov,OP_Mean_NoOfMonths_PartBCov,OP_Mean_ClaimDuration,OP_Mean_AgeAtClaim,OP_AgeRange,OP_Perc_HasDied,OP_Perc_GenderZero,OP_Perc_RaceOne,OP_Perc_RaceThree,OP_Perc_RaceTwo,OP_Perc_HasRenalDisease,OP_Mean_NumChronicConds,OP_Perc_Alzheimers_Chronic,OP_Perc_Cancer_Chronic,OP_Perc_Depression_Chronic,OP_Perc_Diabetes_Chronic,OP_Perc_HeartFailure_Chronic,OP_Perc_IschemicHeart_Chronic,OP_Perc_KidneyDisease_Chronic,OP_Perc_ObstrPulmonary_Chronic,OP_Perc_Osteoporosis_Chronic,OP_Perc_RheumatoidArthritis_Chronic,OP_Perc_Stroke_Chronic
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1
PRV53036,0,1.000000,0,0.237705,20.333333,1.768116,63,65,63,0.000000,0.000000,0.000000,0.008197,0.000000,1.000000,1.000000,1.000000,0.0,0.032787,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,577.213115,0.0,0.000000,6735.655738,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,69.0,0.918033,0.369748,7.333333,1.157895,0.369748,0.257426,1.0,16.666667,6.250000,6.666667,0.000000,0.098361,1.000000,325.491803,241.704565,325.690359,433.995726,242.013889,311.077295,0.024590,180.0,521.885246,39530.0,0.991171,1834.180328,12.000000,11.704918,2.565574,73.803279,73.0,0.008197,0.532787,0.750000,0.000000,0.250000,0.278689,4.803279,0.475410,0.049180,0.516393,0.721311,0.622951,0.827869,0.508197,0.237705,0.409836,0.368852,0.065574
PRV53351,0,1.000000,0,0.388889,18.000000,1.384615,12,12,12,1.000000,0.000000,0.388889,0.111111,0.000000,1.000000,1.000000,1.000000,0.0,0.333333,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,830.666667,0.0,0.000000,7854.444444,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,13.0,0.888889,0.470588,8.000000,1.142857,0.470588,0.307692,1.0,13.000000,3.000000,6.000000,0.000000,0.166667,1.000000,273.333333,271.216931,273.333333,590.000000,257.142857,310.000000,0.055556,260.0,616.666667,4660.0,0.959402,1519.444444,12.000000,12.000000,2.111111,68.388889,35.0,0.000000,0.166667,0.857143,0.142857,0.000000,0.222222,4.222222,0.555556,0.222222,0.388889,0.777778,0.444444,0.444444,0.444444,0.277778,0.277778,0.388889,0.000000
PRV51600,0,0.972477,1,0.357798,12.111111,1.058252,94,97,96,0.009174,0.000000,0.000000,0.045872,0.981651,1.222222,1.166667,1.000000,3.0,0.073394,0.000000,0.0,0.0,0.000000,0.000000,1.0,1.000000,1.00000,0.0,1.0,0.0,0.000000,4068.000000,933.666667,4068.000000,7068.000000,0.0,4068.00000,0.666667,3204.0,650.862385,9000.0,0.528787,5440.458716,12.000000,12.000000,4.333333,4.333333,4068.0,4068.000000,77.666667,33.0,0.0,0.333333,0.0,0.0,0.0,0.333333,4.000000,0.333333,0.333333,0.000000,0.666667,0.333333,0.666667,0.333333,0.333333,0.333333,0.333333,0.333333,100.0,0.954128,0.442308,7.666667,1.000000,0.442308,0.194444,2.0,17.666667,4.166667,5.285714,0.000000,0.132075,1.000000,260.377358,219.567457,242.482194,252.857143,181.857143,262.500000,0.018868,550.0,646.055046,27050.0,0.974290,2267.339450,12.000000,12.000000,2.415094,72.103774,72.0,0.000000,0.415094,0.617647,0.294118,0.058824,0.179245,4.858491,0.424528,0.198113,0.594340,0.811321,0.603774,0.735849,0.377358,0.301887,0.433962,0.301887,0.075472
PRV54463,0,1.000000,0,0.000000,1.000000,1.000000,0,2,2,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,1068.000000,0.0,0.000000,2000.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.0,0.666667,0.333333,1.000000,1.000000,0.333333,0.000000,2.0,1.000000,1.000000,1.000000,0.000000,0.333333,1.000000,670.000000,670.000000,670.000000,1300.000000,700.000000,670.000000,0.000000,0.0,283.333333,2010.0,1.000000,1780.000000,12.000000,12.000000,1.000000,58.666667,34.0,0.000000,1.000000,0.937500,0.015625,0.046875,0.000000,5.000000,0.333333,0.000000,0.333333,0.666667,0.666667,1.000000,0.666667,0.333333,0.666667,0.000000,0.333333
PRV53182,0,1.000000,0,0.333333,1.000000,1.000000,0,3,3,0.333333,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.333333,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,712.000000,0.0,0.000000,4033.333333,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,33.333333,33.333333,33.333333,0.000000,0.000000,33.333333,0.000000,0.0,866.666667,100.0,1.000000,1546.666667,12.000000,12.000000,1.000000,75.333333,4.0,0.000000,0.333333,0.558824,0.352941,0.029412,0.000000,3.333333,0.000000,0.000000,0.333333,0.333333,0.666667,1.000000,0.000000,0.000000,0.666667,0.000000,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRV53999,0,1.000000,0,0.000000,1.000000,1.000000,0,1,1,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,60.000000,60.000000,60.000000,0.000000,0.000000,60.000000,0.000000,0.0,2090.000000,60.0,1.000000,1740.000000,12.000000,12.000000,1.000000,65.000000,0.0,0.000000,1.000000,1.000000,0.000000,0.000000,0.000000,3.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.000000,1.000000,0.000000,0.000000,0.000000
PRV55169,0,0.884882,1,0.330097,3.094421,1.232479,352,479,463,0.066574,0.008322,0.015257,0.031900,0.079057,1.562232,1.188679,1.278689,79.0,0.080444,0.024096,1.0,1.0,0.024096,0.024096,4.0,1.276923,1.22449,1.0,1.0,0.0,0.012048,10919.219512,2085.867733,10876.333333,13002.027778,8818.0,10945.21519,0.313253,87576.0,574.718447,807800.0,0.840837,5079.556172,11.710843,11.771084,7.000000,7.000000,10990.5,10935.901235,71.036145,69.0,0.0,0.349398,0.0,0.0,0.0,0.216867,5.277108,0.445783,0.180723,0.433735,0.867470,0.698795,0.819277,0.481928,0.457831,0.313253,0.493976,0.084337,514.0,0.865465,0.417335,2.166667,1.135371,0.371972,0.210870,14.0,3.284974,1.800000,2.093220,0.000000,0.089342,1.000000,321.974922,245.101154,322.429482,502.365812,295.751941,302.150384,0.023511,1460.0,576.893204,203960.0,0.979583,2189.542302,11.868339,11.935737,2.349530,71.677116,75.0,0.010972,0.413793,0.818182,0.000000,0.181818,0.191223,4.340125,0.362069,0.170846,0.410658,0.645768,0.575235,0.755486,0.373041,0.304075,0.313480,0.351097,0.078370
PRV52056,0,1.000000,0,0.250000,4.615385,1.153846,39,44,47,0.050000,0.016667,0.000000,0.016667,0.016667,2.230769,1.000000,2.000000,0.0,0.050000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,462.800000,0.0,0.000000,3311.666667,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,52.0,0.950000,0.431034,2.272727,1.041667,0.431034,0.195122,7.0,4.538462,1.375000,4.800000,0.000000,0.050000,1.000000,296.166667,222.736947,285.736264,363.125000,236.833333,319.278846,0.033333,490.0,766.000000,17280.0,0.972689,2726.000000,11.766667,12.000000,3.000000,73.616667,59.0,0.000000,0.383333,0.000000,0.000000,0.000000,0.300000,4.416667,0.500000,0.200000,0.516667,0.666667,0.633333,0.766667,0.400000,0.350000,0.166667,0.116667,0.100000
PRV54941,0,1.000000,0,0.388889,1.636364,1.000000,7,17,13,0.055556,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,1.000000,0.0,0.166667,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,771.333333,0.0,0.000000,7521.111111,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,18.0,0.944444,0.500000,1.142857,1.000000,0.500000,0.090909,1.0,1.545455,2.000000,1.200000,0.055556,0.111111,0.944444,277.222222,219.920635,267.878788,300.000000,227.000000,277.222222,0.000000,0.0,833.333333,4990.0,1.000000,3066.666667,12.000000,12.000000,2.888889,72.277778,49.0,0.000000,0.277778,1.000000,0.000000,0.000000,0.222222,5.111111,0.500000,0.166667,0.500000,0.944444,0.500000,0.888889,0.277778,0.277778,0.500000,0.444444,0.111111


0    1471
1     152
Name: PotentialFraud, dtype: int64

# Scaling

In [9]:
mm_scaler = preprocessing.MinMaxScaler()
Xtrain = mm_scaler.fit_transform(Xtrain)
Xtest = mm_scaler.transform(Xtest)

# Upsampling

In [11]:
### UPSAMPLING
# Separate majority and minority classes
df_majority = providers_trainTestSplit_train[providers_trainTestSplit_train.PotentialFraud==0]
df_minority = providers_trainTestSplit_train[providers_trainTestSplit_train.PotentialFraud==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                # to match majority class
                                 n_samples=providers_trainTestSplit_train['PotentialFraud'].value_counts()[0],
                                 random_state=0) # reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Display new class counts
display(df_upsampled.PotentialFraud.value_counts())

# separate into Xtrain and Ytrain
Xtrain_upsampled = df_upsampled.drop(['PotentialFraud'], axis=1)
ytrain_upsampled = df_upsampled['PotentialFraud']

1    3433
0    3433
Name: PotentialFraud, dtype: int64

# Modeling

In [12]:
# some functions that will be used in modeling
def plot_feature_importances(model):
    n_features = scores.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), scores_cols.to_numpy()) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    

# Stratified Cross Validation needs to be applied 
skf = StratifiedKFold(n_splits = 3, random_state = 0, shuffle = True)

### 1. Logistic regression

In [13]:
logreg = LogisticRegression(random_state=0, class_weight='balanced', penalty='l1', solver='liblinear')

param_grid = {'C': np.logspace(-3,1,100)}

logregCV = GridSearchCV(logreg, 
                        param_grid = param_grid, 
                        scoring='recall',
                        cv=skf).fit(Xtrain,ytrain)

print("Best parameters: {}".format(logregCV.best_params_))
print("Best cross-validation score: {:.2f}".format(logregCV.best_score_))
print("Best estimator:\n{}".format(logregCV.best_estimator_))

Best parameters: {'C': 0.15199110829529347}
Best cross-validation score: 0.87
Best estimator:
LogisticRegression(C=0.15199110829529347, class_weight='balanced', penalty='l1',
                   random_state=0, solver='liblinear')


In [14]:
display(recall_score(ytrain, logregCV.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, logregCV.best_estimator_.predict(Xtest)))

0.8983050847457628

0.9144736842105263

In [70]:
print(confusion_matrix(ytest,logregCV.predict(Xtest)))

[[1242  229]
 [  13  139]]


### 2. Random Forest

In [16]:
forest = RandomForestClassifier(class_weight='balanced', random_state=0) 
forest.fit(Xtrain, ytrain)

display(recall_score(ytrain, forest.predict(Xtrain)))
display(recall_score(ytest, forest.predict(Xtest)))

0.9971751412429378

0.3881578947368421

In [19]:
print('Parameters currently in use:\n')
print(forest.get_params())

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


In [57]:
rf_param_grid = {
    'ccp_alpha': [0,1,2],
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1, 10), 
    'max_features': ['auto', 'log2'], 
    'min_samples_leaf': [1, 2, 4], 
    'min_samples_split': [2, 5, 10], 
    'n_estimators': [100, 200, 300, 1000]
}

rf_grid_search = RandomizedSearchCV(RandomForestClassifier(random_state=0, 
                                                  class_weight='balanced'), 
                                    param_distributions=rf_param_grid, cv=skf, scoring='recall', 
                                    random_state=0, 
                                    verbose=2, return_train_score=True)
rf_grid_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(rf_grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(rf_grid_search.best_score_))
print("Best estimator:\n{}".format(rf_grid_search.best_estimator_))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=4, max_features=log2, max_depth=2, criterion=gini, ccp_alpha=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=4, max_features=log2, max_depth=2, criterion=gini, ccp_alpha=2, total=   0.3s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=4, max_features=log2, max_depth=2, criterion=gini, ccp_alpha=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=4, max_features=log2, max_depth=2, criterion=gini, ccp_alpha=2, total=   0.2s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=4, max_features=log2, max_depth=2, criterion=gini, ccp_alpha=2 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=4, max_features=log2, max_depth=2, criterion=gini, ccp_alpha=2, total=   0.3s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=1, criterion=gini, ccp_alpha=2 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=1, criterion=gini, ccp_alpha=2, total=   2.1s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=1, criterion=gini, ccp_alpha=2 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=1, criterion=gini, ccp_alpha=2, total=   2.4s
[CV] n_estimators=1000, min_samples_split=2, min_samples_l

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  1.2min finished


Best parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 5, 'criterion': 'gini', 'ccp_alpha': 1}
Best cross-validation score: 1.00
Best estimator:
RandomForestClassifier(ccp_alpha=1, class_weight='balanced', max_depth=5,
                       max_features='log2', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=200, random_state=0)


In [58]:
display(recall_score(ytrain, rf_grid_search.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, rf_grid_search.best_estimator_.predict(Xtest)))

1.0

1.0

In [61]:
# rf_grid_search.cv_results_['mean_test_score']
rf_grid_search.cv_results_['mean_train_score']

array([0.66666667, 0.33333333, 1.        , 0.66666667, 0.9180791 ,
       0.8700565 , 0.33333333, 0.33333333, 0.96327684, 0.33333333])

In [65]:
rf_param_grid = {
    'max_depth': range(1, 10), 
    'max_features': ['auto', 'log2'], 
    'min_samples_leaf': [4, 5], 
    'min_samples_split': [10, 11], 
    'n_estimators': [150, 200, 250]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=0, 
                                                  class_weight='balanced'), 
                                    param_grid=rf_param_grid, cv=skf, scoring='recall',
                                    verbose=2, return_train_score=True)
rf_grid_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(rf_grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(rf_grid_search.best_score_))
print("Best estimator:\n{}".format(rf_grid_search.best_estimator_))

Fitting 3 folds for each of 216 candidates, totalling 648 fits
[CV] max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150, total=   0.4s
[CV] max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV]  max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150, total=   0.3s
[CV] max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150, total=   0.3s
[CV] max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.4s
[CV] max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.5s
[CV] max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.4s
[CV] max_depth=1, max_features=auto, min_samples_lea

[CV]  max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.4s
[CV] max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250 
[CV]  max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250, total=   0.5s
[CV] max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250 
[CV]  max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250, total=   0.5s
[CV] max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250 
[CV]  max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250, total=   0.4s
[CV] max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=150 
[CV]  max_depth=1, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.3s
[CV] max_depth=1, max_features=log2, min_samples_lea

[CV]  max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.4s
[CV] max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   0.5s
[CV] max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200 
[CV]  max_depth=2, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   0.5s
[CV] max_depth=2, max_features=auto, min_samples_lea

[CV]  max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   0.4s
[CV] max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=200 
[CV]  max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   0.5s
[CV] max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250 
[CV]  max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   0.8s
[CV] max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250 
[CV]  max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   0.8s
[CV] max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250 
[CV]  max_depth=2, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   0.7s
[CV] max_depth=2, max_features=log2, min_samples_lea

[CV]  max_depth=3, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   1.3s
[CV] max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150 
[CV]  max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150, total=   0.6s
[CV] max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150 
[CV]  max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150, total=   0.6s
[CV] max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150 
[CV]  max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150, total=   0.5s
[CV] max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=200 
[CV]  max_depth=3, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   0.8s
[CV] max_depth=3, max_features=auto, min_samples_lea

[CV]  max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   0.5s
[CV] max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200 
[CV]  max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   0.5s
[CV] max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200 
[CV]  max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   0.5s
[CV] max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250 
[CV]  max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   0.6s
[CV] max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250 
[CV]  max_depth=3, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   0.6s
[CV] max_depth=3, max_features=log2, min_samples_lea

[CV]  max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   1.0s
[CV] max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=250 
[CV]  max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   0.9s
[CV] max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150 
[CV]  max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   0.6s
[CV] max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150 
[CV]  max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   0.6s
[CV] max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150 
[CV]  max_depth=4, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   0.6s
[CV] max_depth=4, max_features=auto, min_samples_lea

[CV]  max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   0.4s
[CV] max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200 
[CV]  max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200, total=   0.6s
[CV] max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200 
[CV]  max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200, total=   0.6s
[CV] max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200 
[CV]  max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200, total=   0.6s
[CV] max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=250 
[CV]  max_depth=4, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=250, total=   0.7s
[CV] max_depth=4, max_features=log2, min_samples_lea

[CV]  max_depth=5, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=250, total=   1.2s
[CV] max_depth=5, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=250, total=   1.4s
[CV] max_depth=5, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=250 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=250, total=   1.3s
[CV] max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=150 
[CV]  max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=150, total=   0.5s
[CV] max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=150 
[CV]  max_depth=5, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=150, total=   0.5s
[CV] max_depth=5, max_features=log2, min_samples_lea

[CV]  max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150, total=   0.7s
[CV] max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150 
[CV]  max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=150, total=   0.7s
[CV] max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.9s
[CV] max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.9s
[CV] max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200 
[CV]  max_depth=6, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.9s
[CV] max_depth=6, max_features=auto, min_samples_lea

[CV]  max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200, total=   0.7s
[CV] max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250 
[CV]  max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250, total=   0.9s
[CV] max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250 
[CV]  max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250, total=   0.9s
[CV] max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250 
[CV]  max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=250, total=   0.8s
[CV] max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=150 
[CV]  max_depth=6, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.5s
[CV] max_depth=6, max_features=log2, min_samples_lea

[CV]  max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.7s
[CV] max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150 
[CV]  max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.7s
[CV] max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150 
[CV]  max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=150, total=   0.7s
[CV] max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200 
[CV]  max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   1.0s
[CV] max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200 
[CV]  max_depth=7, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   1.0s
[CV] max_depth=7, max_features=auto, min_samples_lea

[CV]  max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   0.8s
[CV] max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=200 
[CV]  max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=200, total=   0.8s
[CV] max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250 
[CV]  max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   1.0s
[CV] max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250 
[CV]  max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   0.9s
[CV] max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250 
[CV]  max_depth=7, max_features=log2, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   1.0s
[CV] max_depth=7, max_features=log2, min_samples_lea

[CV]  max_depth=8, max_features=auto, min_samples_leaf=4, min_samples_split=11, n_estimators=250, total=   1.3s
[CV] max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150 
[CV]  max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150, total=   0.8s
[CV] max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150 
[CV]  max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150, total=   0.7s
[CV] max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150 
[CV]  max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=150, total=   0.8s
[CV] max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=200 
[CV]  max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   1.0s
[CV] max_depth=8, max_features=auto, min_samples_lea

[CV]  max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   0.8s
[CV] max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200 
[CV]  max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   0.8s
[CV] max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200 
[CV]  max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=200, total=   0.8s
[CV] max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250 
[CV]  max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   0.9s
[CV] max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250 
[CV]  max_depth=8, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   0.9s
[CV] max_depth=8, max_features=log2, min_samples_lea

[CV]  max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   1.3s
[CV] max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=250 
[CV]  max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=10, n_estimators=250, total=   1.5s
[CV] max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150 
[CV]  max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   0.9s
[CV] max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150 
[CV]  max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   1.0s
[CV] max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150 
[CV]  max_depth=9, max_features=auto, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   1.0s
[CV] max_depth=9, max_features=auto, min_samples_lea

[CV]  max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=150, total=   0.7s
[CV] max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200 
[CV]  max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200, total=   0.8s
[CV] max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200 
[CV]  max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200, total=   0.8s
[CV] max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200 
[CV]  max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=200, total=   0.8s
[CV] max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=250 
[CV]  max_depth=9, max_features=log2, min_samples_leaf=5, min_samples_split=11, n_estimators=250, total=   0.9s
[CV] max_depth=9, max_features=log2, min_samples_lea

[Parallel(n_jobs=1)]: Done 648 out of 648 | elapsed:  8.6min finished


Best parameters: {'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}
Best cross-validation score: 0.87
Best estimator:
RandomForestClassifier(class_weight='balanced', max_depth=2, min_samples_leaf=4,
                       min_samples_split=10, n_estimators=150, random_state=0)


In [66]:
display(recall_score(ytrain, rf_grid_search.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, rf_grid_search.best_estimator_.predict(Xtest)))

0.8926553672316384

0.8881578947368421

### 3. Support vector machine

In [20]:
svm_param_grid = {'C': np.logspace(-3,1,100)}

svm_rand_search = RandomizedSearchCV(SVC(random_state=0, class_weight='balanced'),
                                     svm_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
svm_rand_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(svm_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(svm_rand_search.best_score_))
print("Best estimator:\n{}".format(svm_rand_search.best_estimator_))

Best parameters: {'C': 0.05462277217684343}
Best cross-validation score: 0.89
Best estimator:
SVC(C=0.05462277217684343, class_weight='balanced', random_state=0)


In [21]:
display(recall_score(ytrain, svm_rand_search.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, svm_rand_search.best_estimator_.predict(Xtest)))

0.9067796610169492

0.8947368421052632

### 4. Gaussian Naive Bayes 

In [22]:
clf = GaussianNB().fit(Xtrain, ytrain)

print("Training set score: {:.3f}".format(recall_score(ytrain, clf.predict(Xtrain))))
print("Test set score: {:.3f}".format(recall_score(ytest, clf.predict(Xtest))))

print('\nParameters currently in use:')
print(clf.get_params())

Training set score: 0.743
Test set score: 0.737

Parameters currently in use:
{'priors': None, 'var_smoothing': 1e-09}


In [50]:
gnb_param_grid = {'var_smoothing': np.logspace(-10,1,100)}

gnb_rand_search = RandomizedSearchCV(GaussianNB(),
                                     gnb_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
gnb_rand_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(gnb_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(gnb_rand_search.best_score_))
print("Best estimator:\n{}".format(gnb_rand_search.best_estimator_))

Best parameters: {'var_smoothing': 0.46415888336127725}
Best cross-validation score: 0.87
Best estimator:
GaussianNB(var_smoothing=0.46415888336127725)


In [51]:
display(recall_score(ytrain, gnb_rand_search.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, gnb_rand_search.best_estimator_.predict(Xtest)))

0.8728813559322034

0.8618421052631579

In [52]:
confusion_matrix(ytest,gnb_rand_search.predict(Xtest))

array([[1017,  454],
       [  21,  131]])

In [53]:
clf = GaussianNB().fit(Xtrain_upsampled, ytrain_upsampled)

print("Training set score: {:.3f}".format(recall_score(ytrain_upsampled, clf.predict(Xtrain_upsampled))))
print("Test set score: {:.3f}".format(recall_score(ytest, clf.predict(Xtest))))

print('\nParameters currently in use:')
print(clf.get_params())

Training set score: 0.484
Test set score: 0.000

Parameters currently in use:
{'priors': None, 'var_smoothing': 1e-09}


In [54]:
gnb_param_grid = {'var_smoothing': np.logspace(-10,1,100)}

gnb_rand_search = RandomizedSearchCV(GaussianNB(),
                                     gnb_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
gnb_rand_search.fit(Xtrain_upsampled, ytrain_upsampled)

print("Best parameters: {}".format(gnb_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(gnb_rand_search.best_score_))
print("Best estimator:\n{}".format(gnb_rand_search.best_estimator_))

Best parameters: {'var_smoothing': 7.742636826811262e-05}
Best cross-validation score: 0.74
Best estimator:
GaussianNB(var_smoothing=7.742636826811262e-05)


In [55]:
display(recall_score(ytrain_upsampled, gnb_rand_search.best_estimator_.predict(Xtrain_upsampled)))
display(recall_score(ytest, gnb_rand_search.best_estimator_.predict(Xtest)))

0.7343431401106904

0.0

### 5. K nearest neighbour

In [30]:
knn = KNeighborsClassifier().fit(Xtrain, ytrain)

print("Training set score: {:.3f}".format(recall_score(ytrain, knn.predict(Xtrain))))
print("Test set score: {:.3f}".format(recall_score(ytest, knn.predict(Xtest))))

print('\nParameters currently in use:')
print(knn.get_params())

Training set score: 0.531
Test set score: 0.447

Parameters currently in use:
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [31]:
knn = KNeighborsClassifier().fit(Xtrain, ytrain)

print("Training set score: {:.3f}".format(recall_score(ytrain_upsampled, knn.predict(Xtrain_upsampled))))
print("Test set score: {:.3f}".format(recall_score(ytest, knn.predict(Xtest))))

print('\nParameters currently in use:')
print(knn.get_params())

Training set score: 0.925
Test set score: 0.447

Parameters currently in use:
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [38]:
knn_param_grid = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                  'leaf_size': [10,30,50],
                  'n_neighbors': [3,5,7],
                  'weights': ['uniform', 'distance']
                 }

knn_rand_search = RandomizedSearchCV(KNeighborsClassifier(),
                                     knn_param_grid, 
                                     cv=skf, 
                                     scoring='recall')
knn_rand_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(knn_rand_search.best_params_))
print("Best cross-validation score: {:.2f}".format(knn_rand_search.best_score_))
print("Best estimator:\n{}".format(knn_rand_search.best_estimator_))

Best parameters: {'weights': 'uniform', 'n_neighbors': 3, 'leaf_size': 10, 'algorithm': 'ball_tree'}
Best cross-validation score: 0.42
Best estimator:
KNeighborsClassifier(algorithm='ball_tree', leaf_size=10, n_neighbors=3)


In [39]:
display(recall_score(ytrain, knn_rand_search.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, knn_rand_search.best_estimator_.predict(Xtest)))

0.6271186440677966

0.45394736842105265

-------

# Drop features