In [1]:
import pandas as pd
import numpy as np
from numpy import mean, std

import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
import math
import re
import requests
import random
import itertools

In [3]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import statsmodels.api as sm
from sklearn import tree
from catboost import CatBoostClassifier
from sklearn.datasets import make_classification
import xgboost as xgb

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import plot_confusion_matrix

In [5]:
df = pd.read_csv('gender_classification.csv')
df.head()

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
0,1,11.8,6.1,1,0,1,1,Male
1,0,14.0,5.4,0,0,1,0,Female
2,0,11.8,6.3,1,1,1,1,Male
3,0,14.4,6.1,0,1,1,1,Male
4,1,13.5,5.9,0,0,0,0,Female


In [6]:
df.shape

(5001, 8)

In [7]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  5001 non-null   int64  
 1   forehead_width_cm          5001 non-null   float64
 2   forehead_height_cm         5001 non-null   float64
 3   nose_wide                  5001 non-null   int64  
 4   nose_long                  5001 non-null   int64  
 5   lips_thin                  5001 non-null   int64  
 6   distance_nose_to_lip_long  5001 non-null   int64  
 7   gender                     5001 non-null   object 
dtypes: float64(2), int64(5), object(1)
memory usage: 312.7+ KB


### Making Target into 1 & 0

In [8]:
df['gender'].value_counts()

Female    2501
Male      2500
Name: gender, dtype: int64

In [9]:
df['gender'].replace(to_replace="Male", value= 0, inplace = True)
df['gender'].replace(to_replace="Female", value= 1, inplace = True)

In [10]:
df['gender'].value_counts()

1    2501
0    2500
Name: gender, dtype: int64

### Checking the event rate

In [11]:
df_er = pd.DataFrame(df['gender'].value_counts())
df_er['EVENT_RATE'] = (df_er['gender'] / df_er['gender'].sum()) * 100
df_er

Unnamed: 0,gender,EVENT_RATE
1,2501,50.01
0,2500,49.99


In [12]:
dict_mv_count = {i: df[i].isnull().sum() for i in df.columns}

df_mv_count = pd.DataFrame.from_dict(dict_mv_count, orient = 'index', columns = ['MV'])

df_mv_count = df_mv_count.reset_index()

In [13]:
df_mv_count['mv_percent'] = (df_mv_count['MV'] / len(df)) * 100

df_mv_count.sort_values('mv_percent', ascending = False)

Unnamed: 0,index,MV,mv_percent
0,long_hair,0,0.0
1,forehead_width_cm,0,0.0
2,forehead_height_cm,0,0.0
3,nose_wide,0,0.0
4,nose_long,0,0.0
5,lips_thin,0,0.0
6,distance_nose_to_lip_long,0,0.0
7,gender,0,0.0


In [14]:
df_mv_final_count = df_mv_count[df_mv_count['mv_percent'] <= 5]

df_mv_final_count.reset_index(inplace = True)

mv_cols = df_mv_final_count['index'].to_list()

In [15]:
df = df[mv_cols]

### Split data into train, test

In [16]:
df_train = df.sample(frac = 0.8)
df_test = df.drop(df_train.index)

In [17]:
print('Training set: ', df_train.shape)
print('Testing set: ', df_test.shape)

Training set:  (4001, 8)
Testing set:  (1000, 8)


### Checking the event rate of training set

In [18]:
df_er1 = pd.DataFrame(df_train['gender'].value_counts())
df_er1['EVENT_RATE'] = (df_er1['gender'] / df_er1['gender'].sum()) * 100
df_er1

Unnamed: 0,gender,EVENT_RATE
0,2002,50.037
1,1999,49.963


### Checking the event rate of testing set

In [19]:
df_er2 = pd.DataFrame(df_test['gender'].value_counts())
df_er2['EVENT_RATE'] = (df_er2['gender'] / df_er2['gender'].sum()) * 100
df_er2

Unnamed: 0,gender,EVENT_RATE
1,502,50.2
0,498,49.8


### Dropping unwanted columns

In [20]:
cols_drop1 = []

for column in df_train.columns.to_list():
    if column in cols_drop1:
        df_train.drop(columns = column, axis = 1, inplace = True)
        df_test.drop(columns = column, axis = 1, inplace = True)
        
    else:
        pass

In [21]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4001 entries, 2543 to 3769
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  4001 non-null   int64  
 1   forehead_width_cm          4001 non-null   float64
 2   forehead_height_cm         4001 non-null   float64
 3   nose_wide                  4001 non-null   int64  
 4   nose_long                  4001 non-null   int64  
 5   lips_thin                  4001 non-null   int64  
 6   distance_nose_to_lip_long  4001 non-null   int64  
 7   gender                     4001 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 281.3 KB


### Missing Value treatment

In [22]:
mv_dict = {}

for i in df_train.columns.to_list():
    if df_train[i].dtype == 'object':
        mv_dict.update({i: df_train[i].mode()[0]})
        
    else:
        mv_dict.update({i: df_train[i].median()})

In [23]:
mv_dict

{'long_hair': 1.0,
 'forehead_width_cm': 13.1,
 'forehead_height_cm': 5.9,
 'nose_wide': 0.0,
 'nose_long': 1.0,
 'lips_thin': 0.0,
 'distance_nose_to_lip_long': 0.0,
 'gender': 0.0}

In [24]:
for i in df_train.columns.to_list():
    df_train[i].fillna(mv_dict[i], inplace = True)
    
for i in df_test.columns.to_list():
    df_test[i].fillna(mv_dict[i], inplace = True)    

In [25]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4001 entries, 2543 to 3769
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   long_hair                  4001 non-null   int64  
 1   forehead_width_cm          4001 non-null   float64
 2   forehead_height_cm         4001 non-null   float64
 3   nose_wide                  4001 non-null   int64  
 4   nose_long                  4001 non-null   int64  
 5   lips_thin                  4001 non-null   int64  
 6   distance_nose_to_lip_long  4001 non-null   int64  
 7   gender                     4001 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 281.3 KB


### Outlier Treatment

In [26]:
def describe_(df):
    df1 = df.describe().reset_index()
    df1.drop('index', inplace = True, axis = 1)
#     print(df1)
    
    return df1

def out_zscore(data):
    global outliers, zscore
    outliers = []
    zscore = []
    threshold = 3
    mean = np.mean(data)
    std = np.std(data)
    for i in data:
        if std != 0:
            z_score = (i - mean) / std
#             print(z_score)
            zscore.append(z_score)
            
            if np.abs(z_score) > threshold:
                outliers.append(i)
        
        else:
            pass
    
    return outliers

def treat_outlier(df, i):
    # Computing 10th, 90th percentiles and replacing the outliers
    tenth_percentile = np.percentile(df[i], 10)
    ninetieth_percentile = np.percentile(df[i], 90)
    b = np.where(df[i] < tenth_percentile, tenth_percentile, df[i])
    b = np.where(b >  ninetieth_percentile, ninetieth_percentile, b)
    df[i] = b
    return df

In [27]:
for i in describe_(df_train).columns:
    outliers = out_zscore(df_train[i])
    print("Total number of outliers in", i, ":", len(outliers))
    
print("-----------------------------------------------------------------------")
for i in describe_(df_train).columns:
    treat_outlier(df_train, i)
    
for i in describe_(df_train).columns:
    outliers = out_zscore(df_train[i])
    print("Total number of outliers in", i, ":", len(outliers))

Total number of outliers in long_hair : 0
Total number of outliers in forehead_width_cm : 0
Total number of outliers in forehead_height_cm : 0
Total number of outliers in nose_wide : 0
Total number of outliers in nose_long : 0
Total number of outliers in lips_thin : 0
Total number of outliers in distance_nose_to_lip_long : 0
Total number of outliers in gender : 0
-----------------------------------------------------------------------
Total number of outliers in long_hair : 0
Total number of outliers in forehead_width_cm : 0
Total number of outliers in forehead_height_cm : 0
Total number of outliers in nose_wide : 0
Total number of outliers in nose_long : 0
Total number of outliers in lips_thin : 0
Total number of outliers in distance_nose_to_lip_long : 0
Total number of outliers in gender : 0


In [28]:
for i in describe_(df_test).columns:
    outliers = out_zscore(df_test[i])
    print("Total number of outliers in", i, ":", len(outliers))
    
print("-----------------------------------------------------------------------")
for i in describe_(df_test).columns:
    treat_outlier(df_test, i)
    
for i in describe_(df_test).columns:
    outliers = out_zscore(df_test[i])
    print("Total number of outliers in", i, ":", len(outliers))

Total number of outliers in long_hair : 0
Total number of outliers in forehead_width_cm : 0
Total number of outliers in forehead_height_cm : 0
Total number of outliers in nose_wide : 0
Total number of outliers in nose_long : 0
Total number of outliers in lips_thin : 0
Total number of outliers in distance_nose_to_lip_long : 0
Total number of outliers in gender : 0
-----------------------------------------------------------------------
Total number of outliers in long_hair : 0
Total number of outliers in forehead_width_cm : 0
Total number of outliers in forehead_height_cm : 0
Total number of outliers in nose_wide : 0
Total number of outliers in nose_long : 0
Total number of outliers in lips_thin : 0
Total number of outliers in distance_nose_to_lip_long : 0
Total number of outliers in gender : 0


### Information Value

In [29]:
def iv_woe(data, target, bins, show_woe = False):
    
    # Empty Dataframe
    newDF, woeDF = pd.DataFrame(), pd.DataFrame()
    
    # Extract Column Names
    cols = data.columns
    
    # Run WOE and IV on all the indpendent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars])) > 10):
            binned_x = pd.qcut(data[ivars], bins, duplicates = 'drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
            
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})
        
        d = d0.groupby("x", as_index = False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()
        d['Non-Events'] = d['N'] - d['Events']
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()
        d['WoE'] = np.log(d['% of Events'] / d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc = 0, column = "Variable", value = ivars)
        print("Information Value of " + ivars + " is " + str(round(d['IV'].sum(), 6)))
        temp = pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF = pd.concat([newDF, temp], axis = 0)
        woeDF = pd.concat([woeDF, d], axis = 0)
        
        # Show WOE Table
        if show_woe == True:
            print(d)
    return newDF, woeDF

In [30]:
iv, woe = iv_woe(data = df_train, target = 'gender', bins = 4, show_woe = True)

Information Value of long_hair is 0.000167
    Variable  Cutoff     N   Events  % of Events  Non-Events  % of Non-Events  \
0  long_hair   0.000   509  250.000        0.125     259.000            0.129   
1  long_hair   1.000  3492 1749.000        0.875    1743.000            0.871   

     WoE    IV  
0 -0.034 0.000  
1  0.005 0.000  
Information Value of forehead_width_cm is 0.508292
            Variable          Cutoff     N  Events  % of Events  Non-Events  \
0  forehead_width_cm  (11.699, 12.2]  1034 656.000        0.328     378.000   
1  forehead_width_cm    (12.2, 13.1]   968 555.000        0.278     413.000   
2  forehead_width_cm    (13.1, 14.0]  1014 584.000        0.292     430.000   
3  forehead_width_cm    (14.0, 14.8]   985 204.000        0.102     781.000   

   % of Non-Events    WoE    IV  
0            0.189  0.553 0.077  
1            0.206  0.297 0.021  
2            0.215  0.308 0.024  
3            0.390 -1.341 0.386  
Information Value of forehead_height_cm is 0.

In [31]:
iv.to_excel('IV.xlsx', index = False)

In [32]:
iv.sort_values(by = 'IV', ascending = False)

Unnamed: 0,Variable,IV
0,nose_wide,3.075
0,distance_nose_to_lip_long,2.963
0,lips_thin,2.901
0,nose_long,2.796
0,forehead_height_cm,0.565
0,forehead_width_cm,0.508
0,long_hair,0.0


### Information Gain

In [80]:
def calc_entropy(column):
    """
    Calculate entropy given a series, list, or numpy array.
    """
    # Compute the counts of each given value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [81]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on and target.
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the unique values in the column
    values = data[split_name].unique()
    
    # Make two subsets of the data, based on the unique values
#     print(data[data[split_name] == values[0]])
#     print()
#     print(data[data[split_name] == values[1]])

    print(split_name, ':', values)
    left_split = (data[data[split_name] == values[0]])
    right_split = (data[data[split_name] == values[1]])
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0])
        to_subtract += prob * calc_entropy(subset[target_name])
        
    # Return information gain
    return original_entropy - to_subtract

In [82]:
dict_inf_gain = {i: format(calc_information_gain(df_train, i, 'gender'), 'f') for i in df_train.columns}

df_inf_gain = pd.DataFrame.from_dict(dict_inf_gain, orient = 'index', columns = ['Inf Gain'])

df_inf_gain = df_inf_gain.reset_index()

long_hair : [0. 1.]
forehead_width_cm : [14.2 11.7 14.8 12.6 13.  14.1 12.8 12.9 14.4 11.8 12.4 12.1 13.7 13.3
 13.1 12.7 12.3 12.2 11.9 13.8 12.5 12.  14.7 13.6 14.5 14.6 14.  14.3
 13.4 13.9 13.2 13.5]
forehead_height_cm : [6.  5.7 6.7 5.2 5.8 6.4 5.4 6.1 5.3 5.6 6.5 5.9 5.5 6.3 6.6 6.2]
nose_wide : [1. 0.]
nose_long : [1. 0.]
lips_thin : [0. 1.]
distance_nose_to_lip_long : [1. 0.]
gender : [0. 1.]


In [83]:
df_inf_gain1 = df_inf_gain[df_inf_gain['Inf Gain'].astype('float') >= 0.15]
df_inf_gain1.sort_values(by = 'Inf Gain', ascending = False).reset_index()

Unnamed: 0,level_0,index,Inf Gain
0,7,gender,1.0
1,2,forehead_height_cm,0.888118
2,1,forehead_width_cm,0.878515
3,3,nose_wide,0.476554
4,6,distance_nose_to_lip_long,0.462213
5,5,lips_thin,0.453936
6,4,nose_long,0.440128


In [84]:
df_inf_gain.to_excel('Information Gain.xlsx', index = False)

### Variance Inflation Factor

In [33]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(df1):
    # Calculating VIF
    vif = pd.DataFrame()
    vif["Variables"] = df1.columns
    vif["VIF"] = [float(format(variance_inflation_factor(df1.values, i), 'f')) for i in range(df1.shape[1])]
    
    return vif

In [34]:
df2 = df_train._get_numeric_data()

VIF = calc_vif(df2)

In [35]:
VIF.sort_values('VIF', ascending = False)

Unnamed: 0,Variables,VIF
1,forehead_width_cm,94.005
2,forehead_height_cm,92.192
7,gender,10.505
0,long_hair,7.758
3,nose_wide,4.694
6,distance_nose_to_lip_long,4.526
4,nose_long,4.423
5,lips_thin,4.417


In [36]:
VIF.to_csv('VIF.csv', index = False)

In [37]:
cols2 = VIF[VIF['VIF'] < 15]['Variables'].to_list()

df3 = df2[cols2].copy()
VIF2 = calc_vif(df3)

In [38]:
VIF2.sort_values('VIF', ascending = False)

Unnamed: 0,Variables,VIF
0,long_hair,6.554
1,nose_wide,3.766
2,nose_long,3.744
3,lips_thin,3.713
4,distance_nose_to_lip_long,3.671
5,gender,3.187


### Chi-Square Test

In [39]:
catg = df_train.select_dtypes(include = ['object'])
catg.head()

2543
3855
3853
4484
818


In [40]:
from scipy.stats import chi2_contingency

for col in catg.columns:
    contigency = pd.crosstab('gender', catg[col])
    c, p, dof, expected = chi2_contigency(contigency, correction = False)
    
    print(col)
    if p > 0.05:
        # There is multi-colinearity between variables
        print("p-value: ", round(p, 3))
        print("Accept H0")
    else:
        # There is no multi-colinearity between variables
        print("p-value: ", round(p, 3))
        print("Reject H0")
    
    print()

### Making bins based on the WoE

In [41]:
model_var = [
    'long_hair',
    'forehead_width_cm',
    'forehead_height_cm',
    'nose_wide',
    'nose_long',
    'lips_thin',
    'distance_nose_to_lip_long',
]

In [42]:
# import scorecardpy as sc

# appended_data = []
# for col in model_var:
# #     print(col)
#     Max = df_train[[col, 'gender']]
# #     print(Max)
#     Max_Del = sc.woebin(Max, y = 'gender')
# #     print(Max_Del)
#     new = Max_Del[col]
#     # store dataframe in list
#     appended_data.append(new)
#     print()
    
    
# appended_data = pd.concat(appended_data)

# appended_data.to_excel('WOE_BIN.xlsx', index = False)

In [43]:
df_train

Unnamed: 0,long_hair,forehead_width_cm,forehead_height_cm,nose_wide,nose_long,lips_thin,distance_nose_to_lip_long,gender
2543,0.000,14.200,6.000,1.000,1.000,0.000,1.000,0.000
3855,0.000,11.700,5.700,1.000,0.000,1.000,1.000,0.000
3853,1.000,14.800,6.700,0.000,1.000,1.000,1.000,0.000
4484,1.000,12.600,5.200,0.000,0.000,1.000,0.000,1.000
818,1.000,13.000,5.800,0.000,0.000,0.000,0.000,1.000
...,...,...,...,...,...,...,...,...
2307,1.000,14.700,5.200,1.000,1.000,1.000,0.000,0.000
3516,1.000,12.500,5.200,1.000,1.000,1.000,1.000,0.000
364,1.000,14.300,6.400,1.000,1.000,1.000,1.000,0.000
1995,1.000,13.000,5.200,1.000,1.000,0.000,1.000,0.000


### Binning of continuous variable

In [44]:
# df_train[''] = np.where((df_train[''] <= 12), "LT12",
#                np.where((12 < df_train['']) & (df_train[''] <= 22), "12_22", "GT22"))

# df_test[''] = np.where((df_test[''] <= 12), "LT12",
#                np.where((12 < df_test['']) & (df_test[''] <= 22), "12_22", "GT22"))

### Binning of categorical variable

In [45]:
# # Making values of categorical variable to upper case
# df_train[''] = df_train[''].str.upper()
# df_test[''] = df_test[''].str.upper()

In [46]:
# conditions = [(df_train[''].isin(['', ''])),
#               (df_train[''].isin(['', '']))]
# values = ['Group1', 'Group2']
# df_train['_BIN'] = np.select(conditions, values, default = 'Group3')

# conditions = [(df_test[''].isin(['', ''])),
#               (df_test[''].isin(['', '']))]
# values = ['Group1', 'Group2']
# df_test['_BIN'] = np.select(conditions, values, default = 'Group3')

### Label Encoding

In [47]:
# df_train[''] = df_train[''].replace(to_replace = "", value = )
# df_test[''] = df_test[''].replace(to_replace = "", value = )

# df_train[''] = df_train[''].replace(to_replace = "", value = )
# df_test[''] = df_test[''].replace(to_replace = "", value = )

In [48]:
# Making a copy of train & test dataframe
train = df_train.copy()
test = df_test.copy()


### Model Development

In [49]:
df_train = train.copy()
df_test = test.copy()

In [50]:
model_var = [
    'long_hair',
    'forehead_width_cm',
    'forehead_height_cm',
    'nose_wide',
    'nose_long',
    'lips_thin',
    'distance_nose_to_lip_long',
]

### One Hot Encoding

In [51]:
df_train1 = df_train[model_var]

cat_vars = [i for i in df_train1.columns if df_train[i].dtype == 'object']

for var in cat_vars:
    cat_list = 'var' + '_' + var
    cat_list = pd.get_dummoes(df_train1[var], prefix = var)
    df_train1 = df_train1.join(cat_list)
    
df_train1 = df_train1.drop(cat_vars, axis = 1)

df_test1 = df_test[model_var]

for var in cat_vars:
    cat_list = 'var' + '_' + var
    cat_list = pd.get_dummoes(df_test1[var], prefix = var)
    df_test1 = df_test1.join(cat_list)
    
df_test1 = df_test1.drop(cat_vars, axis = 1)

print("Training set: ", df_train1.shape)
print("Testing set: ", df_test1.shape)

Training set:  (4001, 7)
Testing set:  (1000, 7)


In [52]:
def maximum(a, b):
    if a != b:
        if a[0] >= b[0]:
            largest = a[1]
        else:
            largest = b[1]
            
        return largest
    else:
        print("Equal shapes")
        
tr1 = [len(df_train1.columns), "train"]
te1 = [len(df_test1.columns), "test"]

largest = maximum(tr1, te1)
print("Maximum shape of", largest, "dataframe!!!")

Maximum shape of train dataframe!!!


In [53]:
tr_col = set(df_train1.columns.to_list())
te_col = set(df_test1.columns.to_list())

if largest == 'train':
    x = tr_col.intersection(te_col)
    for i in list(tr_col):
        if i not in list(x):
            print("test:", i)
            df_test1[i] = 0

if largest == 'test':
    x = te_col.intersection(tr_col)
    for i in list(tr_col):
        if i not in list(x):
            print("train:", i)
            df_train1[i] = 0

else:
    pass

print("----------------------------------------")
print("Training set: ", df_train1.shape)
print("Testing set: ", df_test1.shape)

----------------------------------------
Training set:  (4001, 7)
Testing set:  (1000, 7)


In [54]:
# # Adding label encoded column to dataframe
# df_train1[''] = train['']
# df_test1[''] = test['']

# print("Training set: ", df_train1.shape)
# print("Testing set: ", df_test1.shape)

In [55]:
X_train = df_train1.copy()
y_train = df_train['gender']
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)
print("\n")

X_test = df_test1.copy()
y_test = df_test['gender']
print("Shape of X_test: ", X_test.shape)
print("Shape of y_test: ", y_test.shape)

Shape of X_train:  (4001, 7)
Shape of y_train:  (4001,)


Shape of X_test:  (1000, 7)
Shape of y_test:  (1000,)


### Logistic Regression

In [56]:
lr = LogisticRegression().fit(X_train, y_train)

y_pred_lr1 = lr.predict(X_train)
y_pred_lr = lr.predict(X_test)

print("Accuracy Score for train:", accuracy_score(y_train, y_pred_lr1) * 100)
print("Accuracy Score for test:", accuracy_score(y_test, y_pred_lr) * 100)
print()

print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_lr1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_lr) * 100)
print()

print("Precision for train:", precision_score(y_train, y_pred_lr1) * 100)
print("Precision for test:", precision_score(y_test, y_pred_lr) * 100)
print()

print("Recall for train:", recall_score(y_train, y_pred_lr1) * 100)
print("Recall for test:", recall_score(y_test, y_pred_lr) * 100)

Accuracy Score for train: 96.80079980004999
Accuracy Score for test: 97.0

ROC AUC Score for train: 96.8009479265107
ROC AUC Score for test: 97.0023520376326

Precision for train: 96.61185849526657
Precision for test: 97.58064516129032

Recall for train: 96.99849924962481
Recall for test: 96.41434262948208


In [57]:
print("Training:")
print(confusion_matrix(y_train, y_pred_lr1))
tn, fp, fn, tp = confusion_matrix(y_train, y_pred_lr1).ravel()

accuracy = (tn + tp) / (tp + fp + tn + fn)
print("Accuracy for train:", accuracy)

# Recall
sensitivity = (tp) / (tp + fn)
print("Sensitivity for train:", sensitivity)

specificity = (tn) / (tn + fp)
print("Specificity for train:", specificity)

precision = (tp) / (tp + fp)
print("Precision for train:", specificity)

Training:
[[1934   68]
 [  60 1939]]
Accuracy for train: 0.9680079980004999
Sensitivity for train: 0.9699849924962481
Specificity for train: 0.9660339660339661
Precision for train: 0.9660339660339661


In [58]:
print("Testing:")
print(confusion_matrix(y_test, y_pred_lr))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lr).ravel()

accuracy = (tn + tp) / (tp + fp + tn + fn)
print("Accuracy for test:", accuracy)

# Recall
sensitivity = (tp) / (tp + fn)
print("Sensitivity for test:", sensitivity)

specificity = (tn) / (tn + fp)
print("Specificity for test:", specificity)

precision = (tp) / (tp + fp)
print("Precision for test:", specificity)

Testing:
[[486  12]
 [ 18 484]]
Accuracy for test: 0.97
Sensitivity for test: 0.9641434262948207
Specificity for test: 0.9759036144578314
Precision for test: 0.9759036144578314


### K-Fold Cross Validation

In [59]:
def kfold(splits, scoring):
    cv = KFold(n_splits = splits, random_state = 1, shuffle = True)
    #create model
    model = LogisticRegression().fit(X_train, y_train)
    # evaluate model
    scores = cross_val_score(model, X_train, y_train, scoring = scoring, cv = cv, n_jobs = -1)
    return scores

In [60]:
# report performance
print("Accuracy: %.3f (%.3f)" %(mean(kfold(4, 'accuracy')), std(kfold(4, 'accuracy'))))
print('Accuracy', kfold(4, 'accuracy'))
print()

print("ROC-AUC: %.3f (%.3f)" %(mean(kfold(4, 'roc_auc')), std(kfold(4, 'roc_auc'))))
print('ROC-AUC', kfold(4, 'roc_auc'))
print()

print("Precison: %.3f (%.3f)" %(mean(kfold(4, 'precision')), std(kfold(4, 'precision'))))
print('Precison', kfold(4, 'precision'))
print()

print("Recall: %.3f (%.3f)" %(mean(kfold(4, 'recall')), std(kfold(4, 'recall'))))
print('Recall', kfold(4, 'recall'))
print()

Accuracy: 0.966 (0.004)
Accuracy [0.95904096 0.969      0.966      0.97      ]

ROC-AUC: 0.996 (0.001)
ROC-AUC [0.99473333 0.99673077 0.99586974 0.99666735]

Precison: 0.965 (0.007)
Precison [0.95463918 0.97109827 0.97       0.96579477]

Recall: 0.966 (0.005)
Recall [0.96058091 0.96923077 0.96230159 0.97363083]



### KS Table

In [61]:
X_train_prob = list(lr.predict_proba(X_train)[:, 1])
X_test_prob = list(lr.predict_proba(X_test)[:, 1])

train['Prob_score_LR'] = X_train_prob
test['Prob_score_LR'] = X_test_prob

def ks(data=None,target=None, prob=None):
    data['target0'] = 1 - data[target]
    data['bucket'] = pd.qcut(data[prob], 10)
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()[prob]
    kstable['max_prob'] = grouped.max()[prob]
    kstable['events']   = grouped.sum()[target]
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    
    kstable['ks_stats'] = np.round(((kstable['events'] / kstable['events'].sum()).cumsum() - \
                                   (kstable['nonevents'] / kstable['nonevents'].sum()).cumsum()), 4) * 100
    kstable['max_ks'] = kstable['ks_stats'].apply(lambda x: '***' if x == kstable['ks_stats'].max() else '')
    
    kstable.index = range(1,11)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 20)
#     print(kstable)
#     print(type(kstable))
    
    #Display KS
    from colorama import Fore
#     print(Fore.RED + "KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
#     print(type(kstable))
    
    return(kstable)

lr_train_ks = ks(train, 'gender', 'Prob_score_LR')
lr_test_ks = ks(test, 'gender', 'Prob_score_LR')

In [62]:
lr_train_ks

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS,ks_stats,max_ks
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1.0,1.0,397.0,0.0,19.86%,0.00%,19.86%,0.00%,19.9,19.86,
2,0.999,1.0,403.0,0.0,20.16%,0.00%,40.02%,0.00%,40.0,40.02,
3,0.994,0.999,399.0,1.0,19.96%,0.05%,59.98%,0.05%,59.9,59.93,
4,0.97,0.994,397.0,3.0,19.86%,0.15%,79.84%,0.20%,79.6,79.64,
5,0.51,0.97,338.0,62.0,16.91%,3.10%,96.75%,3.30%,93.5,93.45,***
6,0.025,0.507,62.0,338.0,3.10%,16.88%,99.85%,20.18%,79.7,79.67,
7,0.005,0.025,3.0,397.0,0.15%,19.83%,100.00%,40.01%,60.0,59.99,
8,0.001,0.005,0.0,400.0,0.00%,19.98%,100.00%,59.99%,40.0,40.01,
9,0.0,0.001,0.0,399.0,0.00%,19.93%,100.00%,79.92%,20.1,20.08,
10,0.0,0.0,0.0,402.0,0.00%,20.08%,100.00%,100.00%,0.0,0.0,


In [63]:
lr_test_ks

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS,ks_stats,max_ks
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.999,1.0,98.0,0.0,19.52%,0.00%,19.52%,0.00%,19.5,19.52,
2,0.999,0.999,102.0,0.0,20.32%,0.00%,39.84%,0.00%,39.8,39.84,
3,0.992,0.999,100.0,0.0,19.92%,0.00%,59.76%,0.00%,59.8,59.76,
4,0.971,0.992,99.0,1.0,19.72%,0.20%,79.48%,0.20%,79.3,79.28,
5,0.481,0.97,87.0,13.0,17.33%,2.61%,96.81%,2.81%,94.0,94.0,***
6,0.033,0.468,15.0,85.0,2.99%,17.07%,99.80%,19.88%,79.9,79.92,
7,0.006,0.033,1.0,99.0,0.20%,19.88%,100.00%,39.76%,60.2,60.24,
8,0.001,0.006,0.0,99.0,0.00%,19.88%,100.00%,59.64%,40.4,40.36,
9,0.0,0.001,0.0,101.0,0.00%,20.28%,100.00%,79.92%,20.1,20.08,
10,0.0,0.0,0.0,100.0,0.00%,20.08%,100.00%,100.00%,0.0,0.0,


### Statsmodel Logistic Regression

In [64]:
logit_model = sm.Logit(y_train, X_train)
result = logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.094215
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                 gender   No. Observations:                 4001
Model:                          Logit   Df Residuals:                     3994
Method:                           MLE   Df Model:                            6
Date:                Sun, 13 Nov 2022   Pseudo R-squ.:                  0.8641
Time:                        13:04:30   Log-Likelihood:                -376.95
converged:                       True   LL-Null:                       -2773.3
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
long_hair                     0.9433      0.283      3.328      0.001       0.

In [65]:
y_pred_result1 = result.predict(X_train)
y_pred_result = result.predict(X_test)

print("ROC AUC Score for train: ", roc_auc_score(y_train, y_pred_result1) * 100)
print("ROC AUC Score for test: ", roc_auc_score(y_test, y_pred_result) * 100)

ROC AUC Score for train:  99.24833795519139
ROC AUC Score for test:  99.28718859501751


### KS Table

In [66]:
# X_train_prob = list(result.predict_proba(X_train)[:, 1])
# X_test_prob = list(result.predict_proba(X_test)[:, 1])

train['Prob_score_Stats'] = y_pred_result1
test['Prob_score_Stats'] = y_pred_result

def ks(data=None,target=None, prob=None):
    data['target0'] = 1 - data[target]
    data['bucket'] = pd.qcut(data[prob], 10)
    grouped = data.groupby('bucket', as_index = False)
    kstable = pd.DataFrame()
    kstable['min_prob'] = grouped.min()[prob]
    kstable['max_prob'] = grouped.max()[prob]
    kstable['events']   = grouped.sum()[target]
    kstable['nonevents'] = grouped.sum()['target0']
    kstable = kstable.sort_values(by="min_prob", ascending=False).reset_index(drop = True)
    kstable['event_rate'] = (kstable.events / data[target].sum()).apply('{0:.2%}'.format)
    kstable['nonevent_rate'] = (kstable.nonevents / data['target0'].sum()).apply('{0:.2%}'.format)
    kstable['cum_eventrate']=(kstable.events / data[target].sum()).cumsum()
    kstable['cum_noneventrate']=(kstable.nonevents / data['target0'].sum()).cumsum()
    kstable['KS'] = np.round(kstable['cum_eventrate']-kstable['cum_noneventrate'], 3) * 100

    #Formating
    kstable['cum_eventrate']= kstable['cum_eventrate'].apply('{0:.2%}'.format)
    kstable['cum_noneventrate']= kstable['cum_noneventrate'].apply('{0:.2%}'.format)
    
    kstable['ks_stats'] = np.round(((kstable['events'] / kstable['events'].sum()).cumsum() - \
                                   (kstable['nonevents'] / kstable['nonevents'].sum()).cumsum()), 4) * 100
    kstable['max_ks'] = kstable['ks_stats'].apply(lambda x: '***' if x == kstable['ks_stats'].max() else '')
    
    kstable.index = range(1,11)
    kstable.index.rename('Decile', inplace=True)
    pd.set_option('display.max_columns', 20)
#     print(kstable)
#     print(type(kstable))
    
    #Display KS
    from colorama import Fore
#     print(Fore.RED + "KS is " + str(max(kstable['KS']))+"%"+ " at decile " + str((kstable.index[kstable['KS']==max(kstable['KS'])][0])))
#     print(type(kstable))
    
    return(kstable)

stats_train_ks = ks(train, 'gender', 'Prob_score_Stats')
stats_test_ks = ks(test, 'gender', 'Prob_score_Stats')

In [67]:
stats_train_ks

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS,ks_stats,max_ks
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.999,1.0,398.0,1.0,19.91%,0.05%,19.91%,0.05%,19.9,19.86,
2,0.999,0.999,401.0,0.0,20.06%,0.00%,39.97%,0.05%,39.9,39.92,
3,0.988,0.999,397.0,3.0,19.86%,0.15%,59.83%,0.20%,59.6,59.63,
4,0.967,0.988,389.0,11.0,19.46%,0.55%,79.29%,0.75%,78.5,78.54,
5,0.492,0.967,319.0,81.0,15.96%,4.05%,95.25%,4.80%,90.5,90.45,***
6,0.028,0.492,85.0,314.0,4.25%,15.68%,99.50%,20.48%,79.0,79.02,
7,0.008,0.028,10.0,391.0,0.50%,19.53%,100.00%,40.01%,60.0,59.99,
8,0.001,0.008,0.0,397.0,0.00%,19.83%,100.00%,59.84%,40.2,40.16,
9,0.0,0.001,0.0,403.0,0.00%,20.13%,100.00%,79.97%,20.0,20.03,
10,0.0,0.0,0.0,401.0,0.00%,20.03%,100.00%,100.00%,-0.0,-0.0,


In [68]:
stats_test_ks

Unnamed: 0_level_0,min_prob,max_prob,events,nonevents,event_rate,nonevent_rate,cum_eventrate,cum_noneventrate,KS,ks_stats,max_ks
Decile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.999,1.0,100.0,0.0,19.92%,0.00%,19.92%,0.00%,19.9,19.92,
2,0.999,0.999,99.0,0.0,19.72%,0.00%,39.64%,0.00%,39.6,39.64,
3,0.987,0.999,101.0,0.0,20.12%,0.00%,59.76%,0.00%,59.8,59.76,
4,0.967,0.987,98.0,2.0,19.52%,0.40%,79.28%,0.40%,78.9,78.88,
5,0.487,0.967,79.0,21.0,15.74%,4.22%,95.02%,4.62%,90.4,90.4,***
6,0.027,0.484,22.0,78.0,4.38%,15.66%,99.40%,20.28%,79.1,79.12,
7,0.009,0.027,2.0,98.0,0.40%,19.68%,99.80%,39.96%,59.8,59.84,
8,0.001,0.008,1.0,99.0,0.20%,19.88%,100.00%,59.84%,40.2,40.16,
9,0.0,0.001,0.0,100.0,0.00%,20.08%,100.00%,79.92%,20.1,20.08,
10,0.0,0.0,0.0,100.0,0.00%,20.08%,100.00%,100.00%,-0.0,-0.0,


### Gaussian NB

In [69]:
gnb = GaussianNB().fit(X_train, y_train)

y_pred_gnb1 = gnb.predict(X_train)
y_pred_gnb = gnb.predict(X_test)

print("Accuracy Score for train:", accuracy_score(y_train, y_pred_gnb1) * 100)
print("Accuracy Score for test:", accuracy_score(y_test, y_pred_gnb) * 100)
print()

print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_gnb1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_gnb) * 100)
print()

print("Precision for train:", precision_score(y_train, y_pred_gnb1) * 100)
print("Precision for test:", precision_score(y_test, y_pred_gnb) * 100)
print()

print("Recall for train:", recall_score(y_train, y_pred_gnb1) * 100)
print("Recall for test:", recall_score(y_test, y_pred_gnb) * 100)

Accuracy Score for train: 97.07573106723319
Accuracy Score for test: 96.7

ROC AUC Score for train: 97.0758980889046
ROC AUC Score for test: 96.7019472311557

Precision for train: 96.86254980079681
Precision for test: 97.1830985915493

Recall for train: 97.29864932466234
Recall for test: 96.21513944223108


### Decision Tree

In [70]:
clf = tree.DecisionTreeClassifier()
dt = clf.fit(X_train, y_train)

y_pred_dt1 = dt.predict(X_train)
y_pred_dt = dt.predict(X_test)

print("Accuracy Score for train:", accuracy_score(y_train, y_pred_dt1) * 100)
print("Accuracy Score for test:", accuracy_score(y_test, y_pred_dt) * 100)
print()

print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_dt1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_dt) * 100)
print()

print("Precision for train:", precision_score(y_train, y_pred_dt1) * 100)
print("Precision for test:", precision_score(y_test, y_pred_dt) * 100)
print()

print("Recall for train:", recall_score(y_train, y_pred_dt1) * 100)
print("Recall for test:", recall_score(y_test, y_pred_dt) * 100)

Accuracy Score for train: 99.75006248437892
Accuracy Score for test: 96.0

ROC AUC Score for train: 99.74991241874683
ROC AUC Score for test: 95.99913598617577

Precision for train: 99.94977398292315
Precision for test: 95.83333333333334

Recall for train: 99.54977488744372
Recall for test: 96.21513944223108


### Cat Boost Classifier

In [71]:
catB = CatBoostClassifier(iterations = 5, learning_rate = 0.1).fit(X_train, y_train)

y_pred_catB1 = catB.predict(X_train)
y_pred_catB = catB.predict(X_test)
print("------------------------------------------------------------------")
print()

print("Accuracy Score for train:", accuracy_score(y_train, y_pred_catB1) * 100)
print("Accuracy Score for test:", accuracy_score(y_test, y_pred_catB) * 100)
print()

print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_catB1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_catB) * 100)
print()

print("Precision for train:", precision_score(y_train, y_pred_catB1) * 100)
print("Precision for test:", precision_score(y_test, y_pred_catB) * 100)
print()

print("Recall for train:", recall_score(y_train, y_pred_catB1) * 100)
print("Recall for test:", recall_score(y_test, y_pred_catB) * 100)

0:	learn: 0.6124840	total: 145ms	remaining: 581ms
1:	learn: 0.5433267	total: 149ms	remaining: 224ms
2:	learn: 0.4891970	total: 153ms	remaining: 102ms
3:	learn: 0.4413603	total: 156ms	remaining: 38.9ms
4:	learn: 0.4021030	total: 159ms	remaining: 0us
------------------------------------------------------------------

Accuracy Score for train: 96.57585603599101
Accuracy Score for test: 97.0

ROC AUC Score for train: 96.57606025790119
ROC AUC Score for test: 96.99835197363159

Precision for train: 96.31840796019901
Precision for test: 96.6403162055336

Recall for train: 96.84842421210605
Recall for test: 97.41035856573706


### Random Forest

In [72]:
rf = RandomForestClassifier().fit(X_train, y_train)

y_pred_rf1 = rf.predict(X_train)
y_pred_rf = rf.predict(X_test)

print("Accuracy Score for train:", accuracy_score(y_train, y_pred_rf1) * 100)
print("Accuracy Score for test:", accuracy_score(y_test, y_pred_rf) * 100)
print()

print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_rf1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_rf) * 100)
print()

print("Precision for train:", precision_score(y_train, y_pred_rf1) * 100)
print("Precision for test:", precision_score(y_test, y_pred_rf) * 100)
print()

print("Recall for train:", recall_score(y_train, y_pred_rf1) * 100)
print("Recall for test:", recall_score(y_test, y_pred_rf) * 100)

Accuracy Score for train: 99.75006248437892
Accuracy Score for test: 96.89999999999999

ROC AUC Score for train: 99.75013730641544
ROC AUC Score for test: 96.89555032880527

Precision for train: 99.65052421367947
Precision for test: 95.90643274853801

Recall for train: 99.84992496248124
Recall for test: 98.00796812749005


### XGBoost

In [73]:
xgb_cl = xgb.XGBClassifier().fit(X_train, y_train)

y_pred_xgb_cl1 = xgb_cl.predict(X_train)
y_pred_xgb_cl = xgb_cl.predict(X_test)

print("Accuracy Score for train:", accuracy_score(y_train, y_pred_xgb_cl1) * 100)
print("Accuracy Score for test:", accuracy_score(y_test, y_pred_xgb_cl) * 100)
print()

print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_xgb_cl1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_xgb_cl) * 100)
print()

print("Precision for train:", precision_score(y_train, y_pred_xgb_cl1) * 100)
print("Precision for test:", precision_score(y_test, y_pred_xgb_cl) * 100)
print()

print("Recall for train:", recall_score(y_train, y_pred_xgb_cl1) * 100)
print("Recall for test:", recall_score(y_test, y_pred_xgb_cl) * 100)

Accuracy Score for train: 99.70007498125469
Accuracy Score for test: 96.3

ROC AUC Score for train: 99.70014977518731
ROC AUC Score for test: 96.29874077985248

Precision for train: 99.60059910134798
Precision for test: 96.03960396039604

Recall for train: 99.79989994997499
Recall for test: 96.61354581673307


### XGBoost with hyperparameters

In [74]:
xgb_cl = xgb.XGBClassifier(objective = 'binary:logistic', booster = 'gbtree', learning_rate = 0.1,
                          max_depth = 5, base_score = 0.5, n_estimators = 200, 
                          nim_parallel_tree = 5, eval_metric = 'auc',
                          tree_method = 'hist', grow_policy = 'lossguide').fit(X_train, y_train)

y_pred_xgb_cl1 = xgb_cl.predict(X_train)
y_pred_xgb_cl = xgb_cl.predict(X_test)

print("Accuracy Score for train:", accuracy_score(y_train, y_pred_xgb_cl1) * 100)
print("Accuracy Score for test:", accuracy_score(y_test, y_pred_xgb_cl) * 100)
print()

print("ROC AUC Score for train:", roc_auc_score(y_train, y_pred_xgb_cl1) * 100)
print("ROC AUC Score for test:", roc_auc_score(y_test, y_pred_xgb_cl) * 100)
print()

print("Precision for train:", precision_score(y_train, y_pred_xgb_cl1) * 100)
print("Precision for test:", precision_score(y_test, y_pred_xgb_cl) * 100)
print()

print("Recall for train:", recall_score(y_train, y_pred_xgb_cl1) * 100)
print("Recall for test:", recall_score(y_test, y_pred_xgb_cl) * 100)

Parameters: { "nim_parallel_tree" } are not used.

Accuracy Score for train: 99.30017495626093
Accuracy Score for test: 96.5

ROC AUC Score for train: 99.30021204408396
ROC AUC Score for test: 96.49794396710347

Precision for train: 99.2503748125937
Precision for test: 96.05522682445759

Recall for train: 99.34967483741872
Recall for test: 97.01195219123507
