In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os


In [2]:
# Load the dataset
a1 = pd.read_excel("../data/case_study1.xlsx")
a2 = pd.read_excel("../data/case_study2.xlsx")

In [3]:
df1 = a1.copy()
df2 = a2.copy()

In [4]:
# Remove nulls
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [5]:
columns_to_be_removed = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed .append(i)

In [6]:
df2 = df2.drop(columns_to_be_removed, axis =1)

for i in df2.columns:
    df2 = df2.loc[ df2[i] != -99999 ]

# Checking common column names
for i in list(df1.columns):
    if i in list(df2.columns):
        print (i)


PROSPECTID


In [7]:
# Merge the two dataframes, inner join so that no nulls are present
df = pd. merge ( df1, df2, how ='inner', left_on = ['PROSPECTID'], right_on = ['PROSPECTID'] )

In [8]:
df.isna().sum().sum()

np.int64(0)

In [9]:
# check how many columns are categorical
for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [10]:
# Chi-square test
for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(i, '---', pval)

MARITALSTATUS --- 3.578180861038862e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287


In [11]:
# VIF for numerical columns
numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']:
        numeric_columns.append(i)

In [12]:
# VIF sequentially check

vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

In [13]:
for i in range (0,total_columns):
    
    vif_value = variance_inflation_factor(vif_data, column_index)
    print (column_index,'---',vif_value)
    
    
    if vif_value <= 6:
        columns_to_be_kept.append( numeric_columns[i] )
        column_index = column_index+1
    
    else:
        vif_data = vif_data.drop([ numeric_columns[i] ] , axis=1)

  vif = 1. / (1. - r_squared_i)


0 --- inf


  vif = 1. / (1. - r_squared_i)


0 --- inf
0 --- 11.320180023967996
0 --- 8.363698035000336
0 --- 6.520647877790928
0 --- 5.149501618212625
1 --- 2.611111040579735


  vif = 1. / (1. - r_squared_i)


2 --- inf
2 --- 1788.7926256209232
2 --- 8.601028256477228
2 --- 3.8328007921530785
3 --- 6.099653381646739
3 --- 5.581352009642762
4 --- 1.985584353098778


  vif = 1. / (1. - r_squared_i)


5 --- inf
5 --- 4.809538302819343
6 --- 23.270628983464636
6 --- 30.595522588100053
6 --- 4.3843464059655854
7 --- 3.064658415523423
8 --- 2.898639771299253
9 --- 4.377876915347324
10 --- 2.207853583695844
11 --- 4.916914200506864
12 --- 5.214702030064725
13 --- 3.3861625024231476
14 --- 7.840583309478997
14 --- 5.255034641721438


  vif = 1. / (1. - r_squared_i)


15 --- inf
15 --- 7.380634506427232
15 --- 1.4210050015175733
16 --- 8.083255010190316
16 --- 1.6241227524040112
17 --- 7.257811920140003
17 --- 15.59624383268298
17 --- 1.825857047132431
18 --- 1.5080839450032664
19 --- 2.172088834824577
20 --- 2.62339755352723
21 --- 2.2959970812106176
22 --- 7.360578319196439
22 --- 2.1602387773102554
23 --- 2.8686288267891458
24 --- 6.458218003637277
24 --- 2.8474118865638265
25 --- 4.753198156284083
26 --- 16.22735475594825
26 --- 6.424377256363877
26 --- 8.887080381808687
26 --- 2.3804746142952653
27 --- 8.609513476514548
27 --- 13.06755093547673
27 --- 3.500040056654654
28 --- 1.9087955874813773
29 --- 17.006562234161628
29 --- 10.730485153719197
29 --- 2.3538497522950275
30 --- 22.104855915136433
30 --- 2.7971639638512906
31 --- 3.4241712032176985
32 --- 10.175021454450935
32 --- 6.408710354561301
32 --- 1.001151196262561
33 --- 3.069197305397274
34 --- 2.8091261600643715
35 --- 20.249538381980678
35 --- 15.864576541593774
35 --- 1.833164974053

In [14]:
# check Anova for columns_to_be_kept 

from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])  
    b = list(df['Approved_Flag'])  
    
    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']


    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)

In [15]:
f_statistic

np.float64(507.29276705297787)

In [16]:
p_value

np.float64(5e-324)

In [17]:
columns_to_be_kept_numerical

['pct_tl_open_L6M',
 'pct_tl_closed_L6M',
 'Tot_TL_closed_L12M',
 'pct_tl_closed_L12M',
 'Tot_Missed_Pmnt',
 'CC_TL',
 'Home_TL',
 'PL_TL',
 'Secured_TL',
 'Unsecured_TL',
 'Other_TL',
 'Age_Oldest_TL',
 'Age_Newest_TL',
 'time_since_recent_payment',
 'max_recent_level_of_deliq',
 'num_deliq_6_12mts',
 'num_times_60p_dpd',
 'num_std_12mts',
 'num_sub',
 'num_sub_6mts',
 'num_sub_12mts',
 'num_dbt',
 'num_dbt_12mts',
 'num_lss',
 'recent_level_of_deliq',
 'CC_enq_L12m',
 'PL_enq_L12m',
 'time_since_recent_enq',
 'enq_L3m',
 'NETMONTHLYINCOME',
 'Time_With_Curr_Empr',
 'CC_Flag',
 'PL_Flag',
 'pct_PL_enq_L6m_of_ever',
 'pct_CC_enq_L6m_of_ever',
 'HL_Flag',
 'GL_Flag']

In [18]:

# listing all the final features
features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
df = df[features + ['Approved_Flag']]

In [19]:
# Label encoding for the categorical features
['MARITALSTATUS', 'EDUCATION', 'GENDER' , 'last_prod_enq2' ,'first_prod_enq2']

print(df['MARITALSTATUS'].unique())
print(df['EDUCATION'].unique())
print(df['GENDER'].unique())
print(df['last_prod_enq2'].unique())
print(df['first_prod_enq2'].unique())

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


In [20]:
# # Ordinal feature -- EDUCATION
# # SSC            : 1
# # 12TH           : 2
# # GRADUATE       : 3
# # UNDER GRADUATE : 3
# # POST-GRADUATE  : 4
# # OTHERS         : 1
# # PROFESSIONAL   : 3


# # Others has to be verified by the business end user 




df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [21]:
df['EDUCATION'].value_counts()
df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            42064 non-null  float64
 1   pct_tl_closed_L6M          42064 non-null  float64
 2   Tot_TL_closed_L12M         42064 non-null  int64  
 3   pct_tl_closed_L12M         42064 non-null  float64
 4   Tot_Missed_Pmnt            42064 non-null  int64  
 5   CC_TL                      42064 non-null  int64  
 6   Home_TL                    42064 non-null  int64  
 7   PL_TL                      42064 non-null  int64  
 8   Secured_TL                 42064 non-null  int64  
 9   Unsecured_TL               42064 non-null  int64  
 10  Other_TL                   42064 non-null  int64  
 11  Age_Oldest_TL              42064 non-null  int64  
 12  Age_Newest_TL              42064 non-null  int64  
 13  time_since_recent_payment  42064 non-null  int

In [22]:
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])

df_encoded.info()
k = df_encoded.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

In [25]:
df_encoded.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,False,False,True,False,False,False,False,False,True,False
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,True,False,False,False,False,False,True,False,False,False
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,True,False,False,False,False,False,False,False,False,True
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,False,False,False,False,True,False,False,False,False,False
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,True,False,False,False,False,False,False,False,True,False


In [73]:
Y = df_encoded['Approved_Flag']
X = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

In [None]:
# enq_L3m
# Age_Oldest_TL
# time_since_recent_enq
# pct_PL_enq_L6m_of_ever
# num_std_12mts
# PL_enq_L12m
# max_recent_level_of_deliq
# recent_level_of_deliq
# GL_Flag
# pct_CC_enq_L6m_of_ever
# last_prod_enq2_ConsumerLoan
# Secured_TL
# last_prod_enq2_others
# pct_tl_open_L6M
# Other_TL

# These are the best features according to the feature importance .

important_features = [
    "enq_L3m",
    "Age_Oldest_TL",
    "num_std_12mts",
    "pct_PL_enq_L6m_of_ever",
    "time_since_recent_enq",
    "max_recent_level_of_deliq",
    "recent_level_of_deliq",
    "PL_enq_L12m",
    "Secured_TL",
    "last_prod_enq2_ConsumerLoan",
    "GL_Flag",
    "num_times_60p_dpd",
    "num_deliq_6_12mts",
    "Age_Newest_TL",
    "PL_Flag"
]

# Filter DataFrame to include only important features
X = X[important_features]


In [75]:
x=X
y = Y
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)
rf_classifier.fit(x_train, y_train)
y_pred = rf_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy}')
print ()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()



Accuracy: 0.75846903601569

Class p1:
Precision: 0.8106637649619152
Recall: 0.7347140039447732
F1 Score: 0.7708225556130367

Class p2:
Precision: 0.808684819362876
Recall: 0.9006937561942517
F1 Score: 0.8522130532633159

Class p3:
Precision: 0.4307116104868914
Recall: 0.26037735849056604
F1 Score: 0.32455315145813735

Class p4:
Precision: 0.6955307262569832
Recall: 0.7259475218658892
F1 Score: 0.7104136947218259



In [76]:

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)



y = Y
x = X


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)




xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.77

Class p1:
Precision: 0.8175965665236051
Recall: 0.7514792899408284
F1 Score: 0.7831449126413155

Class p2:
Precision: 0.8177769861061631
Recall: 0.910009910802775
F1 Score: 0.8614316540013135

Class p3:
Precision: 0.44019138755980863
Recall: 0.27773584905660376
F1 Score: 0.3405830633965757

Class p4:
Precision: 0.7138700290979632
Recall: 0.7152575315840622
F1 Score: 0.7145631067961165



In [77]:
# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier


y = Y
x = X

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()



Accuracy: 0.71

Class p1:
Precision: 0.7173699705593719
Recall: 0.7209072978303748
F1 Score: 0.7191342843089031

Class p2:
Precision: 0.806892568348094
Recall: 0.8307234886025768
F1 Score: 0.8186346322883095

Class p3:
Precision: 0.3465674110835401
Recall: 0.31622641509433963
F1 Score: 0.3307024467245462

Class p4:
Precision: 0.6579212916246215
Recall: 0.6336248785228377
F1 Score: 0.6455445544554456



In [78]:
X.columns

Index(['enq_L3m', 'Age_Oldest_TL', 'time_since_recent_enq',
       'pct_PL_enq_L6m_of_ever', 'num_std_12mts', 'PL_enq_L12m',
       'max_recent_level_of_deliq', 'recent_level_of_deliq', 'GL_Flag',
       'pct_CC_enq_L6m_of_ever', 'last_prod_enq2_ConsumerLoan', 'Secured_TL',
       'last_prod_enq2_others', 'pct_tl_open_L6M', 'Other_TL'],
      dtype='object')

In [79]:
# xgboost is giving me best results
# We will further finetune it
# Apply standard scaler 

from sklearn.preprocessing import StandardScaler

columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq',
'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

for i in columns_to_be_scaled:
    column_data = df_encoded[i].values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled_column = scaler.fit_transform(column_data)
    df_encoded[i] = scaled_column

In [91]:
Y = df_encoded['Approved_Flag']
X = df_encoded. drop ( ['Approved_Flag'], axis = 1 )
# These are the best features according to the feature importance .

important_features = [
    "enq_L3m",
    "Age_Oldest_TL",
    "num_std_12mts",
    "pct_PL_enq_L6m_of_ever",
    "time_since_recent_enq",
    "max_recent_level_of_deliq",
    "recent_level_of_deliq",
    "PL_enq_L12m",
    "Secured_TL",
    "last_prod_enq2_ConsumerLoan",
    "GL_Flag",
    "num_times_60p_dpd",
    "num_deliq_6_12mts",
    "Age_Newest_TL",
    "PL_Flag"
]
# Filter DataFrame to include only important features
X = X[important_features]


In [92]:




import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)



y = Y
x = X


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.77
Class p1:
Precision: 0.8294243070362474
Recall: 0.7672583826429981
F1 Score: 0.7971311475409836

Class p2:
Precision: 0.8263851290380798
Recall: 0.9076313181367691
F1 Score: 0.865104855469488

Class p3:
Precision: 0.4522776572668113
Recall: 0.31471698113207547
F1 Score: 0.3711615487316422

Class p4:
Precision: 0.7351778656126482
Recall: 0.7230320699708455
F1 Score: 0.7290543851053405



In [93]:
dx = pd.DataFrame(xgb_classifier.predict(x_test))
dx.value_counts()

0
1    5541
3    1012
0     938
2     922
Name: count, dtype: int64

In [94]:
# # Hyperparameter tuning for xgboost (Used in the session)

# # Define the hyperparameter grid
# param_grid = {
#     'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
#     'learning_rate'   : [0.001, 0.01, 0.1, 1],
#     'max_depth'       : [3, 5, 8, 10],
#     'alpha'           : [1, 10, 100],
#     'n_estimators'    : [10,50,100]
# }

# index = 0

# answers_grid = {
#     'combination'       :[],
#     'train_Accuracy'    :[],
#     'test_Accuracy'     :[],
#     'colsample_bytree'  :[],
#     'learning_rate'     :[],
#     'max_depth'         :[],
#     'alpha'             :[],
#     'n_estimators'      :[]

#     }



# # Loop through each combination of hyperparameters
# for colsample_bytree in param_grid['colsample_bytree']:
#     for learning_rate in param_grid['learning_rate']:
#         for max_depth in param_grid['max_depth']:
#             for alpha in param_grid['alpha']:
#                 for n_estimators in param_grid['n_estimators']:
                    
#                     index = index + 1
                    
#                     # Define and train the XGBoost model
#                     model = xgb.XGBClassifier(objective='multi:softmax',  
#                                             num_class=4,
#                                             colsample_bytree = colsample_bytree,
#                                             learning_rate = learning_rate,
#                                             max_depth = max_depth,
#                                             alpha = alpha,
#                                             n_estimators = n_estimators)
                    
            
                            
#                     y = Y
#                     x = X

#                     label_encoder = LabelEncoder()
#                     y_encoded = label_encoder.fit_transform(y)


#                     x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)


#                     model.fit(x_train, y_train)


            
#                     # Predict on training and testing sets
#                     y_pred_train = model.predict(x_train)
#                     y_pred_test = model.predict(x_test)
            
            
#                     # Calculate train and test results
                    
#                     train_accuracy =  accuracy_score (y_train, y_pred_train)
#                     test_accuracy  =  accuracy_score (y_test , y_pred_test)
                    
#                     # Include into the lists
#                     answers_grid ['combination']   .append(index)
#                     answers_grid ['train_Accuracy']    .append(train_accuracy)
#                     answers_grid ['test_Accuracy']     .append(test_accuracy)
#                     answers_grid ['colsample_bytree']   .append(colsample_bytree)
#                     answers_grid ['learning_rate']      .append(learning_rate)
#                     answers_grid ['max_depth']          .append(max_depth)
#                     answers_grid ['alpha']              .append(alpha)
#                     answers_grid ['n_estimators']       .append(n_estimators)
            
            
#                     # Print results for this combination
#                     print(f"Combination {index}")
#                     print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
#                     print(f"Train Accuracy: {train_accuracy:.2f}")
#                     print(f"Test Accuracy : {test_accuracy :.2f}")
#                     print("-" * 30)


# Hyperparameter tuning in xgboost
from sklearn.model_selection import GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

# Define the XGBClassifier with the initial set of hyperparameters
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

# Define the hyperparameter grid
param_grid = {
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
    'learning_rate'   : [0.001, 0.01, 0.1, 1],
    'max_depth'       : [3, 5, 8, 10],
    'alpha'           : [1, 10, 100],
    'n_estimators'    : [10,50,100]
}


grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_
accuracy = best_model.score(x_test, y_test)
print("Test Accuracy:", accuracy)

# Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}

  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'alpha': 10, 'colsample_bytree': 0.3, 'learning_rate': 1, 'max_depth': 3, 'n_estimators': 100}
Test Accuracy: 0.7790324497801022


In [95]:
# train , test , colsample_bytree , learning_rate , max_depth , alpha , n_estimators
# 0.88 , 0.8468693352352085 , 0.5 , 0.1 , 10 , 1 , 100 
new_model = xgb.XGBClassifier(objective='multi:softmax',  
                                    num_class=4,
                                    colsample_bytree = 0.3,
                                    learning_rate = 1,
                                    max_depth = 3,
                                    alpha = 10,
                                    n_estimators = 100)

In [96]:
new_model.fit(x_train, y_train)

In [97]:
import pickle as pkl

pkl.dump(new_model, open('bestmodel.pkl', 'wb'))