# Import libraries:

In [1]:

!pip install xgboost

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os
import time 

In [2]:
print("Program is running....")
print()
start_time = time.time()

In [3]:
# Load the dataset

a1 = pd.read_excel("internal_product_CRM.xlsx")
a2 = pd.read_excel("Cibil_dataset_CRM.xlsx")

In [4]:
df1 = a1.copy()
df2 = a2.copy()


In [5]:
# Remove nulls

df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [6]:
columns_to_be_removed = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed .append(i)

In [7]:
df2 = df2.drop(columns_to_be_removed, axis =1)

In [8]:
for i in df2.columns:
    df2 = df2.loc[ df2[i] != -99999 ]

In [9]:
# Checking common column names

for i in list(df1.columns):
    if i in list(df2.columns):
        print (i)

In [10]:
# Merge the two dataframes, inner join so that no nulls are present

df = pd. merge ( df1, df2, how ='inner', left_on = ['PROSPECTID'], right_on = ['PROSPECTID'] )

In [11]:
# check how many columns are categorical

for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

In [12]:
# Chi-square test

for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(i, '---', pval)

- Since all the categorical features have pval <=0.05, we will accept all

In [13]:
# VIF for numerical columns

numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']:
        numeric_columns.append(i)

In [14]:
# VIF sequentially check

vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

In [15]:
for i in range (0,total_columns):
    
    vif_value = variance_inflation_factor(vif_data, column_index)
    print (column_index,'---',vif_value)
    
    
    if vif_value <= 6:
        columns_to_be_kept.append( numeric_columns[i] )
        column_index = column_index+1
    
    else:
        vif_data = vif_data.drop([ numeric_columns[i] ] , axis=1)

In [16]:
# check Anova for columns_to_be_kept 

from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])  
    b = list(df['Approved_Flag'])  
    
    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']


    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)

In [17]:
print(columns_to_be_kept_numerical )

-  feature selection is done for cat and num features

In [18]:
# listing all the final features

features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']

df = df[features + ['Approved_Flag']]


In [19]:
# Label encoding for the categorical features

['MARITALSTATUS', 'EDUCATION', 'GENDER' , 'last_prod_enq2' ,'first_prod_enq2']

In [20]:
df['MARITALSTATUS'].unique()    


In [21]:
df['EDUCATION'].unique()


In [22]:
df['GENDER'].unique()


In [23]:
df['last_prod_enq2'].unique()


In [24]:
df['first_prod_enq2'].unique()

In [25]:
# Ordinal feature -- EDUCATION
# SSC            : 1
# 12TH           : 2
# GRADUATE       : 3
# UNDER GRADUATE : 3
# POST-GRADUATE  : 4
# OTHERS         : 1
# PROFESSIONAL   : 3

In [26]:
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [27]:
df['EDUCATION'].value_counts()

In [28]:

df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()

In [29]:
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])

In [30]:
df_encoded.info()

In [31]:
k = df_encoded.describe()

### Machine Learing model fitting
- Data processing

### 1. Random Forest

In [32]:


y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)
rf_classifier.fit(x_train, y_train)
y_pred = rf_classifier.predict(x_test)



accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy}')
print ()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

## 2. xgboost

In [33]:


import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)



y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)




xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

## 3. Decision Tree

In [34]:

from sklearn.tree import DecisionTreeClassifier


y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

### - xgboost is giving best results,thus further finetuning it.

In [35]:
# Applying standard scaler 

from sklearn.preprocessing import StandardScaler

columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq',
'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

for i in columns_to_be_scaled:
    column_data = df_encoded[i].values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled_column = scaler.fit_transform(column_data)
    df_encoded[i] = scaled_column

In [36]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)

y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)


xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

In [37]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

In [38]:
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

- No improvement in metrices

In [39]:
# Hyperparameter tuning in xgboost

from sklearn.model_selection import GridSearchCV
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [40]:
# Define the XGBClassifier with the initial set of hyperparameters

xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=4)

In [41]:
# Define the parameter grid for hyperparameter tuning

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

In [42]:
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train, y_train)

In [43]:
# Print the best hyperparameters

print("Best Hyperparameters:", grid_search.best_params_)

In [44]:
# Evaluate the model with the best hyperparameters on the test set

best_model = grid_search.best_estimator_
accuracy = best_model.score(x_test, y_test)
print("Test Accuracy:", accuracy)

- Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}

- Checking the model with UNSEEN data

In [45]:
a3 = pd.read_excel("Unseen_Dataset.xlsx")

In [46]:
cols_in_df = list(df.columns)
cols_in_df.pop(42)

In [47]:
df_unseen = a3[cols_in_df] 

In [48]:
df_unseen.columns

In [49]:
df_unseen['MARITALSTATUS'].unique()    

In [50]:
df_unseen['EDUCATION'].unique()

In [51]:
df_unseen['GENDER'].unique()

In [52]:
df_unseen['last_prod_enq2'].unique()

In [53]:
df_unseen['first_prod_enq2'].unique()

In [54]:
# Ordinal feature -- EDUCATION
# SSC            : 1
# 12TH           : 2
# GRADUATE       : 3
# UNDER GRADUATE : 3
# POST-GRADUATE  : 4
# OTHERS         : 1
# PROFESSIONAL   : 3

In [55]:
df_unseen.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df_unseen.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df_unseen.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df_unseen.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df_unseen.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df_unseen.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df_unseen.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [56]:
df_unseen['EDUCATION'].value_counts()
df_unseen['EDUCATION'] = df['EDUCATION'].astype(int)
df_unseen.info()

In [57]:
df_encoded_unseen = pd.get_dummies(df_unseen, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])

In [58]:
df_encoded_unseen.info()

In [59]:
k = df_encoded_unseen.describe()

In [60]:
# Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}

model = xgb.XGBClassifier(objective= 'multi:softmax',
                          num_class = 4,
                          learning_rate= 0.2,
                          max_depth= 3,
                          n_estimators= 200)

In [61]:
model.fit(x_train, y_train)

In [62]:
y_pred_unseen = model.predict(df_encoded_unseen)

In [63]:
print(y_pred_unseen)

In [64]:
mapping = {0: 'P1', 1: 'P2', 2: 'P3', 3: 'P4'}

In [65]:
y_pred_mapped = [mapping[val] for val in y_pred_unseen]

In [66]:
print(y_pred_mapped)

In [67]:
a3['Target_variable'] = y_pred_mapped

In [68]:
a3.sample(10)

In [69]:
a3.to_excel ("C:\\Users\\HP\\OneDrive\\Desktop\\Credit_Risk_ML_Project\\Final_Prediction.xlsx", index=False)

In [70]:
#print runtime

end_time = time.time()
elapsed_time = end_time - start_time
print("Total run time of the program:" + str(round(elapsed_time,2)) + ' sec')