In [312]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os

In [313]:
# Load the dataset
a1 = pd.read_excel("../data/Unseen_Dataset.xlsx")

In [314]:
df = a1.copy()

In [315]:
df.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,PL_Flag,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,MARITALSTATUS,EDUCATION,GENDER,last_prod_enq2,first_prod_enq2
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,1,0.0,0.0,1,0,Married,12TH,M,PL,PL
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,0,0.0,0.0,0,0,Single,GRADUATE,F,ConsumerLoan,ConsumerLoan
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,0,0.0,0.0,1,0,Married,SSC,M,ConsumerLoan,others
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0.0,0.0,0,0,Married,POST-GRADUATE,M,AL,AL
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,0,0.429,0.0,1,0,Married,12TH,M,ConsumerLoan,PL


In [316]:
columns_to_be_removed = []

for i in df.columns:
    if df.loc[df[i] == -99999].shape[0] > 10000:
        columns_to_be_removed .append(i)


In [317]:
columns_to_be_removed

[]

In [318]:
for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2


In [319]:
numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in ['PROSPECTID']:
        numeric_columns.append(i)

In [320]:
numeric_columns


['pct_tl_open_L6M',
 'pct_tl_closed_L6M',
 'Tot_TL_closed_L12M',
 'pct_tl_closed_L12M',
 'Tot_Missed_Pmnt',
 'CC_TL',
 'Home_TL',
 'PL_TL',
 'Secured_TL',
 'Unsecured_TL',
 'Other_TL',
 'Age_Oldest_TL',
 'Age_Newest_TL',
 'time_since_recent_payment',
 'max_recent_level_of_deliq',
 'num_deliq_6_12mts',
 'num_times_60p_dpd',
 'num_std_12mts',
 'num_sub',
 'num_sub_6mts',
 'num_sub_12mts',
 'num_dbt',
 'num_dbt_12mts',
 'num_lss',
 'recent_level_of_deliq',
 'CC_enq_L12m',
 'PL_enq_L12m',
 'time_since_recent_enq',
 'enq_L3m',
 'NETMONTHLYINCOME',
 'Time_With_Curr_Empr',
 'CC_Flag',
 'PL_Flag',
 'pct_PL_enq_L6m_of_ever',
 'pct_CC_enq_L6m_of_ever',
 'HL_Flag',
 'GL_Flag']

In [321]:

df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [322]:
df['EDUCATION'].value_counts()
df['EDUCATION'] = df['EDUCATION'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 42 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            100 non-null    float64
 1   pct_tl_closed_L6M          100 non-null    float64
 2   Tot_TL_closed_L12M         100 non-null    int64  
 3   pct_tl_closed_L12M         100 non-null    float64
 4   Tot_Missed_Pmnt            100 non-null    int64  
 5   CC_TL                      100 non-null    int64  
 6   Home_TL                    100 non-null    int64  
 7   PL_TL                      100 non-null    int64  
 8   Secured_TL                 100 non-null    int64  
 9   Unsecured_TL               100 non-null    int64  
 10  Other_TL                   100 non-null    int64  
 11  Age_Oldest_TL              100 non-null    int64  
 12  Age_Newest_TL              100 non-null    int64  
 13  time_since_recent_payment  100 non-null    int64  


In [323]:
df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])

df_encoded.info()
k = df_encoded.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               100 non-null    float64
 1   pct_tl_closed_L6M             100 non-null    float64
 2   Tot_TL_closed_L12M            100 non-null    int64  
 3   pct_tl_closed_L12M            100 non-null    float64
 4   Tot_Missed_Pmnt               100 non-null    int64  
 5   CC_TL                         100 non-null    int64  
 6   Home_TL                       100 non-null    int64  
 7   PL_TL                         100 non-null    int64  
 8   Secured_TL                    100 non-null    int64  
 9   Unsecured_TL                  100 non-null    int64  
 10  Other_TL                      100 non-null    int64  
 11  Age_Oldest_TL                 100 non-null    int64  
 12  Age_Newest_TL                 100 non-null    int64  
 13  time_s

In [324]:
df_encoded.head(10)

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,False,False,True,False,False,False,False,False,True,False
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,True,False,False,False,False,False,True,False,False,False
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,True,False,False,False,False,False,False,False,False,True
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,False,False,False,False,True,False,False,False,False,False
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,True,False,False,False,False,False,False,False,True,False
5,0.0,0.0,2,0.333,0,0,0,0,6,0,...,True,False,False,False,False,False,False,False,False,True
6,0.0,0.0,0,0.0,0,0,0,0,1,0,...,True,False,False,False,False,False,False,False,False,True
7,0.143,0.0,1,0.143,0,1,0,2,2,5,...,False,False,False,False,False,False,False,False,False,True
8,0.0,0.0,0,0.0,0,0,0,0,2,0,...,True,False,False,False,False,False,False,False,False,True
9,0.5,0.0,1,0.5,0,0,0,0,0,2,...,True,False,False,False,False,False,True,False,False,False


In [325]:
# import pickle as pkl


# model = pkl.load(open('model.pkl', 'rb'))

In [326]:
# xtrain = df_encoded[50:60]

In [327]:
# df_encoded["Target_result"] = model.predict(df_encoded)

In [328]:
# df_encoded.columns

In [329]:
# df_encoded["propensity_bckt"] = []

# for i in df_encoded["Target_result"]:
#     if i == 0:
#         df_encoded["propensity_bckt"].append("P1")
#     elif i==1:
#         df_encoded["propensity_bckt"].append("P2")
#     elif i==2:
#         df_encoded["propensity_bckt"].append("P3")
#     else:
#         df_encoded["propensity_bckt"].append("P4")

In [330]:
# file_name = "Final_unseen_Result.xlsx"

# df_encoded.to_excel(file_name)

### Best features model according to feature importances . 

In [331]:
import pickle as pkl


model = pkl.load(open('bestmodel.pkl', 'rb'))

In [332]:
important_features = [
    "enq_L3m",
    "Age_Oldest_TL",
    "num_std_12mts",
    "pct_PL_enq_L6m_of_ever",
    "time_since_recent_enq",
    "max_recent_level_of_deliq",
    "recent_level_of_deliq",
    "PL_enq_L12m",
    "Secured_TL",
    "last_prod_enq2_ConsumerLoan",
    "GL_Flag",
    "num_times_60p_dpd",
    "num_deliq_6_12mts",
    "Age_Newest_TL",
    "PL_Flag"
]

# enq_L3m
# Age_Oldest_TL
# num_std_12mts
# pct_PL_enq_L6m_of_ever
# time_since_recent_enq
# max_recent_level_of_deliq
# recent_level_of_deliq
# PL_enq_L12m
# Secured_TL
# last_prod_enq2_ConsumerLoan
# GL_Flag
# num_times_60p_dpd
# num_deliq_6_12mts
# Age_Newest_TL
# PL_Flag


In [333]:
df_encoded["Target_result"] = model.predict(df_encoded[important_features])

In [334]:
# model.predict(df_encoded[important_features][30:31])

In [335]:
# import pandas as pd

# # Important features
# important_features = [
#     "enq_L3m", 
#     "Age_Oldest_TL", 
#     "time_since_recent_enq", 
#     "pct_PL_enq_L6m_of_ever", 
#     "num_std_12mts", 
#     "PL_enq_L12m", 
#     "max_recent_level_of_deliq", 
#     "recent_level_of_deliq", 
#     "GL_Flag", 
#     "pct_CC_enq_L6m_of_ever"
# ]

# # Feature values
# # feature_values = {
# #     "enq_L3m": 0,
# #     "Age_Oldest_TL": -0.344301,
# #     "time_since_recent_enq": 1.191963,
# #     "pct_PL_enq_L6m_of_ever": 0.0,
# #     "num_std_12mts": 0,
# #     "PL_enq_L12m": 0,
# #     "max_recent_level_of_deliq": -0.264815,
# #     "recent_level_of_deliq": -0.254277,
# #     "GL_Flag": 0,
# #     "pct_CC_enq_L6m_of_ever": 0.0
# # }

# feature_values = {
#     "enq_L3m": 0,
#     "Age_Oldest_TL": -0.059324,
#     "time_since_recent_enq": 2.312887,
#     "pct_PL_enq_L6m_of_ever": 0.0,
#     "num_std_12mts": 0,
#     "PL_enq_L12m": 0,
#     "max_recent_level_of_deliq": 0.160672,
#     "recent_level_of_deliq": 0.241183,
#     "GL_Flag": 0,
#     "pct_CC_enq_L6m_of_ever": 0.0
# }



# # Create the DataFrame
# df_test = pd.DataFrame([{feature: value for feature, value in feature_values.items()}])



In [336]:
# model.predict(df_test)

In [337]:
# df_encoded["Target_result"].value_counts()

In [338]:
df_encoded["Target_result"] = df_encoded["Target_result"].map({0: "P1", 1: "P2", 2: "P3", 3: "P4"})


In [339]:
df_encoded["Target_result"].value_counts()

Target_result
P1    81
P2    12
P4     7
Name: count, dtype: int64