## 1. Necessary Libraries Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os

In [2]:
!pip install statsmodels -q

## 2. Reading the data

In [3]:
PROJECT_DIR = r"C:\Users\cocod\OneDrive\Desktop\Credit_Risk_Modelling"
DATA_DIR = "data"

In [4]:
def get_data(name):
    file_name = f"{name}.xlsx"
    file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
    return pd.read_excel(file_path)

In [5]:
# loading the dataset
a1 = get_data("case_study1")
a1.head()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
0,1,5,4,1,0,0,0.0,0.0,0.2,0.8,...,0,0,1,0,4,1,4,0,72,18
1,2,1,0,1,0,0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,1,0,7,7
2,3,8,0,8,1,0,0.125,0.0,1.0,0.0,...,0,6,1,0,0,2,6,0,47,2
3,4,1,0,1,1,0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,1,5,5
4,5,3,2,1,0,0,0.0,0.0,0.333,0.667,...,0,0,0,0,0,3,0,2,131,32


In [6]:
a2 = get_data("case_study2")
a2.head()

Unnamed: 0,PROSPECTID,time_since_recent_payment,time_since_first_deliquency,time_since_recent_deliquency,num_times_delinquent,max_delinquency_level,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,...,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,549,35,15,11,29,29,0,0,0,...,0.0,0.0,0.0,13.333,1,0,PL,PL,696,P2
1,2,47,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,0.86,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,302,11,3,9,25,25,1,9,8,...,0.0,0.0,0.0,5741.667,1,0,ConsumerLoan,others,693,P2
3,4,-99999,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,9.9,0,0,others,others,673,P2
4,5,583,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,-99999.0,0,0,AL,AL,753,P1


In [7]:
df1 = a1.copy()
df2 = a2.copy()

In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PROSPECTID            51336 non-null  int64  
 1   Total_TL              51336 non-null  int64  
 2   Tot_Closed_TL         51336 non-null  int64  
 3   Tot_Active_TL         51336 non-null  int64  
 4   Total_TL_opened_L6M   51336 non-null  int64  
 5   Tot_TL_closed_L6M     51336 non-null  int64  
 6   pct_tl_open_L6M       51336 non-null  float64
 7   pct_tl_closed_L6M     51336 non-null  float64
 8   pct_active_tl         51336 non-null  float64
 9   pct_closed_tl         51336 non-null  float64
 10  Total_TL_opened_L12M  51336 non-null  int64  
 11  Tot_TL_closed_L12M    51336 non-null  int64  
 12  pct_tl_open_L12M      51336 non-null  float64
 13  pct_tl_closed_L12M    51336 non-null  float64
 14  Tot_Missed_Pmnt       51336 non-null  int64  
 15  Auto_TL            

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51336 entries, 0 to 51335
Data columns (total 62 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   PROSPECTID                    51336 non-null  int64  
 1   time_since_recent_payment     51336 non-null  int64  
 2   time_since_first_deliquency   51336 non-null  int64  
 3   time_since_recent_deliquency  51336 non-null  int64  
 4   num_times_delinquent          51336 non-null  int64  
 5   max_delinquency_level         51336 non-null  int64  
 6   max_recent_level_of_deliq     51336 non-null  int64  
 7   num_deliq_6mts                51336 non-null  int64  
 8   num_deliq_12mts               51336 non-null  int64  
 9   num_deliq_6_12mts             51336 non-null  int64  
 10  max_deliq_6mts                51336 non-null  int64  
 11  max_deliq_12mts               51336 non-null  int64  
 12  num_times_30p_dpd             51336 non-null  int64  
 13  n

## 3. Preliminary analysis

In [8]:
pd.set_option("display.max_columns", None)

In [25]:
df1.sample(10)

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,Total_TL_opened_L12M,Tot_TL_closed_L12M,pct_tl_open_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,Auto_TL,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
8063,8064,2,2,0,0,1,0.0,0.5,0.0,1.0,0,2,0.0,1.0,0,0,0,1,0,0,0,0,2,1,17,17
28247,28248,3,1,2,2,0,0.667,0.0,0.667,0.333,2,1,0.667,0.333,0,0,0,3,0,0,0,0,3,0,13,3
2326,2327,24,21,3,3,0,0.125,0.0,0.125,0.875,3,0,0.125,0.0,2,2,0,1,20,0,0,22,2,1,63,2
29777,29778,2,0,2,0,0,0.0,0.0,1.0,0.0,0,0,0.0,0.0,1,0,0,1,0,0,0,0,2,1,40,40
2195,2196,2,2,0,0,1,0.0,0.5,0.0,1.0,1,1,0.5,0.5,0,0,0,0,0,0,0,2,0,2,28,8
44877,44878,3,0,3,2,0,0.667,0.0,1.0,0.0,3,0,1.0,0.0,1,0,0,3,0,0,0,0,3,0,7,2
5133,5134,9,6,3,0,0,0.0,0.0,0.333,0.667,0,1,0.0,0.111,0,1,0,0,0,1,3,6,3,4,156,27
33253,33254,4,1,3,1,0,0.25,0.0,0.75,0.25,2,0,0.5,0.0,0,1,0,1,0,0,2,1,3,0,71,5
50809,50810,21,9,12,2,0,0.095,0.0,0.571,0.429,7,4,0.333,0.19,12,0,0,0,10,0,0,21,0,11,43,6
35976,35977,1,0,1,0,0,0.0,0.0,1.0,0.0,0,0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,15,15


In [26]:
df2.sample(10)

Unnamed: 0,PROSPECTID,time_since_recent_payment,time_since_first_deliquency,time_since_recent_deliquency,num_times_delinquent,max_delinquency_level,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,max_deliq_6mts,max_deliq_12mts,num_times_30p_dpd,num_times_60p_dpd,num_std,num_std_6mts,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_6mts,num_dbt_12mts,num_lss,num_lss_6mts,num_lss_12mts,recent_level_of_deliq,tot_enq,CC_enq,CC_enq_L6m,CC_enq_L12m,PL_enq,PL_enq_L6m,PL_enq_L12m,time_since_recent_enq,enq_L12m,enq_L6m,enq_L3m,MARITALSTATUS,EDUCATION,AGE,GENDER,NETMONTHLYINCOME,Time_With_Curr_Empr,pct_of_active_TLs_ever,pct_opened_TLs_L6m_of_L12m,pct_currentBal_all_TL,CC_utilization,CC_Flag,PL_utilization,PL_Flag,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
16240,16241,33,-99999,-99999,0,-99999,0,0,0,0,0,0,0,0,29,9,15,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,8,2,2,2,Married,SSC,27,F,14500,126,0.625,0.5,0.924,-99999.0,0,-99999.0,0,0.0,0.0,0.0,0.0,1.076,1,0,others,others,682,P2
15194,15195,65,-99999,-99999,0,-99999,0,0,0,0,-99999,-99999,0,0,26,2,5,4,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,261,1,0,0,Married,12TH,33,M,25000,77,0.167,0.0,0.969,-99999.0,0,-99999.0,0,0.0,0.0,0.0,0.0,-99999.0,0,1,HL,AL,704,P1
32324,32325,1601,-99999,-99999,0,-99999,0,0,0,0,-99999,-99999,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,8,1,1,1,Married,GRADUATE,34,M,12000,47,0.667,0.0,1.0,-99999.0,0,-99999.0,0,0.0,0.0,0.0,0.0,-99999.0,1,0,others,others,688,P2
5152,5153,401,-99999,-99999,0,-99999,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1088,0,0,0,Married,12TH,28,M,11500,30,1.0,0.0,0.268,-99999.0,0,-99999.0,0,0.0,0.0,0.0,0.0,3.467,0,0,ConsumerLoan,ConsumerLoan,692,P2
26331,26332,77,7,6,2,28,27,0,2,2,0,28,0,0,84,1,9,0,0,0,0,0,0,0,0,0,27,1,0,0,0,0,0,0,223,1,0,0,Married,12TH,41,M,18000,63,0.333,0.0,0.114,-99999.0,0,-99999.0,0,0.0,0.0,0.0,0.0,0.844,1,0,others,others,686,P2
31662,31663,53,-99999,-99999,0,-99999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,2,1,2,139,3,2,0,Married,12TH,22,M,22000,38,0.8,0.75,0.488,-99999.0,0,-99999.0,0,0.5,0.0,0.5,0.0,2.993,0,0,PL,others,673,P2
23293,23294,51,-99999,-99999,0,-99999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,2,0,1,1,0,1,255,2,0,0,Single,12TH,24,M,18900,53,0.667,0.0,0.713,0.159,1,0.861,1,0.0,0.0,0.0,0.0,10.048,0,0,PL,CC,744,P1
26882,26883,90,-99999,-99999,0,-99999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,149,2,2,0,Married,SSC,29,F,10000,24,1.0,1.0,0.7,-99999.0,0,-99999.0,0,0.0,0.0,0.0,0.0,2.0,0,0,others,others,682,P2
37435,37436,3121,-99999,-99999,0,-99999,0,0,0,0,-99999,-99999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3829,0,0,0,Married,SSC,45,M,30000,364,0.0,0.0,0.0,-99999.0,0,-99999.0,0,0.0,0.0,0.0,0.0,-99999.0,0,0,others,others,708,P1
38112,38113,66,-99999,-99999,0,-99999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,2,1,1,3,3,3,3,Single,GRADUATE,25,M,20000,62,0.25,0.0,0.358,-99999.0,0,0.358,1,1.0,0.0,0.5,0.0,3.5,1,0,ConsumerLoan,ConsumerLoan,668,P3


In [17]:
df2.MARITALSTATUS.unique()

array(['Married', 'Single'], dtype=object)

In [18]:
df2.EDUCATION.unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS', 'PROFESSIONAL'], dtype=object)

In [19]:
df2.GENDER.unique()

array(['M', 'F'], dtype=object)

In [22]:
df2.last_prod_enq2.unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'CC', 'HL'], dtype=object)

In [23]:
df2.first_prod_enq2.unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [24]:
df2.Approved_Flag.unique()

array(['P2', 'P1', 'P3', 'P4'], dtype=object)

In [20]:
df1.dtypes

PROSPECTID                int64
Total_TL                  int64
Tot_Closed_TL             int64
Tot_Active_TL             int64
Total_TL_opened_L6M       int64
Tot_TL_closed_L6M         int64
pct_tl_open_L6M         float64
pct_tl_closed_L6M       float64
pct_active_tl           float64
pct_closed_tl           float64
Total_TL_opened_L12M      int64
Tot_TL_closed_L12M        int64
pct_tl_open_L12M        float64
pct_tl_closed_L12M      float64
Tot_Missed_Pmnt           int64
Auto_TL                   int64
CC_TL                     int64
Consumer_TL               int64
Gold_TL                   int64
Home_TL                   int64
PL_TL                     int64
Secured_TL                int64
Unsecured_TL              int64
Other_TL                  int64
Age_Oldest_TL             int64
Age_Newest_TL             int64
dtype: object

In [21]:
df2.dtypes

PROSPECTID                       int64
time_since_recent_payment        int64
time_since_first_deliquency      int64
time_since_recent_deliquency     int64
num_times_delinquent             int64
                                 ...  
GL_Flag                          int64
last_prod_enq2                  object
first_prod_enq2                 object
Credit_Score                     int64
Approved_Flag                   object
Length: 62, dtype: object

In [9]:
## checking for missing information
def missing_info(data):
    na_cols = [col for col in data.columns if data[col].isna().any()]
    na_counts = [data[col].isna().sum() for col in na_cols]
    na_pct = [(data[col].isna().mean() * 100) for col in na_cols]
    
    return (
        pd
        .DataFrame(data={
            "variable": na_cols,
            "count": na_counts,
            "percentage": na_pct
        })
        .sort_values(by="count", ascending=False)
        .set_index("variable")
    )
    

In [28]:
missing_info(df1)

Unnamed: 0_level_0,count,percentage
variable,Unnamed: 1_level_1,Unnamed: 2_level_1


In [29]:
missing_info(df2)

Unnamed: 0_level_0,count,percentage
variable,Unnamed: 1_level_1,Unnamed: 2_level_1


In [10]:
## checking for the number of -99999 values in the df2
def check_error(data):
    # columns with -99999 values
    neg99999_cols = [col for col in data.columns if (data[col] == -99999).any()]
    print(neg99999_cols)
    # count occurrences of -99999 in each column
    neg99999_counts = [data[col].eq(-99999).sum() for col in neg99999_cols]
    # percentage
    neg99999_pct = [(data[col].eq(-99999).mean() * 100) for col in neg99999_cols]
    
    return (
        pd
        .DataFrame(data={
            "variable": neg99999_cols,
            "count": neg99999_counts,
            "percentage": neg99999_pct
        })
        .sort_values(by="count", ascending=False)
        .set_index("variable")
    )
    

In [36]:
check_error(df2)

['time_since_recent_payment', 'time_since_first_deliquency', 'time_since_recent_deliquency', 'max_delinquency_level', 'max_deliq_6mts', 'max_deliq_12mts', 'tot_enq', 'CC_enq', 'CC_enq_L6m', 'CC_enq_L12m', 'PL_enq', 'PL_enq_L6m', 'PL_enq_L12m', 'time_since_recent_enq', 'enq_L12m', 'enq_L6m', 'enq_L3m', 'pct_currentBal_all_TL', 'CC_utilization', 'PL_utilization', 'max_unsec_exposure_inPct']


Unnamed: 0_level_0,count,percentage
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
CC_utilization,47636,92.792582
PL_utilization,44435,86.557192
time_since_recent_deliquency,35949,70.026882
max_delinquency_level,35949,70.026882
time_since_first_deliquency,35949,70.026882
max_unsec_exposure_inPct,23178,45.149603
max_deliq_6mts,12890,25.109085
max_deliq_12mts,10832,21.100203
PL_enq_L12m,6321,12.312997
CC_enq_L12m,6321,12.312997


In [37]:
check_error(df1)

['Age_Oldest_TL', 'Age_Newest_TL']


Unnamed: 0_level_0,count,percentage
variable,Unnamed: 1_level_1,Unnamed: 2_level_1
Age_Oldest_TL,40,0.077918
Age_Newest_TL,40,0.077918


In [38]:
df1.shape

(51336, 26)

In [12]:
cleaned_df1 = clean_data(df1)
cleaned_df1.shape

(51296, 26)

* finding columns in df2 to be removed
* we are going to remove those column in df2 in which the frequency of -99999 is more than 10,000

In [13]:
columns_to_be_removed = []

for col in df2.columns:
    if df2.loc[df2[col] == -99999].shape[0] > 10000:
        columns_to_be_removed.append(col)
        
columns_to_be_removed

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

In [44]:
df2.shape

(51336, 62)

In [14]:
df2 = df2.drop(columns_to_be_removed, axis=1)

In [46]:
df2.shape

(51336, 54)

In [15]:
cleaned_df2 = clean_data(df2)
cleaned_df2.shape

(42066, 54)

In [11]:
def clean_data(df):
    return (
        df
        .drop_duplicates()
        .assign(**{
            col: df[col].str.strip()
            for col in df.select_dtypes(include="O").columns
        })
        .rename(columns=str.lower)
        .loc[~(df == -99999).any(axis=1)]
    )

## 4. Merging the data

In [16]:
# first we need to check the common columns
for col in list(cleaned_df1.columns):
    if col in list(cleaned_df2.columns):
        print(col)

prospectid


In [17]:
# merge the two dataframes, inner join so that no nulls are present
df = pd.merge(cleaned_df1, cleaned_df2, how="inner", left_on = ['prospectid'], right_on=['prospectid'])
df.shape

(42064, 79)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   prospectid                  42064 non-null  int64  
 1   total_tl                    42064 non-null  int64  
 2   tot_closed_tl               42064 non-null  int64  
 3   tot_active_tl               42064 non-null  int64  
 4   total_tl_opened_l6m         42064 non-null  int64  
 5   tot_tl_closed_l6m           42064 non-null  int64  
 6   pct_tl_open_l6m             42064 non-null  float64
 7   pct_tl_closed_l6m           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  total_tl_opened_l12m        42064 non-null  int64  
 11  tot_tl_closed_l12m          42064 non-null  int64  
 12  pct_tl_open_l12m            42064 non-null  float64
 13  pct_tl_closed_l12m          420

## 5. Feature Engineering

## 5.1 Extracting categorical and numerical features

In [18]:
categorical_cols = []
for col in df.columns:
    if df[col].dtype == "object":
        categorical_cols.append(col)
categorical_cols

['maritalstatus',
 'education',
 'gender',
 'last_prod_enq2',
 'first_prod_enq2',
 'approved_flag']

### 5.2 Checking the association between the categorical features

In [19]:
# chi-square test
for col in categorical_cols:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[col], df['approved_flag']))
    print(col, '---', pval)

maritalstatus --- 3.578180861038862e-233
education --- 2.6942265249737532e-30
gender --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287
approved_flag --- 0.0


* This shows that all the p-values are less than 0.05 which means we reject our Null Hypothesis that the features are not associated.
* Since all the categorical features have pval <= 0.05, we will accept all

### 5.3 VIF sequentially check

In [20]:
numeric_columns = []

for col in df.columns:
    if df[col].dtype != "object" and col not in ["prospectid", "approved_flag"]:
        numeric_columns.append(col)
print(len(numeric_columns))

72


In [25]:
selected_features = vif_data[vif_data["VIF"] <= 6]["Feature"].tolist()
print(len(selected_features))

26


In [21]:
# VIF sequentially check
vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0


for i in range(0, total_columns):
    vif_value = variance_inflation_factor(vif_data, column_index)
    print(column_index, '   ', '-----', '   ', vif_value)
    
    if vif_value <= 6:
        columns_to_be_kept.append(numeric_columns[i])
        column_index += 1
    
    else:
        vif_data = vif_data.drop([numeric_columns[i]], axis=1)

  vif = 1. / (1. - r_squared_i)


0     -----     inf


  vif = 1. / (1. - r_squared_i)


0     -----     inf
0     -----     11.320180023967996
0     -----     8.363698035000336
0     -----     6.520647877790928
0     -----     5.149501618212625
1     -----     2.611111040579735


  vif = 1. / (1. - r_squared_i)


2     -----     inf
2     -----     1788.7926256209232
2     -----     8.601028256477228
2     -----     3.8328007921530785
3     -----     6.099653381646731
3     -----     5.5813520096427585
4     -----     1.9855843530987785


  vif = 1. / (1. - r_squared_i)


5     -----     inf
5     -----     4.809538302819343
6     -----     23.270628983464636
6     -----     30.595522588100053
6     -----     4.384346405965587
7     -----     3.064658415523423
8     -----     2.898639771299251
9     -----     4.377876915347322
10     -----     2.2078535836958433
11     -----     4.916914200506864
12     -----     5.214702030064725
13     -----     3.3861625024231476
14     -----     7.840583309478997
14     -----     5.255034641721438


  vif = 1. / (1. - r_squared_i)


15     -----     inf
15     -----     7.380634506427238
15     -----     1.4210050015175733
16     -----     8.083255010190316
16     -----     1.6241227524040112
17     -----     7.257811920140003
17     -----     15.59624383268298
17     -----     1.8258570471324314
18     -----     1.5080839450032664
19     -----     2.172088834824577
20     -----     2.623397553527229
21     -----     2.2959970812106176
22     -----     7.360578319196439
22     -----     2.160238777310255
23     -----     2.8686288267891467
24     -----     6.458218003637272
24     -----     2.8474118865638256
25     -----     4.753198156284083
26     -----     16.227354755948223
26     -----     6.424377256363872
26     -----     8.887080381808687
26     -----     2.3804746142952653
27     -----     8.609513476514548
27     -----     13.06755093547673
27     -----     3.500040056654654
28     -----     1.908795587481377
29     -----     17.006562234161628
29     -----     10.730485153719197
29     -----     2.3538

In [62]:
vif_data.head()

Unnamed: 0,pct_tl_open_l6m,pct_tl_closed_l6m,tot_tl_closed_l12m,pct_tl_closed_l12m,tot_missed_pmnt,cc_tl,home_tl,pl_tl,secured_tl,unsecured_tl,other_tl,age_oldest_tl,age_newest_tl,time_since_recent_payment,max_recent_level_of_deliq,num_deliq_6_12mts,num_times_60p_dpd,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_12mts,num_lss,num_lss_12mts,recent_level_of_deliq,cc_enq_l12m,pl_enq_l12m,time_since_recent_enq,enq_l3m,netmonthlyincome,time_with_curr_empr,pct_currentbal_all_tl,cc_flag,pl_flag,pct_pl_enq_l6m_of_ever,pct_cc_enq_l6m_of_ever,hl_flag,gl_flag
0,0.0,0.0,0,0.0,0,0,0,4,1,4,0,72,18,549,29,0,0,11,0,0,0,0,0,0,0,29,0,0,566,0,51000,114,0.798,0,1,0.0,0.0,1,0
1,0.0,0.0,0,0.0,0,0,0,0,0,1,0,7,7,47,0,0,0,0,0,0,0,0,0,0,0,0,0,0,209,0,19000,50,0.37,0,0,0.0,0.0,0,0
2,0.125,0.0,0,0.0,1,0,0,0,2,6,0,47,2,302,25,8,0,10,0,0,0,0,0,0,0,25,0,0,587,0,18,191,0.585,0,0,0.0,0.0,1,0
3,0.0,0.0,0,0.0,0,0,0,0,3,0,2,131,32,583,0,0,0,16,0,0,0,0,0,0,0,0,0,0,3951,0,15000,75,0.0,0,0,0.0,0.0,0,0
4,0.0,0.0,1,0.167,0,0,0,0,6,0,0,150,17,245,270,0,11,2,3,0,1,0,0,0,0,26,1,3,7,4,0,154,0.0,0,0,0.429,0.0,1,0


### 5.4 ANOVA Test

In [22]:
# check anova test for columns_to_be_kept
from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for col in columns_to_be_kept:
    a = list(df[col])
    b = list(df['approved_flag'])
    
    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']
    
    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)
    
    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(col)
    

In [23]:
len(columns_to_be_kept_numerical)

37

In [26]:
df.head()

Unnamed: 0,prospectid,total_tl,tot_closed_tl,tot_active_tl,total_tl_opened_l6m,tot_tl_closed_l6m,pct_tl_open_l6m,pct_tl_closed_l6m,pct_active_tl,pct_closed_tl,total_tl_opened_l12m,tot_tl_closed_l12m,pct_tl_open_l12m,pct_tl_closed_l12m,tot_missed_pmnt,auto_tl,cc_tl,consumer_tl,gold_tl,home_tl,pl_tl,secured_tl,unsecured_tl,other_tl,age_oldest_tl,age_newest_tl,time_since_recent_payment,num_times_delinquent,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,num_times_30p_dpd,num_times_60p_dpd,num_std,num_std_6mts,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_6mts,num_dbt_12mts,num_lss,num_lss_6mts,num_lss_12mts,recent_level_of_deliq,tot_enq,cc_enq,cc_enq_l6m,cc_enq_l12m,pl_enq,pl_enq_l6m,pl_enq_l12m,time_since_recent_enq,enq_l12m,enq_l6m,enq_l3m,maritalstatus,education,age,gender,netmonthlyincome,time_with_curr_empr,pct_of_active_tls_ever,pct_opened_tls_l6m_of_l12m,pct_currentbal_all_tl,cc_flag,pl_flag,pct_pl_enq_l6m_of_l12m,pct_cc_enq_l6m_of_l12m,pct_pl_enq_l6m_of_ever,pct_cc_enq_l6m_of_ever,hl_flag,gl_flag,last_prod_enq2,first_prod_enq2,credit_score,approved_flag
0,1,5,4,1,0,0,0.0,0.0,0.2,0.8,0,0,0.0,0.0,0,0,0,0,1,0,4,1,4,0,72,18,549,11,29,0,0,0,0,0,21,5,11,0,0,0,0,0,0,0,0,0,29,6,0,0,0,6,0,0,566,0,0,0,Married,12TH,48,M,51000,114,0.2,0.0,0.798,0,1,0.0,0.0,0.0,0.0,1,0,PL,PL,696,P2
1,2,1,0,1,0,0,0.0,0.0,1.0,0.0,1,0,1.0,0.0,0,0,0,1,0,0,0,0,1,0,7,7,47,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,209,1,0,0,Single,GRADUATE,23,F,19000,50,1.0,0.0,0.37,0,0,0.0,0.0,0.0,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,8,0,8,1,0,0.125,0.0,1.0,0.0,2,0,0.25,0.0,1,1,0,6,1,0,0,2,6,0,47,2,302,9,25,1,9,8,0,0,10,5,10,0,0,0,0,0,0,0,0,0,25,4,0,0,0,0,0,0,587,0,0,0,Married,SSC,40,M,18,191,1.0,0.5,0.585,0,0,0.0,0.0,0.0,0.0,1,0,ConsumerLoan,others,693,P2
3,5,3,2,1,0,0,0.0,0.0,0.333,0.667,0,0,0.0,0.0,0,1,0,0,0,0,0,3,0,2,131,32,583,0,0,0,0,0,0,0,53,4,16,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,3951,0,0,0,Married,POST-GRADUATE,48,M,15000,75,0.333,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,AL,AL,753,P1
4,6,6,5,1,0,0,0.0,0.0,0.167,0.833,0,1,0.0,0.167,0,4,0,0,2,0,0,6,0,0,150,17,245,14,270,0,0,0,13,11,5,0,2,3,0,1,0,0,0,0,0,0,26,15,2,0,1,7,3,3,7,6,5,4,Married,12TH,35,M,0,154,0.167,0.0,0.0,0,0,1.0,0.0,0.429,0.0,1,0,ConsumerLoan,PL,668,P3


### 5.5 Encoding

In [28]:
categorical_cols[:5]

['maritalstatus', 'education', 'gender', 'last_prod_enq2', 'first_prod_enq2']

In [29]:
df.maritalstatus.unique()

array(['Married', 'Single'], dtype=object)

In [30]:
df.education.unique()

array(['12TH', 'GRADUATE', 'SSC', 'POST-GRADUATE', 'UNDER GRADUATE',
       'OTHERS', 'PROFESSIONAL'], dtype=object)

In [31]:
df.gender.unique()

array(['M', 'F'], dtype=object)

In [32]:
df.last_prod_enq2.unique()

array(['PL', 'ConsumerLoan', 'AL', 'CC', 'others', 'HL'], dtype=object)

In [33]:
df.first_prod_enq2.unique()

array(['PL', 'ConsumerLoan', 'others', 'AL', 'HL', 'CC'], dtype=object)

In [57]:
df.loc[df['education'] == 'SSC', ['education']] = 1
df.loc[df['education'] == '12TH', ['education']] = 2
df.loc[df['education'] == 'GRADUATE', ['education']] = 3
df.loc[df['education'] == 'UNDER GRADUATE', ['education']] = 3
df.loc[df['education'] == 'POST-GRADUATE', ['education']] = 4
df.loc[df['education'] == 'OTHERS', ['education']] = 1
df.loc[df['education'] == 'PROFESSIONAL', ['education']] = 3 


In [58]:
df.education.value_counts()

education
3    18931
2    11703
1     9532
4     1898
Name: count, dtype: int64

In [59]:
df.education = df.education.astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   prospectid                  42064 non-null  int64  
 1   total_tl                    42064 non-null  int64  
 2   tot_closed_tl               42064 non-null  int64  
 3   tot_active_tl               42064 non-null  int64  
 4   total_tl_opened_l6m         42064 non-null  int64  
 5   tot_tl_closed_l6m           42064 non-null  int64  
 6   pct_tl_open_l6m             42064 non-null  float64
 7   pct_tl_closed_l6m           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  total_tl_opened_l12m        42064 non-null  int64  
 11  tot_tl_closed_l12m          42064 non-null  int64  
 12  pct_tl_open_l12m            42064 non-null  float64
 13  pct_tl_closed_l12m          420

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   prospectid                  42064 non-null  int64  
 1   total_tl                    42064 non-null  int64  
 2   tot_closed_tl               42064 non-null  int64  
 3   tot_active_tl               42064 non-null  int64  
 4   total_tl_opened_l6m         42064 non-null  int64  
 5   tot_tl_closed_l6m           42064 non-null  int64  
 6   pct_tl_open_l6m             42064 non-null  float64
 7   pct_tl_closed_l6m           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  total_tl_opened_l12m        42064 non-null  int64  
 11  tot_tl_closed_l12m          42064 non-null  int64  
 12  pct_tl_open_l12m            42064 non-null  float64
 13  pct_tl_closed_l12m          420

In [62]:
df_encoded = pd.get_dummies(df, columns=['maritalstatus', 'gender', 'last_prod_enq2', 'first_prod_enq2'])
df_encoded.shape

(42064, 91)

In [63]:
len(columns_to_be_kept_numerical)

37

In [64]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42064 entries, 0 to 42063
Data columns (total 91 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   prospectid                    42064 non-null  int64  
 1   total_tl                      42064 non-null  int64  
 2   tot_closed_tl                 42064 non-null  int64  
 3   tot_active_tl                 42064 non-null  int64  
 4   total_tl_opened_l6m           42064 non-null  int64  
 5   tot_tl_closed_l6m             42064 non-null  int64  
 6   pct_tl_open_l6m               42064 non-null  float64
 7   pct_tl_closed_l6m             42064 non-null  float64
 8   pct_active_tl                 42064 non-null  float64
 9   pct_closed_tl                 42064 non-null  float64
 10  total_tl_opened_l12m          42064 non-null  int64  
 11  tot_tl_closed_l12m            42064 non-null  int64  
 12  pct_tl_open_l12m              42064 non-null  float64
 13  p

### 5.6 Listing all features

In [65]:
bool_cols = []
for col in df_encoded.columns:
    if df_encoded[col].dtype == "bool":
        bool_cols.append(col)
len(bool_cols)

16

In [66]:
features = columns_to_be_kept_numerical + bool_cols + ["approved_flag"]
features

['pct_tl_open_l6m',
 'pct_tl_closed_l6m',
 'tot_tl_closed_l12m',
 'pct_tl_closed_l12m',
 'tot_missed_pmnt',
 'cc_tl',
 'home_tl',
 'pl_tl',
 'secured_tl',
 'unsecured_tl',
 'other_tl',
 'age_oldest_tl',
 'age_newest_tl',
 'time_since_recent_payment',
 'max_recent_level_of_deliq',
 'num_deliq_6_12mts',
 'num_times_60p_dpd',
 'num_std_12mts',
 'num_sub',
 'num_sub_6mts',
 'num_sub_12mts',
 'num_dbt',
 'num_dbt_12mts',
 'num_lss',
 'recent_level_of_deliq',
 'cc_enq_l12m',
 'pl_enq_l12m',
 'time_since_recent_enq',
 'enq_l3m',
 'netmonthlyincome',
 'time_with_curr_empr',
 'cc_flag',
 'pl_flag',
 'pct_pl_enq_l6m_of_ever',
 'pct_cc_enq_l6m_of_ever',
 'hl_flag',
 'gl_flag',
 'maritalstatus_Married',
 'maritalstatus_Single',
 'gender_F',
 'gender_M',
 'last_prod_enq2_AL',
 'last_prod_enq2_CC',
 'last_prod_enq2_ConsumerLoan',
 'last_prod_enq2_HL',
 'last_prod_enq2_PL',
 'last_prod_enq2_others',
 'first_prod_enq2_AL',
 'first_prod_enq2_CC',
 'first_prod_enq2_ConsumerLoan',
 'first_prod_enq2_HL',


In [67]:
len(features)

54

In [68]:
df_encoded = df_encoded[features]
df_encoded

Unnamed: 0,pct_tl_open_l6m,pct_tl_closed_l6m,tot_tl_closed_l12m,pct_tl_closed_l12m,tot_missed_pmnt,cc_tl,home_tl,pl_tl,secured_tl,unsecured_tl,other_tl,age_oldest_tl,age_newest_tl,time_since_recent_payment,max_recent_level_of_deliq,num_deliq_6_12mts,num_times_60p_dpd,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_12mts,num_lss,recent_level_of_deliq,cc_enq_l12m,pl_enq_l12m,time_since_recent_enq,enq_l3m,netmonthlyincome,time_with_curr_empr,cc_flag,pl_flag,pct_pl_enq_l6m_of_ever,pct_cc_enq_l6m_of_ever,hl_flag,gl_flag,maritalstatus_Married,maritalstatus_Single,gender_F,gender_M,last_prod_enq2_AL,last_prod_enq2_CC,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others,approved_flag
0,0.000,0.00,0,0.000,0,0,0,4,1,4,0,72,18,549,29,0,0,11,0,0,0,0,0,0,29,0,0,566,0,51000,114,0,1,0.000,0.0,1,0,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,P2
1,0.000,0.00,0,0.000,0,0,0,0,0,1,0,7,7,47,0,0,0,0,0,0,0,0,0,0,0,0,0,209,0,19000,50,0,0,0.000,0.0,0,0,False,True,True,False,False,False,True,False,False,False,False,False,True,False,False,False,P2
2,0.125,0.00,0,0.000,1,0,0,0,2,6,0,47,2,302,25,8,0,10,0,0,0,0,0,0,25,0,0,587,0,18,191,0,0,0.000,0.0,1,0,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,P2
3,0.000,0.00,0,0.000,0,0,0,0,3,0,2,131,32,583,0,0,0,16,0,0,0,0,0,0,0,0,0,3951,0,15000,75,0,0,0.000,0.0,0,0,True,False,False,True,True,False,False,False,False,False,True,False,False,False,False,False,P1
4,0.000,0.00,1,0.167,0,0,0,0,6,0,0,150,17,245,270,0,11,2,3,0,1,0,0,0,26,1,3,7,4,0,154,0,0,0.429,0.0,1,0,True,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,P3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42059,0.333,0.00,0,0.000,0,0,0,0,0,3,1,24,5,15,24,0,0,0,0,0,0,0,0,0,24,0,0,0,1,18500,249,0,0,0.000,0.0,0,0,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,P4
42060,0.000,0.25,1,0.250,0,0,0,0,2,2,0,74,7,57,0,0,0,6,0,0,0,0,0,0,0,0,0,203,0,25000,186,0,0,0.000,0.0,0,0,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,P1
42061,0.500,0.50,1,0.500,0,0,0,0,0,2,0,9,5,32,0,0,0,0,0,0,0,0,0,0,0,0,2,1,2,18000,66,0,0,1.000,0.0,0,0,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,P3
42062,0.000,0.00,1,0.500,0,0,0,0,0,2,0,15,8,58,0,0,0,0,0,0,0,0,0,0,0,0,0,242,0,12802,54,0,0,0.000,0.0,0,0,False,True,True,False,False,False,True,False,False,False,False,False,False,False,False,True,P2


## 6. Exporting the dataset

In [69]:
def export_data(X, y, name):
    file_name = f"{name}.csv"
    file_path = os.path.join(PROJECT_DIR, DATA_DIR, file_name)
    
    X.join(y).to_csv(file_path, index=False)
    
    return pd.read_csv(file_path).head()

In [70]:
X = df_encoded.drop(columns="approved_flag")
y = df_encoded.approved_flag.copy()

In [71]:
X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=42)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(26920, 53) (26920,)
(6731, 53) (6731,)
(8413, 53) (8413,)


In [72]:
export_data(X_train, y_train, "train")

Unnamed: 0,pct_tl_open_l6m,pct_tl_closed_l6m,tot_tl_closed_l12m,pct_tl_closed_l12m,tot_missed_pmnt,cc_tl,home_tl,pl_tl,secured_tl,unsecured_tl,other_tl,age_oldest_tl,age_newest_tl,time_since_recent_payment,max_recent_level_of_deliq,num_deliq_6_12mts,num_times_60p_dpd,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_12mts,num_lss,recent_level_of_deliq,cc_enq_l12m,pl_enq_l12m,time_since_recent_enq,enq_l3m,netmonthlyincome,time_with_curr_empr,cc_flag,pl_flag,pct_pl_enq_l6m_of_ever,pct_cc_enq_l6m_of_ever,hl_flag,gl_flag,maritalstatus_Married,maritalstatus_Single,gender_F,gender_M,last_prod_enq2_AL,last_prod_enq2_CC,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others,approved_flag
0,0.0,0.0,0,0.0,0,0,0,0,1,0,0,37,37,645,0,0,0,0,0,0,0,0,0,0,0,0,0,1100,0,30000,128,0,0,0.0,0.0,0,0,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,True,P2
1,1.0,0.0,0,0.0,0,0,0,0,1,0,0,4,4,43,0,0,0,0,0,0,0,0,0,0,0,0,0,69,2,50000,38,0,0,0.0,0.0,0,0,False,True,False,True,False,False,True,False,False,False,True,False,False,False,False,False,P2
2,0.0,0.5,2,0.5,0,0,0,0,2,2,1,42,18,73,25,1,0,0,0,0,0,0,0,0,7,0,0,555,0,14500,58,0,0,0.0,0.0,0,0,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,P2
3,0.0,0.0,1,0.333,0,0,0,0,0,3,2,10,8,62,32,3,1,0,0,0,0,0,0,0,1,2,0,1,1,22000,120,0,0,0.0,0.0,0,0,False,True,False,True,False,False,True,False,False,False,False,False,False,False,False,True,P3
4,0.4,0.0,0,0.0,2,0,0,0,4,1,0,38,2,420,0,0,0,0,0,0,0,0,0,0,0,0,0,59,1,15000,191,0,0,0.0,0.0,1,0,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,P2


In [73]:
export_data(X_val, y_val, "val")

Unnamed: 0,pct_tl_open_l6m,pct_tl_closed_l6m,tot_tl_closed_l12m,pct_tl_closed_l12m,tot_missed_pmnt,cc_tl,home_tl,pl_tl,secured_tl,unsecured_tl,other_tl,age_oldest_tl,age_newest_tl,time_since_recent_payment,max_recent_level_of_deliq,num_deliq_6_12mts,num_times_60p_dpd,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_12mts,num_lss,recent_level_of_deliq,cc_enq_l12m,pl_enq_l12m,time_since_recent_enq,enq_l3m,netmonthlyincome,time_with_curr_empr,cc_flag,pl_flag,pct_pl_enq_l6m_of_ever,pct_cc_enq_l6m_of_ever,hl_flag,gl_flag,maritalstatus_Married,maritalstatus_Single,gender_F,gender_M,last_prod_enq2_AL,last_prod_enq2_CC,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others,approved_flag
0,0.0,0.0,0,0.0,0,0,0,1,0,3,1,42,8,106,0,0,0,10,0,0,0,0,0,0,0,0,1,252,0,55000,310,0,1,0.0,0.0,0,0,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,P2
1,0.0,0.0,0,0.0,0,0,0,0,2,0,0,105,50,845,0,0,0,0,0,0,0,0,0,0,0,0,0,1517,0,20000,60,0,0,0.0,0.0,0,0,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,P1
2,0.0,0.5,1,0.5,0,0,0,0,0,2,0,21,10,83,0,0,0,0,0,0,0,0,0,0,0,0,0,648,0,14000,71,0,0,0.0,0.0,0,0,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,P2
3,0.0,0.0,0,0.0,1,1,0,0,3,1,0,77,9,35,30,0,0,0,0,0,0,0,0,0,30,1,0,39,1,37500,130,1,0,0.0,1.0,1,0,False,True,False,True,False,True,False,False,False,False,True,False,False,False,False,False,P2
4,0.0,0.0,0,0.0,0,1,0,0,17,1,0,109,17,43,52,0,1,0,0,0,0,0,0,0,3,0,1,52,1,18000,308,1,0,1.0,0.0,1,0,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,P2


In [74]:
export_data(X_test, y_test, "test")

Unnamed: 0,pct_tl_open_l6m,pct_tl_closed_l6m,tot_tl_closed_l12m,pct_tl_closed_l12m,tot_missed_pmnt,cc_tl,home_tl,pl_tl,secured_tl,unsecured_tl,other_tl,age_oldest_tl,age_newest_tl,time_since_recent_payment,max_recent_level_of_deliq,num_deliq_6_12mts,num_times_60p_dpd,num_std_12mts,num_sub,num_sub_6mts,num_sub_12mts,num_dbt,num_dbt_12mts,num_lss,recent_level_of_deliq,cc_enq_l12m,pl_enq_l12m,time_since_recent_enq,enq_l3m,netmonthlyincome,time_with_curr_empr,cc_flag,pl_flag,pct_pl_enq_l6m_of_ever,pct_cc_enq_l6m_of_ever,hl_flag,gl_flag,maritalstatus_Married,maritalstatus_Single,gender_F,gender_M,last_prod_enq2_AL,last_prod_enq2_CC,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others,approved_flag
0,0.667,0.0,0,0.0,2,0,0,1,2,1,0,18,2,53,0,0,0,2,0,0,0,0,0,0,0,0,1,46,1,20000,154,0,1,1.0,0.0,1,0,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,P3
1,0.0,0.0,0,0.0,0,0,0,0,0,1,0,6,6,55,0,0,0,0,0,0,0,0,0,0,0,0,4,41,3,23000,54,0,0,0.5,0.0,0,0,False,True,False,True,False,False,False,False,True,False,False,False,False,False,True,False,P3
2,0.0,0.0,0,0.0,0,0,0,0,3,0,0,29,20,459,0,0,0,0,0,0,0,0,0,0,0,0,0,8,1,22000,124,0,0,0.0,0.0,1,0,True,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,P2
3,0.667,0.0,0,0.0,0,0,0,0,0,3,0,7,4,51,0,0,0,0,0,0,0,0,0,0,0,0,3,1,4,30000,66,0,0,1.0,0.0,0,0,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,P4
4,0.0,0.0,0,0.0,0,0,0,0,1,1,0,58,31,715,25,0,0,0,0,0,0,0,0,0,9,0,0,51,1,25000,122,0,0,0.0,0.0,0,0,True,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,P4
