## Loading and Setup

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
# Filter warnings
from warnings import filterwarnings
filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
# Load the dataset
df = pd.read_parquet('dataset/LoanData_after_EDA.parquet')

## Assessment

In [3]:
# Print the shape of the data
df.shape

(20834, 106)

In [4]:
# # Print the first 5 rows of the dataset
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag
0,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,...,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,N,Cash,N
1,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,...,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,N,Cash,N
3,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,...,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,N,Cash,N
4,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,...,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,N,Cash,N
5,11950.0,11950.0,11950.0,36 months,13.44,405.18,C,C3,Veterinary Tecnician,4 years,...,100.0,0.0,0.0,16900.0,12798.0,9400.0,4000.0,N,Cash,N


In [5]:
# Display the info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20834 entries, 0 to 22509
Columns: 106 entries, loan_amnt to debt_settlement_flag
dtypes: float64(83), object(23)
memory usage: 17.0+ MB


In [6]:
# Get a statistical summary of the dataset
df.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,fico_range_low,fico_range_high,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
count,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,...,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0,20834.0
mean,15208.43333,15208.43333,15202.132332,12.233967,441.169325,79989.85,19.303227,0.338005,694.9988,698.998896,...,0.088125,2.273207,93.970606,44.925623,0.12854,0.068926,182368.6,55289.892771,22926.701546,47316.996208
std,8757.840042,8757.840042,8752.917757,4.205992,250.4378,62420.66,8.761508,0.886107,31.104621,31.105086,...,0.495271,1.915496,8.874502,35.962074,0.375008,0.443356,174956.3,50137.987566,22384.669279,45418.084457
min,1000.0,1000.0,1000.0,5.32,30.54,3800.0,0.0,0.0,660.0,664.0,...,0.0,0.0,12.5,0.0,0.0,0.0,2500.0,0.0,0.0,0.0
25%,8000.0,8000.0,8000.0,9.17,257.39,50000.0,12.69,0.0,670.0,674.0,...,0.0,1.0,90.9,11.1,0.0,0.0,55601.75,24575.75,8200.0,18261.0
50%,14000.0,14000.0,14000.0,11.99,383.83,69000.0,18.785,0.0,690.0,694.0,...,0.0,2.0,97.4,42.9,0.0,0.0,122471.0,42364.0,16100.0,36519.5
75%,20225.0,20225.0,20193.75,14.48,589.2425,96000.0,25.55,0.0,710.0,714.0,...,0.0,3.0,100.0,75.0,0.0,0.0,263193.5,69858.5,29875.0,63325.75
max,35000.0,35000.0,35000.0,28.99,1354.66,3964280.0,136.97,15.0,845.0,850.0,...,13.0,25.0,100.0,100.0,6.0,22.0,2388482.0,878659.0,303000.0,591177.0


## Combination of Features

### Split the data into a training set and a test set

This is done before any preprocessing or feature selection. It's important to split the data first to avoid data leakage.

In [7]:
# Extract the target variable 'y' from the DataFrame 'df'
target_column = 'loan_status'
y = df[target_column]

In [8]:
# Identify the numerical columns
num_cols = df.select_dtypes(include=np.number).columns.tolist()

# Display the list of numerical columns
num_cols

['loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'policy_code',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'open_acc_6m',
 'open_act_il',
 'open_il_12m',
 'open_il_24m',
 'mths_since_rcnt_il',
 'total_bal_il',
 'il_util',
 'open_rv_12m',
 'open_rv_24m',
 'max_bal_bc',
 'all_util',
 'total_rev_hi_lim',
 'inq_fi',
 'total_cu_tl',
 'inq_last_12m',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_r

In [9]:
# Identify the categorical columns (excluding the target variable)
cat_cols = df.select_dtypes(include='object').drop(columns=[target_column]).columns.tolist()

# Display the list of categorical columns
cat_cols

['term',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'verification_status',
 'issue_d',
 'pymnt_plan',
 'url',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'earliest_cr_line',
 'initial_list_status',
 'last_pymnt_d',
 'last_credit_pull_d',
 'application_type',
 'hardship_flag',
 'disbursement_method',
 'debt_settlement_flag']

In [10]:
# Split the data into train and test datasets first (80% train, 20% test)
X_train_num, X_test_num, y_train, y_test = train_test_split(df[num_cols], y, test_size=0.2, random_state=42, stratify=y)
X_train_cat, X_test_cat = train_test_split(df[cat_cols], test_size=0.2, random_state=42)

# Reset the index for the train datasets
X_train_num.reset_index(drop=True, inplace=True)
X_train_cat.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

# Reset the index for the test datasets
X_test_num.reset_index(drop=True, inplace=True)
X_test_cat.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

### Numerical Columns

In [11]:
# Count the number of numerical columns
num_numerical_columns = len(num_cols)

# Display the count
print(f'Number of numerical columns: {num_numerical_columns}')

Number of numerical columns: 83


In [12]:
# Initialize SelectKBest with f_classif as the score function to extract top 50 best numerical features
selector_num = SelectKBest(score_func=f_classif, k=50)

# Fit the selector to the training data for numerical columns and transform the training data to get the top 50 features
X_train_num_new = selector_num.fit_transform(X_train_num, y_train)

# Transform the test data using the same selector
X_test_num_new = selector_num.transform(X_test_num)

# Get the indices of the selected features for numerical columns
selected_feature_indices_num = selector_num.get_support(indices=True)

# Get the names of the selected features for numerical columns
selected_features_num = X_train_num.columns[selected_feature_indices_num].tolist()

# Display the selected features in a well-formatted list
print("Selected features:")
for idx, feature in enumerate(selected_features_num, start=1):
    print(f"{idx}. {feature}")

Selected features:
1. loan_amnt
2. funded_amnt
3. funded_amnt_inv
4. int_rate
5. annual_inc
6. dti
7. fico_range_low
8. fico_range_high
9. inq_last_6mths
10. out_prncp
11. out_prncp_inv
12. total_pymnt
13. total_pymnt_inv
14. total_rec_prncp
15. total_rec_late_fee
16. recoveries
17. collection_recovery_fee
18. last_pymnt_amnt
19. last_fico_range_high
20. last_fico_range_low
21. tot_cur_bal
22. open_acc_6m
23. open_il_12m
24. open_il_24m
25. mths_since_rcnt_il
26. il_util
27. open_rv_12m
28. open_rv_24m
29. max_bal_bc
30. all_util
31. total_rev_hi_lim
32. inq_fi
33. inq_last_12m
34. acc_open_past_24mths
35. avg_cur_bal
36. bc_open_to_buy
37. bc_util
38. mo_sin_old_rev_tl_op
39. mo_sin_rcnt_rev_tl_op
40. mo_sin_rcnt_tl
41. mort_acc
42. mths_since_recent_bc
43. mths_since_recent_inq
44. num_actv_rev_tl
45. num_rev_tl_bal_gt_0
46. num_tl_op_past_12m
47. percent_bc_gt_75
48. pub_rec_bankruptcies
49. tot_hi_cred_lim
50. total_bc_limit


- Recoveries: post charge off gross recovery
    * "Recoveries" refer to the post-charge off gross recovery amount. In the context of lending or debt collection, a "charge off" occurs when a lender declares a debt as unlikely to be collected and removes it from their books as a loss. After a charge off, the lender may attempt to recover some or all of the debt through various means, such as selling the debt to a collection agency or negotiating a payment plan with the borrower. The amount recovered from the borrower after the charge off is referred to as "Recoveries."
- collection_recovery_fee: post charge off collection fee
    * "Collection Recovery Fee" is the post-charge off collection fee. When a debt is in collections after a charge off, collection agencies or debt collectors may charge a fee for their efforts to collect the outstanding debt from the borrower. This fee is called the "Collection Recovery Fee" and is typically a percentage of the amount collected.
- last_fico_range_high: The upper boundary range the borrower‚ as last FICO pulled belongs to.
- last_fico_range_low: The lower boundary range the borrower‚ as last FICO pulled belongs to.
    * "Last FICO Range High" and "Last FICO Range Low" refer to the upper and lower boundaries, respectively, of the range of the borrower's FICO (Fair Isaac Corporation) credit score at the time of the last FICO score update. The FICO score is a widely used credit scoring model that assesses an individual's creditworthiness based on their credit history and financial behavior. The credit score typically falls within a range, and the FICO score provided by LendingClub (or any other financial institution) is represented as an interval, with the "Last FICO Range High" denoting the upper end of the range and the "Last FICO Range Low" representing the lower end.
    * For example, if a borrower's last FICO score update provided by LendingClub was within a range of 680 to 700, "Last FICO Range High" would be 700, and "Last FICO Range Low" would be 680. These FICO score ranges are used to provide some level of privacy to borrowers while still giving potential lenders or investors an idea of the borrower's creditworthiness.

### Categorical Columns

In [13]:
# Count the number of numerical columns
num_categorical_columns = len(cat_cols)

# Display the count
print(f'Number of categorical columns: {num_categorical_columns}')

Number of categorical columns: 22


#### Train Data

In [14]:
# check number of unique values in each categorical column in train data
X_train_cat[cat_cols].nunique()

term                        2
grade                       7
sub_grade                  35
emp_title                8995
emp_length                 11
home_ownership              3
verification_status         3
issue_d                     1
pymnt_plan                  1
url                     16667
purpose                    12
title                      12
zip_code                  820
addr_state                 49
earliest_cr_line          509
initial_list_status         2
last_pymnt_d               40
last_credit_pull_d         40
application_type            2
hardship_flag               1
disbursement_method         1
debt_settlement_flag        2
dtype: int64

In [15]:
# drop categorical columns with too many unique values
X_train_cat.drop(columns=['sub_grade', 'emp_title', 'url', 'zip_code', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'], inplace=True)

# display which categorical columns were left
cat_cols_train = X_train_cat.select_dtypes(include='object').columns.tolist()
cat_cols_train

['term',
 'grade',
 'emp_length',
 'home_ownership',
 'verification_status',
 'issue_d',
 'pymnt_plan',
 'purpose',
 'title',
 'initial_list_status',
 'application_type',
 'hardship_flag',
 'disbursement_method',
 'debt_settlement_flag']

In [16]:
# check number of unique values in each categorical column
X_train_cat[cat_cols_train].nunique()

term                     2
grade                    7
emp_length              11
home_ownership           3
verification_status      3
issue_d                  1
pymnt_plan               1
purpose                 12
title                   12
initial_list_status      2
application_type         2
hardship_flag            1
disbursement_method      1
debt_settlement_flag     2
dtype: int64

In [17]:
# Get the value counts in each categorical column
for col in cat_cols_train:
    value_counts = X_train_cat[col].value_counts()
    print(f"Value Counts for {col}:\n{value_counts}\n")

Value Counts for term:
 36 months    11601
 60 months     5066
Name: term, dtype: int64

Value Counts for grade:
B    5092
C    4882
A    2934
D    2260
E    1104
F     337
G      58
Name: grade, dtype: int64

Value Counts for emp_length:
10+ years    5825
< 1 year     1623
2 years      1562
3 years      1447
1 year       1162
5 years      1096
4 years      1026
8 years       896
6 years       724
9 years       657
7 years       649
Name: emp_length, dtype: int64

Value Counts for home_ownership:
MORTGAGE    8209
RENT        6629
OWN         1829
Name: home_ownership, dtype: int64

Value Counts for verification_status:
Source Verified    6875
Not Verified       5066
Verified           4726
Name: verification_status, dtype: int64

Value Counts for issue_d:
Dec-2015    16667
Name: issue_d, dtype: int64

Value Counts for pymnt_plan:
n    16667
Name: pymnt_plan, dtype: int64

Value Counts for purpose:
debt_consolidation    9498
credit_card           4171
home_improvement       931
other   

In [18]:
# Shows the distribution of the features in the dataset
X_train_cat.select_dtypes('object')

Unnamed: 0,term,grade,emp_length,home_ownership,verification_status,issue_d,pymnt_plan,purpose,title,initial_list_status,application_type,hardship_flag,disbursement_method,debt_settlement_flag
0,36 months,C,1 year,MORTGAGE,Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
1,36 months,D,< 1 year,OWN,Verified,Dec-2015,n,other,Other,w,Individual,N,Cash,N
2,36 months,B,10+ years,OWN,Not Verified,Dec-2015,n,other,Other,w,Individual,N,Cash,N
3,36 months,C,9 years,OWN,Source Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
4,60 months,E,8 years,MORTGAGE,Source Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16662,60 months,D,10+ years,MORTGAGE,Source Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
16663,36 months,C,2 years,RENT,Source Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
16664,36 months,D,3 years,RENT,Source Verified,Dec-2015,n,small_business,Business,w,Individual,N,Cash,N
16665,36 months,B,10+ years,RENT,Not Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N


#### Test Data

In [19]:
# check number of unique values in each categorical column in test data
X_test_cat[cat_cols].nunique()

term                       2
grade                      7
sub_grade                 35
emp_title               2793
emp_length                11
home_ownership             3
verification_status        3
issue_d                    1
pymnt_plan                 1
url                     4167
purpose                   12
title                     12
zip_code                 696
addr_state                49
earliest_cr_line         435
initial_list_status        2
last_pymnt_d              40
last_credit_pull_d        40
application_type           2
hardship_flag              1
disbursement_method        1
debt_settlement_flag       2
dtype: int64

In [20]:
# drop categorical columns with too many unique values
X_test_cat.drop(columns=['sub_grade', 'emp_title', 'url', 'zip_code', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'], inplace=True)

# display which categorical columns were left
cat_cols_test = X_test_cat.select_dtypes(include='object').columns.tolist()
cat_cols_test

['term',
 'grade',
 'emp_length',
 'home_ownership',
 'verification_status',
 'issue_d',
 'pymnt_plan',
 'purpose',
 'title',
 'initial_list_status',
 'application_type',
 'hardship_flag',
 'disbursement_method',
 'debt_settlement_flag']

- term: The number of payments on the loan. Values are in months and can be either 36 or 60.
    * The "term" refers to the number of payments on the loan. It is expressed in months and can have two possible values: 36 or 60. This indicates the duration over which the borrower is expected to repay the loan in fixed installments.
- grade: LC assigned loan grade
    * "Grade" refers to the loan grade assigned by LC (LendingClub) or the lending institution. It is used to represent the credit quality of the borrower. Generally, grades are represented by letters (e.g., A, B, C) with A being the highest credit quality and representing lower risk borrowers, while lower grades represent higher risk borrowers.
- emp_length: Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years. 
    * "Employment length" indicates the number of years of employment of the borrower. Possible values range from 0 to 10, where 0 means less than one year of employment, and 10 means ten or more years of employment.
- home_ownership: The home ownership status provided by the borrower during registration. The values are: RENT, OWN, MORTGAGE, OTHER.
    * "Home ownership" represents the home ownership status provided by the borrower during registration. The possible values are RENT, OWN, MORTGAGE, or OTHER, indicating whether the borrower is renting, owning, mortgaging, or has other arrangements related to their residence.
- verification_status: Indicates if income was verified by LC, not verified, or if the income source was verified
    * "Verification status" indicates whether the borrower's income was verified by LC or the lending institution. The possible values are Verified, Not Verified, or Source Verified.
- issue_d: The month which the loan was funded
    * "Issue date" refers to the month in which the loan was funded or issued to the borrower.
- loan_status (target variable): Current status of the loan: Paid, Default.
- pymnt_plan: Indicates if a payment plan has been put in place for the loan
- purpose: A category provided by the borrower for the loan request. 
    * It indicates the reason or intended use of the loan amount.
- title: The loan title provided by the borrower
    * It provides additional information about the purpose of the loan.
- initial_list_status: The initial listing status of the loan. Possible values are: W, F
    * The "initial listing status" indicates whether the loan was initially listed as Whole Loan (W) or Fractional Loan (F) on the lending platform.
- application_type: Indicates whether the loan is an individual application or a joint application with two co-borrowers
- hardship_flag: Indicates whether the borrower is under a hardship plan.
    * A hardship plan, also known as a financial hardship plan, is an arrangement or agreement made between a borrower and a lender to temporarily modify the terms of a loan or debt repayment due to the borrower's financial difficulties. The purpose of a hardship plan is to provide relief to borrowers who are facing financial hardship and are struggling to meet their loan or debt obligations.
- disbursement_method: Represents the method used to disburse the loan amount to the borrower.
- debt_settlement_flag: Indicates whether a debt settlement plan has been put in place for the loan.
    * A debt settlement plan, also known as debt settlement or debt negotiation, is a strategy used by individuals or businesses to settle their outstanding debts with creditors for less than the full amount owed. 
    * In a debt settlement plan, the debtor (the individual or business owing the debt) negotiates with their creditors to reach an agreement on a reduced lump-sum payment or a structured payment plan to settle the debt. This negotiated settlement amount is typically less than the total outstanding debt owed.

In [21]:
# check number of unique values in each categorical column
X_test_cat[cat_cols_test].nunique()

term                     2
grade                    7
emp_length              11
home_ownership           3
verification_status      3
issue_d                  1
pymnt_plan               1
purpose                 12
title                   12
initial_list_status      2
application_type         2
hardship_flag            1
disbursement_method      1
debt_settlement_flag     2
dtype: int64

In [22]:
# Get the value counts in each categorical column
for col in cat_cols_test:
    value_counts = X_test_cat[col].value_counts()
    print(f"Value Counts for {col}:\n{value_counts}\n")

Value Counts for term:
 36 months    2815
 60 months    1352
Name: term, dtype: int64

Value Counts for grade:
B    1246
C    1214
A     725
D     567
E     315
F      72
G      28
Name: grade, dtype: int64

Value Counts for emp_length:
10+ years    1503
< 1 year      426
2 years       382
3 years       348
5 years       287
1 year        272
8 years       247
4 years       237
6 years       181
9 years       146
7 years       138
Name: emp_length, dtype: int64

Value Counts for home_ownership:
MORTGAGE    2080
RENT        1631
OWN          456
Name: home_ownership, dtype: int64

Value Counts for verification_status:
Source Verified    1725
Not Verified       1257
Verified           1185
Name: verification_status, dtype: int64

Value Counts for issue_d:
Dec-2015    4167
Name: issue_d, dtype: int64

Value Counts for pymnt_plan:
n    4167
Name: pymnt_plan, dtype: int64

Value Counts for purpose:
debt_consolidation    2386
credit_card           1022
home_improvement       238
other       

In [23]:
# Shows the distribution of the features in the dataset
X_test_cat.select_dtypes('object')

Unnamed: 0,term,grade,emp_length,home_ownership,verification_status,issue_d,pymnt_plan,purpose,title,initial_list_status,application_type,hardship_flag,disbursement_method,debt_settlement_flag
0,60 months,B,3 years,RENT,Source Verified,Dec-2015,n,debt_consolidation,Credit card refinancing,w,Individual,N,Cash,N
1,60 months,E,7 years,RENT,Source Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
2,36 months,B,10+ years,RENT,Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
3,60 months,D,2 years,RENT,Source Verified,Dec-2015,n,debt_consolidation,Debt consolidation,f,Individual,N,Cash,Y
4,36 months,B,9 years,RENT,Not Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4162,36 months,C,8 years,RENT,Source Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
4163,60 months,E,10+ years,OWN,Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N
4164,36 months,A,2 years,MORTGAGE,Source Verified,Dec-2015,n,home_improvement,Home improvement,w,Individual,N,Cash,N
4165,36 months,A,10+ years,MORTGAGE,Not Verified,Dec-2015,n,debt_consolidation,Debt consolidation,w,Individual,N,Cash,N


#### Preprocessing Step

In [24]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder()

# Perform one-hot encoding using OneHotEncoder on the training data
encoded_features_train = ohe.fit_transform(X_train_cat)

# Transform the test data using the same encoder
encoded_features_test = ohe.transform(X_test_cat)

# Create new dataframes with the encoded features
X_train_encoded = pd.DataFrame(encoded_features_train.toarray(), columns=ohe.get_feature_names_out(), dtype=int)
X_test_encoded = pd.DataFrame(encoded_features_test.toarray(), columns=ohe.get_feature_names_out(), dtype=int)

In [25]:
# Inspect the encoding results on the training set
X_train_encoded.isna().sum()

term_ 36 months                        0
term_ 60 months                        0
grade_A                                0
grade_B                                0
grade_C                                0
grade_D                                0
grade_E                                0
grade_F                                0
grade_G                                0
emp_length_1 year                      0
emp_length_10+ years                   0
emp_length_2 years                     0
emp_length_3 years                     0
emp_length_4 years                     0
emp_length_5 years                     0
emp_length_6 years                     0
emp_length_7 years                     0
emp_length_8 years                     0
emp_length_9 years                     0
emp_length_< 1 year                    0
home_ownership_MORTGAGE                0
home_ownership_OWN                     0
home_ownership_RENT                    0
verification_status_Not Verified       0
verification_sta

In [26]:
# Inspect the encoding results on the testing set
X_test_encoded.isna().sum()

term_ 36 months                        0
term_ 60 months                        0
grade_A                                0
grade_B                                0
grade_C                                0
grade_D                                0
grade_E                                0
grade_F                                0
grade_G                                0
emp_length_1 year                      0
emp_length_10+ years                   0
emp_length_2 years                     0
emp_length_3 years                     0
emp_length_4 years                     0
emp_length_5 years                     0
emp_length_6 years                     0
emp_length_7 years                     0
emp_length_8 years                     0
emp_length_9 years                     0
emp_length_< 1 year                    0
home_ownership_MORTGAGE                0
home_ownership_OWN                     0
home_ownership_RENT                    0
verification_status_Not Verified       0
verification_sta

In [27]:
# Initialize SelectKBest with chi2 as the score function to extract top 4 best categorical features
selector_cat = SelectKBest(score_func=chi2, k=4)

# Fit the selector to the training data for categorical columns and transform the training data to get the top 4 features
X_train_cat_new = selector_cat.fit_transform(X_train_encoded, y_train)

# Transform the test data using the same selector
X_test_cat_new = selector_cat.transform(X_test_encoded)

# Get the indices of the selected features for categorical columns
selected_feature_indices_cat = selector_cat.get_support(indices=True)

# Get the names of the selected features for categorical columns
selected_features_cat = X_train_encoded.columns[selected_feature_indices_cat].tolist()

# Display the selected categorical features
print("Selected categorical features:")
for idx, feature in enumerate(selected_features_cat, start=1):
    print(f"{idx}. {feature}")

Selected categorical features:
1. grade_A
2. purpose_vacation
3. title_Vacation
4. application_type_Joint App


## Concatenation for the final traning and testing datasets

### Train Data

In [28]:
# Check the distribution of the numerical features
X_train_num[selected_features_num].shape

(16667, 50)

In [29]:
# Check the distribution of the categorical features
X_train_encoded[selected_features_cat].shape

(16667, 4)

In [30]:
# Check if the indices of X_train_num and X_train_cat are identical
indices_equal = (X_train_num.index == X_train_encoded.index).all()

print("Are the indices identical?", indices_equal)

Are the indices identical? True


In [31]:
# Concatenate the DataFrames
X_train_final = pd.concat([X_train_num[selected_features_num], X_train_encoded[selected_features_cat], y_train], axis=1)

# Display the final train dataset
X_train_final.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,annual_inc,dti,fico_range_low,fico_range_high,inq_last_6mths,out_prncp,...,num_tl_op_past_12m,percent_bc_gt_75,pub_rec_bankruptcies,tot_hi_cred_lim,total_bc_limit,grade_A,purpose_vacation,title_Vacation,application_type_Joint App,loan_status
0,15000.0,15000.0,15000.0,9.17,48000.0,12.2,680.0,684.0,0.0,0.0,...,0.0,25.0,1.0,22300.0,6200.0,0,0,0,0,Paid
1,24000.0,24000.0,24000.0,18.99,75000.0,34.77,665.0,669.0,0.0,0.0,...,1.0,75.0,0.0,195224.0,24600.0,0,0,0,0,Paid
2,15000.0,15000.0,15000.0,9.8,125000.0,17.62,710.0,714.0,3.0,0.0,...,2.0,62.5,0.0,409207.0,44800.0,0,0,0,0,Paid
3,11750.0,11750.0,11750.0,11.99,28000.0,29.86,670.0,674.0,0.0,0.0,...,0.0,33.3,0.0,16900.0,8700.0,0,0,0,0,Default
4,10000.0,10000.0,10000.0,10.64,38000.0,11.97,680.0,684.0,1.0,0.0,...,1.0,100.0,0.0,16200.0,4000.0,0,0,0,0,Paid


In [32]:
# Display the shape of the training data
X_train_final.shape

(16667, 55)

In [33]:
# Check for missing values in the dataframe
X_train_final.isna().sum()

loan_amnt                     0
funded_amnt                   0
funded_amnt_inv               0
int_rate                      0
annual_inc                    0
dti                           0
fico_range_low                0
fico_range_high               0
inq_last_6mths                0
out_prncp                     0
out_prncp_inv                 0
total_pymnt                   0
total_pymnt_inv               0
total_rec_prncp               0
total_rec_late_fee            0
recoveries                    0
collection_recovery_fee       0
last_pymnt_amnt               0
last_fico_range_high          0
last_fico_range_low           0
tot_cur_bal                   0
open_acc_6m                   0
open_il_12m                   0
open_il_24m                   0
mths_since_rcnt_il            0
il_util                       0
open_rv_12m                   0
open_rv_24m                   0
max_bal_bc                    0
all_util                      0
total_rev_hi_lim              0
inq_fi  

### Test Data

In [34]:
# Check the distribution of the numerical features
X_test_num[selected_features_num].shape

(4167, 50)

In [35]:
# Check the distribution of the categorical features
X_test_encoded[selected_features_cat].shape

(4167, 4)

In [36]:
# Check if the indices of X_test_num and X_test_cat are identical
indices_equal = (X_test_num.index == X_test_encoded.index).all()

print("Are the indices identical?", indices_equal)

Are the indices identical? True


In [37]:
# Concatenate the DataFrames
X_test_final = pd.concat([X_test_num[selected_features_num], X_test_encoded[selected_features_cat], y_test], axis=1)


# Display the final test dataset
X_test_final.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,annual_inc,dti,fico_range_low,fico_range_high,inq_last_6mths,out_prncp,...,num_tl_op_past_12m,percent_bc_gt_75,pub_rec_bankruptcies,tot_hi_cred_lim,total_bc_limit,grade_A,purpose_vacation,title_Vacation,application_type_Joint App,loan_status
0,28000.0,28000.0,28000.0,7.89,700000.0,3.23,705.0,709.0,0.0,0.0,...,1.0,75.0,0.0,2030198.0,80400.0,0,0,0,0,Paid
1,16800.0,16800.0,16800.0,11.99,43000.0,22.88,690.0,694.0,0.0,0.0,...,3.0,50.0,0.0,49676.0,3700.0,0,0,0,0,Paid
2,24000.0,24000.0,24000.0,8.49,150000.0,23.7,685.0,689.0,0.0,9996.38,...,0.0,100.0,0.0,1161055.0,33900.0,0,0,0,0,Paid
3,24000.0,24000.0,24000.0,5.32,125000.0,24.54,745.0,749.0,0.0,0.0,...,1.0,50.0,0.0,547059.0,13800.0,0,0,0,0,Paid
4,24000.0,24000.0,23950.0,11.22,65000.0,18.96,715.0,719.0,0.0,9957.44,...,2.0,33.3,0.0,132613.0,14500.0,0,0,0,0,Paid


In [38]:
# Display the shape of the testing data
X_test_final.shape

(4167, 55)

In [39]:
# Check for missing values in the dataframe
X_test_final.isna().sum()

loan_amnt                     0
funded_amnt                   0
funded_amnt_inv               0
int_rate                      0
annual_inc                    0
dti                           0
fico_range_low                0
fico_range_high               0
inq_last_6mths                0
out_prncp                     0
out_prncp_inv                 0
total_pymnt                   0
total_pymnt_inv               0
total_rec_prncp               0
total_rec_late_fee            0
recoveries                    0
collection_recovery_fee       0
last_pymnt_amnt               0
last_fico_range_high          0
last_fico_range_low           0
tot_cur_bal                   0
open_acc_6m                   0
open_il_12m                   0
open_il_24m                   0
mths_since_rcnt_il            0
il_util                       0
open_rv_12m                   0
open_rv_24m                   0
max_bal_bc                    0
all_util                      0
total_rev_hi_lim              0
inq_fi  

### Save the Datasets

In [40]:
# Save the final train and test datasets to Parquet format
X_train_final.to_parquet('dataset/train_dataset.parquet', index=False)
X_test_final.to_parquet('dataset/test_dataset.parquet', index=False)