# Część pierwsza

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, OneHotEncoder)
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns

In [4]:
df = pd.read_csv('Loan_Default.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148670 entries, 0 to 148669
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   ID                         148670 non-null  int64  
 1   year                       148670 non-null  int64  
 2   loan_limit                 145326 non-null  object 
 3   Gender                     148670 non-null  object 
 4   approv_in_adv              147762 non-null  object 
 5   loan_type                  148670 non-null  object 
 6   loan_purpose               148536 non-null  object 
 7   Credit_Worthiness          148670 non-null  object 
 8   open_credit                148670 non-null  object 
 9   business_or_commercial     148670 non-null  object 
 10  loan_amount                148670 non-null  int64  
 11  rate_of_interest           112231 non-null  float64
 12  Interest_rate_spread       112031 non-null  float64
 13  Upfront_charges            10

In [6]:
df.head()

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


In [7]:
df.isnull().sum()

ID                               0
year                             0
loan_limit                    3344
Gender                           0
approv_in_adv                  908
loan_type                        0
loan_purpose                   134
Credit_Worthiness                0
open_credit                      0
business_or_commercial           0
loan_amount                      0
rate_of_interest             36439
Interest_rate_spread         36639
Upfront_charges              39642
term                            41
Neg_ammortization              121
interest_only                    0
lump_sum_payment                 0
property_value               15098
construction_type                0
occupancy_type                   0
Secured_by                       0
total_units                      0
income                        9150
credit_type                      0
Credit_Score                     0
co-applicant_credit_type         0
age                            200
submission_of_applic

In [8]:
df.describe(include='all')

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
count,148670.0,148670.0,145326,148670,147762,148670,148536,148670,148670,148670,...,148670,148670.0,148670,148470,148470,133572.0,148670,148670,148670.0,124549.0
unique,,,2,4,2,3,4,2,2,2,...,4,,2,7,2,,4,2,,
top,,,cf,Male,nopre,type1,p3,l1,nopc,nob/c,...,CIB,,CIB,45-54,to_inst,,North,direct,,
freq,,,135348,42346,124621,113173,55934,142344,148114,127908,...,48152,,74392,34720,95814,,74722,148637,,
mean,99224.5,2019.0,,,,,,,,,...,,699.789103,,,,72.746457,,,0.246445,37.732932
std,42917.476598,0.0,,,,,,,,,...,,115.875857,,,,39.967603,,,0.430942,10.545435
min,24890.0,2019.0,,,,,,,,,...,,500.0,,,,0.967478,,,0.0,5.0
25%,62057.25,2019.0,,,,,,,,,...,,599.0,,,,60.47486,,,0.0,31.0
50%,99224.5,2019.0,,,,,,,,,...,,699.0,,,,75.13587,,,0.0,39.0
75%,136391.75,2019.0,,,,,,,,,...,,800.0,,,,86.184211,,,0.0,45.0


# Train test split

In [9]:
y = df['Status']
X = df.drop(['Status','ID','year','rate_of_interest','Interest_rate_spread','Upfront_charges','property_value','income'], axis=1)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,term,...,total_units,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,dtir1
141245,cf,Sex Not Available,nopre,type3,p3,l1,nopc,nob/c,76500,360.0,...,1U,CIB,605,EXP,>74,to_inst,70.833333,south,direct,12.0
3507,,Male,nopre,type1,p4,l1,nopc,nob/c,556500,360.0,...,1U,EXP,729,CIB,45-54,not_inst,59.967672,south,direct,43.0
53688,cf,Female,pre,type2,p3,l1,nopc,b/c,126500,180.0,...,1U,CIB,609,CIB,65-74,to_inst,85.472973,North,direct,42.0
46491,cf,Male,nopre,type1,p3,l1,nopc,nob/c,246500,360.0,...,1U,EXP,600,CIB,45-54,not_inst,85.590278,central,direct,41.0
54671,cf,Female,nopre,type1,p4,l1,nopc,nob/c,486500,360.0,...,1U,CRIF,701,EXP,55-64,not_inst,58.756039,south,direct,35.0


In [12]:
X_test.head()

Unnamed: 0,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,loan_amount,term,...,total_units,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,dtir1
24912,cf,Female,nopre,type2,p3,l1,nopc,b/c,116500,360.0,...,1U,EXP,679,CIB,45-54,to_inst,84.42029,North,direct,41.0
147068,cf,Sex Not Available,pre,type1,p4,l1,nopc,nob/c,236500,360.0,...,1U,EXP,853,EXP,25-34,to_inst,88.246269,south,direct,44.0
123284,cf,Sex Not Available,nopre,type1,p3,l1,nopc,nob/c,136500,348.0,...,1U,CRIF,598,EXP,65-74,to_inst,19.279661,south,direct,41.0
53610,cf,Female,nopre,type1,p3,l1,nopc,nob/c,256500,180.0,...,1U,EXP,776,EXP,65-74,to_inst,36.228814,North,direct,32.0
39672,cf,Female,nopre,type1,p1,l1,nopc,nob/c,686500,360.0,...,1U,EXP,653,CIB,35-44,not_inst,81.921241,North,direct,48.0


In [13]:
y_train.head()

141245    0
3507      0
53688     0
46491     1
54671     0
Name: Status, dtype: int64

# Steps of Data Preprocessing Pipeline

In [14]:
categorical_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']
numerical_columns = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
print(categorical_columns)
print(numerical_columns)

['loan_limit', 'Gender', 'approv_in_adv', 'loan_type', 'loan_purpose', 'Credit_Worthiness', 'open_credit', 'business_or_commercial', 'Neg_ammortization', 'interest_only', 'lump_sum_payment', 'construction_type', 'occupancy_type', 'Secured_by', 'total_units', 'credit_type', 'co-applicant_credit_type', 'age', 'submission_of_application', 'Region', 'Security_Type']
['loan_amount', 'term', 'Credit_Score', 'LTV', 'dtir1']


In [15]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create pipeline
pipeline = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [16]:
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Convert to DataFrame
X_train_transformed = pd.DataFrame(X_train_transformed)
X_test_transformed = pd.DataFrame(X_test_transformed)

# Add column names
X_train_transformed.columns = numerical_columns + list(pipeline.named_transformers_['cat']['onehot'].get_feature_names_out())
X_test_transformed.columns = numerical_columns + list(pipeline.named_transformers_['cat']['onehot'].get_feature_names_out())

In [17]:
pd.set_option('display.max_columns', None)
print(f"Sample of transformed data:\n {X_train_transformed.head()}")

Sample of transformed data:
    loan_amount      term  Credit_Score       LTV     dtir1  x0_cf  x0_ncf  \
0    -1.382543  0.425386     -0.818669 -0.053303 -2.684156    1.0     0.0   
1     1.220632  0.425386      0.250870 -0.315776  0.524081    1.0     0.0   
2    -1.111379 -2.658811     -0.784168  0.300334  0.420590    1.0     0.0   
3    -0.460585  0.425386     -0.861796  0.303168  0.317098    1.0     0.0   
4     0.841002  0.425386      0.009361 -0.345044 -0.303851    1.0     0.0   

   x1_Female  x1_Joint  x1_Male  x1_Sex Not Available  x2_nopre  x2_pre  \
0        0.0       0.0      0.0                   1.0       1.0     0.0   
1        0.0       0.0      1.0                   0.0       1.0     0.0   
2        1.0       0.0      0.0                   0.0       0.0     1.0   
3        0.0       0.0      1.0                   0.0       1.0     0.0   
4        1.0       0.0      0.0                   0.0       1.0     0.0   

   x3_type1  x3_type2  x3_type3  x4_p1  x4_p2  x4_p3  x4_

In [18]:
X_train_transformed.describe(include='all')

Unnamed: 0,loan_amount,term,Credit_Score,LTV,dtir1,x0_cf,x0_ncf,x1_Female,x1_Joint,x1_Male,x1_Sex Not Available,x2_nopre,x2_pre,x3_type1,x3_type2,x3_type3,x4_p1,x4_p2,x4_p3,x4_p4,x5_l1,x5_l2,x6_nopc,x6_opc,x7_b/c,x7_nob/c,x8_neg_amm,x8_not_neg,x9_int_only,x9_not_int,x10_lpsm,x10_not_lpsm,x11_mh,x11_sb,x12_ir,x12_pr,x12_sr,x13_home,x13_land,x14_1U,x14_2U,x14_3U,x14_4U,x15_CIB,x15_CRIF,x15_EQUI,x15_EXP,x16_CIB,x16_EXP,x17_25-34,x17_35-44,x17_45-54,x17_55-64,x17_65-74,x17_<25,x17_>74,x18_not_inst,x18_to_inst,x19_North,x19_North-East,x19_central,x19_south,x20_Indriect,x20_direct
count,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0,118936.0
mean,5.705323e-18,-4.571428e-16,3.718317e-16,1.65305e-16,-2.56172e-16,0.932762,0.067238,0.184351,0.278259,0.284001,0.253388,0.84432,0.15568,0.76246,0.138604,0.098936,0.233251,0.022298,0.375118,0.369333,0.957086,0.042914,0.996267,0.003733,0.138604,0.861396,0.101777,0.898223,0.047673,0.952327,0.023063,0.976937,0.00021,0.99979,0.049674,0.929029,0.021297,0.99979,0.00021,0.985387,0.009904,0.002632,0.002077,0.323989,0.294764,0.10324,0.278007,0.499882,0.500118,0.129364,0.220387,0.234353,0.218664,0.139689,0.008954,0.048589,0.353736,0.646264,0.50211,0.008399,0.058452,0.431039,0.00021,0.99979
std,1.000004,1.000004,1.000004,1.000004,1.000004,0.250435,0.250435,0.387772,0.448143,0.450939,0.434953,0.362554,0.362554,0.425577,0.345534,0.298577,0.422903,0.147651,0.484155,0.482626,0.202664,0.202664,0.060985,0.060985,0.345534,0.345534,0.302357,0.302357,0.213074,0.213074,0.150104,0.150104,0.014497,0.014497,0.217271,0.256777,0.144374,0.014497,0.014497,0.119998,0.099028,0.051232,0.045524,0.467998,0.455938,0.304274,0.448019,0.500002,0.500002,0.335603,0.41451,0.423595,0.413342,0.346665,0.094203,0.215008,0.478131,0.478131,0.499998,0.091263,0.234596,0.495224,0.014497,0.014497
min,-1.70794,-4.098103,-1.724327,-1.740993,-3.408597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7317496,0.425386,-0.870421,-0.2360485,-0.5108341,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,-0.1894214,0.425386,-0.00788952,0.05062967,0.1101151,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
75%,0.5698381,0.425386,0.8632672,0.2831916,0.6275727,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
max,17.59894,0.425386,1.725799,187.4085,2.386929,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
for column in X_train_transformed.columns:
    print(f'{column}: {X_train_transformed[column].isnull().sum()}')

loan_amount: 0
term: 0
Credit_Score: 0
LTV: 0
dtir1: 0
x0_cf: 0
x0_ncf: 0
x1_Female: 0
x1_Joint: 0
x1_Male: 0
x1_Sex Not Available: 0
x2_nopre: 0
x2_pre: 0
x3_type1: 0
x3_type2: 0
x3_type3: 0
x4_p1: 0
x4_p2: 0
x4_p3: 0
x4_p4: 0
x5_l1: 0
x5_l2: 0
x6_nopc: 0
x6_opc: 0
x7_b/c: 0
x7_nob/c: 0
x8_neg_amm: 0
x8_not_neg: 0
x9_int_only: 0
x9_not_int: 0
x10_lpsm: 0
x10_not_lpsm: 0
x11_mh: 0
x11_sb: 0
x12_ir: 0
x12_pr: 0
x12_sr: 0
x13_home: 0
x13_land: 0
x14_1U: 0
x14_2U: 0
x14_3U: 0
x14_4U: 0
x15_CIB: 0
x15_CRIF: 0
x15_EQUI: 0
x15_EXP: 0
x16_CIB: 0
x16_EXP: 0
x17_25-34: 0
x17_35-44: 0
x17_45-54: 0
x17_55-64: 0
x17_65-74: 0
x17_<25: 0
x17_>74: 0
x18_not_inst: 0
x18_to_inst: 0
x19_North: 0
x19_North-East: 0
x19_central: 0
x19_south: 0
x20_Indriect: 0
x20_direct: 0
