### Importing the  Libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')

### Read the Dataset

In [2]:
app_df = pd.read_csv("..\\Dataset\\application_record.csv")
credit_df = pd.read_csv("..\\Dataset\\credit_record.csv")

# Show dataset info
print("Application Dataset:")
print(app_df.shape)
app_df.info()

print("\nCredit Record Dataset:")
print(credit_df.shape)
credit_df.info()

# Preview
print("\nApplication Dataset Preview:")
display(app_df.head())

print("\nCredit Record Dataset Preview:")
display(credit_df.head())


Application Dataset:
(438557, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-nu

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0



Credit Record Dataset Preview:


Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


### Drop Unwanted Features

In [3]:
# Drop unnecessary or irrelevant columns
if 'FLAG_MOBIL' in app_df.columns:
    app_df.drop(['FLAG_MOBIL'], axis=1, inplace=True)


### Handling Missing Values

In [4]:
# Handle missing values in OCCUPATION_TYPE column
if 'OCCUPATION_TYPE' in app_df.columns:
    app_df['OCCUPATION_TYPE'].fillna('Unknown', inplace=True)


### Data Cleaning and Merging

In [5]:
# Mark customers who defaulted (STATUS 2, 3, 4, 5)
defaulters = credit_df[credit_df['STATUS'].isin(['2', '3', '4', '5'])]['ID'].unique()

# Create DEFAULT column: 1 = defaulter, 0 = non-defaulter
app_df['DEFAULT'] = app_df['ID'].apply(lambda x: 1 if x in defaulters else 0)

# Drop the 'ID' column after using it
app_df.drop('ID', axis=1, inplace=True)


### Feature Engineering

In [6]:
# New Feature: Income per family member
app_df['INCOME_PER_PERSON'] = app_df['AMT_INCOME_TOTAL'] / app_df['CNT_FAM_MEMBERS']

# Binning income into 4 categories
app_df['INCOME_BIN'] = pd.qcut(app_df['AMT_INCOME_TOTAL'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])


### Handling Categorical Values

In [7]:
# Encode object (categorical) columns using Label Encoding
cat_cols = app_df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    app_df[col] = le.fit_transform(app_df[col])



In [8]:
app_df.head()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,DEFAULT,INCOME_PER_PERSON,INCOME_BIN
0,1,1,1,0,427500.0,4,1,0,4,-12005,-4542,1,0,0,17,2.0,0,213750.0,Very High
1,1,1,1,0,427500.0,4,1,0,4,-12005,-4542,1,0,0,17,2.0,0,213750.0,Very High
2,1,1,1,0,112500.0,4,4,1,1,-21474,-1134,0,0,0,16,2.0,0,56250.0,Low
3,0,0,1,0,270000.0,0,4,3,1,-19110,-3051,0,1,1,14,1.0,0,270000.0,Very High
4,0,0,1,0,270000.0,0,4,3,1,-19110,-3051,0,1,1,14,1.0,0,270000.0,Very High


### Splitting Data Into Train And Test

In [9]:
# Separate features (X) and target (y)
X = app_df.drop('DEFAULT', axis=1)
y = app_df['DEFAULT']

# Split the dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Output shapes for verification
print("Training Features Shape:", X_train.shape)
print("Test Features Shape:", X_test.shape)
print("Training Labels Shape:", y_train.shape)
print("Test Labels Shape:", y_test.shape)

Training Features Shape: (350845, 18)
Test Features Shape: (87712, 18)
Training Labels Shape: (350845,)
Test Labels Shape: (87712,)


In [11]:
y.value_counts()

DEFAULT
0    437941
1       616
Name: count, dtype: int64