# IMPORTING DEPENDENCIES

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import plotly.express as px
%matplotlib inline

In [2]:
train_df = pd.read_csv('data/raw/train_ctrUa4K.csv')
test_df = pd.read_csv('data/raw/test_lAUu6dG.csv')

In [3]:
train_df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


## Filling Null Values

In [4]:
# Check for missing values
print("\nMissing values:")
print(train_df.isnull().sum())


Missing values:
Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


In [5]:
# Check for missing values
print("\nMissing values:")
print(test_df.isnull().sum())


Missing values:
Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64


### 1.Gender

In [6]:
mode = train_df['Gender'].mode()[0]
train_df['Gender']= train_df['Gender'].fillna(mode)

In [7]:
mode = test_df['Gender'].mode()[0]
test_df['Gender']= test_df['Gender'].fillna(mode)

### 2.Married

In [8]:
mode = train_df['Married'].mode()[0]
train_df['Married']= train_df['Married'].fillna(mode)

### 3.Dependents

In [9]:
print(train_df[train_df['Dependents'] == 0]['Dependents'].dtype)
print(train_df[train_df['Dependents'] == 1]['Dependents'].dtype)
print(train_df[train_df['Dependents'] == 2]['Dependents'].dtype)
print(train_df[train_df['Dependents'] == '3+']['Dependents'].dtype)

object
object
object
object


In [10]:
mode = train_df['Dependents'].mode()[0]
train_df['Dependents']= train_df['Dependents'].fillna(mode)

In [11]:
mode = test_df['Dependents'].mode()[0]
test_df['Dependents']= test_df['Dependents'].fillna(mode)

In [12]:
# Replace '3+' with '3' in the 'Dependents' column
train_df['Dependents'] = train_df['Dependents'].replace('3+', '3')
# Convert the 'Dependents' column to integer datatype
train_df['Dependents'] = train_df['Dependents'].astype(int)

In [13]:
# Replace '3+' with '3' in the 'Dependents' column
test_df['Dependents'] = test_df['Dependents'].replace('3+', '3')
# Convert the 'Dependents' column to integer datatype
test_df['Dependents'] = test_df['Dependents'].astype(int)

### 4.Self_Employed

In [14]:
train_df['Self_Employed'].value_counts()

Self_Employed
No     500
Yes     82
Name: count, dtype: int64

In [15]:
mode = train_df['Self_Employed'].mode()[0]
train_df['Self_Employed']= train_df['Self_Employed'].fillna(mode)

In [16]:
mode = test_df['Self_Employed'].mode()[0]
test_df['Self_Employed']= test_df['Self_Employed'].fillna(mode)

### 5. LoanAmount

In [17]:
train_df[train_df['LoanAmount'].isnull()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
35,LP001106,Male,Yes,0,Graduate,No,2275,2067.0,,360.0,1.0,Urban,Y
63,LP001213,Male,Yes,1,Graduate,No,4945,0.0,,360.0,0.0,Rural,N
81,LP001266,Male,Yes,1,Graduate,Yes,2395,0.0,,360.0,1.0,Semiurban,Y
95,LP001326,Male,No,0,Graduate,No,6782,0.0,,360.0,,Urban,N
102,LP001350,Male,Yes,0,Graduate,No,13650,0.0,,360.0,1.0,Urban,Y
103,LP001356,Male,Yes,0,Graduate,No,4652,3583.0,,360.0,1.0,Semiurban,Y
113,LP001392,Female,No,1,Graduate,Yes,7451,0.0,,360.0,1.0,Semiurban,Y
127,LP001449,Male,No,0,Graduate,No,3865,1640.0,,360.0,1.0,Rural,Y
202,LP001682,Male,Yes,3,Not Graduate,No,3992,0.0,,180.0,1.0,Urban,N


In [18]:
# Create a copy of the DataFrame with only the 'Self_Employed' column
df_self_employed = train_df[['LoanAmount']].copy()
# Initialize a KNNImputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=5)
# Fit the imputer on the 'Self_Employed' column
imputer.fit(df_self_employed)
# Transform the 'Self_Employed' column to fill the null values
df_self_employed['LoanAmount'] = imputer.transform(df_self_employed)
# Update the original DataFrame with the imputed values
train_df['LoanAmount'] = df_self_employed['LoanAmount']

In [19]:
# Create a copy of the DataFrame with only the 'Self_Employed' column
df_self_employed = test_df[['LoanAmount']].copy()
# Initialize a KNNImputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=5)
# Fit the imputer on the 'Self_Employed' column
imputer.fit(df_self_employed)
# Transform the 'Self_Employed' column to fill the null values
df_self_employed['LoanAmount'] = imputer.transform(df_self_employed)
# Update the original DataFrame with the imputed values
test_df['LoanAmount'] = df_self_employed['LoanAmount']

### 6. LoanAmountTerms

In [20]:
train_df['Loan_Amount_Term'].value_counts()

Loan_Amount_Term
360.0    512
180.0     44
480.0     15
300.0     13
240.0      4
84.0       4
120.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [21]:
# Create a copy of the DataFrame with only the 'Self_Employed' column
df_self_employed = train_df[['Loan_Amount_Term']].copy()

# Initialize a KNNImputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=7)

# Fit the imputer on the 'Self_Employed' column
imputer.fit(df_self_employed)

# Transform the 'Self_Employed' column to fill the null values
df_self_employed['Loan_Amount_Term'] = imputer.transform(df_self_employed)

# Update the original DataFrame with the imputed values
train_df['Loan_Amount_Term'] = df_self_employed['Loan_Amount_Term']

In [22]:
# Create a copy of the DataFrame with only the 'Self_Employed' column
df_self_employed = test_df[['Loan_Amount_Term']].copy()

# Initialize a KNNImputer with the desired number of neighbors
imputer = KNNImputer(n_neighbors=7)

# Fit the imputer on the 'Self_Employed' column
imputer.fit(df_self_employed)

# Transform the 'Self_Employed' column to fill the null values
df_self_employed['Loan_Amount_Term'] = imputer.transform(df_self_employed)

# Update the original DataFrame with the imputed values
test_df['Loan_Amount_Term'] = df_self_employed['Loan_Amount_Term']

### 7. Credit_History

In [23]:
train_df['Credit_History'].value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [24]:
mode = train_df['Credit_History'].mode()[0]
train_df['Credit_History']= train_df['Credit_History'].fillna(mode)

In [25]:
mode = test_df['Credit_History'].mode()[0]
test_df['Credit_History']= test_df['Credit_History'].fillna(mode)

In [27]:
#exporting processed files 
train_df.to_csv('data/preprocessed/train_df_without_null.csv',index=False)
test_df.to_csv('data/preprocessed/test_df_without_null.csv',index=False)