In [1]:
import pandas as pd;
import numpy as np;
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('training_set.csv')

In [3]:
df.isnull().sum()

Loan_ID               0
Gender               15
Married               3
Dependents           15
Education             1
Self_Employed        32
ApplicantIncome       2
CoapplicantIncome     1
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
property_Area         0
Loan_Status           0
dtype: int64

In [4]:
df1 = df.dropna()
df1.shape

(476, 13)

In [5]:
df2 = df.dropna(axis=1)
df2.shape

(614, 3)

In [6]:
threshold = df.shape[1] // 2       # Checking whether a row has more than 50% attributes missing, if yes we can remove that tuple
df_new = df.dropna(thresh = 1 + threshold)

In [7]:
df_new.shape

(614, 13)

In [8]:
categorical_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Dependents']
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

In [9]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
property_Area        0
Loan_Status          0
dtype: int64

In [10]:
categorical_cols = ['Gender', 'Married', 'Education', 'Self_Employed', 'Dependents', 'property_Area', 'Loan_Status']

encoder = LabelEncoder()
df['Gender'] = encoder.fit_transform(df['Gender'])

df['Married'] = df['Married'].map({'Yes': 1, 'No':0})
df['Education'] = df['Education'].map({'Graduate':1, 'Not Graduate': 0})
df['Self_Employed'] = df['Self_Employed'].map({'Yes': 1, 'No':0})
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N':0})

In [11]:
df['Dependents'] = df['Dependents'].replace('3+', 3).astype(int)

In [12]:
df = pd.get_dummies(df, columns=['property_Area'], drop_first=True)

In [13]:
df['property_Area_Urban'] = encoder.fit_transform(df['property_Area_Urban'])
df['property_Area_Semiurban'] = encoder.fit_transform(df['property_Area_Semiurban'])

In [14]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,property_Area_Semiurban,property_Area_Urban
0,LP001002,1,0,0,1,0,5849.0,0.0,128.0,360.0,1.0,1,0,1
1,LP001003,1,1,1,1,0,3806.0,1508.0,128.0,360.0,1.0,0,0,0
2,LP001005,1,1,0,1,1,3000.0,0.0,66.0,360.0,1.0,1,0,1
3,LP001006,1,1,0,0,0,2583.0,2358.0,120.0,360.0,1.0,1,0,1
4,LP001008,1,0,0,1,0,6000.0,0.0,141.0,360.0,1.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0,1,0,2900.0,0.0,71.0,360.0,1.0,1,0,0
610,LP002979,1,1,3,1,0,4106.0,0.0,40.0,180.0,1.0,1,0,0
611,LP002983,1,1,1,1,0,8072.0,240.0,253.0,360.0,1.0,1,0,1
612,LP002984,1,1,2,1,0,7583.0,0.0,187.0,360.0,1.0,1,0,1


In [15]:
# def detect_outliers(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     upperbound = Q3 + 1.5*IQR
#     lowerbound = Q1 - 1.5*IQR
#     outliers = df[(df[column] < lowerbound) | (df[column] > upperbound)]
#     return outliers
    
# outliers = detect_outliers(df, 'LoanAmount')
# print(outliers)

In [16]:
numerical_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [17]:
df

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,property_Area_Semiurban,property_Area_Urban
0,LP001002,1,0,0,1,0,0.073497,-0.554104,-0.211241,0.273231,1.0,1,0,1
1,LP001003,1,1,1,1,0,-0.261168,-0.038356,-0.211241,0.273231,1.0,0,0,0
2,LP001005,1,1,0,1,1,-0.393199,-0.554104,-0.948996,0.273231,1.0,1,0,1
3,LP001006,1,1,0,0,0,-0.461507,0.252351,-0.306435,0.273231,1.0,1,0,1
4,LP001008,1,0,0,1,0,0.098232,-0.554104,-0.056551,0.273231,1.0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0,1,0,-0.409580,-0.554104,-0.889500,0.273231,1.0,1,0,0
610,LP002979,1,1,3,1,0,-0.212024,-0.554104,-1.258378,-2.522836,1.0,1,0,0
611,LP002983,1,1,1,1,0,0.437647,-0.472022,1.276168,0.273231,1.0,1,0,1
612,LP002984,1,1,2,1,0,0.357543,-0.554104,0.490816,0.273231,1.0,1,0,1
