In [87]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

---

### **Importing Dataset**

In [88]:
dataset = pd.read_csv("datasetEBB.csv")

In [89]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,13315,1L8Z5JJWAN,67.0,34282.0,245603.0,682.0,10.0,3.0,18.15,12.0,0.42,PhD,Full-time,Divorced,Yes,No,Home,Yes,0.0
1,120592,4LB8XH8ZWQ,27.0,56462.0,30961.0,337.0,32.0,3.0,21.13,60.0,0.87,High School,Self-employed,Married,Yes,Yes,Business,Yes,0.0
2,250310,SIPPK8S6R6,62.0,88670.0,93859.0,560.0,52.0,3.0,12.81,48.0,0.17,PhD,Unemployed,Divorced,Yes,Yes,Home,No,0.0
3,135948,9DZTNM8405,58.0,46301.0,246283.0,606.0,66.0,4.0,19.91,12.0,0.7,High School,Unemployed,Divorced,No,No,Education,Yes,0.0
4,203344,9Q9D0HUBEH,34.0,33342.0,108163.0,684.0,97.0,4.0,19.86,48.0,0.74,Master's,Part-time,Married,Yes,Yes,Business,No,0.0


---

## **Data Preprocessing**

In [90]:
dataset.dtypes

Unnamed: 0          int64
LoanID             object
Age               float64
Income            float64
LoanAmount        float64
CreditScore       float64
MonthsEmployed    float64
NumCreditLines    float64
InterestRate      float64
LoanTerm          float64
DTIRatio          float64
Education          object
EmploymentType     object
MaritalStatus      object
HasMortgage        object
HasDependents      object
LoanPurpose        object
HasCoSigner        object
Default           float64
dtype: object

---

### **Drop Unnecessary Columns**

In [None]:
dataset.drop(columns=['Unnamed: 0', 'LoanID'], inplace=True)

---

### **String Manupulation**

In [92]:
categorical_cols = dataset[['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage','HasDependents', 'LoanPurpose', 'HasCoSigner']]

for col in categorical_cols.columns:
    print(dataset[col].unique())
    print('-' * 50)

['PhD' 'High School' "Master's" nan "Bachelor's"]
--------------------------------------------------
['Full-time' 'Self-employed' 'Unemployed' 'Part-time' nan]
--------------------------------------------------
['Divorced' 'Married' 'Single' nan '?']
--------------------------------------------------
['Yes' 'No' nan]
--------------------------------------------------
['No' 'Yes' nan]
--------------------------------------------------
['Home' 'Business' 'Education' 'Auto' 'Other' nan]
--------------------------------------------------
['Yes' 'No' nan]
--------------------------------------------------


In [93]:
dataset['MaritalStatus'].replace('?', np.nan, inplace=True)

In [94]:
categorical_cols = dataset[['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage','HasDependents', 'LoanPurpose', 'HasCoSigner']]

for col in categorical_cols.columns:
    print(dataset[col].unique())
    print('-' * 50)

['PhD' 'High School' "Master's" nan "Bachelor's"]
--------------------------------------------------
['Full-time' 'Self-employed' 'Unemployed' 'Part-time' nan]
--------------------------------------------------
['Divorced' 'Married' 'Single' nan]
--------------------------------------------------
['Yes' 'No' nan]
--------------------------------------------------
['No' 'Yes' nan]
--------------------------------------------------
['Home' 'Business' 'Education' 'Auto' 'Other' nan]
--------------------------------------------------
['Yes' 'No' nan]
--------------------------------------------------


---

### **Handle Missing Values**

In [95]:
dataset.isnull().sum()

Age                16
Income            569
LoanAmount        637
CreditScore       667
MonthsEmployed    686
NumCreditLines    699
InterestRate      702
LoanTerm          710
DTIRatio          710
Education         710
EmploymentType    710
MaritalStatus     187
HasMortgage       178
HasDependents     165
LoanPurpose       136
HasCoSigner       106
Default           300
dtype: int64

In [96]:
# Numeric Columns
num_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm','DTIRatio', 'Default']
for col in num_cols:
    dataset[col].fillna(dataset[col].median(), inplace=True)

In [97]:
dataset.columns

Index(['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
       'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
       'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
       'LoanPurpose', 'HasCoSigner', 'Default'],
      dtype='object')

In [98]:
# Categorical Columns
cat_cols = ['Education', 'EmploymentType', 'MaritalStatus','HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']

for col in cat_cols:
    dataset[col].fillna(dataset[col].mode()[0], inplace=True)

In [99]:
dataset.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

---

### **Treating Duplicates**

In [100]:
dataset.duplicated().sum()

np.int64(34)

In [101]:
dataset.drop_duplicates(inplace=True)

In [102]:
dataset.duplicated().sum()

np.int64(0)

---

### **Encode Categorical Variables**

In [103]:
dataset.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,67.0,34282.0,245603.0,682.0,10.0,3.0,18.15,12.0,0.42,PhD,Full-time,Divorced,Yes,No,Home,Yes,0.0
1,27.0,56462.0,30961.0,337.0,32.0,3.0,21.13,60.0,0.87,High School,Self-employed,Married,Yes,Yes,Business,Yes,0.0
2,62.0,88670.0,93859.0,560.0,52.0,3.0,12.81,48.0,0.17,PhD,Unemployed,Divorced,Yes,Yes,Home,No,0.0
3,58.0,46301.0,246283.0,606.0,66.0,4.0,19.91,12.0,0.7,High School,Unemployed,Divorced,No,No,Education,Yes,0.0
4,34.0,33342.0,108163.0,684.0,97.0,4.0,19.86,48.0,0.74,Master's,Part-time,Married,Yes,Yes,Business,No,0.0


In [104]:
from sklearn.preprocessing import LabelEncoder

In [105]:
for i in categorical_cols.columns:
    label = LabelEncoder()
    dataset[i] = label.fit_transform(dataset[i])

In [106]:
dataset.head()

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,67.0,34282.0,245603.0,682.0,10.0,3.0,18.15,12.0,0.42,3,0,0,1,0,3,1,0.0
1,27.0,56462.0,30961.0,337.0,32.0,3.0,21.13,60.0,0.87,1,2,1,1,1,1,1,0.0
2,62.0,88670.0,93859.0,560.0,52.0,3.0,12.81,48.0,0.17,3,3,0,1,1,3,0,0.0
3,58.0,46301.0,246283.0,606.0,66.0,4.0,19.91,12.0,0.7,1,3,0,0,0,2,1,0.0
4,34.0,33342.0,108163.0,684.0,97.0,4.0,19.86,48.0,0.74,2,1,1,1,1,1,0,0.0


---