# Loan Data Cleaning 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('loan.csv')

In [None]:
df.head(3)

In [None]:
df.shape

In [None]:
df.isnull().sum()

### % of missing values for each column

In [None]:
(df.isnull().sum()/df.shape[0])*100

### Total % of missing values

In [None]:
(df.isnull().sum().sum()/(df.shape[0]*df.shape[1]))*100

### Heatmap showcasing no. of missing values in each column


In [None]:
sns.heatmap(df.isnull())
plt.show()

In [None]:
df.info()

In [None]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])

In [None]:
df['Gender'].isnull().sum()

### Filling all columns with categorical data with fillna mode filling

In [None]:
for i in df.select_dtypes(include=object).columns:
    df[i] = df[i].fillna(df[i].mode()[0])

In [None]:
df.select_dtypes(include=object).isnull().sum()

In [None]:
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

### Filling all columns with numerical data with sklearn imputation

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
si = SimpleImputer(strategy='mean')
imputed_arr = si.fit_transform(df[numerical_columns])
imputed_arr

In [None]:
new_df = pd.DataFrame(imputed_arr,columns=numerical_columns)

In [None]:
new_df.isnull().sum()

In [None]:
df[numerical_columns]= new_df         #writing filled columns onto original dataframe
df.isnull().sum()

### One Hot Encoding nominal data

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder()

In [None]:
arr_enc = ohe.fit_transform(df[['Gender','Married','Education','Self_Employed','Loan_Status']]).toarray()

In [None]:
df_enc = pd.DataFrame(arr_enc,columns=['Gender_Female','Gender_Male','Married_No','Married_Yes','Graduate_Yes','Graduate_No','Self_Employed_No','Self_Employed_Yes','Loan_Status_No','Loan_Status_Yes'])

In [None]:
df_enc.drop(['Gender_Female','Married_No','Graduate_No','Self_Employed_No','Loan_Status_No'], axis=1,inplace=True)
df.drop(['Gender','Married','Education','Self_Employed','Loan_Status'],axis=1,inplace=True)

In [None]:
df = pd.concat([df,df_enc],axis=1)

### Ordinal Encoding ordinal data

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
location= [['Urban', 'Semiurban','Rural']]

In [None]:
oe = OrdinalEncoder(categories=location)

In [None]:
df['Property_Area'] = oe.fit_transform(df[['Property_Area']])

### Outlier Detection with BoxPlot

In [None]:
sns.boxplot(x= 'ApplicantIncome', data = df)
plt.show()

In [None]:
sns.boxplot(x='CoapplicantIncome', data = df )
plt.show()

In [None]:
sns.boxplot(x = 'LoanAmount', data = df)
plt.show()

In [None]:
df

In [None]:
sns.displot(x = 'Loan_Amount_Term', data = df)
plt.show()

### Outlier removal using IQR

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
Q1 = df['ApplicantIncome'].quantile(0.25)
Q3 = df['ApplicantIncome'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
min_val =  Q1 - (1.5 * IQR)
max_val =  Q3 + (1.5 * IQR)
min_val,max_val

In [None]:
df = df[df['ApplicantIncome'] <= max_val]

In [None]:
df.shape

In [None]:
sns.boxplot(x = 'ApplicantIncome', data = df)
plt.show()

### Outlier removal using direct method

In [None]:
df.describe()

In [None]:
min_val = df['CoapplicantIncome'].mean() - (3 * df['CoapplicantIncome'].std())
max_val = df['CoapplicantIncome'].mean() + (3 * df['CoapplicantIncome'].std())
min_val,max_val

In [None]:
df = df[df['CoapplicantIncome'] <= max_val]

In [None]:
sns.boxplot(x= 'CoapplicantIncome', data=df)
plt.show()


### Outlier removal using Z-Score

In [None]:
z_score = (df['LoanAmount'] - df['LoanAmount'].mean())/ df['LoanAmount'].std()
z_score

In [None]:
df['z_score']= z_score

In [None]:
df = df[df['z_score']<3]