# Import Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl

# Load and Read the dataset

In [None]:
df=pd.read_csv("../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv")
df.head()

In [None]:
df.columns #gives column names in the dataset

In [None]:
df.shape #shows no of rows and columns in the dataset

In [None]:
df.describe()

In [None]:
df.info()

so we can see that we have total 13 attributes out of which 12 attributes are Independent variables and 1 attribute (Loan_Status) is dependent variable.
we can also see the datatype of each variable. 

# Data Preprocessing

In [None]:
#let`s check the missing values with in the dataset
df.isnull().sum()

In [None]:
#fill the missing values for numerical terms - mean
LoanAmountMean = df["LoanAmount"].mean()
LoanAmountTermMean = df["Loan_Amount_Term"].mean()
df["LoanAmount"] = df["LoanAmount"].fillna(LoanAmountMean)
df["Loan_Amount_Term"] = df["Loan_Amount_Term"].fillna(LoanAmountTermMean)
# I have replaced missing values in Credit_History column with most frequent value - 1.0
df["Credit_History"] = df["Credit_History"].fillna(1.0)

In [None]:
#fill the missing values for categorical terms - mode
df["Gender"] = df["Gender"].fillna(df["Gender"].mode()[0])
df["Married"] = df["Married"].fillna(df["Married"].mode()[0])
df["Dependents"] = df["Dependents"].fillna(df["Dependents"].mode()[0])
df["Self_Employed"] = df["Self_Employed"].fillna(df["Self_Employed"].mode()[0])

In [None]:
df.isnull().sum() #now we can see that their are no missing values in the dataset

# Creating New Features

In [None]:
# ApplicantIncome and CoapplicantIncome can be combined together 
# so we are adding these two columns and making a new column called TotalIncome
# and we will drop ApplicantIncome and CoapplicantIncome columns
df["TotalIncome"] = df["ApplicantIncome"] + df["CoapplicantIncome"]
cols=["ApplicantIncome","CoapplicantIncome","Loan_ID"]
df.drop(cols,axis=1,inplace=True)


In [None]:
df.head()

# Exploratory Data Analysis

In [None]:
#categorical attriburtes visualization
sns.countplot(df["Gender"])

so we can analyse from this plot that majority of the data is for Male

In [None]:
sns.countplot(df["Married"])

so majority of the applicants are married

In [None]:
sns.countplot(df["Dependents"])

so most of the applicants have 0 dependents and very few have 3+ dependents

In [None]:
sns.countplot(df["Education"])

We can see from the above graph that most of the applicants are Graduate.

In [None]:
sns.countplot(df["Self_Employed"])

Most of the applicants are not self employed.

In [None]:
# numerical attributes visualization
sns.distplot(df["TotalIncome"])

MAjority of the TotalIncome of applicants is between 0-10,000 , only few are from 20,000 onwards.
graph is left skewed i.e most of the applicants are on the left side which is not a good distribution for training the model.so we will apply the log function in the column to normalize the attribute and make a bell curve.

If you see the graph "left skewed or right skewed", you can apply
1. log transformation
2. Min-Max Normalization
3. Standarization
These are the common techniques to normalize the distribution in order to train the model better.

In [None]:
# apply log transformation to the attribute
df["TotalIncome"]= np.log(df["TotalIncome"])

In [None]:
sns.distplot(df["TotalIncome"])

In [None]:
sns.distplot(df["LoanAmount"])

so the distribution for CoapplicantIncome is also left skewed. we will apply log transformation here as well.

In [None]:
df["LoanAmount"] = np.log(df["LoanAmount"])
sns.distplot(df["LoanAmount"])

so now the distribution of LoanAmount is better than before.

In [None]:
sns.distplot(df["Loan_Amount_Term"])

In [None]:
sns.distplot(df["Credit_History"])

No need to apply transformation here because values are already in the range of 0-1

# Correlation Matrix

Correlation Matrix is used to see the relationship between variables. if the correlation between two variables is high , drop any one of the variable (This is the best practice). 

In [None]:
corr = df.corr()
plt.figure(figsize=(12,9))
sns.heatmap(corr, annot = True)

In [None]:
df.head()

# Label Encoding 

## Converting Categorical variables into numerical using label encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ["Gender","Married","Education","Self_Employed","Property_Area","Loan_Status","Dependents"]
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

Now we can see all the columns are converted into numerical columns and we can now easily train our model.

# Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop("Loan_Status",axis=1)
Y = df["Loan_Status"]
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2, random_state=42)

We have splitted the data like 20% for testing and 80% for training.

# Model Training (Logistic Regression, Decision Tree, Random Forest)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
model = LogisticRegression()
model.fit(x_train,y_train)
print("Accuracy of model is",model.score(x_test,y_test)*100)
score = cross_val_score(model,X,Y,cv=5)
print("Cross Validation is",np.mean(score)*100)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
print("Accuracy of model is",model.score(x_test,y_test)*100)
score = cross_val_score(model,X,Y,cv=5)
print("Cross Validation is",np.mean(score)*100)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train,y_train)
print("Accuracy of model is",model.score(x_test,y_test)*100)
score = cross_val_score(model,X,Y,cv=5)
print("Cross Validation is",np.mean(score)*100)

# Fine Tuning

In [None]:
# Let`s fine tune the hyper parameters of RandomForest 
model = RandomForestClassifier(n_estimators=100,min_samples_split=25,max_depth=7,max_features=1)
model.fit(x_train,y_train)
print("Accuracy of model is",model.score(x_test,y_test)*100)
score = cross_val_score(model,X,Y,cv=5)
print("Cross Validation is",np.mean(score)*100)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
y_predicted = model.predict(x_test)
cm = confusion_matrix(y_test,y_predicted)
sns.heatmap(cm,annot=True)
