<a href="https://colab.research.google.com/github/sanskriti-iyer/loan-eligibility/blob/main/Loan_Eligibility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**LOAN ELIGIBILITY**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
import math
import pickle
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
test_file = pd.read_csv('Testing Data.csv')
train_file = pd.read_csv('Training Data.csv')
train_file.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Testing Data.csv'

In [None]:
test_file.head()

In [None]:
#to prevent loss of data in original file we create a copy
train_set_og = train_file.copy()
test_set_og = test_file.copy()

In [None]:
train_file.describe()

# 1. Filling in the null values

In [None]:
#our next step will be to find the missing values in training data set
train_file.isnull().sum()

In [None]:
#we need to fill the null values with either common terms or the mean values (for numbers)
#for numerical columns we will be adding mean values to the null
train_file['LoanAmount'] = train_file['LoanAmount'].fillna(train_file['LoanAmount'].mean())
train_file['Loan_Amount_Term'] = train_file['Loan_Amount_Term'].fillna(train_file['Loan_Amount_Term'].mean())
train_file['Credit_History'] = train_file['Credit_History'].fillna(train_file['Credit_History'].mean())

In [None]:
#as we can see here all the numerical columns do not have any null values
train_file.isnull().sum()

In [None]:
#now we have to replace null values in categorical columns with the common terms that is mode
train_file['Gender'] = train_file['Gender'].fillna(train_file['Gender'].mode()[0])
train_file['Dependents'] = train_file['Dependents'].fillna(train_file['Dependents'].mode()[0])
train_file['Self_Employed'] = train_file['Self_Employed'].fillna(train_file['Self_Employed'].mode()[0])
train_file['Married'] = train_file['Married'].fillna(train_file['Married'].mode()[0])

In [None]:
train_file.isnull().sum()

# 2. Data Visualization

In [None]:
#we'll be chekcing categorical first

In [None]:
sns.countplot(train_file['Gender'])

_To check male:female_

In [None]:
train_file['Gender'].value_counts()

In [None]:
sns.countplot(train_file['Married'])

_To check married and unmarried men and women_

In [None]:
#To find married and unmarried men

filt1 = train_file['Gender'] == 'Male'
train_file.loc[filt1,'Married'].str.contains('Yes',na=False).value_counts()
#hence married men = 369 ; unmarried men = 133

In [None]:
#To find married and umnarried women

filt2 = train_file['Gender'] == 'Female'
train_file.loc[filt2,'Married'].str.contains('Yes',na=False).value_counts()
#hence married women = 32 ; unmarried women = 80

_To check graduation status of men and women with regards to marital status_

In [None]:
grad_stat_grp = train_file.groupby(['Gender'])
grad_men = grad_stat_grp.get_group('Male')
grad_men

In [None]:
grad_men.value_counts(['Married','Education']) #graduation status of men wrt their marital status

In [None]:
grad_women = grad_stat_grp.get_group('Female')
grad_women

In [None]:
grad_women.value_counts(['Married','Education']) #graduation status of women wrt marital status

In [None]:
sns.countplot(train_file['Education'])

In [None]:
sns.countplot(train_file['Self_Employed'])

In [None]:
sns.countplot(train_file['Property_Area'])

In [None]:
sns.countplot(train_file['Loan_Status'])

In [None]:
sns.countplot(train_file['Dependents'])

In [None]:
#analysing numerical columns
sns.distplot(train_file['ApplicantIncome'])

In [None]:
sns.distplot(train_file['CoapplicantIncome'])

In [None]:
sns.distplot(train_file['LoanAmount'])

In [None]:
train_file['LoanAmount'].nlargest()

In [None]:
train_file.iloc[171]
#applicant no.171 has the highest loan amount who is a male

In [None]:
sns.distplot(train_file['Loan_Amount_Term'])

In [None]:
sns.distplot(train_file['Credit_History'])

# 3. Creating a new attribute and applying log transformation for better distribution

In [None]:
train_file['Total_Income'] = train_file['ApplicantIncome'] + train_file['CoapplicantIncome']
train_file

In [None]:
#filling null values in the new column "Total_Income"
train_file['Total_Income'] = train_file['Total_Income'].fillna(train_file['Total_Income'].mean())

In [None]:
train_file

In [None]:
train_file['ApplicantIncomeLog'] = np.log(train_file['ApplicantIncome']+1)
sns.distplot(train_file["ApplicantIncomeLog"])

In [None]:
train_file['CoapplicantIncomeLog'] = np.log(train_file['CoapplicantIncome']+1)
sns.distplot(train_file["CoapplicantIncomeLog"])

In [None]:
train_file['LoanAmountLog'] = np.log(train_file['LoanAmount']+1)
sns.distplot(train_file["LoanAmountLog"])

In [None]:
train_file['Loan_Amount_Term_Log'] = np.log(train_file['Loan_Amount_Term']+1)
sns.distplot(train_file["Loan_Amount_Term_Log"])

In [None]:
train_file['Total_Income_Log'] = np.log(train_file['Total_Income']+1)
sns.distplot(train_file["Total_Income_Log"])

In [None]:
corr = train_file.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True, cmap="BuGn")

In [None]:
cols = ['ApplicantIncome', 'CoapplicantIncome', "LoanAmount", "Loan_Amount_Term", "Total_Income", 'Loan_ID', 'CoapplicantIncomeLog']
train_file = train_file.drop(columns=cols, axis=1)
train_file.head()
#these above columns have been dropped because i dont deem it necessary to train the model using those as i only need
#basic information like gender education dependents for who is most fit to be elligble to take out a loan based on their
#income, how stable they are regarding their employment etc
#only the log columns are used for some of the columns for the model to be trained on better distributions

# 4. Encoding

In [None]:
#here all the alphabetical answers from categorical columns have been normalized to numerics for model training
from sklearn.preprocessing import LabelEncoder
cols = ['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status','Dependents']
le = LabelEncoder()
for col in cols:
    train_file[col] = le.fit_transform(train_file[col])

In [None]:
train_file.head()
#Y = 1 N = 0

In [None]:
#splitting the train-test
X = train_file.drop(columns=['Loan_Status'], axis=1)
y = train_file['Loan_Status']

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# 5. Training the model

In [None]:
from sklearn.model_selection import cross_val_score
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy is", model.score(x_test, y_test)*100)
    # cross validation - it is used for better validation of model
    score = cross_val_score(model, x, y, cv=5)
    print("Cross validation is",np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, X, y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model, X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
model = RandomForestClassifier()
classify(model, X, y)

In [None]:
model = ExtraTreesClassifier()
classify(model, X, y)

# 6. Hyperparameter Tuning for choosing the ideal model

In [None]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=25, max_depth=7, max_features=1)
classify(model, X, y)

# 7. Prediction summary and errors

In [None]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm
print(y_pred)

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
with open('P1.pkl','wb') as f:
    pickle.dump(model,f)

In [None]:
with open('P1.pkl','rb') as f:
    pickle.load(f)