In [None]:
## Importing Important Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from IPython.display import Image

In [None]:
## Loading the Dataset
df = pd.read_csv('train_dataset.csv')

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
## Find no. of null values
df.isnull().sum()

In [None]:
## Fill the missing values for NUMERICAL TERMS - mean
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].mean())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mean())

In [None]:
## Fill the missing values for CATERGORIAL TERMS - mode
df['Gender'] = df["Gender"].fillna(df['Gender'].mode()[0])
df['Married'] = df["Married"].fillna(df['Married'].mode()[0])
df['Dependents'] = df["Dependents"].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df["Self_Employed"].fillna(df['Self_Employed'].mode()[0])

In [None]:
## Check again to confirm that all the null values have been replaced
df.isnull().sum()

In [None]:
## Since Total income would be a better parameter than Applicant and Co-Applicant Income
df['Total_Income'] = df['ApplicantIncome'] + df['CoapplicantIncome']
df.head()

In [None]:
# Apply log transformation to the attribute for ease of use
df['ApplicantIncomeLog'] = np.log(df['ApplicantIncome'])
df['Loan_Amount_Term_Log'] = np.log(df['Loan_Amount_Term'])
df['Total_Income_Log'] = np.log(df['Total_Income'])
df['LoanAmountLog'] = np.log(df['LoanAmount'])

In [None]:
## Getting correlation between different predictors
corr = df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, annot = True)

In [None]:
## Dropping unnecessary columns
cols = ['CoapplicantIncome','Loan_ID','Total_Income','ApplicantIncome','LoanAmount','Loan_Amount_Term']
df = df.drop(columns=cols,axis=1)
df.head()

In [None]:
## Label Encoding to deal with the Categorial Data
df['Gender'].value_counts()
df['Gender'] = df['Gender'].replace({'Male':1, 'Female':0})
df['Married'].value_counts()
df['Married'] = df['Married'].replace({'Yes':1, 'No':0})
df['Dependents'].value_counts()
df['Dependents'] = df['Dependents'].replace({'0':0, '1':1, '2':2, '3+':3})
df['Education'].value_counts()
df['Education'] = df['Education'].replace({'Graduate':1, 'Not Graduate':0})
df['Property_Area'].value_counts()
df['Property_Area'] = df['Property_Area'].replace({'Urban':2, 'Semiurban':1, 'Rural':0})
df['Self_Employed'] = df['Self_Employed'].replace({'Yes':1, 'No':0})
df['Loan_Status'] = df['Loan_Status'].replace({'Y': 1, 'N': 0})

In [None]:
df.head()

In [None]:
## Specify input and output attributes for Training Dataset
x = df.drop(columns=['Loan_Status'], axis=1)
y = df['Loan_Status']

In [None]:
##        MODEL TRAINING          ##
## Classify function
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
def classify(model, x, y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy on test set is", model.score(x_test, y_test)*100)
    print("Accuracy on train set is", model.score(x_train,y_train)*100)
    # cross validation - it is used for better validation of model
    # eg: cv-5, train-4, test-1
    score = cross_val_score(model, x , y , cv=5)
    print("Cross validation is",np.mean(score)*100)

In [None]:
## Logistic Regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model, x , y)

In [None]:
## Desicion Tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model, x, y)

In [None]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
model = RandomForestClassifier()
classify(model, x, y)

In [None]:
## Gradient Boosting Classifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
#---
#Hyperparameters
num_estimators = [250, 500, 750]
learn_rates = [0.05, 0.075,  0.1]
max_depths = [3, 4, 5]
min_samples_leaf = [2, 3]
min_samples_split = [2, 5, 7]
#---
param_grid = {'n_estimators': num_estimators, 'learning_rate': learn_rates, 'max_depth': max_depths, 'min_samples_leaf': min_samples_leaf,'min_samples_split': min_samples_split}
model = GradientBoostingClassifier()
kfold = KFold(n_splits=10)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, n_jobs=2)
grid_result = grid.fit(x,y)

In [None]:
grid.score(x,y)

In [None]:
feature_importance = grid.best_estimator_.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
# plt.subplot(1, 2, 2)
plt.figure(figsize=(8, 18))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, x.keys()[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
i1 = int(input("Applicant Income = "))

In [None]:
i2 = int(input("Co-Applicant Income = "))

In [None]:
i3 = int(input("Loan Amount = "))

In [None]:
i4 = int(input("Loan Term = "))

In [None]:
i5 = int(input("Property Area (Enter 2 if 'Urban', 1 if 'Semi-Urban', 0 if 'Rural') = "))

In [None]:
i6 = int(input("Married (Enter 1 if 'Yes', 0 if 'No')  = "))

In [None]:
i7 = int(input("Gender (Enter 1 if 'Male', 0 if 'Female') = "))

In [None]:
i8 = int(input("Number of Dependents (Enter 0 if 0 Dependents, 1 if 1 Dependent, 2 if 2 Dependents, 3 if 3 or more Dependents) = "))

In [None]:
i9 = int(input("Wether you are self Employed or not? (Enter 1 if 'Yes', 0 if 'No')"))

In [None]:
i10 = int(input("Education (Enter 1 if 'Graduate', 0 otherwise) = "))

In [None]:
i11 = int(input("Credit History (Enter 1 if no pending loans, 0 otherwise) = "))

In [None]:
df_test = pd.DataFrame([{'Gender':i7,'Married':i6,'Dependents':i8,'Education':i10,'Self_Employed':i9,'Credit History':i11,'Property_Area':i5,'ApplicantIncomeLog':np.log(i1),'Loan_Amount_Term_Log':np.log(i4),'Total_Income_Log':np.log(i1+i2),'LoanAmountLog':np.log(i3)}])

In [None]:
result = grid.predict(df_test)
if result[0]==1:
    display(Image(url= "Yes.jpg", width=400, height=400))
else:
    display(Image(url= "No.jfif", width=400, height=400))