In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing Important Libraries 

import numpy as np 
import pandas as pd
import pprint
import matplotlib.pyplot as plt 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading the data 
df=pd.read_csv("/kaggle/input/analytics-vidhya-loan-prediction/train.csv")
df.head()

In [None]:
# checking if the dataset is Imbalanced or not 

print(df.Loan_Status.value_counts()/df.shape[0]*100)
plt.bar(df.Loan_Status.value_counts().keys(),df.Loan_Status.value_counts()/df.shape[0]*100)
plt.xlabel("Loan Given (YES/NO)")
plt.ylabel("% of People")
plt.title("Check for Imbalanced Dataset")
plt.tight_layout()

In [None]:
# checjking for the null values in dataframe
df.isnull().sum()

In [None]:
# Definig a new column

Lst=list(zip(df["ApplicantIncome"],df["CoapplicantIncome"]))
HasCoapplicant=[]
for i in Lst:
    if i[1]!=0:
        HasCoapplicant.append(1)
    else:
        HasCoapplicant.append(0)
        
df["HasCoapplicant"]=HasCoapplicant
       

In [None]:
# Identifying the unique values and the data type of each features 
for i in df.columns[1:]:
    print("Unique labels for {} = ({},  {}) ".format(i,df[i].nunique(),df[i].dtype))

In [None]:
# Handling the categorical variables and missing values 

# Replacing Male with value 1 and Female with value 0 and Nan values with the most occuring label among them 
df.Gender.replace({"Male": 1, "Female": 0},inplace=True)
df.Gender.replace({np.nan: df.Gender.value_counts().keys()[0]},inplace=True)

# Same as what done for the Gender feature 
df.Married.replace({"Yes": 1, "No": 0},inplace=True)
df.Married.replace({np.nan: df.Married.value_counts().keys()[0]},inplace=True)

df.Self_Employed.replace({"Yes": 1, "No": 0},inplace=True)
df.Self_Employed.replace({np.nan: df.Self_Employed.value_counts().keys()[0]},inplace=True)

df.Education.replace({"Graduate": 1, "Not Graduate": 0},inplace=True)

df.Property_Area.replace({"Urban": 1, "Rural": 0,"Semiurban":2},inplace=True)

df.LoanAmount.replace({np.nan: df.LoanAmount.median()},inplace=True)

df.Credit_History.replace({np.nan:df.Credit_History.value_counts().keys()[0]},inplace=True)

df.Loan_Status.replace({"Y":1,"N":0},inplace=True)

df.Dependents.replace({"0":0,"1":1,"2":2,"3+":3},inplace=True)
df.Dependents.replace({np.nan:df.Dependents.value_counts()[0]},inplace=True)

# For continous feature replacing the nan value with the median of the feature
df.Loan_Amount_Term.replace({np.nan:df.Loan_Amount_Term.median()},inplace=True)


df.head()

In [None]:
# No missing values in the dataset so missing value handling is done 
sum(df.isnull().sum())

In [None]:
Y=df.Loan_Status                                            # Target
df.drop(["Loan_ID","Loan_Status"],axis=1,inplace=True)      # Features
X=df

In [None]:
# Standardizing the training dataset

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X  

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X, Y, test_size=0.30, random_state=42,stratify=Y)

###  Hyperparameter Tuning <br>

- n_estimators = number of trees in the foreset
- max_features = max number of features considered for splitting a node
- max_depth = max number of levels in each decision tree
- min_samples_split = min number of data points placed in a node before the node is split
- min_samples_leaf = min number of data points allowed in a leaf node
- bootstrap = method for sampling data points (with or without replacement)

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV


In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

#Minimum number of samples required to split a node
min_samples_split = randint(1,10)

# Minimum number of samples required at each leaf node
min_samples_leaf = randint(1,10)

# Method of selecting samples for training each tree
bootstrap = [True, False]

#Method to select the tree building criterion
criterion = ["gini", "entropy"]



random_grid={'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
                'criterion':criterion}

print(random_grid)

In [None]:
# caution : This step takes time to complete based on computational power of the pc
# Using the random grid to search for best hyperparameters 
rf = RandomForestClassifier()                                     # Defining the classifier object

# Performing a 10 fold cross validation with 50 iteration so total 50*10=500 combinations 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 10, verbose=2, random_state=42, n_jobs = -1)

#Fit the random search model to get the best parameters 
rf_random.fit(X_train, Y_train)

In [None]:
# checking what parameters will perform the best for the RandomForest classifier model

print(rf_random.best_params_)
print(rf_random.best_score_)

In [None]:
# Building  the model with optimal Parameters 
classifier=RandomForestClassifier(bootstrap= True,criterion='entropy',max_depth=None,max_features='log2',min_samples_leaf= 6,min_samples_split=8,n_estimators=1800,n_jobs=-1)

In [None]:
# Model fitting with the optimal parameters 
classifier.fit(X_train,Y_train)

### Feature Selection

In [None]:
sorted_idx = classifier.feature_importances_.argsort()
plt.barh(df.columns[sorted_idx], classifier.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")
plt.title("Feature Importance for Random Forest classifier")
plt.tight_layout()


In [None]:
# Get numerical feature importances
importances = list(classifier.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(list(df.columns), importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]


In [None]:
# checking the most important features that can be used for the classification 
ImpFeatures=[]
for i in feature_importances:
    if i[1]>= 0.1:
        ImpFeatures.append(i[0])
print("Most Important Features are :",ImpFeatures)

In [None]:
# Re-creating the dataframe with the important features only 
Data={}
for i in ImpFeatures:
    if i in df.columns:
        Data[i]=df[i]
features=pd.DataFrame(Data)

features.head()

In [None]:
# Scaling the recreated dataframe 

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Scaled_features = sc.fit_transform(features)
Scaled_features

In [None]:
# Splitting the features of the scaled Dataframe 

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(Scaled_features, Y, test_size=0.30, random_state=42,stratify=Y)

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators

# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

#Minimum number of samples required to split a node
min_samples_split = randint(1,10)

# Minimum number of samples required at each leaf node
min_samples_leaf = randint(1,10)

# Method of selecting samples for training each tree
bootstrap = [True, False]

criterion = ["gini", "entropy"]


random_grid={'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
                'criterion':criterion}

print(random_grid)

In [None]:
# Fitting the best Parameters with the optimal hyperparameters 
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 10, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, Y_train)

In [None]:
print(rf_random.best_params_)

In [None]:
# Building  the model with optimal Parameters 
classifier=RandomForestClassifier(bootstrap= True,criterion='gini',max_depth=110,max_features='log2',min_samples_leaf= 8,min_samples_split=5,n_estimators=800,n_jobs=-1)

In [None]:
# Model fitting with the optimal parameters 
classifier.fit(X_train,Y_train)

In [None]:
# Predicting the output labels 
y_pred=classifier.predict(X_test)
y_pred

In [None]:
# comparison dataframe between test and predicted result 

Datas={"Test_Data":Y_test,"Predicted_Data":y_pred}
comparison_df=pd.DataFrame(Datas)
comparison_df.reset_index(inplace=True,drop=True)
comparison_df.head()

In [None]:
# confusion Matrix 
confusion = metrics.confusion_matrix(Y_test, y_pred)
confusion

In [None]:
import seaborn as sns
labels = ['True Neg','False Pos','False Neg','True Pos']
sns.heatmap(confusion, annot=True,cmap='Blues',fmt='.0f')
plt.title("Heatmap confusion matrix")
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.tight_layout()

In [None]:
TP = confusion[1,1] # true positive 
TN = confusion[0,0] # true negatives
FP = confusion[0,1] # false positives
FN = confusion[1,0] # false negatives

In [None]:
# accuracy of the model 
print("Accuracy : {:.2f}".format(metrics.accuracy_score(Y_test, y_pred)*100), "%")
# sensitivity of the model
print("Sensitivity : {:.2f} %".format((TP / float(TP+FN))*100))
# specificity of the model 
print("Specificity : {:.2f} %".format((TN / float(TN+FP))*100))

In [None]:
# classification Report 
print(metrics.classification_report(Y_test,y_pred))

#### Note:
Although this dataset has a seperate test dataset to predict the output value but here the test set is splitted into train and test datasets so that the accuracy matrix can be evaluated.
However, Having fitted the actual test data to the classification model, an accuracy of 0.778 is achieved.

<h3 align="center"> _____Thank You____ </h3>
<h3 align="center"> constructive criticism is appriciated </h3>