In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
test_df = pd.read_csv('../input/loan-eligible-dataset/loan-test.csv')
train_df = pd.read_csv('../input/loan-eligible-dataset/loan-train.csv')

In [None]:
train_df.head(3)

In [None]:
test_df.head(2)

In [None]:
train_original=train_df.copy()
test_original=test_df.copy()

In [None]:
train_df.isnull().sum()

In [None]:
train_df.dtypes

# Filling Missing Values with Visualatization and EDA

In [None]:
ax1 = train_df['Gender'].value_counts(normalize=True).plot.bar(title='Train Dataset')
plt.show()
ax2 = test_df['Gender'].value_counts(normalize=True).plot.bar(title='Test Dataset')
plt.show()

In [None]:
#Since Majority of applicants are male, assigning the value male to missing values
train_df['Gender'] = train_df['Gender'].fillna("Male")
test_df['Gender'] = test_df['Gender'].fillna("Male")

In [None]:
sns.countplot(train_df['Married']);

In [None]:
#Same thing with Married
train_df['Married'] = train_df['Married'].fillna("Yes")
test_df['Married'] = test_df['Married'].fillna("Yes")

In [None]:
sns.countplot(train_df['Dependents']);

In [None]:
train_df['Dependents'] = train_df['Dependents'].fillna(0)
test_df['Dependents'] = test_df['Dependents'].fillna(0)

In [None]:
sns.countplot(train_df['Self_Employed']);

In [None]:
train_df['Self_Employed'] = train_df['Self_Employed'].fillna('No')
test_df['Self_Employed'] = test_df['Self_Employed'].fillna('No')

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(train_df['LoanAmount']);
train_df['LoanAmount']= train_df['LoanAmount'].fillna(150)

In [None]:
test_df['LoanAmount']= test_df['LoanAmount'].fillna(test_df['LoanAmount'].mean())

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(train_df['Loan_Amount_Term']);

In [None]:
train_df['Loan_Amount_Term']= train_df['Loan_Amount_Term'].fillna(360)

In [None]:
test_df['Loan_Amount_Term'].value_counts(normalize=True)

In [None]:
test_df['Loan_Amount_Term']= test_df['Loan_Amount_Term'].fillna(360)

In [None]:
sns.countplot(train_df['Credit_History']);

In [None]:
train_df['Credit_History'] = train_df['Credit_History'].fillna(1.0)

In [None]:
test_df['Credit_History'].value_counts(normalize=True)

In [None]:
test_df['Credit_History'] = test_df['Credit_History'].fillna(1.0)

In [None]:
train_df.isnull().sum()

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(test_df.isnull());

In [None]:
Credit_History=pd.crosstab(train_df['Credit_History'],train_df['Loan_Status'])
Property_Area=pd.crosstab(train_df['Property_Area'],train_df['Loan_Status'])
Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind='bar',stacked=False,)
plt.show()
Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind='bar',stacked=False)
plt.show()

In [None]:
Married=pd.crosstab(train_df['Married'],train_df['Loan_Status'])
Dependents=pd.crosstab(train_df['Dependents'],train_df['Loan_Status'])
Education=pd.crosstab(train_df['Education'],train_df['Loan_Status'])
Self_Employed=pd.crosstab(train_df['Self_Employed'],train_df['Loan_Status'])
Married.div(Married.sum(1).astype(float), axis=0).plot(kind='bar',stacked=False)
plt.show()
Dependents.div(Dependents.sum(1).astype(float), axis=0).plot(kind='bar',stacked=False)
plt.show()
Education.div(Education.sum(1).astype(float), axis=0).plot(kind='bar',stacked=False)
plt.show()
Self_Employed.div(Self_Employed.sum(1).astype(float), axis=0).plot(kind='bar',stacked=False)
plt.show()

# Converting Categorical Variables/Strings to Integer

In [None]:
df_unique = train_df.nunique()
df_unique

In [None]:
#Binary variables
binary_variable = list(df_unique[df_unique==2].index)
binary_variable

In [None]:
#categorical variables with multiple categories that is greater than 2 (binary)
categorical_variables = list(df_unique[(df_unique>2)&(df_unique <=6)].index)
train_df[categorical_variables].nunique()

In [None]:
train_df['Gender'] = train_df['Gender'].replace(['Male','Female'],[1,0])
train_df['Married'] = train_df['Married'].replace(['Yes','No'],[1,0])
train_df['Dependents'] = train_df['Dependents'].replace(['0','1','2'],[0,1,2])
train_df['Dependents'] = train_df['Dependents'].replace('3+' , 3)
train_df['Education'] = train_df['Education'].replace(['Graduate' , 'Not Graduate'],[1,0])
train_df['Self_Employed'] = train_df['Self_Employed'].replace(['Yes','No'],[1,0])
train_df['Property_Area'] = train_df['Property_Area'].replace(['Urban' ,'Rural' ,'Semiurban'],[0,1,2])
train_df['Loan_Status'] = train_df['Loan_Status'].replace(['Y','N'],[1,0])

In [None]:
test_df['Gender'] = test_df['Gender'].replace(['Male','Female'],[1,0])
test_df['Married'] = test_df['Married'].replace(['Yes','No'],[1,0])
test_df['Dependents'] = test_df['Dependents'].replace(['0','1','2'],[0,1,2])
test_df['Dependents'] = test_df['Dependents'].replace('3+' , 3)
test_df['Education'] = test_df['Education'].replace(['Graduate' , 'Not Graduate'],[1,0])
test_df['Self_Employed'] = test_df['Self_Employed'].replace(['Yes','No'],[1,0])
test_df['Property_Area'] = test_df['Property_Area'].replace(['Urban' ,'Rural' ,'Semiurban'],[0,1,2])

# Building Models

In [None]:
X = train_df.drop(columns=['Loan_Status', 'Loan_ID'], axis=1)
y = train_df['Loan_Status']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.shape

In [None]:
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

In [None]:
#Feature selection through Forward Step
sfs = SFS(RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
          k_features=10, 
          forward=False,
          floating= False,
          verbose = 2,
          scoring='accuracy',
          cv=4,
          n_jobs=-1
         
         ).fit(X_train, y_train)

In [None]:
#These features will give us 82% Accuracy but not suing Loan Amount for loan eligibility wouldn't be correct predictions
sfs.k_feature_names_

# RandomForest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
#Number of trees in RF
n_estimators = [15, 20, 30, 50, 75, 100, 200, 300, 400]
#Number of features to consider at each split
max_features = ['auto', 'sqrt']
#maximum number of level in the tree
max_depth =[1,2,4]
#Selecting samples for each tree
bootstrap = [True, False]
#Minimum number of samples required at each node
min_samples_leaf = [1,2,3]
#Splitting with either entropy or gini
criterion=['gini','entropy']

param_grid ={'n_estimators':n_estimators,
             'max_features':max_features,
             'max_depth':max_depth,
             'min_samples_leaf':min_samples_leaf,
             'bootstrap':bootstrap,
             'criterion':criterion}


RF = RandomForestClassifier(oob_score=True, warm_start=True, n_jobs=-1)

GRF = GridSearchCV(estimator=RF,
    param_grid=param_grid,
    scoring=None,
    n_jobs=-1,
    cv=4)
GRF.fit(X_train,y_train)
predGRF = GRF.predict(X_test)

In [None]:
GRF.best_params_

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, predGRF))

# Decision Tree and Feature Importance

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier().fit(X_train,y_train)

params_grid = {'max_depth':range(1, DT.tree_.max_depth+1, 2),
               'max_features':range(1,len(DT.feature_importances_)+1)
              }

DT_GV = GridSearchCV(DecisionTreeClassifier(random_state=101),
                   param_grid=params_grid,
                   scoring='accuracy',
                   n_jobs=-1)

DT_GV = DT_GV.fit(X_train,y_train)
y_predDT = DT_GV.predict(X_test)
DT_GV.best_estimator_

In [None]:
Feature_importance = pd.Series(DT.feature_importances_, index=[x for x in X]).sort_values(ascending=False)
bar = Feature_importance.plot(kind='bar', figsize=(18,10))
bar.set(xlabel='Features')
bar.set(ylabel='Relative_Importance')

#  Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

L2 = LogisticRegression()
L2 = L2.fit(X_train,y_train)

y_predl2 = L2.predict(X_test)

print(classification_report(y_test,y_predl2))

# Adaboost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'learning_rate': [ 2, 1, 0.5, 0.2, 0.1, 0.01, 0.001], 
              #'subsample':[1.0, 0.5,0.2], 
              #'max_features':[4, 5, 10, 12, 19],
              'n_estimators':[15, 20, 30, 40, 80, 100, 200, 400,500]
              
             }

AB = GridSearchCV(AdaBoostClassifier(), param_grid=parameters, scoring='accuracy', n_jobs=-1)
AB = AB.fit(X_train,y_train)
y_predAB = AB.predict(X_test)

AB.best_params_

# 

# Voting Classifier/Stacking 

In [None]:
from sklearn.ensemble import VotingClassifier

classifiers = [('GRF', GRF),('AB', AB), ('L2', L2), ('DT_GV', DT_GV) ]

VC = VotingClassifier(classifiers,voting='soft')

VC = VC.fit(X_train,y_train)
y_predvc = VC.predict(X_test)

print(classification_report(y_test,y_predvc))
y_prob = VC.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve, confusion_matrix

sns.set_context()
cm = confusion_matrix(y_test,y_predvc)
_,ax = plt.subplots(figsize=(18,8))
ax = sns.heatmap(cm, annot=True, fmt='d', cmap='coolwarm', annot_kws={"size":40, "weight":"bold"})

labels =['Yes', "No"]
ax.set_xticklabels(labels, fontsize=20);
ax.set_yticklabels(labels, fontsize=20);
ax.set_ylabel("Ground Truth", fontsize=20);
ax.set_xlabel("prediction", fontsize=20)
plt.ylim(2,0)

In [None]:
sns.set_context("talk")

fig, axList = plt.subplots(ncols=2)
fig.set_size_inches(16,8)

#plotting roc auc curve
ax = axList[0]

fpr,tpr, thresholds = roc_curve(y_test,y_prob[:,1])
ax.plot(fpr, tpr, color="green", linewidth=5)

ax.plot([0,1],[0,1], ls='--', color="black",lw=3)
ax.set(xlabel='False Positive Rate',
       ylabel ="True Positive Rate",
       xlim = [-.01,1.01], ylim = [-.01,1.01],
       title ="ROC Curve")
ax.grid(True)

ax = axList[1]
precision, recall, _ = precision_recall_curve(y_test,y_prob[:,1])
ax.plot(recall, precision, color='blue', lw=5)
ax.set(xlabel='Recall',
       ylabel ="Precision",
       xlim = [-.01,1.01], ylim = [-.01,1.01],
       title ="Precision Recall Curve")
ax.grid(True)