In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,f1_score
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev



def preprocessData(data,name):
    # dropping unnecessary columns
    data=data.drop(columns=['Loan_ID'])
    # Imputation of missing values for test data
    data["Gender"].fillna(data["Gender"].mode()[0],inplace=True)
    data["Married"].fillna(data["Married"].mode()[0],inplace=True)
    data['Dependents'].fillna(data["Dependents"].mode()[0],inplace=True)
    data["Self_Employed"].fillna(data["Self_Employed"].mode()[0],inplace=True)
    data["Credit_History"].fillna(data["Credit_History"].mode()[0],inplace=True)
    data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].mode()[0],inplace=True)
    data["LoanAmount"].fillna(data["LoanAmount"].median(),inplace=True)
    #New feature derivation
    data["Total_Income"]=(data["ApplicantIncome"]+data["CoapplicantIncome"])
    data["Income_Loan_Amount_Ratio"]=data["Total_Income"]/data["LoanAmount"]
#     #Encoding
#     df_name=name+"_encoded"
#     df_name = pd.get_dummies(data,drop_first=True)
    return data

def selectFeatures(df,features):
    return df[features]

def encodeFeatures(data):
    return pd.get_dummies(data,drop_first=True)

def splitTrainData(df_name):
    ########## Split Features and Target Varible ############
    X = df_name.drop(columns='Loan_Status_Y')
    y = df_name[['Loan_Status_Y']]

    ################# Splitting into Train -Test Data #######
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify =y,random_state =42)
    return X_train,X_test,y_train,y_test

def runModel(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    prediction_result=model.predict(X_test)
    score =accuracy_score(prediction_result,y_test)*100 
    print(str(model)+" Accuracy: ", score)

# reading data
train = pd.read_csv("../input/loan-prediction-data/train.csv")
test = pd.read_csv("../input/loan-prediction-data/test.csv")

#Data preprocessing
train_preprocessed=preprocessData(train,"train")
test_preprocessed=preprocessData(test,"test")

#Selecting features
# features= ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
#            'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
#            'Credit_History','Property_Area','Income_Loan_Amount_Ratio','Total_Income']
features= [ 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
           'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term',
           'Credit_History','Property_Area','Income_Loan_Amount_Ratio','Total_Income']
target_variable=['Loan_Status']
target_variable.extend(features)
train_selected=selectFeatures(train_preprocessed,target_variable)
test_selected=selectFeatures(test_preprocessed,features)

#Feature encoding
train_encoded=encodeFeatures(train_selected)
test_encoded=encodeFeatures(test_selected)


# Running models without k-fold cross validation

In [None]:
#running models

#A. Running models without k-fold validation
#Splitting train data
X_train,X_test,y_train,y_test=splitTrainData(train_encoded)

LR_model = LogisticRegression(random_state=1)
DT_model = DecisionTreeClassifier(random_state=1)
RF_model = RandomForestClassifier(random_state=1,max_depth=10,n_estimators=50)
knn_model = KNeighborsClassifier(n_neighbors=3)
NB_model = GaussianNB()
model_list = [LR_model,DT_model,RF_model,knn_model,NB_model]
for model in model_list:
    runModel(model,X_train,X_test,y_train,y_test)

# Running models with k-fold cross validation

In [None]:
#B. Run models with stratified k-fold validation
# Create StratifiedKFold object. 
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) 
lst_accu_stratified = []

#Drop target variable
X = train_encoded.drop(columns='Loan_Status_Y')
y = train_encoded[['Loan_Status_Y']]
    
LR_model = LogisticRegression(random_state=1)
DT_model = DecisionTreeClassifier(random_state=1)
RF_model = RandomForestClassifier(random_state=1,max_depth=10,n_estimators=50)
knn_model = KNeighborsClassifier(n_neighbors=3)
NB_model = GaussianNB()
model_list = [LR_model,DT_model,RF_model,knn_model,NB_model]

for model in model_list: 
    for train_index, test_index in skf.split(X, y): 
        X_train_fold, X_test_fold = X.loc[train_index], X.loc[test_index] 
        y_train_fold, y_test_fold = y.loc[train_index], y.loc[test_index] 
        model.fit(X_train_fold, y_train_fold)
    #     lst_accu_stratified.append(model.score(X_test_fold, y_test_fold)) 
        prediction_result=model.predict(X_test_fold)
        lst_accu_stratified.append(accuracy_score(prediction_result,y_test_fold)*100)
    print(str(model)+" Overall Accuracy:", mean(lst_accu_stratified), '%')  