In [None]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline 

import xgboost as xgb

from sklearn.model_selection import KFold

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_colwidth', 1000)
pd.set_option('max_rows',40)
pd.set_option('max_columns',1000)

import seaborn as sns 
sns.set_style('darkgrid')

In [None]:
data = pd.read_csv("../input/loan-prediction-analytics-vidhya/train_ctrUa4K.csv")
test = pd.read_csv("../input/loan-prediction-analytics-vidhya/test_lAUu6dG.csv")

# Target

In [None]:
sns.countplot(x="Loan_Status", data=data)

**Numeric Columns**<br>
**Categorical Columns**<br>

In [None]:
num_cols = data.select_dtypes(exclude='object').copy()
cat_cols = data.select_dtypes(include='object').copy()
cat_cols = cat_cols.drop('Loan_ID',axis=1)

In [None]:
num_cols.head()

In [None]:
cat_cols.head()

# Uni-variate Analysis

In [None]:
fig = plt.figure(figsize=(12,16))
for index,col in enumerate(num_cols):
    plt.subplot(3,2,index+1)
    sns.distplot(num_cols.loc[:,col].dropna(),kde=False,bins=30)
fig.tight_layout(pad=1.0)

In [None]:
data['Loan_Amount_Term'].value_counts()

## Outliers

In [None]:
fig = plt.figure(figsize=(12,16))
for index,col in enumerate(num_cols):
    plt.subplot(3,2,index+1)
    sns.boxplot(data=num_cols.dropna(), y=col)
fig.tight_layout(pad=1.0)

## Cat_columns

In [None]:
fig = plt.figure(figsize=(18,20))
for index in range(len(cat_cols.columns)):
    plt.subplot(2,4,index+1)
    sns.countplot(x=cat_cols.iloc[:,index], data=cat_cols.dropna())
#    plt.xticks(rotation=90)
fig.tight_layout(pad=1.0)

## Missing Values

In [None]:
# Checking Null values  
plt.figure(figsize=(10,8))
sns.heatmap(data.isnull(),cmap='plasma')

def percent_missing(d):
    percentage = pd.DataFrame(100*(d.isnull().sum()/len(d)),columns=['Missing_%']).sort_values('Missing_%',ascending=False)
    return (percentage.head(10))

In [None]:
percent_missing(data)

In [None]:
#Mapp the Y/N to 1/0
mapp = {'Y':1,'N':0}
data["Loan_Status"] = data["Loan_Status"].map(mapp)

## Correlations

In [None]:
corr = data.corr()
sns.heatmap(data=corr.dropna(),cmap='Blues',linewidth=0.5)

In [None]:
sns.countplot(x='Loan_Status',data=data,hue='Credit_History')

In [None]:
corr["Loan_Status"]

In [None]:
fig = plt.figure(figsize=(18,20))
for index in range(len(cat_cols.columns)):
    plt.subplot(2,4,index+1)
    sns.countplot(x=cat_cols.iloc[:,index], data=cat_cols.dropna(),hue=data['Loan_Status'])
#    plt.xticks(rotation=90)
fig.tight_layout(pad=1.0)

# Data Processing 

**Outliers**

In [None]:
data = data[data['ApplicantIncome'] < 50000]
data = data[data['LoanAmount'] < 500]

**Mapping**

In [None]:
mapp_gender = {'Male':0,'Female':1}
mapp_married = {'No':0,'Yes':1}
mapp_dep = {'0':0,'1':1,'2':1,'3+':1}
mapp_edu = {'Not Graduate':0,'Graduate':1}
mapp_se = {'No':0,'Yes':1}
mapp_pa = {'Semiurban':1,'Urban':2,'Rural':3}


data['Gender'] = data['Gender'].map(mapp_gender)
data['Married'] = data['Married'].map(mapp_married)
data['Dependents'] = data['Dependents'].map(mapp_dep)
data['Education'] = data['Education'].map(mapp_edu)
data['Self_Employed'] = data['Self_Employed'].map(mapp_se)
data['Property_Area'] = data['Property_Area'].map(mapp_pa)


test['Gender'] = test['Gender'].map(mapp_gender)
test['Married'] = test['Married'].map(mapp_married)
test['Dependents'] = test['Dependents'].map(mapp_dep)
test['Education'] = test['Education'].map(mapp_edu)
test['Self_Employed'] = test['Self_Employed'].map(mapp_se)
test['Property_Area'] = test['Property_Area'].map(mapp_pa)

mapp_lt = {360.0 :1,180.0 : 0,480.0:0,300.0:0,84.0: 0,240.0: 0,120.0: 0,36.0: 0,60.0: 0,12.0:0}
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].map(mapp_lt)
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].map(mapp_lt)

In [None]:
corr = data.corr()
sns.heatmap(data=corr.dropna(),mask = corr < 0.8,cmap='Blues',linewidth=0.5)

No Collinearity btw other features

# Filling Missing Values

In [None]:
loan_amt_avg = test.groupby('ApplicantIncome').mean()['LoanAmount']
def fill_loan_amt(LoanAmount,ApplicantIncome):
    if np.isnan(LoanAmount):
        return loan_amt_avg[ApplicantIncome]
    else:
        return LoanAmount
test['LoanAmount'] = test.apply(lambda x: fill_loan_amt(x['LoanAmount'], x['ApplicantIncome']), axis=1)
test['LoanAmount'] = test['LoanAmount'].fillna(136.9586777)

In [None]:
loan_amt_avg = data.groupby('ApplicantIncome').mean()['LoanAmount']

def fill_loan_amt(LoanAmount,ApplicantIncome):
    if np.isnan(LoanAmount):
        return loan_amt_avg[ApplicantIncome]
    else:
        return LoanAmount
data['LoanAmount'] = data.apply(lambda x: fill_loan_amt(x['LoanAmount'], x['ApplicantIncome']), axis=1)
data['LoanAmount'] = data['LoanAmount'].fillna(146.4121622)

In [None]:
test['Dependents'] = test['Dependents'].fillna(1)
data['Dependents'] = data['Dependents'].fillna(1)

In [None]:
def fill_gender(Gender,Married):
    if np.isnan(Gender):
        if Married ==1:
            return (1)
        else:
            return (0)
    else:
        return Gender
test['Gender'] = test.apply(lambda x: fill_gender(x['Gender'], x['Married']), axis=1)
data['Gender'] = data.apply(lambda x: fill_gender(x['Gender'], x['Married']), axis=1)

In [None]:
def fill_self_emp(Self_Employed,ApplicantIncome):
    if np.isnan(Self_Employed):
        if ApplicantIncome >=7380:
            return (1)
        else:
            return (0)
    else:
        return Self_Employed
test['Self_Employed'] = test.apply(lambda x: fill_self_emp(x['Self_Employed'], x['ApplicantIncome']), axis=1)
data['Self_Employed'] = data.apply(lambda x: fill_self_emp(x['Self_Employed'], x['ApplicantIncome']), axis=1)

In [None]:
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].fillna(0)
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(0)

In [None]:
test['Credit_History'] = test['Credit_History'].fillna(-999)
data = data.dropna()

In [None]:
def fill_CoapplicantIncome(CoapplicantIncome):
    if CoapplicantIncome ==0:
         return (1)
    else:
        return (0)
data['CoapplicantIncome'] = data.apply(lambda x: fill_CoapplicantIncome(x['CoapplicantIncome']), axis=1)
test['CoapplicantIncome'] = test.apply(lambda x: fill_CoapplicantIncome(x['CoapplicantIncome']), axis=1)

In [None]:
mapp = {1:'Y',0:'N'}
data["Loan_Status"] = data["Loan_Status"].map(mapp)

X_train = data.drop(['Loan_ID','Loan_Status'],axis=1)
y_train = data['Loan_Status']

# HyperParameter Tuning

In [None]:
import optuna

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 350, 1000),
        'max_depth': trial.suggest_int('max_depth', 6, 13),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.009, 0.10),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 0.05),
        'missing': -999,
        #'scale_pos_weight': 0.48,
       }
        
    clf = xgb.XGBClassifier(**params)
    auccuracies = []
    X_train_k = X_train.values
    y_train_k = y_train.values
    kf = KFold(n_splits=3,random_state=2000,shuffle=True)
    for train_idx, valid_idx in kf.split(X_train_k,y_train_k):
#        X_train_k, X_test = X_train[train_index], X_train[test_index]
#        y_train_k, y_test = y_train[train_index], y_train[test_index]
        train_data = X_train_k[train_idx, :], y_train_k[train_idx]
        valid_data = X_train_k[valid_idx, :], y_train_k[valid_idx]
        
        clf.fit(X_train_k[train_idx, :], y_train_k[train_idx])
        pred = clf.predict(X_train_k[valid_idx, :])
        accuracy = accuracy_score(y_train_k[valid_idx],pred)
        auccuracies.append(accuracy)
    print(f'Trial done: Accuracy values on folds: {auccuracies}')
    return np.average(auccuracies)

In [None]:
n_trials = 100

FIT_XGB = True

if FIT_XGB:
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
best_param = study.best_params
best_param['scale_pos_weight'] =0.48
best_param['missing'] =-999

In [None]:
import plotly

In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
model = xgb.XGBClassifier(**best_param)
model.fit(X_train,y_train)

In [None]:
predictions_final = model.predict(test.drop('Loan_ID',axis=1))

In [None]:
predictions_final