This code is for Loan Prediction practice problem organized by Analytics Vidhya. Competition link is:
https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/#ProblemStatement

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We have 3 files. Let's get them using pandas!

In [None]:
train_df=pd.read_csv('/kaggle/input/loan-prediction-practice-av-competition/train_csv.csv')
test_df=pd.read_csv('/kaggle/input/loan-prediction-practice-av-competition/test.csv.csv')

Let's check their shapes.

In [None]:
print(train_df.shape)
print(test_df.shape)

We can see that their are 13 features each containing 614 training examples and 614 test examples. Let's analyze them.

<h1>Data Preprocessing and EDA</h1>

In [None]:
train_df.head().T

In [None]:
train_df.describe().T

In [None]:
train_df.info()

For non-numerical features,


In [None]:
train_df.describe(include=['object'])

Let's understand what we got from here one by one. <br>
**Features** Gender(1),Married(2), Dependents(3), Education(4), Self_Employed(5), Property_Area(11), and Loan_Status(12) are object data type. <br>
**Features** Gender(1), Married(2), Dependents(3), Self_Employed(5), LoanAmount(8), Loan_Amount_Term(9), Credit_History(10) have some of their data missing.<br>
We can also see that some of the features may be categorical.

Let's check the number of features containing NaN and number of unique values in each feature.

In [None]:
train_df.isnull().sum(axis=0)

In [None]:
test_df.isnull().sum()

We can see that not only train but also some data is missing in case of test datasets.

In [None]:
train_df.nunique()

In [None]:
test_df.nunique()

In [None]:
train_df['Property_Area'].value_counts()

<h2>Categorical Data</h2>

We can see that the  following features are categorical:
* Gender
* Married
* Dependents
* Education
* Self_Employed
* Credit_History
* Property_Area
<br>and, <br>
* Loan_Status

The most thing we can understand, even from the data description, that our target variable (Loan_Status) contains 2 possibility. Thus, it is a binary classification problem.

From the data description, as well as on checking some data, we can see that under what categories they are divided. Let's first set the target variable which either 'Yes' or 'No' to 1 or 0. 

In [None]:
target_map={"Y":1, "N": 0}
dataset=[train_df]
for data in dataset:
    data['Loan_Status']=data['Loan_Status'].map(target_map)

We need to map the rest of the categorical variables but this time to both training and test dataset. But before 
doing it, we should fill the missing values with the mode of the corresponding features.

In [None]:
cat_cols=['Gender','Married','Dependents','Self_Employed','Credit_History']
for col in cat_cols:
    train_df[col].fillna(train_df[col].mode()[0],inplace=True)
    test_df[col].fillna(test_df[col].mode()[0],inplace=True)

Let's check if there are any missing values left for categorical features

Now, all that is left is to convert the object categorical data to numeric form

In [None]:
target=train_df['Loan_Status']
train_df=train_df.drop('Loan_Status',1)

In [None]:
gender_map={"Male": 1,"Female": 0}
marry_map={"Yes":1,"No":0}
education_map={"Graduate": 1,"Not Graduate":0}
property_map={"Semiurban":2,"Urban":1,"Rural":0}
dataset=[train_df]
for data in dataset:
    data['Gender']=data['Gender'].map(gender_map)
    data['Married']=data['Married'].map(marry_map)
    data['Self_Employed']=data['Self_Employed'].map(marry_map)
    data['Education']=data['Education'].map(education_map)
    data['Property_Area']=data['Property_Area'].map(property_map)
#dependents contains numeric value except 3+, so we just need to replace 3+ with 3 and then  convert their type to numeric
train_df = train_df.replace({'Dependents': r'3+'}, {'Dependents': 3}, regex=True)
train_df['Dependents']=train_df['Dependents'].astype('float64')
#test_df = train_df.replace({'Dependents': r'3+'}, {'Dependents': 3},regex=True)
#test_df['Dependents']=test_df['Dependents'].astype('float64')
train_df.info()

In [None]:
gender_map={"Male": 1,"Female": 0}
marry_map={"Yes":1,"No":0}
education_map={"Graduate": 1,"Not Graduate":0}
property_map={"Semiurban":2,"Urban":1,"Rural":0}
dataset=[test_df]
for data in dataset:
    data['Gender']=data['Gender'].map(gender_map)
    data['Married']=data['Married'].map(marry_map)
    data['Self_Employed']=data['Self_Employed'].map(marry_map)
    data['Education']=data['Education'].map(education_map)
    data['Property_Area']=data['Property_Area'].map(property_map)
#dependents contains numeric value except 3+, so we just need to replace 3+ with 3 and then  convert their type to numeric
test_df = test_df.replace({'Dependents': r'3+'}, {'Dependents': 3},regex=True)
test_df['Dependents']=test_df['Dependents'].astype('float64')
test_df.info()

In [None]:
test_df.isnull().sum()

Now, we have worked with our categorical data for both training and our test datasets

Let's check if there are any missing data that is left

In [None]:
train_df.isnull().sum()

Only LoanAmount and Loan_Amount_Term is left, let's see them.

In [None]:
train_df['Loan_Amount_Term'].value_counts()

We can see that Loan_Amount_Term has majority of data containing value 360, so it would be better if we replace the missing values with 360(most frequent value).

In [None]:
train_df['LoanAmount'].value_counts()

But we cannot say the same for LoanAmount data, so it's better to replace it's missing value with it's median.

In [None]:
train_df['Loan_Amount_Term'].fillna(360,inplace=True)
train_df['LoanAmount'].fillna(train_df['LoanAmount'].median(),inplace=True)
train_df.isnull().sum()

In [None]:
train_df.info()

We can see that training set has now no missing values and that all features are numeric type( except Loan_ID, which I think, for now, provides less information)<br>
Let's complete our test data too.

In [None]:
test_df.isnull().sum()

In [None]:
test_df['Loan_Amount_Term'].value_counts()

In [None]:
test_df['LoanAmount'].value_counts()

Well, like training set, we need to do the same in case of test set.

In [None]:
test_df['Loan_Amount_Term'].fillna(360,inplace=True)
test_df['LoanAmount'].fillna(test_df['LoanAmount'].median(),inplace=True)
test_df.isnull().sum()

In [None]:
test_df.info()

Now, our test set is ready

Let's see a correlation heatmap to visualize how are features correlated.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inLine

In [None]:
corr=train_df.corr()
colormap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr,cmap=colormap,xticklabels=corr.columns,yticklabels=corr.columns,annot=True)
plt.show()

We can see that features are not correlated to each other as much.

Before fitting the model, let's normalize the features of LoanAmount and Loan_Amount_term

In [None]:
features=['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']

In [None]:
for fea in features:
    print(train_df[fea].value_counts(sort=True))
    print('---------------------------')

The features whose values are to be normalized are--

In [None]:
fea_normalize=['Dependents','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Property_Area']

In [None]:
for fea in fea_normalize:
    train_df[fea]=(train_df[fea])/(train_df[fea].max())

<h1>Base Model</h1>

Since, it is a binary classification problem, we would be solving using Decision Tree Classifier and Logisitics Regression. Let's see one by one. But before, let's split our training data to training and validation

In [None]:
train_df=train_df.drop('Loan_ID',1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train,y_val= train_test_split(train_df,target,test_size=0.30, random_state=np.random.randint(0,100))

# Logitics Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
from sklearn import metrics

In [None]:
y_pred=lr.predict(X_val)
acc = metrics.accuracy_score(y_val,y_pred)
print(acc)

With Logistics Regression, the accuracy came to be 0.8108

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
clf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid)
clf.fit(X_train,y_train)

In [None]:
y_pred_cv=clf.predict(X_val)
acc = metrics.accuracy_score(y_val,y_pred_cv)
print(acc)

<h2>Decision Tree Classifier</h2>

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
dt_base=DecisionTreeClassifier(max_depth=10,random_state=4)
dt_base.fit(X_train,y_train)


The evaluation metric here is accuracy. So, let's check it's accuracy

In [None]:
from sklearn import metrics

In [None]:
y_pred=dt_base.predict(X_val)
acc = metrics.accuracy_score(y_val,y_pred)
print(acc)

We can see that our base model is 72% accurate. 

<h3>HyperParameter Tuning</h3>

In [None]:
dt_base.tree_.node_count

In [None]:
param_grid = {
    'max_depth' : range(4,25),
    'min_samples_leaf' : range(20,200,10),
    'min_samples_split' : range(20,200,10),
    'criterion' : ['gini','entropy'] 
}
n_folds = 5

We are using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV


In [None]:
dt = DecisionTreeClassifier(random_state=np.random.randint(0,100))
grid = GridSearchCV(dt, param_grid, cv = n_folds, return_train_score=True,verbose=3)
#grid.fit(X_train,y_train)

In [None]:
#grid.best_params_

Let's train our best model and find it's accuracy.

In [None]:
best_tree=DecisionTreeClassifier(criterion='gini',max_depth=4,min_samples_leaf=20,min_samples_split=80,random_state=np.random.randint(0,100))
best_tree.fit(X_train,y_train)
y_pred_best=best_tree.predict(X_val)

In [None]:
acc = metrics.accuracy_score(y_val,y_pred)
print(acc)

Our accuracy score is increased to 78.9%

Let's predict the results for test file and store in csv file

In [None]:
test_df.info()

In [None]:
loanID=test_df['Loan_ID']
test_df=test_df.drop('Loan_ID',1)

In [None]:
y_pred_t=lr.predict(test_df)
y_final=[]
for y in y_pred_t:
    if y==1:
        y_final.append("Y")
    elif y==0:
        y_final.append("N")
y_best=np.array(y_final)
type(y_best)
      

In [None]:
submission = pd.DataFrame({
        "Loan_ID": loanID,
        "Loan_Status": y_best
    })
submission.head(10)

In [None]:
submission.to_csv('submission_lr.csv', index=False)