In [116]:
import pandas as pd #imported pandas library
import numpy   #imported numpy library
y = pd.read_json('loan_data.json')   # stored load_data.json in y using pandas
y.to_csv('lon.csv', index = False)  #created lon.csv file
k = pd.read_csv('lon.csv')
l = pd.DataFrame(k)      #created dataframe l

# (1) Find % of total applicants for each unique value of dependents

In [117]:
l['Dependents'].value_counts(normalize=True) * 100

0     57.534247
2     17.221135
1     16.634051
3+     8.610568
Name: Dependents, dtype: float64

# (2) Find the average number of dependents per applicant

In [118]:
l['Dependents'] = l['Dependents'].map({'3+': 5,'1' : 1 , '2' : 2 , '0': 0})
#replaced 3+ by 5

In [119]:
l.groupby('Application_ID').agg({ 'Dependents': 'mean'})

Unnamed: 0_level_0,Dependents
Application_ID,Unnamed: 1_level_1
LP001002,0
LP001003,1
LP001005,0
LP001006,0
LP001008,0
...,...
LP002978,0
LP002979,5
LP002983,1
LP002984,2


# (3) Find the %of applications approved for self employed applicants

In [5]:
(l[l['Self_Employed'] == 'Yes']['Self_Employed'].count()/len(l))*100

13.698630136986301

# (4) What is the % of rejections for married male applicants

In [6]:
l[(l['Married'] == 'Yes') & (l['Gender'] == 'Male') & (l['Application_Status'] == 'N')]['Application_Status'].count()/len(l)*100

17.025440313111545

# (5) Which property area has the maximum approval ratio

In [7]:
l[l['Application_Status'] == 'Y']['Property_Area'].value_counts(normalize=True) * 100

Semiurban    44.092219
Urban        29.971182
Rural        25.936599
Name: Property_Area, dtype: float64

# (6) Find average dependents per income group

In [8]:
l.groupby('Income').agg({ 'Dependents': 'mean'})

Unnamed: 0_level_0,Dependents
Income,Unnamed: 1_level_1
high,1.111111
low,0.765568
medium,1.150259


#  (7) Create a simple predictive model to assess whether a loan application will be approved or rejected and provide the accuracy score

imported Necessary libraries

In [64]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
df = pd.read_json('loan_data.json')
df.to_csv('loan.csv', index = False)   #loan.csv file contains all data which is present in loan_data.json 
train = pd.read_csv('loan.csv')   

In [65]:
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.linear_model import LogisticRegression
#import itertools
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier


In [66]:
from sklearn.model_selection import train_test_split # we split data in train and test
X_train,X_test=train_test_split(df,test_size=0.2) # training set is 80% of data

In [67]:
X_train.to_csv('train_loanprediction.csv', index = False)  #X_train is stored in train_loanprediction.csv
X_test.to_csv('test_loanprediction.csv', index = False)  #X_test is stored in test_loanprediction.csv


In [68]:
train = pd.read_csv('train_loanprediction.csv') #uesd pandas library to read data
test = pd.read_csv('test_loanprediction.csv')

 we have to check whether the data is cleaned or not. And after cleaning part, we have to structure the Data. For cleaning part, First I have to check whether there exists any missing values. For that I am using the code snippet isnull()

In [113]:
train.isnull().sum()

Application_ID        0
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
Credit_History        0
Property_Area         0
Income                0
Application_Status    0
dtype: int64

Loan_ID should be unique. So if there n number of rows, there should be n number of unique Loan_ID’s. Let us check for that. If there are any duplicate values we can remove that.

In [69]:
train.apply(lambda x: len(x.unique()))

Application_ID        383
Gender                  2
Married                 2
Dependents              4
Education               2
Self_Employed           2
Credit_History          2
Property_Area           3
Income                  3
Application_Status      2
dtype: int64

there are 383 rows in our train data set, there should be 383 unique Loan_ID’s. Fortunately there are no duplicate values. We can also see that for Gender, Married, Education and Self_Employed columns, the values are only 2 which is evident after cleaning the data-set

As our target variable is Loan_Status. We are storing it in a variable called y. But before doing all these we are dropping Application_ID column in both the data sets. Here it goes.

In [70]:
train2 = train.drop('Application_ID', axis = 1)
test2 = test.drop('Application_ID', axis = 1)

In [71]:
X = train2.drop('Application_Status',1) #dropped Application_Status column
y = train2.Application_Status       #stored Application_status in y

In [72]:
X.shape

(383, 8)

In [73]:
y.head(5)

0    N
1    N
2    Y
3    Y
4    Y
Name: Application_Status, dtype: object

I need to convert every categorical variable in to numerical, I have used get_dummies method.

In [74]:
#converted categorical variables into 0 and 1
X = pd.get_dummies(X)
train2 = pd.get_dummies(train2)
test2 = pd.get_dummies(test2)


In [76]:
#X contains all columns except Application_ID and Application_Status column
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8)

In [103]:
#Create a function to run a gradient boosted classifier over our data. 
#Note numerous different values were used in the param_grid to hone in on the best paramater
#combinations. The param grid below is the final one I used
def model(X_train, X_val, y_train, y_val):
    if __name__ == '__main__':
    
        param_grid = {'learning_rate': [0.03, 0.035],
                      'max_depth': [3, 4, 5],
                      'min_samples_leaf': [17, 18],
                      'max_features': [1.0, 0.95, 0.9],
                      'n_estimators': [100, 300, 500]
                      }

        estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                                 param_grid=param_grid,
                                 n_jobs=-1)

        estimator.fit(X_train, y_train)

        best_params = estimator.best_params_
                                 
        validation_accuracy = estimator.score(X_val, y_val)
        print('Validation accuracy: ', validation_accuracy)
        #print(validation_accuracy*100)
        return best_params

In [104]:
params = {'min_samples_leaf': 17, 'max_features': 0.95, 'max_depth': 3,
          'learning_rate': 0.03, 'n_estimators': 500}


In [105]:
model = GradientBoostingClassifier(**params)
model.fit(X, y)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.03, loss='deviance', max_depth=3,
                           max_features=0.95, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=17, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [80]:
test2 = test2.drop('Application_Status_N', axis = 1)
test2 = test2.drop('Application_Status_Y', axis = 1)

In [101]:
preds = model.predict(test2)

In [82]:
submit = test['Application_ID']

In [83]:
preds = pd.Series(preds)
submit = pd.concat([submit, preds], names=['Application_ID', 'Application_Status'], axis=1)
submit.columns = ['Application_ID', 'Application_Status']

Result is storesd in result1.csv

In [85]:
submit.to_csv('result1.csv', index=False)

In [114]:
#x = pd.read_csv('result.csv')

In [107]:
parameters = [{'C': [0,1,5], 'kernel':['linear']},
             {'C': [0,1,5], 'kernel':['rbf'], 'gamma':[0.01, 0.05]}]


param_grid = {'learning_rate': [0.03, 0.035],
                      'max_depth': [3, 4, 5],
                      'min_samples_leaf': [17, 18],
                      'max_features': [1.0, 0.95, 0.9],
                      'n_estimators': [100, 300, 500]
                      }


estimator = GridSearchCV(estimator=GradientBoostingClassifier(),
                                 param_grid=param_grid,
                                 n_jobs=-1)

In [108]:
estimator.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingClassifier(ccp_alpha=0.0,
                                                  criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_n...
                 

In [111]:
best_params = estimator.best_params_
                                 
validation_accuracy = estimator.score(X_val, y_val)*100

# We Got Accuracy of 81.81%

In [112]:
validation_accuracy

81.81818181818183