In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

**Import the usual libraries for pandas and plotting. You can import sklearn later on.**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Get the Data

** Use pandas to read loan_data.csv as a dataframe called loans.**

In [None]:
loans = pd.read_csv('/kaggle/input/loan-data/loan_data.csv')

** Check out the info(), head(), and describe() methods on loans.**

In [None]:
loans.info()

In [None]:
loans.describe()

In [None]:
loans.head()

Recheck for nulls

In [None]:
print(loans.isna().sum().sum())

**Count values**

In [None]:
print(loans['credit.policy'].value_counts())
print(loans['purpose'].value_counts())
print(loans['not.fully.paid'].value_counts())

# Exploratory Data Analysis

In that part, we peform the data visualization!

**Create a histogram of two FICO distributions on top of each other, one for each credit.policy outcome.**


In [None]:
plt.figure(figsize=(10,6))
loans[loans['credit.policy']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='Credit.Policy=1')
loans[loans['credit.policy']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='Credit.Policy=0')
plt.legend()
plt.xlabel('FICO')

**Create a similar figure, except this time select by the not.fully.paid column.**

In [None]:
plt.figure(figsize=(10,6))
loans[loans['not.fully.paid']==1]['fico'].hist(alpha=0.5,color='blue',
                                              bins=30,label='not.fully.paid=1')
loans[loans['not.fully.paid']==0]['fico'].hist(alpha=0.5,color='red',
                                              bins=30,label='not.fully.paid=0')
plt.legend()
plt.xlabel('FICO')

**Create a countplot using seaborn showing the counts of loans by purpose, with the color hue defined by not.fully.paid.**

In [None]:
plt.figure(figsize=(11,7))
sns.countplot(x='purpose',hue='not.fully.paid',data=loans,palette='Set1')

**Let's see the trend between FICO score and interest rate. Recreate the following jointplot.**

In [None]:
sns.jointplot(x='fico',y='int.rate',data=loans,color='purple')

**Create the following lmplots to see if the trend differed between not.fully.paid and credit.policy. Check the documentation for lmplot() if you can't figure out how to separate it into columns.**

In [None]:
plt.figure(figsize=(11,7))
sns.lmplot(y='int.rate',x='fico',data=loans,hue='credit.policy',
           col='not.fully.paid',palette='Set1')

**Plots for categorical variables**

In [None]:
categorical_columns = ['credit.policy', 'purpose', 'fico', 'inq.last.6mths', 'delinq.2yrs','pub.rec']

print(categorical_columns)
numerical_columns = ['int.rate', 'installment', 'log.annual.inc', 'dti', 'days.with.cr.line', 'revol.util']
print(numerical_columns)

fig,axes = plt.subplots(3,2,figsize=(35,35))
for idx,cat_col in enumerate(categorical_columns):
    row,col = idx//2,idx%2
    sns.countplot(x=cat_col,data=loans,hue='not.fully.paid',ax=axes[row,col])


plt.subplots_adjust(hspace=1)


The Plots convey the following things for our dataset. The non-full paid loans are the small portion compared to the other one in the classification for every plot.

Now, let's oberve the Numerical Columns:


**Plots for numerical variables**

In [None]:
fig,axes = plt.subplots(1,6,figsize=(40,5))
for idx,cat_col in enumerate(numerical_columns):
    sns.boxplot(y=cat_col,data= loans,x='not.fully.paid',ax=axes[idx])

print(loans[numerical_columns].describe())
plt.subplots_adjust(hspace=1)



For Numercical Columns, there is significant relation to non paid Loan and days.with.cr.line which is the number of days the borrower has had a credit line as well with revol.util which is the borrower's revolving line utilization rate (the amount of the credit line used relative to total credit available). Also, we can observe similar bevahiors either the features or features-labels.


***Correlation Pearson and Spearman***

In [None]:
features_num = ['int.rate', 'installment', 'log.annual.inc',
                'dti', 'fico', 'days.with.cr.line',
                'revol.bal', 'revol.util',
                'inq.last.6mths', 'delinq.2yrs']
corr_pearson = loans[features_num].corr(method='pearson')
corr_spearman = loans[features_num].corr(method='spearman')

fig = plt.figure(figsize = (9,7))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (9,7))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

## Categorical Features

Notice that the **purpose** column as categorical

That means we need to transform them using dummy variables so sklearn will be able to understand them. Let's do this in one clean step using pd.get_dummies.

Let's show you a way of dealing with these columns that can be expanded to multiple categorical features if necessary.

**Create a list of 1 element containing the string 'purpose'. Call this list cat_feats.**

In [None]:
cat_feats = ['purpose']

**Histograms**

In [None]:
loans.iloc[:,:].hist(figsize=(15,15))
plt.show()

**Now use pd.get_dummies(loans,columns=cat_feats,drop_first=True) to create a fixed larger dataframe that has new feature columns with dummy variables. Set this dataframe as final_data.**

In [None]:
final_data = pd.get_dummies(loans,columns=cat_feats,drop_first=True)

In [None]:
final_data.info()

## Train Test Split

Now its time to split our data into a training set and a testing set!

** Use sklearn to split your data into a training set and a testing set as we've done in the past.**

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,f1_score, roc_auc_score, roc_curve

In [None]:
X = final_data.drop('not.fully.paid',axis=1)
y = final_data['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

## Training + Test Decision Tree, Random Forest, GradientBoostingClassifier & Logistic Regression Models

Let's start by training our models!

**Import DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, GradientBoostingClassifier**

# **Model 1: Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
tree_clf = DecisionTreeClassifier()

In [None]:
tree_clf.fit(X_train,y_train)

Check for overfitting

In [None]:
y_pred = tree_clf.predict(X_train)
print("Training Data Set Accuracy: ", accuracy_score(y_train,y_pred))
print("Training Data F1 Score ", f1_score(y_train,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='accuracy').mean())

Overfitting Problem

We can see from above metrics that Training Accuracy > Test Accuracy with default settings of Decision Tree classifier. Hence, model is overfit. We will try some Hyper-parameter tuning and see if it helps.

# **Tuning 'Max_Depth' and 'Min_Samples_leaf' of tree**

In [None]:
training_accuracy = []
val_accuracy = []
training_f1 = []
val_f1 = []
tree_depths = []

for depth in range(1,20):
    tree_clf = DecisionTreeClassifier(max_depth=depth)
    tree_clf.fit(X_train,y_train)
    y_training_pred = tree_clf.predict(X_train)

    training_acc = accuracy_score(y_train,y_training_pred)
    train_f1 = f1_score(y_train,y_training_pred)
    val_mean_f1 = cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='f1_macro').mean()
    val_mean_accuracy = cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='accuracy').mean()
    
    training_accuracy.append(training_acc)
    val_accuracy.append(val_mean_accuracy)
    training_f1.append(train_f1)
    val_f1.append(val_mean_f1)
    tree_depths.append(depth)
    

Tuning_Max_depth = {"Training Accuracy": training_accuracy, "Validation Accuracy": val_accuracy, "Training F1": training_f1, "Validation F1":val_f1, "Max_Depth": tree_depths }
Tuning_Max_depth_df = pd.DataFrame.from_dict(Tuning_Max_depth)

plot_df = Tuning_Max_depth_df.melt('Max_Depth',var_name='Metrics',value_name="Values")
fig,ax = plt.subplots(figsize=(15,5))
sns.pointplot(x="Max_Depth", y="Values",hue="Metrics", data=plot_df,ax=ax)

From above graph, we can conclude that keeping 'Max_Depth' = 6 will yield optimum Test accuracy and F1 score. The Optimum Test Accuracy is roughly 0.8 and Optimum F1 Score for validation is around 0.5

# **Visulazing Decision Tree with Max Depth = 6**

Visulazing Decision Tree with Max Depth = 6

In [None]:
import graphviz 
from sklearn import tree

tree_clf = tree.DecisionTreeClassifier(max_depth = 6)
tree_clf.fit(X_train,y_train)
dot_data = tree.export_graphviz(tree_clf,feature_names = X.columns.tolist())
graph = graphviz.Source(dot_data)
graph

From above tree, we could see that some of the leafs have less than 5 samples hence our classifier might overfit. We can sweep hyper-parameter 'min_samples_leaf' to further improve test accuracy by keeping max_depth to 6

In [None]:
training_accuracy = []
val_accuracy = []
training_f1 = []
val_f1 = []
min_samples_leaf = []
import numpy as np
for samples_leaf in range(1,80,3): ### Sweeping from 1% samples to 10% samples per leaf 
    tree_clf = DecisionTreeClassifier(max_depth=6,min_samples_leaf = samples_leaf)
    tree_clf.fit(X_train,y_train)
    y_training_pred = tree_clf.predict(X_train)

    training_acc = accuracy_score(y_train,y_training_pred)
    train_f1 = f1_score(y_train,y_training_pred)
    val_mean_f1 = cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='f1_macro').mean()
    val_mean_accuracy = cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='accuracy').mean()
    
    training_accuracy.append(training_acc)
    val_accuracy.append(val_mean_accuracy)
    training_f1.append(train_f1)
    val_f1.append(val_mean_f1)
    min_samples_leaf.append(samples_leaf)
    

Tuning_min_samples_leaf = {"Training Accuracy": training_accuracy, "Validation Accuracy": val_accuracy, "Training F1": training_f1, "Validation F1":val_f1, "Min_Samples_leaf": min_samples_leaf }
Tuning_min_samples_leaf_df = pd.DataFrame.from_dict(Tuning_min_samples_leaf)

plot_df = Tuning_min_samples_leaf_df.melt('Min_Samples_leaf',var_name='Metrics',value_name="Values")
fig,ax = plt.subplots(figsize=(15,5))
sns.pointplot(x="Min_Samples_leaf", y="Values",hue="Metrics", data=plot_df,ax=ax)

From above plot, we will choose Min_Samples_leaf to 32 to improve test accuracy and F1-score.

## Predictions and Evaluation of Decision Tree
**Create predictions from the test set, a classification report and a confusion matrix.**

Let's predict off the y_test values and evaluate our model.

Predict the class of not.fully.paid for the X_test data.

**Feature importance**

In [None]:
from matplotlib import pyplot

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=6,min_samples_leaf = 32)
tree_clf.fit(X_train,y_train)
# get importance
importance = tree_clf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
ind = [x for x in range(len(importance))]
pyplot.bar(ind, importance)
pyplot.title('Decision Tree')
pyplot.xticks(ind, ('1', '2', '3', '4', '5','6','7','8','9','10','11','12','13','14','15','16','17','18'))
pyplot.xlabel('Features')
pyplot.ylabel('Importance')
pyplot.show()

**Confusion Matrix**

In [None]:
# predict
y_pred = tree_clf.predict(X_test)
print("Test Accuracy: ",accuracy_score(y_test,y_pred))
print("Test F1 Score: ",f1_score(y_test,y_pred))
print("Confusion Matrix on Test Data")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score

In [None]:
acc =[]
f1 = []
acc.append(accuracy_score(y_test,y_pred))
f1.append(f1_score(y_test,y_pred))

**Cross-validation**

In [None]:
print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_train,y_train,cv=5,scoring='accuracy').mean())

**Reports Precision, Recall, F1-score**

In [None]:
print(classification_report(y_test,y_pred))

**ROC CURVE**

In [None]:
print(roc_auc_score(y_test, y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label = "Decision Tree")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Tree ROC Curve")
plt.show()

**Mis-classifications**

It can be seen that majority of the misclassifications are happening on non-full paid loan applicants being classified as full paid loan.

Let's look into Random Forest Classifier (and later for the other 2 methods)if it can reduce mis-classifications


# **Model 2: Random Forest Classifier**

Create the RandomForestClassifier() called rf_clf and fit it to the training data.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_train)
print("Train F1 Score ", f1_score(y_train,y_pred))
print("Train Accuracy ", accuracy_score(y_train,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(rf_clf,X_train,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(rf_clf,X_train,y_train,cv=5,scoring='accuracy').mean())

# **Tuning 'Max_Depth' of RF**

**Tuning Max_Depth**

In [None]:
training_accuracy = []
val_accuracy = []
training_f1 = []
val_f1 = []
tree_depths = []

for depth in range(1,20):
    rf_clf = RandomForestClassifier(max_depth=depth)
    rf_clf.fit(X_train,y_train)
    y_training_pred = rf_clf.predict(X_train)

    training_acc = accuracy_score(y_train,y_training_pred)
    train_f1 = f1_score(y_train,y_training_pred)
    val_mean_f1 = cross_val_score(rf_clf,X_train,y_train,cv=5,scoring='f1_macro').mean()
    val_mean_accuracy = cross_val_score(rf_clf,X_train,y_train,cv=5,scoring='accuracy').mean()
    
    training_accuracy.append(training_acc)
    val_accuracy.append(val_mean_accuracy)
    training_f1.append(train_f1)
    val_f1.append(val_mean_f1)
    tree_depths.append(depth)
    

Tuning_Max_depth = {"Training Accuracy": training_accuracy, "Validation Accuracy": val_accuracy, "Training F1": training_f1, "Validation F1":val_f1, "Max_Depth": tree_depths }
Tuning_Max_depth_df = pd.DataFrame.from_dict(Tuning_Max_depth)

plot_df = Tuning_Max_depth_df.melt('Max_Depth',var_name='Metrics',value_name="Values")
fig,ax = plt.subplots(figsize=(15,5))
sns.pointplot(x="Max_Depth", y="Values",hue="Metrics", data=plot_df,ax=ax)

The same philosophy with desicion tree ('Max_Depth' = 10).

## Random Forest: Predictions and Evaluation
**Create predictions from the test set, a classification report and a confusion matrix.**

Let's predict off the y_test values and evaluate our model.

Predict the class of not.fully.paid for the X_test data.

**Feature importance**

In [None]:
rf_clf = RandomForestClassifier(max_depth=10)
rf_clf.fit(X_train,y_train)
# get importance
importance = rf_clf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
ind = [x for x in range(len(importance))]
pyplot.bar(ind, importance)
pyplot.title('Random Forest')
pyplot.xticks(ind, ('1', '2', '3', '4', '5','6','7','8','9','10','11','12','13','14','15','16','17','18'))
pyplot.xlabel('Features')
pyplot.ylabel('Importance')
pyplot.show()

**Confusion Matrix**

In [None]:
y_pred = rf_clf.predict(X_test)
print("Test Accuracy: ",accuracy_score(y_test,y_pred))
print("Test F1 Score: ",f1_score(y_test,y_pred))
print("Confusion Matrix on Test Data")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
acc.append(accuracy_score(y_test,y_pred))
f1.append(f1_score(y_test,y_pred))

**Cross-Validation**

In [None]:
print("Validation Mean F1 Score: ",cross_val_score(rf_clf,X_train,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(rf_clf,X_train,y_train,cv=5,scoring='accuracy').mean())

**Reports Precision, Recall, F1-score**

In [None]:
print(classification_report(y_test,y_pred))

**ROC CURVE**

In [None]:
print(roc_auc_score(y_test, y_pred))
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label = "Random Forest")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("RF ROC Curve")
plt.show()

**Mis-classifications**

It can be seen that majority of the misclassifications are happening on non-full paid loan applicants being classified as full paid loan in our prediction.

Let's look the Logistic regression, if it can reduce mis-classifications. 



# **Model 3: Logistic Regression**

Create the LogisticRegression() called logreg_clf and fit it to the training data.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict

# **Tuning based on threshold**

In the next part,we tune the logistic regression with chaning the threshold

In [None]:
train_accuracies = []
train_f1_scores = []
test_accuracies = []
test_f1_scores = []
thresholds = []

#for thresh in np.linspace(0.1,0.9,8): ## Sweeping from threshold of 0.1 to 0.9
for thresh in np.arange(0.1,0.9,0.1): ## Sweeping from threshold of 0.1 to 0.9
    logreg_clf = LogisticRegression(solver='liblinear')
    logreg_clf.fit(X_train,y_train)
    
    y_pred_train_thresh = logreg_clf.predict_proba(X_train)[:,1]
    y_pred_train = (y_pred_train_thresh > thresh).astype(int)

    train_acc = accuracy_score(y_train,y_pred_train)
    train_f1 = f1_score(y_train,y_pred_train)
    
    y_pred_test_thresh = logreg_clf.predict_proba(X_test)[:,1]
    y_pred_test = (y_pred_test_thresh > thresh).astype(int) 
    
    test_acc = accuracy_score(y_test,y_pred_test)
    test_f1 = f1_score(y_test,y_pred_test)
    
    train_accuracies.append(train_acc)
    train_f1_scores.append(train_f1)
    test_accuracies.append(test_acc)
    test_f1_scores.append(test_f1)
    thresholds.append(thresh)
    
    
Threshold_logreg = {"Training Accuracy": train_accuracies, "Test Accuracy": test_accuracies, "Training F1": train_f1_scores, "Test F1":test_f1_scores, "Decision Threshold": thresholds }
Threshold_logreg_df = pd.DataFrame.from_dict(Threshold_logreg)

plot_df = Threshold_logreg_df.melt('Decision Threshold',var_name='Metrics',value_name="Values")
fig,ax = plt.subplots(figsize=(15,5))
sns.pointplot(x="Decision Threshold", y="Values",hue="Metrics", data=plot_df,ax=ax)

Based on the above Test/Train curves, we can keep threshold to 0.2.

## Logistic Regression: Predictions and Evaluation
**Create predictions from the test set, a classification report and a confusion matrix.**

Let's predict off the y_test values and evaluate our model.

Predict the class of not.fully.paid for the X_test data.

**Feature importance**

In [None]:
# define the model
logreg_clf = LogisticRegression()
# fit the model
logreg_clf.fit(X_train, y_train)
# get importance
importance = logreg_clf.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
ind = [x for x in range(len(importance))]
pyplot.bar(ind, importance)
pyplot.title('Logistic Regression')
pyplot.xticks(ind, ('1', '2', '3', '4', '5','6','7','8','9','10','11','12','13','14','15','16','17','18'))
pyplot.xlabel('Features')
pyplot.ylabel('Importance')
pyplot.show()

**Confusion Matrix**

In [None]:
thresh = 0.2 ### Threshold chosen from above Curves
y_pred_test_thresh = logreg_clf.predict_proba(X_test)[:,1]
y_pred = (y_pred_test_thresh > thresh).astype(int) 
print("Test Accuracy: ",accuracy_score(y_test,y_pred))
print("Test F1 Score: ",f1_score(y_test,y_pred))
print("Confusion Matrix on Test Data")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
acc.append(accuracy_score(y_test,y_pred))
f1.append(f1_score(y_test,y_pred))

**Cross-Validation**

In [None]:
print("Validation Mean F1 Score: ",cross_val_score(logreg_clf,X_train,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(logreg_clf,X_train,y_train,cv=5,scoring='accuracy').mean())

**Reports Precision, Recall, F1-score**

In [None]:
print(classification_report(y_test,y_pred))

**ROC CURVE**

In [None]:
print(roc_auc_score(y_test, y_pred))
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label = "Logistic Regression")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("LogReg ROC Curve")
plt.show()

**Mis-classifications**

This method increase the prediciton for non-full paid Loan but decrease the prediction for the full paid Loans. 
Let's look the last model Gradient Boosting Classifier, if it can reduce the mis-classifications. 

# **Model 4: Gradient Boosting Classifier**

Create the GradientBoostingClassifier() called logreg_clf and fit it to the training data.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import numpy
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from matplotlib import pyplot
import matplotlib
matplotlib.use('Agg') 

In [None]:
regressor = GradientBoostingClassifier()
regressor.fit(X_train, y_train)

# **Tuning 'n_estimators' and 'learning_rate' in Gradient Boosting**

Here, we are tuning our Gradient boosting model which involves creating and adding trees to the model sequentially.

New trees are created to correct the residual errors in the predictions from the existing sequence of trees.

The effect is that the model can quickly fit, then overfit the training dataset.

A technique to slow down the learning in the gradient boosting model is to apply a weighting factor for the corrections by new trees when added to the model.

This weighting is called the shrinkage factor or the learning rate, depending on the literature or the tool.

The setting values less than 1.0 for learning rate that has the effect of making less corrections for each tree added to the model and also cause the trade-off bias with the n_estimators parameter. This in turn results in more trees that must be added to the model.

For this reason, our tunning have small values in the range of 0.1 to 0.3 (as well as values less than 0.1).

In [None]:
# grid search
n_estimators = [300, 400, 500]
learning_rate = [0.001, 0.01, 0.1]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(regressor, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))
# plot results
scores = numpy.array(means).reshape(len(learning_rate), len(n_estimators))
for i, value in enumerate(learning_rate):
    pyplot.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
pyplot.legend()
pyplot.xlabel('n_estimators')
pyplot.ylabel('Log Loss')
pyplot.savefig('n_estimators_vs_learning_rate.png')

Our results shows that our optimal learning rate is 0.01 and n_estimators=500

In [None]:
regressor = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01)
regressor.fit(X_train, y_train)

## Gradient Boosting Classifier: Predictions and Evaluation
**Create predictions from the test set, a classification report and a confusion matrix.**

Let's predict off the y_test values and evaluate our model.

Predict the class of not.fully.paid for the X_test data.

**Feature importance**

In [None]:
# get importance
importance = regressor.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
ind = [x for x in range(len(importance))]
pyplot.bar(ind, importance)
pyplot.title('Gradient Boosting')
pyplot.xticks(ind, ('1', '2', '3', '4', '5','6','7','8','9','10','11','12','13','14','15','16','17','18'))
pyplot.xlabel('Features')
pyplot.ylabel('Importance')
pyplot.show()

**Confusion Matrix**

In [None]:
y_pred = regressor.predict(X_test)
print("Test Accuracy: ",accuracy_score(y_test,y_pred))
print("Test F1 Score: ",f1_score(y_test,y_pred))
print("Confusion Matrix on Test Data")
pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
acc.append(accuracy_score(y_test,y_pred))
f1.append(f1_score(y_test,y_pred))

**Cross-Validation**

In [None]:
print("Validation Mean F1 Score: ",cross_val_score(regressor,X_train,y_train,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(regressor,X_train,y_train,cv=5,scoring='accuracy').mean())

**Reports Precision, Recall, F1-score**

In [None]:
print(classification_report(y_test,y_pred))

**ROC CURVE**

In [None]:
print(roc_auc_score(y_test, y_pred))
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, label = "Gradient Boosting Classifier")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("GB ROC Curve")
plt.show()

# **Conclusion**

Tunning the best model is challenging due to hyperparameter complex and chaotic numbers you can pick within the model. In our analysis, the results is currently relying on a couple of parameters and specific space numbers for tunning the models. However, the techniques are also implemented in predicting which customers will fully pay their loan. As more sophisticated ML, we applied different models, and we try to tune every model with a couple of parameters (max 2 parameters). Our models are poor performance, but if we compare them based on our metrics, we will have the ranks as follow:

**1st Rank Accuracy          Model**
*    85%      Random Forest Classifier

**1st Rank F1-score          Model**
*    30%     Logistic Regression

Based on our results, we can conclude that Logistic regression performs better in classification. That model has lower accuracy than the other 3 models, but it has better results for predicting the non fully paid loans and the most important class.




In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
langs = ['TREE', 'RF', 'Logistic Reg', 'GB']
ax.bar(langs,acc)
plt.title('ACCURACY')
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(langs,f1)
plt.title('F1-Score')
plt.show()