In [None]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import tree
from sklearn import metrics

**Load data**

In [None]:
data = pd.read_csv(os.path.join("../input", "loan_sub.csv"), sep=',')
data.head()

In [None]:
data.columns

In [None]:
data.dtypes

**Preprocess the data**

The target is the data column labeled 'bad_loans', 1 is good and 0 is bad.

Rename it using 'good_loans' and represents of +1, instead using -1 on 'bad_loans', which is more intuitive

In [None]:
data['good_loans'] = data['bad_loans'].apply(lambda x : +1 if x==0 else -1)
data = data.drop('bad_loans', axis=1)

In [None]:
data['good_loans'].value_counts(normalize=True)

**Feature extraction**

In [None]:
cols = ['grade', 'term','home_ownership', 'emp_length']
target = 'good_loans'

data = data[cols + [target]]
data.head()

**Implement downsampling on unbalanced dataset**

In [None]:
data['good_loans'].value_counts()

In [None]:
# use the percentage of bad and good loans to downsample the good loans.
bad_ones = data[data[target] == -1]
good_ones = data[data[target] == 1]
percentage = len(bad_ones)/float(len(good_ones))

risky_loans = bad_ones
safe_loans = good_ones.sample(frac=percentage, random_state=33)

# combine two kinds of loans
data_set = pd.concat([risky_loans, safe_loans], axis=0)

In [None]:
data_set[target].value_counts(normalize=True)

**Preprocessing your features**

In [None]:
def dummies(data, columns=['col_1','col_2','col_3', 'col_4']):
    for col in columns:
        data[col] = data[col].apply(lambda x: str(x))
        new_cols = [col + '_' + i for i in data[col].unique()]
        data = pd.concat([data, pd.get_dummies(data[col], prefix=col)[new_cols]], axis=1)
        del data[col]
    return data

In [None]:
# one hot encoding
cols = ['grade', 'term','home_ownership', 'emp_length']
data_set = dummies(data_set, columns=cols)
data_set.head()

**Implement a train_test_split**

In [None]:
train_data, test_data = train_test_split(data_set, test_size=0.2, random_state=33)
trainX, trainY = train_data[train_data.columns[1:]], pd.DataFrame(train_data[target])
testX, testY = test_data[test_data.columns[1:]], pd.DataFrame(test_data[target])

**Model training**

In [None]:
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=5)
model.fit(trainX, trainY)

In [None]:
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred = clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y, y_pred), "\n")
    
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y, y_pred), "\n")

In [None]:
measure_performance(testX, testY, model)

**Tree Visualization**

In [None]:
import graphviz
dot_data = tree.export_graphviz(model, out_file=None, feature_names=trainX.columns) 
graph = graphviz.Source(dot_data) 
#graph.render("loan") 
#graph.view()

In [None]:
graph

**Increase tree depth**

In [None]:
model_5 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=5)
model_5.fit(trainX, trainY)

In [None]:
measure_performance(testX, testY, model_5)