Kaggle: https://www.kaggle.com/jiuzhang/lending-club-subset

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
TRAIN_DIR = '../input/lending-club-subset/loan_sub.csv'

# Explantory Data Analyisis - Take a Glance at the Data

In [None]:
train = pd.read_csv(TRAIN_DIR, sep=',', header=0)

In [None]:
train.shape

In [None]:
train.head()

In [None]:
train.columns

In [None]:
train.describe()

# Explantory Data Analyisis - Look at the Label

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Translate the label column 'bad_loans' (1 is bad, 0 is not bad) into 'safe_loans' (1 is good, -1 is bad).

In [None]:
train['safe_loans'] = train['bad_loans'].apply(lambda x: -1 if x == 1 else 1)

In [None]:
train['safe_loans'].value_counts(normalize=True)

Observation: It's a imbalanced dataset, with much fewer bad loans.

# Explantory Data Analyisis - Look at the Features

For simplicity, we only look at several features: 'grade', 'term','home_ownership', 'emp_length'.

In [None]:
sns.countplot(data=train, x='grade', hue='safe_loans',
              order=['A', 'B', 'C', 'D', 'E', 'F', 'G'])

Observation: Column "grade" is useful. Lower grade loans are easy to default.

In [None]:
sns.countplot(data=train, x='term', hue='safe_loans')

Observation: Column "term" is useful. Longer term loans are easy to default.

In [None]:
sns.countplot(data=train, x='home_ownership', hue='safe_loans')

Observation: Column "home_ownership" is useful. Owned home loans are not easy to default.

In [None]:
plt.xticks(rotation=45)
sns.countplot(data=train, x='emp_length', hue='safe_loans',
              order=['< 1 year', '1 year', '2 years', '3 years', '4 years',
                     '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years'])

Observation: Column "emp-length" might be useful.

# Prepare Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def down_sampling(data, label_col):
    labels = data[label_col].unique()
    assert len(labels) == 2
    
    label1_data = data[data[label_col] == labels[0]]
    label2_data = data[data[label_col] == labels[1]]
    if len(label1_data) < len(label2_data):
        label[0], label[1] = label[1], label[0]
        label1_data, label2_data = label2_data, label1_data
        
    sample_percentage = len(label2_data) / len(label1_data)
    label1_data = label1_data.sample(frac=sample_percentage)
    return pd.concat([label1_data, label2_data], axis=0).reset_index(drop=True)

In [None]:
def get_dummies(data, col):
    data = pd.concat([data, pd.get_dummies(data[col], prefix=col)], axis=1)
    return data.drop([col], axis=1)

In [None]:
Y_COLUMN, X_COLUMNS = ['safe_loans'], ['grade', 'term','home_ownership', 'emp_length']

1.Down-sampling

In [None]:
balanced_train = down_sampling(train, 'safe_loans')

In [None]:
balanced_train['safe_loans'].value_counts(normalize=True)

2.Keep Useful Columns

In [None]:
balanced_train = balanced_train[Y_COLUMN + X_COLUMNS]

In [None]:
balanced_train.head()

3.Get Dummy Variables

In [None]:
balanced_train = get_dummies(balanced_train, 'grade')
balanced_train = get_dummies(balanced_train, 'term')
balanced_train = get_dummies(balanced_train, 'home_ownership')
balanced_train = get_dummies(balanced_train, 'emp_length')

In [None]:
balanced_train.head()

In [None]:
Y_COLUMN, X_COLUMNS = [balanced_train.columns[0]], balanced_train.columns[1:]

4.Training Set, Cross Validation Set, Test Set

In [None]:
X, y = balanced_train.loc[:,X_COLUMNS], balanced_train.loc[:,Y_COLUMN]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0,test_size=0.2)
print(f'Training Set - X train shape: {X_train.shape}, y train shape: {y_train.shape}')
print(f'Validation Set - X val shape: {X_val.shape}, y train shape: {y_val.shape}')

# Model: Decision Tree (No Sklearn)

In [None]:
from sklearn.base import BaseEstimator
from sklearn.metrics import accuracy_score

1.Model Definition

In [None]:
class TreeNode:
    def __init__(self, is_leaf, prediction, split_feature):
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.split_feature = split_feature
        self.left = None
        self.right = None

In [None]:
class DecisionTree(BaseEstimator):
    def __init__(self, max_depth, min_error, verbose=False):
        self.max_depth = max_depth
        self.min_error = min_error
        self.verbose = verbose
    
    def fit(self, X, y):
        features = X.columns
        data = pd.concat([y, X], axis=1)
        self.label_col = y.columns[0]
        self.root_node = self.__create_tree(data, features, curr_depth=0,
                                            max_depth=self.max_depth, min_error=self.min_error)
        self.score = self.__calculate_score(X, y)
        
    def predict(self, X):
        return X.apply(lambda row: self.__predict_single_data(self.root_node, row), axis=1)
    
    def __create_tree(self, data, features, curr_depth, max_depth, min_error):
        # Exit Rule 1: No remaining features
        if len(features) == 0:
            if self.verbose:
                print('No remaining features.')
            return self.__create_leaf(data)
        # Exit Rule 2: Reached max depth.
        if curr_depth >= max_depth:
            if self.verbose:
                print('Reached max depth.')
            return self.__create_leaf(data)
            
        split_feature = self.__find_best_feature(data, features)
        features = features.drop(split_feature)
        if self.verbose:
            print(f'Split on feature: {split_feature}')
        
        left_split = data[data[split_feature] == 0]
        right_split = data[data[split_feature] == 1]
        
        # Exit Rule 3: Perfect Split
        if len(left_split) == 0:
            if self.verbose:
                print('Perfect Split.')
            return self.__create_leaf(right_split)
        if len(right_split) == 0:
            if self.verbose:
                print('Perfect Split.')
            return self.__create_leaf(left_split)
            
        left_tree = self.__create_tree(left_split, features, curr_depth+1, max_depth, min_error)
        right_tree = self.__create_tree(right_split, features, curr_depth+1, max_depth, min_error)
        
        curr_node = TreeNode(is_leaf=False, prediction=None, split_feature=split_feature)
        curr_node.left = left_tree
        curr_node.right = right_tree
        
        return curr_node
    
    def __create_leaf(self, data):
        leaf = TreeNode(True, None, None)
        num_pos = len(data[data[self.label_col] == 1])
        num_neg = len(data[data[self.label_col] == -1])
        if num_pos > num_neg:
            leaf.prediction = 1
        else:
            leaf.prediction = -1
        return leaf
    
    def __find_best_feature(self, data, features):
        original_entropy = self.__entropy(data[self.label_col])
        num_samples = float(len(data))
        
        best_feature, best_info_gain = None, float('-inf')
        for feature in features:
            left_split = data[data[feature] == 0]
            right_split = data[data[feature] == 1]
            left_entropy = self.__entropy(left_split[self.label_col])
            right_entropy = self.__entropy(right_split[self.label_col])
            new_entropy = len(left_split) / num_samples * left_entropy + \
                          len(right_split) / num_samples * right_entropy
            info_gain = original_entropy - new_entropy
            if info_gain > best_info_gain:
                best_feature, best_info_gain = feature, info_gain
        
        return best_feature
    
    def __entropy(self, labels):
        if len(labels) == 0:
            return 0
        
        p = float((labels==1).sum()) / len(labels)
        if p == 0 or p == 1:
            return 0
        
        return - p*np.log2(p) - (1-p)*np.log2(1-p)
    
    def __predict_single_data(self, tree_node, x):
        # Exit Rule
        if tree_node.is_leaf:
            return tree_node.prediction
        
        if x[tree_node.split_feature] == 0:
            return self.__predict_single_data(tree_node.left, x)
        else:
            return self.__predict_single_data(tree_node.right, x)
        
    def __calculate_score(self, X, y):
        y_pred = self.predict(X)
        print(f'Accuracy of Training Set: {accuracy_score(y, y_pred)}')

2.Fit with Training Set

In [None]:
dt = DecisionTree(max_depth=10, min_error=1e-15)
dt.fit(X_train, y_train)

3.Predict for Validation Set

In [None]:
y_pred = dt.predict(X_val[X_COLUMNS])
accuracy_score(y_pred, y_val)