# Homework 3

In [159]:
import pandas as pd
import numpy as np
import json

In [160]:
from sklearn import tree

### 1. Data preprocessing

Read datasets

In [161]:
loans = pd.read_csv('./lending-club-data.csv')

In [162]:
loans.columns

Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv',
       u'term', u'int_rate', u'installment', u'grade', u'sub_grade',
       u'emp_title', u'emp_length', u'home_ownership', u'annual_inc',
       u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc',
       u'purpose', u'title', u'zip_code', u'addr_state', u'dti',
       u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths',
       u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc',
       u'pub_rec', u'revol_bal', u'revol_util', u'total_acc',
       u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt',
       u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int',
       u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee',
       u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d',
       u'last_credit_pull_d', u'collections_12_mths_ex_med',
       u'mths_since_last_major_derog', u'policy_code', u'not_compliant',
       u'status', u'inactiv

Reassign labels into +1 and -1

In [163]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x == 0 else -1)
loans = loans.drop('bad_loans', axis = 1)

In [164]:
loans.shape

(122607, 68)

In [165]:
loans['safe_loans'].value_counts()

 1    99457
-1    23150
Name: safe_loans, dtype: int64

Select a subset of features for models

In [166]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                    # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]

#### Notice: we use splitting datasets offered by the lecture, so we actually skip the following steps. We continue from the section '2. Split data into training and validation' .

Sample data to balance classes： undersampling

In [17]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


In [18]:
# Since there are fewer risky loans than safe loans, use the number of risky loans to undersample the safe loans.

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(len(risky_loans), random_state = 1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [19]:
loans_data.shape

(46300, 13)

In [20]:
loans_data['safe_loans'].value_counts()

-1    23150
 1    23150
Name: safe_loans, dtype: int64

In [36]:
loans_data.dtypes

grade                     object
sub_grade                 object
short_emp                  int64
emp_length_num             int64
home_ownership            object
dti                      float64
purpose                   object
term                      object
last_delinq_none           int64
last_major_derog_none      int64
revol_util               float64
total_rec_late_fee       float64
safe_loans                 int64
dtype: object

In [40]:
pd.get_dummies(loans_data).columns

Index([u'short_emp', u'emp_length_num', u'dti', u'last_delinq_none',
       u'last_major_derog_none', u'revol_util', u'total_rec_late_fee',
       u'safe_loans', u'grade_A', u'grade_B', u'grade_C', u'grade_D',
       u'grade_E', u'grade_F', u'grade_G', u'sub_grade_A1', u'sub_grade_A2',
       u'sub_grade_A3', u'sub_grade_A4', u'sub_grade_A5', u'sub_grade_B1',
       u'sub_grade_B2', u'sub_grade_B3', u'sub_grade_B4', u'sub_grade_B5',
       u'sub_grade_C1', u'sub_grade_C2', u'sub_grade_C3', u'sub_grade_C4',
       u'sub_grade_C5', u'sub_grade_D1', u'sub_grade_D2', u'sub_grade_D3',
       u'sub_grade_D4', u'sub_grade_D5', u'sub_grade_E1', u'sub_grade_E2',
       u'sub_grade_E3', u'sub_grade_E4', u'sub_grade_E5', u'sub_grade_F1',
       u'sub_grade_F2', u'sub_grade_F3', u'sub_grade_F4', u'sub_grade_F5',
       u'sub_grade_G1', u'sub_grade_G2', u'sub_grade_G3', u'sub_grade_G4',
       u'sub_grade_G5', u'home_ownership_MORTGAGE', u'home_ownership_OTHER',
       u'home_ownership_OWN', u'home

One-hot encoding for multi categorical variables.

In [41]:
loans_data = pd.get_dummies(loans_data)

In [43]:
loans_data.shape

(46300, 68)

### 2. Split data into training and validation

Convert categorical variables to dummies first

In [None]:
loans = pd.get_dummies(loans)

Indices of train and validation splitting

In [191]:
with open('./module-5-assignment-1-train-idx.json') as f:
    train_idx = json.load(f)
    
with open('./module-5-assignment-1-validation-idx.json') as f:
    validation_idx = json.load(f)

In [192]:
# 80-20 split: train_data, validation_data = loans_data.random_split(.8, seed=1)
# However, we use offerd indices to complete split (to keep datasets consistent with those in lectures)
train_data = loans.iloc[train_idx]
validation_data = loans.iloc[validation_idx]

In [193]:
loans.columns

Index([u'short_emp', u'emp_length_num', u'dti', u'last_delinq_none',
       u'last_major_derog_none', u'revol_util', u'total_rec_late_fee',
       u'safe_loans', u'grade_A', u'grade_B', u'grade_C', u'grade_D',
       u'grade_E', u'grade_F', u'grade_G', u'sub_grade_A1', u'sub_grade_A2',
       u'sub_grade_A3', u'sub_grade_A4', u'sub_grade_A5', u'sub_grade_B1',
       u'sub_grade_B2', u'sub_grade_B3', u'sub_grade_B4', u'sub_grade_B5',
       u'sub_grade_C1', u'sub_grade_C2', u'sub_grade_C3', u'sub_grade_C4',
       u'sub_grade_C5', u'sub_grade_D1', u'sub_grade_D2', u'sub_grade_D3',
       u'sub_grade_D4', u'sub_grade_D5', u'sub_grade_E1', u'sub_grade_E2',
       u'sub_grade_E3', u'sub_grade_E4', u'sub_grade_E5', u'sub_grade_F1',
       u'sub_grade_F2', u'sub_grade_F3', u'sub_grade_F4', u'sub_grade_F5',
       u'sub_grade_G1', u'sub_grade_G2', u'sub_grade_G3', u'sub_grade_G4',
       u'sub_grade_G5', u'home_ownership_MORTGAGE', u'home_ownership_OTHER',
       u'home_ownership_OWN', u'home

### 3. Build a decision tree classifier

Build the decision tree classifier now

In [202]:
decision_tree_model = tree.DecisionTreeClassifier(max_depth = 6)
small_model = tree.DecisionTreeClassifier(max_depth = 2)

In [203]:
x_train = train_data.drop('safe_loans', axis = 1)
y_train = train_data['safe_loans']
x_validation = validation_data.drop('safe_loans', axis = 1)
y_validation = validation_data['safe_loans']

In [204]:
decision_tree_model.fit(x_train, y_train)
small_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

Sample data

In [205]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans,grade_A,grade_B,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79,0,10,16.85,1,1,96.4,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,0,3,13.97,0,1,59.5,0.0,-1,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
41,0,11,16.33,1,1,62.1,0.0,-1,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [206]:
x_sample = sample_validation_data.drop('safe_loans', axis = 1)
y_sample = sample_validation_data['safe_loans']

In [209]:
x_sample

Unnamed: 0,short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,grade_A,grade_B,grade_C,...,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_small_business,purpose_vacation,purpose_wedding,term_ 36 months,term_ 60 months
19,0,11,11.18,1,1,82.4,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
79,0,10,16.85,1,1,96.4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
24,0,3,13.97,0,1,59.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
41,0,11,16.33,1,1,62.1,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [212]:
small_model_result = small_model.predict(x_sample)
small_model_result

array([ 1, -1, -1,  1])

In [213]:
decision_tree_result = decision_tree_model.predict(x_sample)
decision_tree_result

array([ 1, -1, -1,  1])

In [214]:
# Precentage of predictions correctly predicted
sum(decision_tree_result == y_sample) / float(len(y_sample))
# or we can use .score(). Same

0.5

Explore probability predictions

In [218]:
y_sample.unique()

array([ 1, -1])

In [216]:
decision_tree_model.predict_proba(x_sample)

array([[ 0.34156543,  0.65843457],
       [ 0.53630646,  0.46369354],
       [ 0.64750958,  0.35249042],
       [ 0.20789474,  0.79210526]])

In [219]:
small_model.predict_proba(x_sample)

array([[ 0.41896585,  0.58103415],
       [ 0.59255339,  0.40744661],
       [ 0.59255339,  0.40744661],
       [ 0.23120112,  0.76879888]])

Here, the probability of predictions are the exact same for the 2nd and 3rd loans

Visualize the prediction on a tree

In [223]:
print 'Accuracy of training decision_tree_model: '
print decision_tree_model.score(x_train, y_train)
print 'Accuracy of training small_model: '
print small_model.score(x_train, y_train)

Accuracy of training decision_tree_model: 
0.640527616591
Accuracy of training small_model: 
0.613502041694


In [245]:
print 'Accuracy of decision_tree_model in validation data: '
print round(decision_tree_model.score(x_validation, y_validation), 2)
print 'Accuracy of small_model in validation data: '
print round(small_model.score(x_validation, y_validation), 2)

Accuracy of decision_tree_model in validation data: 
0.64
Accuracy of small_model in validation data: 
0.62


### 4. Evaluating accuracy of a complex decision tree model

In [230]:
big_model = tree.DecisionTreeClassifier(max_depth = 10)
big_model.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [232]:
print 'Accuracy of training big_model: '
print big_model.score(x_train, y_train)
print 'Accuracy of validating big_model: '
print big_model.score(x_validation, y_validation)

Accuracy of training big_model: 
0.66379217709
Accuracy of validating big_model: 
0.626454114606


The accuracy of big_model in validation set is a little bit lower than that in training set. That is not a severe overfitting. 

### 5.Quantifying the cost of mistakes

In [238]:
predictions = decision_tree_model.predict(x_validation)

In [240]:
len(predictions)

9284

The number of False Positive

In [239]:
sum(y_validation[y_validation != predictions] == -1)

1661

The number of False Negative

In [241]:
sum(y_validation[y_validation != predictions] == 1)

1717

The number of True Positive + False Positive

In [243]:
sum(predictions == 1)

4554

Total cost: A False Negative - \$10000, A False Positive - $20000

In [244]:
sum(y_validation[y_validation != predictions] == -1) * 20000 + sum(y_validation[y_validation != predictions] == 1)*10000

50390000