In [48]:
import graphlab
loans = graphlab.SFrame('~/ml_coursera/data/lending-club-data.gl')

In [49]:
loans.column_names

<bound method SFrame.column_names of Columns:
	id	int
	member_id	int
	loan_amnt	int
	funded_amnt	int
	funded_amnt_inv	int
	term	str
	int_rate	float
	installment	float
	grade	str
	sub_grade	str
	emp_title	str
	emp_length	str
	home_ownership	str
	annual_inc	int
	is_inc_v	str
	issue_d	str
	loan_status	str
	pymnt_plan	str
	url	str
	desc	str
	purpose	str
	title	str
	zip_code	str
	addr_state	str
	dti	float
	delinq_2yrs	int
	earliest_cr_line	str
	inq_last_6mths	int
	mths_since_last_delinq	int
	mths_since_last_record	int
	open_acc	int
	pub_rec	int
	revol_bal	int
	revol_util	float
	total_acc	int
	initial_list_status	str
	out_prncp	float
	out_prncp_inv	float
	total_pymnt	float
	total_pymnt_inv	float
	total_rec_prncp	float
	total_rec_int	float
	total_rec_late_fee	float
	recoveries	float
	collection_recovery_fee	float
	last_pymnt_d	str
	last_pymnt_amnt	float
	next_pymnt_d	str
	last_credit_pull_d	str
	collections_12_mths_ex_med	int
	mths_since_last_major_derog	str
	policy_code	int
	not_compliant	in

In [50]:
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)

In [51]:
loans = loans.remove_column('bad_loans')

In [52]:
safe_loans_len = loans[loans['safe_loans'] == +1].num_rows()
print safe_loans_len

99457


In [37]:
safe_loans

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade

sub_grade,emp_title,emp_length,home_ownership,annual_inc,is_inc_v,issue_d,loan_status

pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line

inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal

revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,...


In [53]:
risky_loans_len = loans[loans['safe_loans'] == -1].num_rows()
print risky_loans_len

23150


In [55]:
print safe_loans_len*100/loans.num_rows()
print risky_loans_len*100/loans.num_rows()

81
18


In [56]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

In [57]:
target = 'safe_loans' 

In [59]:
loans = loans[features + [target]]
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


In [61]:
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
print percentage

0.232763908021


In [69]:
risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)
loans_data = risky_loans.append(safe_loans)
len(loans_data[loans_data['safe_loans'] == +1])/len(loans_data[loans_data['safe_loans'] == -1])

In [86]:
categorical_variables = []
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
    if feat_type == str:
        categorical_variables.append(feat_name)
print categorical_variables
for feature in categorical_variables:
    loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x: 1})
    loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
    
    for column in loans_data_unpacked.column_names():
        loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)

    loans_data.remove_column(feature)
    loans_data.add_columns(loans_data_unpacked)
print loans_data
train_data, validation_data = loans_data.random_split(.8, seed=1)


[]
+-----------+----------------+-------+------------------+-----------------------+
| short_emp | emp_length_num |  dti  | last_delinq_none | last_major_derog_none |
+-----------+----------------+-------+------------------+-----------------------+
|     1     |       1        |  1.0  |        1         |           1           |
|     0     |       5        |  5.55 |        1         |           1           |
|     1     |       1        | 18.08 |        1         |           1           |
|     1     |       1        | 10.08 |        1         |           1           |
|     0     |       4        |  7.06 |        1         |           1           |
|     0     |       11       | 13.22 |        1         |           1           |
|     0     |       2        |  2.4  |        1         |           1           |
|     0     |       10       | 15.22 |        1         |           1           |
|     0     |       3        | 13.97 |        0         |           1           |
|     0     |

In [87]:
train_data

short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,1,1.0,1,1,9.4,0.0,-1
0,5,5.55,1,1,32.6,0.0,-1
1,1,18.08,1,1,36.5,0.0,-1
1,1,10.08,1,1,91.7,0.0,-1
0,4,7.06,1,1,55.5,0.0,-1
0,11,13.22,1,1,90.3,0.0,-1
0,2,2.4,1,1,29.7,0.0,-1
0,10,15.22,1,1,57.6,0.0,-1
0,9,9.12,1,1,63.7,24.17,-1
0,5,20.88,1,1,90.8,0.0,-1

grade.A,grade.B,grade.C,grade.D,grade.E,grade.F,grade.G,sub_grade.A1,sub_grade.A2,sub_grade.A3,sub_grade.A4
0,0,1,0,0,0,0,0,0,0,0
0,0,0,0,0,1,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,0,1,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,0,1,0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,0,1,0,0,0,0,0,0,0,0

sub_grade.A5,sub_grade.B1,sub_grade.B2,sub_grade.B3,sub_grade.B4,sub_grade.B5,sub_grade.C1,sub_grade.C2
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,1,0,0
0,0,0,0,0,0,1,0
0,0,1,0,0,0,0,0
0,0,0,0,1,0,0,0
0,0,0,1,0,0,0,0
0,0,0,0,0,0,0,1
0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0

sub_grade.C3,sub_grade.C4,sub_grade.C5,sub_grade.D1,sub_grade.D2,sub_grade.D3,sub_grade.D4,sub_grade.D5
0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,1,0,0,0,0,0

sub_grade.E1,sub_grade.E2,sub_grade.E3,sub_grade.E4,sub_grade.E5,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...


In [88]:
validation_data

short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,3,13.97,0,1,59.5,0.0,-1
0,11,16.33,1,1,62.1,0.0,-1
0,5,12.48,1,1,73.9,0.0,-1
0,11,23.18,1,1,79.7,0.0,-1
0,3,7.83,1,1,65.4,0.0,-1
0,7,16.63,1,1,79.9,0.0,-1
0,4,6.62,1,1,79.9,0.0,-1
0,9,8.4,1,1,86.2,0.0,-1
0,4,15.0,1,1,55.5,0.0,-1
0,6,13.22,1,1,72.9,0.0,-1

grade.A,grade.B,grade.C,grade.D,grade.E,grade.F,grade.G,sub_grade.A1,sub_grade.A2,sub_grade.A3,sub_grade.A4
0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,1,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,0,0,0,0,1,0,0,0,0,0
0,1,0,0,0,0,0,0,0,0,0
0,0,1,0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0

sub_grade.A5,sub_grade.B1,sub_grade.B2,sub_grade.B3,sub_grade.B4,sub_grade.B5,sub_grade.C1,sub_grade.C2
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0
0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0

sub_grade.C3,sub_grade.C4,sub_grade.C5,sub_grade.D1,sub_grade.D2,sub_grade.D3,sub_grade.D4,sub_grade.D5
0,0,0,0,1,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,1,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0
0,0,0,1,0,0,0,0

sub_grade.E1,sub_grade.E2,sub_grade.E3,sub_grade.E4,sub_grade.E5,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...


In [103]:
from sklearn.tree import DecisionTreeClassifier
import numpy
train_data_np = train_data.to_numpy()
clf_depth_6 = DecisionTreeClassifier(random_state=0,  max_depth=6)
clf_depth_2 = DecisionTreeClassifier(random_state=0,  max_depth=2)
clf_depth_10 = DecisionTreeClassifier(random_state=0,  max_depth=10)
target_labels = train_data['safe_loans']
clf_depth_6.fit(train_data_np, target_labels, sample_weight=None)
clf_depth_2.fit(train_data_np, target_labels, sample_weight=None)
clf_depth_10.fit(train_data_np, target_labels, sample_weight=None)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')

In [102]:
target_labels

dtype: int
Rows: 37224
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ... ]

In [104]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

short_emp,emp_length_num,dti,last_delinq_none,last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
0,11,11.18,1,1,82.4,0.0,1
0,10,16.85,1,1,96.4,0.0,1
0,3,13.97,0,1,59.5,0.0,-1
0,11,16.33,1,1,62.1,0.0,-1

grade.A,grade.B,grade.C,grade.D,grade.E,grade.F,grade.G,sub_grade.A1,sub_grade.A2,sub_grade.A3,sub_grade.A4
0,1,0,0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0

sub_grade.A5,sub_grade.B1,sub_grade.B2,sub_grade.B3,sub_grade.B4,sub_grade.B5,sub_grade.C1,sub_grade.C2
0,0,0,1,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0

sub_grade.C3,sub_grade.C4,sub_grade.C5,sub_grade.D1,sub_grade.D2,sub_grade.D3,sub_grade.D4,sub_grade.D5
0,0,0,0,0,0,0,0
0,0,0,1,0,0,0,0
0,0,0,0,1,0,0,0
0,0,0,0,0,0,0,0

sub_grade.E1,sub_grade.E2,sub_grade.E3,sub_grade.E4,sub_grade.E5,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
0,0,0,0,0,...
