In [29]:
import pandas as pd
import numpy as np
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
from statsmodels.regression.linear_model import OLS
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [30]:
#TODO: loading and preprocessing
df = pd.read_csv("loan_data.csv")
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [31]:
df.size

134092

In [32]:
#rows
len(df)

9578

In [33]:
cols = df.columns.tolist()
cols

['credit.policy',
 'purpose',
 'int.rate',
 'installment',
 'log.annual.inc',
 'dti',
 'fico',
 'days.with.cr.line',
 'revol.bal',
 'revol.util',
 'inq.last.6mths',
 'delinq.2yrs',
 'pub.rec',
 'not.fully.paid']

In [34]:
for col in cols:
    if df[col].isna().sum() > 0:
        print('NaN values found in column : ', col)
    else:
        print('No NaN values found in column : ', col)
        

#TODO: Handle NA Values

No NaN values found in column :  credit.policy
No NaN values found in column :  purpose
No NaN values found in column :  int.rate
No NaN values found in column :  installment
No NaN values found in column :  log.annual.inc
No NaN values found in column :  dti
No NaN values found in column :  fico
No NaN values found in column :  days.with.cr.line
No NaN values found in column :  revol.bal
No NaN values found in column :  revol.util
No NaN values found in column :  inq.last.6mths
No NaN values found in column :  delinq.2yrs
No NaN values found in column :  pub.rec
No NaN values found in column :  not.fully.paid


In [35]:
dtypes = dict(df.dtypes)
print(dtypes)

for col,dtype in dtypes.items():
    if dtype == np.dtype('O'):
        print('Handling object dtype column: \"{}" in design matrix with One Hot Encoding'.format(col))
        # variable to represent the collection of one hot encoded columns
        ohe = pd.get_dummies(df, drop_first=True)
        df = df.drop(col,axis=1)
        df = pd.concat([df,ohe],axis=1)

{'credit.policy': dtype('int64'), 'purpose': dtype('O'), 'int.rate': dtype('float64'), 'installment': dtype('float64'), 'log.annual.inc': dtype('float64'), 'dti': dtype('float64'), 'fico': dtype('int64'), 'days.with.cr.line': dtype('float64'), 'revol.bal': dtype('int64'), 'revol.util': dtype('float64'), 'inq.last.6mths': dtype('int64'), 'delinq.2yrs': dtype('int64'), 'pub.rec': dtype('int64'), 'not.fully.paid': dtype('int64')}
Handling object dtype column: "purpose" in design matrix with One Hot Encoding


In [36]:
df.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,...,inq.last.6mths.1,delinq.2yrs,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,...,1,0,0,0,0,1,0,0,0,0
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,...,1,0,0,0,0,1,0,0,0,0
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,...,0,1,0,0,1,0,0,0,0,0


In [37]:
df.columns
# so get_dummies has turned the categorical variable purpose into many one hot encoded vectors

Index(['credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti',
       'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid',
       'credit.policy', 'int.rate', 'installment', 'log.annual.inc', 'dti',
       'fico', 'days.with.cr.line', 'revol.bal', 'revol.util',
       'inq.last.6mths', 'delinq.2yrs', 'pub.rec', 'not.fully.paid',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_educational', 'purpose_home_improvement',
       'purpose_major_purchase', 'purpose_small_business'],
      dtype='object')

In [38]:
#TODO: split this dataset into train and test
y = df['int.rate'] 
X = df.drop('int.rate',axis=1)

if (X.size+y.size) != df.size:
    print('Matrix sizes do not match.')

In [64]:
#TODO: figure out why there's two columns for int.rate
y

Unnamed: 0,int.rate,int.rate.1
0,0.1189,0.1189
1,0.1071,0.1071
2,0.1357,0.1357
3,0.1008,0.1008
4,0.1426,0.1426
...,...,...
9573,0.1461,0.1461
9574,0.1253,0.1253
9575,0.1071,0.1071
9576,0.1600,0.1600


In [39]:
X

Unnamed: 0,credit.policy,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,...,inq.last.6mths.1,delinq.2yrs.1,pub.rec,not.fully.paid,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,829.10,11.350407,19.48,737,5639.958333,28854,52.1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,228.22,11.082143,14.29,707,2760.000000,33623,76.7,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,366.86,10.373491,11.63,682,4710.000000,3511,25.6,1,0,...,1,0,0,0,0,1,0,0,0,0
3,1,162.34,11.350407,8.10,712,2699.958333,33667,73.2,1,0,...,1,0,0,0,0,1,0,0,0,0
4,1,102.92,11.299732,14.97,667,4066.000000,4740,39.5,0,1,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9573,0,344.76,12.180755,10.39,672,10474.000000,215372,82.1,2,0,...,2,0,0,1,0,0,0,0,0,0
9574,0,257.70,11.141862,0.21,722,4380.000000,184,1.1,5,0,...,5,0,0,1,0,0,0,0,0,0
9575,0,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,...,8,0,0,1,0,1,0,0,0,0
9576,0,351.58,10.819778,19.18,692,1800.000000,0,3.2,5,0,...,5,0,0,1,0,0,0,1,0,0


In [40]:
#TODO: write a script or set of functions that handles all the NA values of this dataset
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print(x_train.size)
print(x_test.size)
print(y_train.size)
print(y_test.size)

229860
57480
15324
3832


In [41]:
y_train

Unnamed: 0,int.rate,int.rate.1
7317,0.1459,0.1459
350,0.0838,0.0838
5206,0.1148,0.1148
2673,0.0932,0.0932
3528,0.1126,0.1126
...,...,...
5734,0.1357,0.1357
5191,0.1426,0.1426
5390,0.1426,0.1426
860,0.1166,0.1166


In [42]:
#TODO: make the linear regression model before looking for outliers

In [43]:
regression = lm.LinearRegression()

In [44]:
regression.fit(x_train, y_train)

LinearRegression()

In [45]:
#coefficients
regression.coef_

array([[-1.67403246e-03,  2.18520922e-05, -4.52276242e-04,
         6.64895401e-05, -2.41875994e-04,  8.07622522e-08,
        -1.36738358e-08,  3.62987721e-05,  2.85773236e-04,
         2.02161741e-04, -9.95940684e-05,  1.17084321e-04,
        -1.67403246e-03,  2.18520922e-05, -4.52276242e-04,
         6.64895401e-05, -2.41875994e-04,  8.07622523e-08,
        -1.36738291e-08,  3.62987721e-05,  2.85773236e-04,
         2.02161741e-04, -9.95940684e-05,  1.17084321e-04,
        -3.98668917e-03, -1.92320350e-03,  5.42052109e-04,
         1.95766099e-03,  2.04589909e-03,  1.63007572e-02],
       [-1.67403246e-03,  2.18520922e-05, -4.52276242e-04,
         6.64895401e-05, -2.41875994e-04,  8.07622522e-08,
        -1.36738358e-08,  3.62987721e-05,  2.85773236e-04,
         2.02161741e-04, -9.95940684e-05,  1.17084321e-04,
        -1.67403246e-03,  2.18520922e-05, -4.52276242e-04,
         6.64895401e-05, -2.41875994e-04,  8.07622523e-08,
        -1.36738291e-08,  3.62987721e-05,  2.85773236e-

In [46]:
#intercept
regression.intercept_

array([0.458831, 0.458831])

In [47]:
y_pred = regression.predict(x_test)
y_pred

array([[0.15931299, 0.15931299],
       [0.11895383, 0.11895383],
       [0.12425802, 0.12425802],
       ...,
       [0.12741529, 0.12741529],
       [0.12664884, 0.12664884],
       [0.10743029, 0.10743029]])

In [48]:
#TODO: write functions to remove outliers with high leverage and influential points
#STRETCH: include print statements that determine why these outliers were removed


In [13]:
#TODO: write basic plots to show the remove outliers with regards to the plots mentioned on 11/17 notes

In [14]:
#TODO: cross validation metrics? 