In [28]:
import pandas as pd
import numpy as np 
import statsmodels.api as sm
np.set_printoptions(suppress=True) #suppress scientific notation for easier reading

In [2]:
##### Same as before we did for linear regression
loansData = pd.read_csv('https://spark-public.s3.amazonaws.com/dataanalysis/loansData.csv')
loansData.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
81174,20000,20000,8.90%,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14,14272,2,< 1 year
99592,19200,19200,12.12%,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12,11140,1,2 years
80059,35000,35000,21.98%,60 months,debt_consolidation,23.81%,CA,MORTGAGE,11500.0,690-694,14,21977,1,2 years
15825,10000,9975,9.99%,36 months,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10,9346,0,5 years
33182,12000,12000,11.71%,36 months,credit_card,18.78%,NJ,RENT,3195.0,695-699,11,14469,0,9 years


In [3]:
print loansData['Amount.Requested'].sum()
print loansData['Monthly.Income'].sum()

31016250
14216639.37


In [4]:
#take only the columns we are interested in (annual_inc, int_rate, home_ownership)
loansData = loansData.loc[:,('Interest.Rate', 'Monthly.Income', 'Home.Ownership','FICO.Range','Amount.Requested')]
loansData.head()

Unnamed: 0,Interest.Rate,Monthly.Income,Home.Ownership,FICO.Range,Amount.Requested
81174,8.90%,6541.67,MORTGAGE,735-739,20000
99592,12.12%,4583.33,MORTGAGE,715-719,19200
80059,21.98%,11500.0,MORTGAGE,690-694,35000
15825,9.99%,3833.33,MORTGAGE,695-699,10000
33182,11.71%,3195.0,RENT,695-699,12000


In [5]:
#cleaning up the columns for modelling:
loansData['Interest.Rate'] = loansData['Interest.Rate'].map(lambda x: round(float(x.rstrip('%'))/100, 3))
#loansData['Interest.Rate'] = loansData['Interest.Rate'].map(lambda x: x.rstrip('%'))
#loansData['Interest.Rate'] = loansData['Interest.Rate'].astype(float)
loansData['FICO.Score'] = loansData['FICO.Range'].map(lambda x: int(x[:3]))
loansData.drop('FICO.Range', axis=1, inplace=True)
loansData.head()

Unnamed: 0,Interest.Rate,Monthly.Income,Home.Ownership,Amount.Requested,FICO.Score
81174,0.089,6541.67,MORTGAGE,20000,735
99592,0.121,4583.33,MORTGAGE,19200,715
80059,0.22,11500.0,MORTGAGE,35000,690
15825,0.1,3833.33,MORTGAGE,10000,695
33182,0.117,3195.0,RENT,12000,695


In [6]:
loansData.dtypes

Interest.Rate       float64
Monthly.Income      float64
Home.Ownership       object
Amount.Requested      int64
FICO.Score            int64
dtype: object

In [7]:
print loansData['Interest.Rate'].sum()

326.629


In [8]:
#any nan values to clean?
print 'Before' 
print loansData.isnull().sum()

#drop any NAN values:
loansData = loansData.dropna()

#confirmed dropped
print ''
print 'After'
print loansData.isnull().sum()

Before
Interest.Rate       0
Monthly.Income      1
Home.Ownership      0
Amount.Requested    0
FICO.Score          0
dtype: int64

After
Interest.Rate       0
Monthly.Income      0
Home.Ownership      0
Amount.Requested    0
FICO.Score          0
dtype: int64


In [11]:
##### Now the logistic part; create series for each variable:
intrate = loansData['Interest.Rate']
#intrate[np.isnan(intrate)] = 0
loanamt = loansData['Amount.Requested']
#loanamt[np.isnan(loanamt)] = 0
fico = loansData['FICO.Score']
#fico[np.isnan(fico)] = 0
monthly_income = loansData['Monthly.Income']
#monthly_income[np.isnan(monthly_income)] = 0

print intrate[:5]
print type(intrate)
print ''
print loanamt[:5]
print type(loanamt)
print ''
print fico[:5]
print type(fico)
print ''
print monthly_income[:5]
print type(monthly_income)


81174    0.089
99592    0.121
80059    0.220
15825    0.100
33182    0.117
Name: Interest.Rate, dtype: float64
<class 'pandas.core.series.Series'>

81174    20000
99592    19200
80059    35000
15825    10000
33182    12000
Name: Amount.Requested, dtype: int64
<class 'pandas.core.series.Series'>

81174    735
99592    715
80059    690
15825    695
33182    695
Name: FICO.Score, dtype: int64
<class 'pandas.core.series.Series'>

81174     6541.67
99592     4583.33
80059    11500.00
15825     3833.33
33182     3195.00
Name: Monthly.Income, dtype: float64
<class 'pandas.core.series.Series'>


In [12]:
## Home Status is not a value, so you need to convert it and weigh them based on their importance
house_ownership = loansData['Home.Ownership']
house_ownership = [4 if x == 'OWN' else 3 if x == 'MORTGAGE' else 2 if x == 'RENT' else 1 if x == 'OTHER' \
                   else 0 for x in house_ownership]

print house_ownership[:5]
print type(house_ownership)

[3, 3, 3, 3, 2]
<type 'list'>


In [15]:
# The dependent variable
IntRate = np.matrix(intrate).transpose()
print IntRate[:5]
print type(IntRate)

[[ 0.089]
 [ 0.121]
 [ 0.22 ]
 [ 0.1  ]
 [ 0.117]]
<class 'numpy.matrixlib.defmatrix.matrix'>


In [16]:
sum(IntRate)

matrix([[ 326.555]])

In [17]:
# The independent variables shaped as columns
FICO = np.matrix(fico).transpose()
print FICO[:5]
print sum(FICO)
print ''
LoanAmount = np.matrix(loanamt).transpose()
print LoanAmount[:5]
print sum(LoanAmount)
print ''
MonInc = np.matrix(monthly_income).transpose()
print MonInc[:5]
print sum(MonInc)
print ''
HomeStatus = np.matrix(house_ownership).transpose()
print HomeStatus[:5]
print sum(HomeStatus)
print ''

[[735]
 [715]
 [690]
 [695]
 [695]]
[[1763920]]

[[20000]
 [19200]
 [35000]
 [10000]
 [12000]]
[[31011250]]

[[  6541.67]
 [  4583.33]
 [ 11500.  ]
 [  3833.33]
 [  3195.  ]]
[[ 14216639.37]]

[[3]
 [3]
 [3]
 [3]
 [2]]
[[6541]]



In [20]:
#Stack the Variables we want to use for the model:
#leave out home status in first run
stacked_run1 = np.column_stack([FICO,LoanAmount,MonInc])
print stacked_run1[:5]
print ''
print sum(stacked_run1)

[[   735.    20000.     6541.67]
 [   715.    19200.     4583.33]
 [   690.    35000.    11500.  ]
 [   695.    10000.     3833.33]
 [   695.    12000.     3195.  ]]
[[  1763920.    31011250.    14216639.37]]


In [21]:
#create model:
run1 = sm.add_constant(stacked_run1)
model_run1 = sm.OLS(IntRate,run1)
run1_fitted = model_run1.fit()

#show summary:
result_run1 = run1_fitted.summary()
result_run1

0,1,2,3
Dep. Variable:,y,R-squared:,0.66
Model:,OLS,Adj. R-squared:,0.659
Method:,Least Squares,F-statistic:,1613.0
Date:,"Tue, 20 Oct 2015",Prob (F-statistic):,0.0
Time:,15:24:08,Log-Likelihood:,5734.3
No. Observations:,2499,AIC:,-11460.0
Df Residuals:,2495,BIC:,-11440.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,0.7279,0.010,73.688,0.000,0.709 0.747
x1,-0.0009,1.41e-05,-62.549,0.000,-0.001 -0.001
x2,2.236e-06,6.8e-08,32.869,0.000,2.1e-06 2.37e-06
x3,-6.374e-07,1.34e-07,-4.741,0.000,-9.01e-07 -3.74e-07

0,1,2,3
Omnibus:,70.934,Durbin-Watson:,1.976
Prob(Omnibus):,0.0,Jarque-Bera (JB):,79.339
Skew:,0.385,Prob(JB):,5.91e-18
Kurtosis:,3.41,Cond. No.,319000.0


In [24]:
#Get the fitted coefficients from the results:
coeff_run1 = run1_fitted.params
print coeff_run1

[ 0.72792692 -0.00088032  0.00000224 -0.00000064]


In [26]:
print 'Coefficients: ', run1_fitted.params[0:2]
print 'Intercept: ', run1_fitted.params[2]
print 'P-Values: ', run1_fitted.pvalues
print 'R-Squared: ', run1_fitted.rsquared

Coefficients:  [ 0.72792692 -0.00088032]
Intercept:  2.23582087715e-06
P-Values:  [ 0.          0.          0.          0.00000224]
R-Squared:  0.659807170928


## Using Equation to Predict Interest Rates (Model Testing Phase)

In [25]:
#recall data: 
loansData.head()

Unnamed: 0,Interest.Rate,Monthly.Income,Home.Ownership,Amount.Requested,FICO.Score
81174,0.089,6541.67,MORTGAGE,20000,735
99592,0.121,4583.33,MORTGAGE,19200,715
80059,0.22,11500.0,MORTGAGE,35000,690
15825,0.1,3833.33,MORTGAGE,10000,695
33182,0.117,3195.0,RENT,12000,695


In [29]:
print coeff_run1[0]
print coeff_run1[1]
print coeff_run1[2]
print coeff_run1[3]

0.727926922642
-0.000880316336608
2.23582087715e-06
-6.37449340405e-07


In [30]:
test_case = coeff_run1[0] + (coeff_run1[1]*735) + (coeff_run1[2]*20000) + (coeff_run1[3]*6541.67)
test_case

0.12144084955145877

In [31]:
test_copy = loansData.copy(deep=True)
test_copy.head()

Unnamed: 0,Interest.Rate,Monthly.Income,Home.Ownership,Amount.Requested,FICO.Score
81174,0.089,6541.67,MORTGAGE,20000,735
99592,0.121,4583.33,MORTGAGE,19200,715
80059,0.22,11500.0,MORTGAGE,35000,690
15825,0.1,3833.33,MORTGAGE,10000,695
33182,0.117,3195.0,RENT,12000,695


In [37]:
test_copy['predicted_int_rate'] = (coeff_run1[0] + (coeff_run1[1]*test_copy['FICO.Score']) + \
                                                   (coeff_run1[2]*test_copy['Amount.Requested']) + \
                                                   (coeff_run1[3]*test_copy['Monthly.Income']))

In [38]:
test_copy.head(10)

Unnamed: 0,Interest.Rate,Monthly.Income,Home.Ownership,Amount.Requested,FICO.Score,predicted_int_rate
81174,0.089,6541.67,MORTGAGE,20000,735,0.121441
99592,0.121,4583.33,MORTGAGE,19200,715,0.138507
80059,0.22,11500.0,MORTGAGE,35000,690,0.191432
15825,0.1,3833.33,MORTGAGE,10000,695,0.136022
33182,0.117,3195.0,RENT,12000,695,0.1409
62403,0.153,4891.67,OWN,6000,670,0.148412
48808,0.079,2916.67,RENT,10000,720,0.114598
22090,0.171,13863.42,MORTGAGE,33500,705,0.173367
76404,0.143,3150.0,RENT,14675,685,0.155713
15867,0.069,5000.0,RENT,7000,715,0.110964
