In [1]:
#Import modules
import pandas as pd
import statsmodels.formula.api as smf

from sklearn.cross_validation import KFold

In [2]:
#Load raw data into data frame
raw_data = pd.read_csv('loansData.csv')

In [3]:
#Convert interest rate from string to float
raw_data['IntRate'] = [float(val.strip('%')) 
                       for val in raw_data['Interest.Rate'].values]

In [4]:
#Code home ownership status
raw_data['Ownership'] = raw_data['Home.Ownership'].astype('category')
raw_data['Ownership'] = raw_data['Ownership'].cat.codes

In [5]:
#Calculate yearly income from monthly income
raw_data['YearlyIncome'] = raw_data['Monthly.Income'] * 12

In [6]:
#Build regression model
model = smf.ols(formula = 'raw_data["IntRate"] \
                            ~ raw_data["Ownership"] \
                            + raw_data["YearlyIncome"] \
                            + raw_data["Ownership"] * raw_data["YearlyIncome"]', 
                            data = raw_data).fit()

model.summary()

0,1,2,3
Dep. Variable:,"raw_data[""IntRate""]",R-squared:,0.008
Model:,OLS,Adj. R-squared:,0.006
Method:,Least Squares,F-statistic:,6.327
Date:,"Sat, 08 Oct 2016",Prob (F-statistic):,0.000285
Time:,09:00:30,Log-Likelihood:,-7108.8
No. Observations:,2499,AIC:,14230.0
Df Residuals:,2495,BIC:,14250.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,12.6691,0.206,61.512,0.000,12.265,13.073
"raw_data[""Ownership""]",0.0614,0.081,0.759,0.448,-0.097,0.220
"raw_data[""YearlyIncome""]",8.219e-07,2.09e-06,0.393,0.694,-3.28e-06,4.92e-06
"raw_data[""Ownership""]:raw_data[""YearlyIncome""]",1.773e-06,1.06e-06,1.676,0.094,-3.02e-07,3.85e-06

0,1,2,3
Omnibus:,66.89,Durbin-Watson:,1.996
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.547
Skew:,0.279,Prob(JB):,2.36e-12
Kurtosis:,2.549,Cond. No.,506000.0


In [7]:
#Build cross validation iterator
cval_itr = KFold(len(raw_data), n_folds = 10)

perf_metrics = pd.DataFrame(index = range(10), columns = ['MSE', 'MAE', 'R2'])
i = 0

for train, test in cval_itr:
    #Load training data into a data frame
    train_data = pd.DataFrame()
    train_data['IntRate'] = raw_data['IntRate'].values[train]
    train_data['Ownership'] = raw_data['Ownership'].values[train]
    train_data['YearlyIncome'] = raw_data['YearlyIncome'].values[train]
    
    #Load test data into a data frame
    test_data = pd.DataFrame()
    test_data['IntRate'] = raw_data['IntRate'].values[test]
    test_data['Ownership'] = raw_data['Ownership'].values[test]
    test_data['YearlyIncome'] = raw_data['YearlyIncome'].values[test]
    
    #Model training data
    model = smf.ols(formula = 'train_data["IntRate"] \
                                ~ train_data["Ownership"] \
                                + train_data["YearlyIncome"] \
                                + train_data["Ownership"] * train_data["YearlyIncome"]', 
                                    data = train_data).fit()
    
    #Evaluate model against test data
    test_data['Pred_IntRate'] = (model.params[0] 
                                 + model.params[1] * test_data['Ownership']
                                 + model.params[2] * test_data['YearlyIncome']
                                 + model.params[3] * test_data['Ownership'] * test_data['Ownership'])
    
    #Calculate performance metrics
    SSres = sum( (test_data['Pred_IntRate'] - test_data['IntRate'])**2)
    
    SStot = sum( (test_data['IntRate'] - test_data['IntRate'].mean())**2)
    
    MSE = SSres / len(test_data)
    
    MAE = sum( abs(test_data['Pred_IntRate'] - test_data['IntRate'])) / len(test_data)
    
    R2 = 1 - (SSres / SStot)
    
    perf_metrics.loc[i] = [MSE, MAE, R2]
    
    i += 1

In [None]:
perf_metrics

In [8]:
print('Average mean squared error for entire cross validation : %0.2f\n' % perf_metrics['MSE'].mean())
print('Average mean absolute error for entire cross validation : %0.2f\n' % perf_metrics['MAE'].mean())
print('Average R squared for entire cross validation : %0.2f\n' % perf_metrics['R2'].mean())

Average mean squared error for entire cross validation : 17.51

Average mean absolute error for entire cross validation : 3.38

Average R squared for entire cross validation : -0.00



In [None]:
#The performance metrics indicate similar conclusion to the previous analysis
#on the loan data.  The model does not predict the variation in the data
#very well (R2 = 0.0).  MSE and MAE support the conclusion drawn from
#R2.  MAE states that the model either predicts the interest rate to be 
#on average +/- 3.38 % of the true interest rate.  MSE indicated how closely
#the model matches the available data, therefore a small MSE is better.
#The large MSE shown above suggests the model does not match the data
#very well.