### Lesson 7 - Model Fit Evaluation

Let's go through the usual library imports

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model, metrics


## Cross validation
#### Intro to cross validation with bike share data from last time. We will be modeling casual ridership. 

In [5]:
from sklearn import cross_validation
bikeshare = pd.read_csv('bikeshare.csv')
print bikeshare.columns

Index([u'datetime', u'season', u'holiday', u'workingday', u'weather', u'temp',
       u'atemp', u'humidity', u'windspeed', u'casual', u'registered',
       u'count'],
      dtype='object')


#### Create dummy variables and set outcome (dependent) variable

In [9]:
#make a dummy variable for weather
weather = pd.get_dummies(bikeshare.weather, prefix='weather')

#make a data of just the predictors
#because the dummies are a separate dataframe, we can join it to the main bikeframe dataframe
modeldata = bikeshare[['temp', 'humidity']].join(weather[['weather_1', 'weather_2', 'weather_3']])

#our outcome will be the number of casual (non-registered) users who checked out a bike for a given hour
y = bikeshare['casual'] 

#Let's look at the predictors and outcome
print modeldata.head(5)
print y.head(5)

   temp  humidity  weather_1  weather_2  weather_3
0  9.84        81        1.0        0.0        0.0
1  9.02        80        1.0        0.0        0.0
2  9.02        80        1.0        0.0        0.0
3  9.84        75        1.0        0.0        0.0
4  9.84        75        1.0        0.0        0.0
0    3
1    8
2    5
3    3
4    0
Name: casual, dtype: int64


#### Create a cross valiation with 5 folds

In [25]:
# The first parameter is the total number of data points
# The second parameter is the number of folds (K=10)
# The last parameter allows us to have some of the data in previous folds be in the new fold.
kf = cross_validation.KFold(len(modeldata), n_folds=10, shuffle=True)

Let's actually train and test the model on each fold. and save the RMSE values for each test/iteration

In [26]:
rmse_values = []
scores = []
n= 0
print "#### CROSS VALIDATION each fold ####"
for train_index, test_index in kf:
    lm = linear_model.LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    mse = metrics.mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index]))
    rmse_values.append(mse**.5)
    scores.append(lm.score(modeldata.iloc[test_index], y.iloc[test_index]))
    n+=1
    print 'Model', n
    print 'RMSE:', rmse_values[n-1]
    print 'R2:', scores[n-1]


print "####  SUMMARY OF CROSS VALIDATION #####"
print 'Mean of RMSE for all folds:', np.mean(rmse_values)
print 'Mean of R2 for all folds:', np.mean(scores)

#### CROSS VALIDATION each fold ####
Model 1
RMSE: 41.062533158
R2: 0.324327989739
Model 2
RMSE: 43.2653673197
R2: 0.298698060829
Model 3
RMSE: 40.9712544946
R2: 0.320685718491
Model 4
RMSE: 40.4331182486
R2: 0.314459145793
Model 5
RMSE: 39.8868064094
R2: 0.31886926231
Model 6
RMSE: 40.8035644371
R2: 0.337992422391
Model 7
RMSE: 40.931741824
R2: 0.326330539969
Model 8
RMSE: 41.7799125661
R2: 0.313206981932
Model 9
RMSE: 41.2591462257
R2: 0.337130446154
Model 10
RMSE: 42.8307950271
R2: 0.268925957268
####  SUMMARY OF CROSS VALIDATION #####
Mean of RMSE for all folds: 41.322423971
Mean of R2 for all folds: 0.316062652488


In [15]:
lm = linear_model.LinearRegression().fit(modeldata, y)
print "### Single Model ###"
print 'RMSE of single model:', metrics.mean_squared_error(y, lm.predict(modeldata))**.5
print 'R2: ', lm.score(modeldata, y)

### Single Model ###
RMSE of single model: 41.3217895435
R2:  0.316589157827


### Check
Which of the two approaches would predict new data more accurately: the single model or the cross validated, averaged one?


### Advanced: There are ways to improve our model with regularization. 
Let's check out the effects on MSE and R2

In [41]:
print "### OLS ###"
lm = linear_model.LinearRegression().fit(modeldata, y)
print "### OLS ###"
print 'OLS RMSE: ', metrics.mean_squared_error(y, lm.predict(modeldata))**.5
print 'OLS R2:', lm.score(modeldata, y)

print "~~~ Ridge ~~~"
ridge = linear_model.RidgeCV(alphas=[0.1, 2.0, 20.0])
ridge.fit(modeldata,y)       
print 'Ridge RMSE: ', metrics.mean_squared_error(y, ridge.predict(modeldata))**.5
print 'Ridge R2:', ridge.score(modeldata, y)

### OLS ###
### OLS ###
OLS RMSE:  41.3217895435
OLS R2: 0.316589157827
~~~ Ridge ~~~
Ridge RMSE:  41.3224536218
Ridge R2: 0.316567191599


#### Examine cross-validation performance

In [47]:
kf = cross_validation.KFold(len(modeldata), n_folds=8, shuffle=True)
OLS_rmse_values = []
Ridge_rmse_values = []
n= 0
print "#### CROSS VALIDATION each fold ####"
for train_index, test_index in kf:
    lm = linear_model.LinearRegression().fit(modeldata.iloc[train_index], y.iloc[train_index])
    mse = metrics.mean_squared_error(y.iloc[test_index], lm.predict(modeldata.iloc[test_index]))
    OLS_rmse_values.append(mse**.5)

    ridge = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0]).fit(modeldata.iloc[train_index], y.iloc[train_index])
    mse = metrics.mean_squared_error(y.iloc[test_index], ridge.predict(modeldata.iloc[test_index]))
    Ridge_rmse_values.append(mse**.5)

    n+=1


print "####  SUMMARY OF CROSS VALIDATION #####"
print 'Mean of RMSE for OLS:', np.mean(OLS_rmse_values)
print 'Mean of RMSE for Ridge:', np.mean(Ridge_rmse_values)

if np.mean(OLS_rmse_values) < np.mean(Ridge_rmse_values):print "OLS Performed Better"
if np.mean(OLS_rmse_values) > np.mean(Ridge_rmse_values):print "Ridge Performed Better"

#### CROSS VALIDATION each fold ####
####  SUMMARY OF CROSS VALIDATION #####
Mean of RMSE for OLS: 41.2522043799
Mean of RMSE for Ridge: 41.2521689869
Ridge Performed Better


## Example Application of Gradient Descent 

In [65]:


# SGD is very senstitive to varying-sized feature values. 
# So, first we need to do feature scaling (makes all features have a mean of 0 and Std. Dev of 1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data= scaler.fit_transform(modeldata)
lm = linear_model.SGDRegressor()
lm.fit(scaled_data, y)
print "Gradient Descent RMSE:", metrics.mean_squared_error(y, lm.predict(scaled_data))**.5
print "Gradient Descent R2:", lm.score(scaled_data, y)

lm = linear_model.LinearRegression()
lm.fit(modeldata, y)
print ""
print 'OLS RMSE: ', metrics.mean_squared_error(y, lm.predict(modeldata))**.5
print 'OLS R2:', lm.score(modeldata, y)

lm = linear_model.SGDRegressor()

Gradient Descent RMSE: 41.3406146261
Gradient Descent R2: 0.315966329231

OLS RMSE:  41.3217895435
OLS R2: 0.316589157827


Check: How well did gradient descent perform compared to OLS?

# Independent Practice: 

In the code folder, you'll find datasets and data dictionaries for various domains.

Choose one to examine, think of an outcome you would like to predict and build a regression model to explain that outcome.

Feel free to choose any type of regression method, and apply any dummy coding or transformations to your variables.

How well does your model fit cross-validated data? How much cross-validation error did your model observe on average?

In the final 20 minutes of class, we'll take time to share our results with the group.


### Starter Code

In [None]:
#Read in your dataset
df = pd.read_csv("")
#display the possible predictors and outcomes
print df.columns

#enter in what you want your predictors to be
predictors = ['','','']

#enter what you want your outcome to be
outcome =''


X=df[predictors]
y=df[outcome]

#the lists will the R2 and RMSE from the cross validation iterations
R2_scores = []
RMSE_scores =[]

#create a cross-validation iterator
kf = cross_validation.KFold(len(modeldata), n_folds=10, shuffle=True)

for train_index, test_index in kf:
####finish the code

