In [36]:
#import packages
import numpy as py
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars
from sklearn.linear_model import LassoCV
%matplotlib inline 

In [3]:
prostate = pd.read_csv('prostate.csv')

In [4]:
prostate.head()

Unnamed: 0,Obs,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
0,1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783
1,2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519
2,3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519
3,4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519
4,5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564


In [5]:
prostate.shape

(97, 10)

In [8]:
# check count of nulls in columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print(prostate.isnull().sum())

Obs        0
lcavol     0
lweight    0
age        0
lbph       0
svi        0
lcp        0
gleason    0
pgg45      0
lpsa       0
dtype: int64


In [10]:
# drop Obs, irrelevant
prostate = prostate.drop(['Obs'], axis=1)

# move lpsa to first as target
targetName = 'lpsa'
targetSeries = prostate[targetName]

#remove target from current location and insert in collum 0
del prostate[targetName]
prostate.insert(0, targetName, targetSeries)

prostate.head()

Unnamed: 0,lpsa,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45
0,-0.430783,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0
1,-0.162519,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0
2,-0.162519,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20
3,-0.162519,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0
4,0.371564,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0


In [11]:
# breaking our dataset into the target and the predictors
prostate.target=prostate['lpsa'] 
prostate.features=prostate.drop(['lpsa'], axis=1)                         
print(prostate.target.shape)
print(prostate.features.shape)

(97,)
(97, 8)


In [23]:
## source Dr. Gartland - Linear Regression- More Advanced Techniques

# Helper function to fit, cross validate, and assess
# slight automation, this is one way of not having to repeaatedly fit and print your measures
# 

from sklearn.model_selection import *
def train_and_evaluate(model, features, target):
    
    model.fit(features, target)
    predicted_LR = model.predict(features);
    
    print("Coefficient of determination on training set:",model.score(features, target))
    
    # create a k-fold croos validation iterator of k=5 folds
    scores = cross_val_score(model, features, target, cv=5)
    print("Average coefficient of determination using 5-fold crossvalidation:",py.mean(scores))
    
    print("Coef", model.intercept_, model.coef_)
    print("MSE", mean_squared_error(target, predicted_LR))

## Linear Regression

In [24]:
LRModel = LinearRegression(normalize=True)
train_and_evaluate(LRModel, prostate.features, prostate.target)

Coefficient of determination on training set: 0.663389565499
Average coefficient of determination using 5-fold crossvalidation: -9.73426138478
Coef 0.181560845469 [ 0.56434128  0.62201979 -0.02124819  0.09671252  0.7616734  -0.10605094
  0.04922793  0.00445751]
MSE 0.4439012241


## Ridge Regression

In [26]:
RGModel = Ridge(alpha=20)
train_and_evaluate(RGModel, prostate.features, prostate.target)

Coefficient of determination on training set: 0.623580506206
Average coefficient of determination using 5-fold crossvalidation: -9.32114285063
Coef 1.34977433997 [ 0.4907651   0.28356959 -0.0123061   0.10538182  0.25801323  0.04535242
  0.0058513   0.00513288]
MSE 0.496398973246


In [37]:
# grid search on ridge


# use a full grid over several parameters and cross validate 5 times
from sklearn.model_selection import GridSearchCV
param_grid = {"alpha": [.01,.1, .5, 1, 2]}
#param_grid={"alpha": [1,10,1]} this does a range 1 through 10 changes by a factor of 1. 
#param_grid={"alpha": [.01,1,.05]} this does a range 1 through 1 changes by a factor of .05

# run grid search
grid_search = GridSearchCV(RGModel, param_grid=param_grid,n_jobs=-1,cv=5)
grid_search.fit(prostate.features, prostate.target)
print("Grid Scores", grid_search.cv_results_)
print("Best", grid_search.best_params_)

Grid Scores {'mean_fit_time': array([ 0.02317567,  0.00625   ,  0.        ,  0.00624943,  0.        ]), 'std_fit_time': array([ 0.0168841 ,  0.00765466,  0.        ,  0.00765395,  0.        ]), 'mean_score_time': array([ 0.,  0.,  0.,  0.,  0.]), 'std_score_time': array([ 0.,  0.,  0.,  0.,  0.]), 'param_alpha': masked_array(data = [0.01 0.1 0.5 1 2],
             mask = [False False False False False],
       fill_value = ?)
, 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 0.5}, {'alpha': 1}, {'alpha': 2}], 'split0_test_score': array([-2.72356144, -2.72951823, -2.75584924, -2.78838061, -2.8519637 ]), 'split1_test_score': array([-15.81573693, -15.77216208, -15.59939222, -15.42277382, -15.1610955 ]), 'split2_test_score': array([-13.35088349, -13.3049951 , -13.11851307, -12.92000036, -12.61126985]), 'split3_test_score': array([-13.93868432, -13.87590764, -13.64042357, -13.42470001, -13.16353786]), 'split4_test_score': array([-2.82705217, -2.83887254, -2.88832586, -2.94413912, -3.0

## Lasso Regression with Cross Validation

In [35]:
LASModel = LassoCV(alphas=[.01,.1, .5, 1, 2], cv=5)
train_and_evaluate(LASModel, prostate.features, prostate.target)

Coefficient of determination on training set: 0.661215173583
Average coefficient of determination using 5-fold crossvalidation: -9.65656022076
Coef 0.621962584419 [ 0.55854576  0.55859432 -0.01884546  0.09417079  0.63020145 -0.06349113
  0.          0.0049868 ]
MSE 0.44676867898


## ElasticNet Regression

In [32]:
ENModel = ElasticNet(alpha=.5)
train_and_evaluate(ENModel, prostate.features, prostate.target)

Coefficient of determination on training set: 0.475792806864
Average coefficient of determination using 5-fold crossvalidation: -9.18036541122
Coef 1.73448461185 [ 0.36370419  0.          0.          0.          0.          0.          0.
  0.01037257]
MSE 0.691292339348


## Lasso Lars Regression

In [34]:
LLModel = LassoLars(alpha=1)
train_and_evaluate(LLModel, prostate.features, prostate.target)

Coefficient of determination on training set: 0.0
Average coefficient of determination using 5-fold crossvalidation: -8.97676974076
Coef 2.47838687835 [ 0.  0.  0.  0.  0.  0.  0.  0.]
MSE 1.31873875139
