In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, scale
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
from operator import itemgetter

%matplotlib inline

In [2]:
hitters = pd.read_csv('./data/Hitters.csv', index_col=0).dropna()

# Create a set of dummy variables for the categoricals
dummies = pd.get_dummies(hitters[['League', 'Division', 'NewLeague']])

# Generate new dataframe with new dummy variables
hitters = hitters.drop(['League', 'Division', 'NewLeague'], axis=1)

# add new dummy variables to hitters
hitters = pd.concat([hitters, dummies[['League_N', 'Division_W', 'NewLeague_N']]],axis=1)

hitters.head(2)

Unnamed: 0_level_0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_N,Division_W,NewLeague_N
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,1,1,1
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,0,1,0


In [3]:
X=hitters.drop(['Salary'], axis=1)
y=hitters['Salary']

In [93]:
alphas = 10**np.linspace(-4, 2, 100)

In [94]:
regr=[]
for index, alp in enumerate(alphas):
    # NOTE normalize = True does not standardize the regressors by standard deviation, it divides by
    # the l2 norm of each column.
    model=Ridge(alpha=alp, normalize=True, fit_intercept=True)
    output=model.fit(X, y)
    
    regr=np.append(regr, output)
    
    
    

In [95]:
df=pd.DataFrame()
for i in range(len(alphas)):
    # make a datframe of the results
    inters=(pd.Series(regr[i].alpha, index=['alpha']))   
    coefs=pd.Series(data = np.hstack([regr[i].intercept_, regr[i].coef_]), 
                   index=['Intercept'] + X.columns.tolist())
    inters=inters.append(coefs)
    
    df[i]=inters

    

In [96]:
def l2_norms(i):
    print(df[i],'\n')

    print('l2 norm of Betas=', np.sqrt(sum(df[i].apply(lambda x: x**2).iloc[2:])))

In [97]:
l2_norms(89)

alpha           24.770764
Intercept      404.264758
AtBat            0.037839
Hits             0.141609
HmRun            0.536690
Runs             0.236334
RBI              0.245584
Walks            0.296780
Years            1.132897
CAtBat           0.003206
CHits            0.011932
CHmRun           0.089629
CRuns            0.023939
CRBI             0.024716
CWalks           0.025598
PutOuts          0.016928
Assists          0.002681
Errors          -0.021315
League_N         0.101226
Division_W      -6.391921
NewLeague_N      0.319640
Name: 89, dtype: float64 

l2 norm of Betas= 6.540417563468825


In [92]:
l2_norms(60)

alpha          533.669923
Intercept      528.179165
AtBat            0.002236
Hits             0.008121
HmRun            0.032628
Runs             0.013725
RBI              0.014486
Walks            0.017071
Years            0.069574
CAtBat           0.000192
CHits            0.000706
CHmRun           0.005325
CRuns            0.001417
CRBI             0.001462
CWalks           0.001545
PutOuts          0.000900
Assists          0.000147
Errors          -0.000706
League_N        -0.022345
Division_W      -0.323351
NewLeague_N     -0.003542
Name: 60, dtype: float64 

l2 norm of Betas= 0.33432066444400144


In [5]:
#StandardScaler() will normalize the features (each column of X, INDIVIDUALLY !!!) 
#so that each column/feature/variable will have mean = 0 and standard deviation = 1.

scaler=StandardScaler()
scaler.fit_transform(X)

array([[-0.6029005 , -0.59567545, -0.5285512 , ...,  1.05875764,
         0.98116592,  1.07505242],
       [ 0.51254171,  0.49225957,  0.72996619, ..., -0.94450322,
         0.98116592, -0.9301872 ],
       [ 0.62816682,  0.73648988,  0.95878753, ...,  1.05875764,
        -1.01919561,  1.07505242],
       ...,
       [ 0.48533581,  0.40344855, -0.98619389, ..., -0.94450322,
         0.98116592, -0.9301872 ],
       [ 1.15188054,  0.80309815, -0.29972986, ..., -0.94450322,
        -1.01919561, -0.9301872 ],
       [ 1.54636621,  1.38036979, -0.29972986, ..., -0.94450322,
         0.98116592, -0.9301872 ]])