In [10]:
import pandas as pd
from sklearn.datasets import fetch_california_housing

cal_housing = fetch_california_housing()
x = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
y = pd.DataFrame(cal_housing.target)

In [11]:
x.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [12]:
y.head()

Unnamed: 0,0
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [3]:
from sklearn.linear_model import LinearRegression
my_model = LinearRegression()

In [4]:
my_model.fit(x,y)

LinearRegression()

#### The below cell shows the corresponding values for the coefficients in the DF below - 4.3 corresponds to MedInc, and so on.

In [5]:
my_model.coef_

array([ 4.36693293e-01,  9.43577803e-03, -1.07322041e-01,  6.45065694e-01,
       -3.97638942e-06, -3.78654265e-03, -4.21314378e-01, -4.34513755e-01])

In [9]:
x

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


#### calculate all the means and variances of each column. And then we're going to apply them to that original data frame.

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [11]:
x_scaled

array([[ 2.34476576,  0.98214266,  0.62855945, ..., -0.04959654,
         1.05254828, -1.32783522],
       [ 2.33223796, -0.60701891,  0.32704136, ..., -0.09251223,
         1.04318455, -1.32284391],
       [ 1.7826994 ,  1.85618152,  1.15562047, ..., -0.02584253,
         1.03850269, -1.33282653],
       ...,
       [-1.14259331, -0.92485123, -0.09031802, ..., -0.0717345 ,
         1.77823747, -0.8237132 ],
       [-1.05458292, -0.84539315, -0.04021111, ..., -0.09122515,
         1.77823747, -0.87362627],
       [-0.78012947, -1.00430931, -0.07044252, ..., -0.04368215,
         1.75014627, -0.83369581]])

#### Adding names back to the array which was above, just for visuals sake

In [13]:
x_scaled = pd.DataFrame(data=x_scaled, columns=cal_housing.feature_names)
x_scaled

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844
2,1.782699,1.856182,1.155620,-0.049016,-0.820777,-0.025843,1.038503,-1.332827
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818
...,...,...,...,...,...,...,...,...
20635,-1.216128,-0.289187,-0.155023,0.077354,-0.512592,-0.049110,1.801647,-0.758826
20636,-0.691593,-0.845393,0.276881,0.462365,-0.944405,0.005021,1.806329,-0.818722
20637,-1.142593,-0.924851,-0.090318,0.049414,-0.369537,-0.071735,1.778237,-0.823713
20638,-1.054583,-0.845393,-0.040211,0.158778,-0.604429,-0.091225,1.778237,-0.873626


#### create a new model so that we can keep the results of the old model. So we'll just call it the model or scaled model.

In [15]:
scaled_model = LinearRegression()
scaled_model.fit(x_scaled, y)

LinearRegression()

#### Variable importances below, for example: We see a big drop in importance from the original values whereas the fifth variable dropped from 0.6 to 0.004.

In [16]:
scaled_model.coef_

array([ 0.8296193 ,  0.11875165, -0.26552688,  0.30569623, -0.004503  ,
       -0.03932627, -0.89988565, -0.870541  ])

#### Variable importances listed below. AvgRooms is suppressing the price of the house, AvgBedRooms is increasing the price of the house. Population column is close to zero and has next to no impact.

In [18]:
for i in range(len(x_scaled.columns)):
    print(f"{x_scaled.columns[i]}: {scaled_model.coef_[i]}")

MedInc: 0.8296193042804504
HouseAge: 0.11875165121214162
AveRooms: -0.26552687950662046
AveBedrms: 0.30569622980430894
Population: -0.004503001312614049
AveOccup: -0.03932626697814864
Latitude: -0.8998856544145073
Longitude: -0.8705410023357312


# REGULARIZATION - preventing overfitting using L1

In [19]:
from sklearn.linear_model import Lasso
l1_model = Lasso(alpha=1)

In [20]:
l1_model.fit(x_scaled,y)

Lasso(alpha=1)

#### Seeing as all of our coefficients are 0, we have underfit this model quite severly. The penalty was so large that it did not allow the model to actually fit. 

#### The next step is to find a different value of alpha. Alpha is a multiple of the penalty, so decreasing alpha will decrease the strength.

In [21]:
l1_model.coef_

array([ 0.,  0.,  0., -0., -0., -0., -0., -0.])

In [26]:
l1_model.aplha=0.1
l1_model.aplha

0.1

#### As we decrease alpha in the loop, we should expect the coefficients to grow to nearly the same values as our original linear model.

#### L1 is displaying the behavior claimed that weak interactions are suppressed, while the strongest interactions are allowed to grow. Now, you'll see that the strong interactions are still not allowed to grow to their full strength. Their largest values are still when alpha is the smallest, and in fact, we could compare with how this happens that alpha equals 0

In [28]:
alpha = 1
for i in range(5):
    l1_model.alpha=alpha
    l1_model.fit(x_scaled,y)
    print(alpha, l1_model.coef_)
    alpha= alpha/10

1 [ 0.  0.  0. -0. -0. -0. -0. -0.]
0.1 [ 0.70571337  0.10601099 -0.         -0.         -0.         -0.
 -0.01121267 -0.        ]
0.01 [ 0.77722333  0.12486709 -0.12940585  0.16912537 -0.         -0.02944551
 -0.79543737 -0.75899738]
0.001 [ 0.8244684   0.11967446 -0.25193711  0.29211349 -0.00315059 -0.03840844
 -0.88903624 -0.85906895]
0.0001 [ 0.82910679  0.11884435 -0.26417261  0.30434164 -0.00436762 -0.03923458
 -0.89879503 -0.86938865]


#### The largest values are when alpha was it's smallest, and we can compare how this happens that alpha equals 0. We are still using linear regression, and adding in alphas to the first array below, the second array shows the original coefs. The values between the two line up pretty close.

In [29]:
l1_model.alpha = 0
l1_model.fit(x_scaled,y)
print(alpha, l1_model.coef_)

1e-05 [ 0.8296193   0.11875165 -0.26552688  0.30569623 -0.004503   -0.03932627
 -0.89988565 -0.870541  ]


  l1_model.fit(x_scaled,y)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [30]:
scaled_model.coef_

array([ 0.8296193 ,  0.11875165, -0.26552688,  0.30569623, -0.004503  ,
       -0.03932627, -0.89988565, -0.870541  ])

#### How to determine what is our best fit? To do that, we are going to need to use a loss - get unbiased estimate by using cross validation. 

In [36]:
from sklearn.model_selection import cross_val_score
cross_val_score(scaled_model, x_scaled, y)

array([0.54866323, 0.46820691, 0.55078434, 0.53698703, 0.66051406])

#### Using the same loop from above, but instead we are now using our cross val score as opposed to just the alpha. When using a loss, we want the lowest score possible, like golf. The first line outputted is negative, which is not good when working with a loss. It means that something is wrong.

#### Another key feature is you'll notice these scores vary quite a bit.

#### This could be a sign that our data set is not being adequately shuffled. Then there might be some order present in our data set. Although it's not entirely unheard of to see variation from run to run. What you want to see is the actual mean or average of these runs, but it's pretty clear that this one's the smallest, so we would start with our alpha of 0.1

In [38]:
alpha = 1
for i in range(5):
    l1_model.alpha=alpha
    print(alpha, cross_val_score(l1_model, x_scaled, y))
    print("-" * 10)
    alpha= alpha/10

1 [-0.21613668 -0.02423671 -0.09329273 -0.07285357 -0.0393445 ]
----------
0.1 [0.45536374 0.4023948  0.47036257 0.30458657 0.52233135]
----------
0.01 [0.56682644 0.45494291 0.55254378 0.50602901 0.66847995]
----------
0.001 [0.55132766 0.46710065 0.55132522 0.53439773 0.66213492]
----------
0.0001 [0.54893833 0.46809859 0.55084209 0.53673338 0.66068491]
----------


# REGULARIZATIN - L2 Ridge

#### We are again using alpha, as according to the documentation on sklearn. Alpha is a substitute term for what we would call lambda. 

In [40]:
from sklearn.linear_model import Ridge
l2_model = Ridge(alpha=1)
l2_model.fit(x_scaled,y)

Ridge(alpha=1)

In [41]:
l2_model.score(x_scaled,y)

0.6062326414700456

#### Looking at the variable importances below, we can see they're non-zero which means they are not terribly suppressed. We may need to look in both directions this time, we will start with low value of alpha and go high.

In [42]:
l2_model.coef_

array([ 0.82959256,  0.11881684, -0.26539682,  0.30552458, -0.00448006,
       -0.03932976, -0.89926646, -0.86991606])

#### Looking at the outputs below, we see a minimal change from the numbers above, but now we have a starting point and can increase our alpha in a loop to see how it changes the results.

In [43]:
l2_model = Ridge(alpha=1E-5)
l2_model.fit(x_scaled,y)
l2_model.coef_

array([ 0.8296193 ,  0.11875165, -0.26552688,  0.30569623, -0.004503  ,
       -0.03932627, -0.89988565, -0.870541  ])

#### As we go through, we see that as alpha is being increased, the coefficients start to decrease. In other words, alpha is starting to suppress the growth of the coefficients. The first one, which we're fairly familiar with has a fairly strong variable importance. And by the time we get to the end, you can see it's almost, magnitude is cut in half

#### But you'll notice, none of these coefficients have yet been zeroed out. And this, of course, is that difference between L1 and L2 regularization. In that L1 causes certain weak interactions to become zero. You'll notice this weak interaction, while still being suppressed, has not been fully suppressed. And it's still non-zero. It's small, but still there. So L2 tends to keep all your variables, while of course, L1 will suppress certain variables.

In [46]:
alpha = 1E-5
for i in range(10):
    l2_model.alpha=alpha
    l2_model.fit(x_scaled,y)
    print(alpha, l2_model.coef_)
    print("-" * 10)
    alpha = alpha * 10

1e-05 [ 0.8296193   0.11875165 -0.26552688  0.30569623 -0.004503   -0.03932627
 -0.89988565 -0.870541  ]
----------
0.0001 [ 0.8296193   0.11875166 -0.26552687  0.30569621 -0.004503   -0.03932627
 -0.89988559 -0.87054094]
----------
0.001 [ 0.82961928  0.11875172 -0.26552675  0.30569606 -0.00450298 -0.03932627
 -0.89988503 -0.87054038]
----------
0.01 [ 0.82961904  0.1187523  -0.26552558  0.30569451 -0.00450277 -0.0393263
 -0.89987946 -0.87053475]
----------
0.1 [ 0.82961664  0.11875818 -0.26551388  0.30567906 -0.00450071 -0.03932662
 -0.89982369 -0.87047846]
----------
1.0 [ 0.82959256  0.11881684 -0.26539682  0.30552458 -0.00448006 -0.03932976
 -0.89926646 -0.86991606]
----------
10.0 [ 0.8293461   0.11939823 -0.26422311  0.30398067 -0.00427544 -0.03936068
 -0.8937389  -0.86433656]
----------
100.0 [ 0.82639107  0.12472471 -0.25228369  0.28870558 -0.00239758 -0.03961935
 -0.84257736 -0.8126335 ]
----------
1000.0 [ 0.78246743  0.15068383 -0.15023828  0.17109626  0.00686803 -0.0397000

#### Now we still don't know, of course, which one's the best fit because we haven't done our scoring, so let's do that now. 

#### So we see that we're actually getting a slightly better score as we've increased alpha. So when I say increased, our loss is starting to decrease. So that means that we actually have a little bit of work to do to tune the strength up even larger.

#### Now, for general overfitting or preventing overfitting, we like to use L2. So in this case, we'd probably be using the ridge regression to find out our best fit

In [49]:
alpha = 1E-5
for i in range(10):
    l2_model.alpha=alpha
    print(alpha, cross_val_score(l2_model,x_scaled,y))
    print("-" * 10)
    alpha = alpha * 10

1e-05 [0.54866323 0.46820691 0.55078434 0.53698703 0.66051406]
----------
0.0001 [0.54866324 0.46820691 0.55078434 0.53698702 0.66051406]
----------
0.001 [0.54866334 0.46820689 0.55078436 0.53698696 0.66051407]
----------
0.01 [0.54866426 0.46820668 0.55078452 0.53698632 0.66051415]
----------
0.1 [0.54867348 0.46820467 0.55078609 0.53697995 0.66051499]
----------
1.0 [0.54876544 0.4681845  0.55080175 0.53691622 0.66052317]
----------
10.0 [0.54966271 0.46798279 0.55095161 0.53627682 0.66058868]
----------
100.0 [0.55669091 0.46597036 0.55186109 0.52974249 0.65993628]
----------
1000.0 [0.55440687 0.44998446 0.53991768 0.47189205 0.62504649]
----------
10000.0 [0.37180782 0.38729253 0.40830662 0.2927889  0.45139779]
----------
