Linear Regression, Lasso and Ridge regression

In [96]:
#  House pricing dataset 
from sklearn.datasets import fetch_california_housing


In [97]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 

In [98]:
df = fetch_california_housing()

In [99]:
type(df)

sklearn.utils.Bunch

In [100]:
df

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [101]:
# so this data is in the form of key value pairs we have to convert it into the dataframe 
dataset =  pd.DataFrame(df.data)

In [102]:
# we dont see column name there so we will set up column name by 
dataset.columns = df.feature_names
dataset.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [103]:
dataset['Price']= df.target

In [104]:
dataset.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25,2.697
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25,2.992
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25,2.414
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26,2.267
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25,2.611


In [105]:
# dividing the data set into dependant and independant features 
x = dataset.iloc[:, :-1] # means we are skipping the last feature that is our price -- Independant features 
y = dataset.iloc[:, -1] # dependant feature ,, here inside the bracket : suggest all data and -1 suggest the last column only 


In [106]:
x.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [107]:
y.head()

0    4.526
1    3.585
2    3.521
3    3.413
4    3.422
Name: Price, dtype: float64

In [108]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
x, y, test_size=0.33, random_state=42)

In [109]:
# Linear regression 
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import cross_val_score
lin_reg = LinearRegression()
mse = cross_val_score(lin_reg, X_train, y_train, scoring = 'neg_mean_squared_error', cv = 5 )
mse_mean = np.mean(mse) # taking mean of mse values 
print(mse)
mse_mean

[-0.54099828 -0.49871687 -0.50480739 -0.5200183  -0.55070903]


-0.523049976338392

In [110]:
lin_reg.fit(X_train,y_train)

LinearRegression()

In [111]:
# Ridge regression 
from sklearn.linear_model import Ridge 
from sklearn.model_selection import GridSearchCV # will help with hyperparameter tunning that is lambda or we can call it alpha 
ridge = Ridge()

params = [{'alpha': [1e-15, 1e-12, 1e-9, 1e-6, 1e-3, 1e-2, 1e-1, 1, 2, 5, 10, 20, 30, 40, 100, 200, 400, 800 ]}]

ridge_regressor = GridSearchCV(ridge, params, scoring = 'neg_mean_squared_error', cv = 10)
ridge_regressor.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=Ridge(),
             param_grid=[{'alpha': [1e-15, 1e-12, 1e-09, 1e-06, 0.001, 0.01,
                                    0.1, 1, 2, 5, 10, 20, 30, 40, 100, 200, 400,
                                    800]}],
             scoring='neg_mean_squared_error')

In [112]:
print(ridge_regressor.best_params_)
print(ridge_regressor.best_score_)


{'alpha': 10}
-0.5221728352781309


In [113]:
# with the ridge  and lasso there not much improvement in performance with respect to the score
# lets try Lasso regression 
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV # will help with hyperparameter tunning that is lambda or we can call it alpha 
lasso = Lasso()

params = [{'alpha': [1e-15, 1e-12, 1e-9, 1e-6, 1e-3, 1e-2, 1e-1, 1, 2, 5, 10, 20, 40, 100, 200, 400, 1000]}]

lasso_regressor = GridSearchCV(lasso, params, scoring = 'neg_mean_squared_error', cv = 10)
lasso_regressor.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=10, estimator=Lasso(),
             param_grid=[{'alpha': [1e-15, 1e-12, 1e-09, 1e-06, 0.001, 0.01,
                                    0.1, 1, 2, 5, 10, 20, 40, 100, 200, 400,
                                    1000]}],
             scoring='neg_mean_squared_error')

In [114]:
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

{'alpha': 1e-06}
-0.5221889361172731


In [115]:
# The value should go towards the zero 
# lets check R2 score 

y_pred = lasso_regressor.predict(X_test)
from sklearn.metrics import r2_score
r2_score1 = r2_score(y_pred, y_test)



In [116]:
print(r2_score1)

0.3395715278615712


In [117]:
# its  really poor 
y_pred = lin_reg.predict(X_test)
from sklearn.metrics import r2_score
r2_score2 = r2_score(y_pred, y_test)

In [118]:
print(r2_score2)

0.3395742960386784


In [119]:
y_pred = ridge_regressor.predict(X_test)
from sklearn.metrics import r2_score
r2_score3 = r2_score(y_pred, y_test)

In [120]:
print(r2_score3)

0.338595032973087


In [121]:
# Logistic regression 
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer

In [122]:
df = load_breast_cancer()
# Independant features 
X = pd.DataFrame(df['data'], columns = df['feature_names'])

In [123]:
X.head(10)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244
6,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,0.05742,...,22.88,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368
7,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,0.07451,...,17.06,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151
8,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,0.07389,...,15.49,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072
9,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,0.08243,...,15.09,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075


In [124]:
# dependant feature
y = pd.DataFrame(df['target'], columns = ['Target'])
y

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0
...,...
564,0
565,0
566,0
567,0


In [125]:
y['Target'].value_counts()

1    357
0    212
Name: Target, dtype: int64

In [126]:
# this is a balance data set 

params = [{'C': [1, 5, 10]}, {'max_iter': [100, 150]}]

In [127]:
model1 = LogisticRegression(C = 100, max_iter = 100)


In [128]:
model= GridSearchCV(model1, param_grid = params, scoring = 'f1', cv = 5)

In [129]:
model.fit(X_train,y_train)

25 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Asus\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Asus\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1516, in fit
    check_classification_targets(y)
  File "C:\Users\Asus\anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 197, in check_classification_targets
    raise ValueError("Unknown label type: %r" % y_type)
ValueError: Unknown label type: 'continuous'



ValueError: Unknown label type: 'continuous'