In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

Ans 1. Generating a dataset with seven highly correlated columns and a target variable

In [None]:
x = np.array([i * np.pi / 180 for i in range(60, 300, 4)])
np.random.seed(10)
y = np.sin(x) + np.random.normal(0, 0.15, len(x))
df= pd.DataFrame(np.column_stack([x, y]), columns = ['x', 'y'])
for i in range(2,8):
    colname = 'x_%d' % i
    df[colname] = df['x'] ** i
print(df.head())

Unnamed: 0,x,y,x_2,x_3,x_4,x_5,x_6,x_7
0,1.047198,1.065763,1.096623,1.148381,1.202581,1.25934,1.318778,1.381021
1,1.117011,1.006086,1.247713,1.393709,1.556788,1.738948,1.942424,2.169709
2,1.186824,0.695374,1.408551,1.671702,1.984016,2.354677,2.794587,3.316683
3,1.256637,0.949799,1.579137,1.984402,2.493673,3.133642,3.93785,4.948448
4,1.32645,1.063496,1.75947,2.33385,3.095735,4.106339,5.446854,7.224981


Defining input and output variables

In [None]:
X = df.loc[:, df.columns != 'y']
y = df.loc[:, df.columns == 'y']
print(X[:5])
print(y[:5])

          x       x_2       x_3       x_4       x_5       x_6       x_7
0  1.047198  1.096623  1.148381  1.202581  1.259340  1.318778  1.381021
1  1.117011  1.247713  1.393709  1.556788  1.738948  1.942424  2.169709
2  1.186824  1.408551  1.671702  1.984016  2.354677  2.794587  3.316683
3  1.256637  1.579137  1.984402  2.493673  3.133642  3.937850  4.948448
4  1.326450  1.759470  2.333850  3.095735  4.106339  5.446854  7.224981
          y
0  1.065763
1  1.006086
2  0.695374
3  0.949799
4  1.063496


Standard Scaler to scale input data

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = np.insert(X_scaled, 0, values = 1, axis = 1)
print(X_scaled[:5])

array([[ 1.        , -1.7034199 , -1.31365751, -1.05867178, -0.89285793,
        -0.78079937, -0.70090067, -0.64095326],
       [ 1.        , -1.64567685, -1.29384239, -1.05255484, -0.89110526,
        -0.78032303, -0.70077609, -0.64092161],
       [ 1.        , -1.5879338 , -1.27274887, -1.04562343, -0.88899127,
        -0.77971149, -0.70060587, -0.64087558],
       [ 1.        , -1.53019075, -1.25037695, -1.03782667, -0.88646941,
        -0.77893783, -0.7003775 , -0.64081009],
       [ 1.        , -1.47244771, -1.22672664, -1.02911364, -0.8834903 ,
        -0.77797175, -0.70007607, -0.64071873]])

Performing ridge regression using gradient descent optimization (performed 100 instead of 1000 iterations)

In [None]:
X_train = X_scaled
y_train = y.iloc[:, 0]
number_of_iterations = 100
n = len(X_train)
learning_rate = [0.0001, 0.001, 0.01, 0.1, 1, 10]
lambdas = [10 ** -15, 10 ** -10, 0.00001, 0.001, 0, 1, 10, 20]
beta_list = []
r2_list = []
for r in lambdas:
    for l in learning_rate:
        beta = np.zeros(8)
        for i in range(number_of_iterations):
            x0_gradient = 0
            x1_gradient = 0
            x2_gradient = 0
            x3_gradient = 0
            x4_gradient = 0
            x5_gradient = 0
            x6_gradient = 0
            x7_gradient = 0
            for j in range(n):
                a = X_train[j, 1]
                b = X_train[j, 2]
                c = X_train[j, 3]
                d = X_train[j, 4]
                e = X_train[j, 5]
                f = X_train[j, 6]
                g = X_train[j, 7]
                h = y_train[j]
                x0_gradient += (beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h) + (r * beta[0])
                x1_gradient += ((beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h)) * a + (r * beta[1])
                x2_gradient += ((beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h)) * b + (r * beta[2])
                x3_gradient += ((beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h)) * c + (r * beta[3])
                x4_gradient += ((beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h)) * d + (r * beta[4])
                x5_gradient += ((beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h)) * e + (r * beta[5])
                x6_gradient += ((beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h)) * f + (r * beta[6])
                x7_gradient += ((beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) + (beta[6] * f) + (beta[7] * g) - h)) * g + (r * beta[7])
            beta[0] = beta[0] - l / n * x0_gradient
            beta[1] = beta[1] - l / n * x1_gradient
            beta[2] = beta[2] - l / n * x2_gradient
            beta[3] = beta[3] - l / n * x3_gradient
            beta[4] = beta[4] - l / n * x4_gradient
            beta[5] = beta[5] - l / n * x5_gradient
            beta[6] = beta[6] - l / n * x6_gradient
            beta[7] = beta[7] - l / n * x7_gradient
        beta_list.append(beta)
        y_predict = X_train.dot(beta)
        sum_square_error = np.sum((y_train - y_predict) ** 2)
        sum_square_variance = np.sum((y_train - np.mean(y_train)) ** 2)
        r2 = 1 - sum_square_error / sum_square_variance
        r2_list.append(r2)
print(f"Maximum R-squared:{max(r2_list)}")

Maximum R-squared:0.9430149427851566


Ans 2. Reading Hitters dataset

In [None]:
df = pd.read_csv("/content/drive/My Drive/ColabFiles/Hitters.csv")
df = df[df['Salary'].notnull()]
df = df.reset_index()
print(df.head())

Unnamed: 0,index,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,1,315,81,7,24,38,39,14,3449,835,...,321,414,375,N,W,632,43,10,475.0,N
1,2,479,130,18,66,72,76,3,1624,457,...,224,266,263,A,W,880,82,14,480.0,A
2,3,496,141,20,65,78,37,11,5628,1575,...,828,838,354,N,E,200,11,3,500.0,N
3,4,321,87,10,39,42,30,2,396,101,...,48,46,33,N,E,805,40,4,91.5,N
4,5,594,169,4,74,51,35,11,4408,1133,...,501,336,194,A,W,282,421,25,750.0,A


Performing data preprocessing

In [None]:
league = {'N' : 0, 'A' : 1}
df["League"] = df["League"].map(league)
df["NewLeague"] = df["NewLeague"].map(league)
division = {'W' : 0, 'E' : 1}
df["Division"] = df["Division"].map(division)
print(df.head())

Unnamed: 0,index,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,...,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
0,1,315,81,7,24,38,39,14,3449,835,...,321,414,375,0,0,632,43,10,475.0,0
1,2,479,130,18,66,72,76,3,1624,457,...,224,266,263,1,0,880,82,14,480.0,1
2,3,496,141,20,65,78,37,11,5628,1575,...,828,838,354,0,1,200,11,3,500.0,0
3,4,321,87,10,39,42,30,2,396,101,...,48,46,33,0,1,805,40,4,91.5,0
4,5,594,169,4,74,51,35,11,4408,1133,...,501,336,194,1,0,282,421,25,750.0,1


Defining input and output variables

In [None]:
X = df.loc[:, df.columns != 'Salary']
y = df.loc[:, df.columns == 'Salary']
print(X[:5])
print(y[:5])

   index  AtBat  Hits  HmRun  Runs  RBI  Walks  Years  CAtBat  CHits  CHmRun  \
0      1    315    81      7    24   38     39     14    3449    835      69   
1      2    479   130     18    66   72     76      3    1624    457      63   
2      3    496   141     20    65   78     37     11    5628   1575     225   
3      4    321    87     10    39   42     30      2     396    101      12   
4      5    594   169      4    74   51     35     11    4408   1133      19   

   CRuns  CRBI  CWalks  League  Division  PutOuts  Assists  Errors  NewLeague  
0    321   414     375       0         0      632       43      10          0  
1    224   266     263       1         0      880       82      14          1  
2    828   838     354       0         1      200       11       3          0  
3     48    46      33       0         1      805       40       4          0  
4    501   336     194       1         0      282      421      25          1  
   Salary
0   475.0
1   480.0
2   500.0

Standard Scaler to scale input data

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = np.insert(X_scaled, 0, values = 1, axis = 1)
print(X_scaled[:5])

array([[ 1.        , -1.77182646, -0.6029005 , -0.59567545, -0.5285512 ,
        -1.2061115 , -0.52206292, -0.0975271 ,  1.3978935 ,  0.3467905 ,
         0.1743734 , -0.0029198 , -0.12167086,  0.25896561,  0.43533435,
        -1.05875764, -0.98116592,  1.22149851, -0.52319133,  0.21335208,
        -1.07505242],
       [ 1.        , -1.76105236,  0.51254171,  0.49225957,  0.72996619,
         0.44151497,  0.79405962,  1.6093726 , -0.90120024, -0.45286539,
        -0.4098921 , -0.07605382, -0.41510491, -0.19959036,  0.010373  ,
         0.94450322, -0.98116592,  2.10910888, -0.25386267,  0.81996395,
         0.9301872 ],
       [ 1.        , -1.75027827,  0.62816682,  0.73648988,  0.95878753,
         0.40228577,  1.02631654, -0.18979195,  0.77086793,  1.30155773,
         1.31817358,  1.89856469,  1.41205144,  1.57266648,  0.3556541 ,
        -1.05875764,  1.01919561, -0.32466148, -0.74417894, -0.84821868,
        -1.07505242],
       [ 1.        , -1.73950418, -0.56209164, -0.46245892

Performing Linear, Lasso and Ridge regression of which all Regression techniques preformed almost same

In [None]:
model = LinearRegression()
model = model.fit(X, y)
y_pred = model.predict(X)
print("Linear Regression")
print(f"R-squared:{r2_score(y, y_pred)}")
print(f"Mean squared error:{mean_squared_error(y, y_pred)}")
model = Ridge(0.5748)
model = model.fit(X, y)
y_pred = model.predict(X)
print("Ridge Regression")
print(f"R-squared:{r2_score(y, y_pred)}")
print(f"Mean squared error:{mean_squared_error(y, y_pred)}")
model = Lasso(0.5748)
model = model.fit(X, y)
y_pred = model.predict(X)
print("Lasso Regression")
print(f"R-squared:{r2_score(y, y_pred)}")
print(f"Mean squared error:{mean_squared_error(y, y_pred)}")

Linear Regression
R-squared:0.5461337860066704
Mean squared error:92014.23518960376
Ridge Regression
R-squared:0.5461298557318897
Mean squared error:92015.03199098902
Lasso Regression
R-squared:0.5459996727607728
Mean squared error:92041.42454049527


  model = cd_fast.enet_coordinate_descent(


Ans 3: Loading California dataset

In [4]:
california = fetch_california_housing()
X = california.data
y = california.target
print(X[:5])
print(y[:5])

[[ 8.32520000e+00  4.10000000e+01  6.98412698e+00  1.02380952e+00
   3.22000000e+02  2.55555556e+00  3.78800000e+01 -1.22230000e+02]
 [ 8.30140000e+00  2.10000000e+01  6.23813708e+00  9.71880492e-01
   2.40100000e+03  2.10984183e+00  3.78600000e+01 -1.22220000e+02]
 [ 7.25740000e+00  5.20000000e+01  8.28813559e+00  1.07344633e+00
   4.96000000e+02  2.80225989e+00  3.78500000e+01 -1.22240000e+02]
 [ 5.64310000e+00  5.20000000e+01  5.81735160e+00  1.07305936e+00
   5.58000000e+02  2.54794521e+00  3.78500000e+01 -1.22250000e+02]
 [ 3.84620000e+00  5.20000000e+01  6.28185328e+00  1.08108108e+00
   5.65000000e+02  2.18146718e+00  3.78500000e+01 -1.22250000e+02]]
[4.526 3.585 3.521 3.413 3.422]


Performing LassoCV and RidgeCV regression

In [None]:
model = RidgeCV()
model = model.fit(X, y)
y_pred = model.predict(X)
print("RidgeCV Regression")
print(f"R-squared:{r2_score(y, y_pred)}")
print(f"Mean squared error:{mean_squared_error(y, y_pred)}")
model = LassoCV()
model = model.fit(X, y)
y_pred = model.predict(X)
print("LassoCV Regression")
print(f"R-squared:{r2_score(y, y_pred)}")
print(f"Mean squared error:{mean_squared_error(y, y_pred)}")

RidgeCV Regression
R-squared:0.60623010176544
Mean squared error:0.5243244261574113
LassoCV Regression
R-squared:0.5902088471774514
Mean squared error:0.5456575325117344
