In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

Ans 1. Reading USA Housing dataset

In [None]:
df = pd.read_csv("/content/drive/My Drive/ColabFiles/USA_Housing.csv")
print(df.head())

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


Defining input and output variables

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
print(y[:5])

0    1.059034e+06
1    1.505891e+06
2    1.058988e+06
3    1.260617e+06
4    6.309435e+05
Name: Price, dtype: float64

Standard Scaler to scale input data

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)
X = np.insert(X, 0, values = 1, axis = 1)
print(X[:5])

array([[ 1.        ,  1.02865969, -0.29692705,  0.02127433,  0.08806222,
        -1.31759867],
       [ 1.        ,  1.00080775,  0.02590164, -0.25550611, -0.72230146,
         0.40399945],
       [ 1.        , -0.68462915, -0.11230283,  1.5162435 ,  0.93084045,
         0.07240989],
       [ 1.        , -0.49149907,  1.22157207, -1.39307717, -0.58453963,
        -0.18673422],
       [ 1.        , -0.80707253, -0.94483368,  0.84674187,  0.20151314,
        -0.98838741]])

KFold function for dividing input and output features into five folds

In [None]:
kfold = KFold()
print(kfold)

KFold(n_splits=5, random_state=None, shuffle=False)

Performing multiple linear regression using 5 iterations (5 folds)

In [None]:
i = 0
beta_best = []
r2_max = - 10 ** 10
for X_split, y_split in zip(kfold.split(X), kfold.split(y)):
    X_train = X[X_split[0]]
    X_test = X[X_split[1]]
    y_train = y[y_split[0]]
    y_test = y[y_split[1]]
    A = X_train.T.dot(X_train)
    B = np.linalg.inv(A)
    C = B.dot(X_train.T)
    beta = C.dot(y_train)
    print(f"For {i + 1} iteration:")
    i += 1
    print(f"Beta matrix:{beta}")
    y_predict = X_test.dot(beta)
    print(f"y predicted:{y_predict}")
    sum_square_error = np.sum((y_test - y_predict) ** 2)
    sum_square_variance = np.sum((y_test - np.mean(y_test)) ** 2)
    r2 = 1 - sum_square_error / sum_square_variance
    print(f"R-squared:{r2}")
    if r2_max < r2:
        r2_max = r2
        beta_best = beta
print(f"Beta matrix for best value of R-squared:{beta_best}")

For 1 iteration:
Beta matrix:[1231872.07704612  229923.70318635  164052.98579011  120923.57426579
    3533.77470973  151855.58542488]
y predicted:[1222472.59789716 1494131.15197358 1253670.80242078 1120389.19877662
  844315.19005474 1068260.55667213 1671245.639246   1567875.18249614
  763347.28814599 1470721.64690957 1775501.83443543  630026.09214137
  950539.75749648 1303433.26478986 1306505.35508872 1237586.54456661
 1521433.87536906  488187.16489613 1137920.58807016 1203167.64046499
 2019384.21819711 1076240.13702178  734801.62065626  997288.25431527
  900094.94841917 1473156.57255351 1237124.03621889  930137.78874904
 1405712.96683316 1373314.62542769 1246117.10168385 1224015.69281346
 1234384.89038389  611159.97921256  937440.62317097 1274345.51392185
 1330861.45766804 1161935.74224449  525180.40929258   97206.19765112
  929627.49901683 1714160.39150958  792704.11688002 1611877.74359594
  943225.3232602  1443933.98877536 1222266.83903772 1070537.83047054
 1228326.71172751 1744277.

Using best value of beta to test the performance for 30% of data

In [None]:
X_test = X[int(len(X) * 0.7):]
y_predict = X_test.dot(beta)
print(f"y predicted:{y_predict}")

y predicted:[ 856299.76967364 1278677.68878101 2020283.88816485 ... 1020714.15833879
 1263729.36347261 1301717.22291429]


Ans 2. Reading USA Housing dataset

In [None]:
df = pd.read_csv("/content/drive/My Drive/ColabFiles/USA_Housing.csv")
print(df.head())

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
0,79545.45857,5.682861,7.009188,4.09,23086.8005,1059034.0
1,79248.64245,6.0029,6.730821,3.09,40173.07217,1505891.0
2,61287.06718,5.86589,8.512727,5.13,36882.1594,1058988.0
3,63345.24005,7.188236,5.586729,3.26,34310.24283,1260617.0
4,59982.19723,5.040555,7.839388,4.23,26354.10947,630943.5


Defining input and output variables

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
print(y[:5])

0    1.059034e+06
1    1.505891e+06
2    1.058988e+06
3    1.260617e+06
4    6.309435e+05
Name: Price, dtype: float64

Standard Scaler to scale input data

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)
print(X[:5])

array([[ 1.02865969, -0.29692705,  0.02127433,  0.08806222, -1.31759867],
       [ 1.00080775,  0.02590164, -0.25550611, -0.72230146,  0.40399945],
       [-0.68462915, -0.11230283,  1.5162435 ,  0.93084045,  0.07240989],
       [-0.49149907,  1.22157207, -1.39307717, -0.58453963, -0.18673422],
       [-0.80707253, -0.94483368,  0.84674187,  0.20151314, -0.98838741]])

Performing multiple linear regression using gradient descent optimization

In [None]:
X_train = X[:int(len(X) * 0.56)]
y_train = y[:int(len(X) * 0.56)]
number_of_iterations = 1000
n = len(X_train)
learning_rate = [0.001, 0.01, 0.1, 1]
beta_list = []
for l in learning_rate:
    beta = np.zeros(6)
    for i in range(number_of_iterations):
        x0_gradient = 0
        x1_gradient = 0
        x2_gradient = 0
        x3_gradient = 0
        x4_gradient = 0
        x5_gradient = 0
        for j in range(n):
            a = X_train[j, 0]
            b = X_train[j, 1]
            c = X_train[j, 2]
            d = X_train[j, 3]
            e = X_train[j, 4]
            f = y_train[j]
            x0_gradient += (beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) - f)
            x1_gradient += (beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) - f) * a
            x2_gradient += (beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) - f) * b
            x3_gradient += (beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) - f) * c
            x4_gradient += (beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) - f) * d
            x5_gradient += (beta[0] + (beta[1] * a) + (beta[2] * b) + (beta[3] * c) + (beta[4] * d) + (beta[5] * e) - f) * e
        beta[0] = beta[0] - l / n * x0_gradient
        beta[1] = beta[1] - l / n * x1_gradient
        beta[2] = beta[2] - l / n * x2_gradient
        beta[3] = beta[3] - l / n * x3_gradient
        beta[4] = beta[4] - l / n * x4_gradient
        beta[5] = beta[5] - l / n * x5_gradient
    beta_list.append(beta)
print(f"Beta matrices:{beta_list}")

Beta matrices:[array([779956.15931298, 148633.92656087, 100071.44202261,  73571.12255806,
        22428.72038852,  91893.41958638]), array([ 1.23240080e+06,  2.31659714e+05,  1.63606011e+05,  1.18757266e+05,
       -8.60666297e+00,  1.50706774e+05]), array([ 1.23244775e+06,  2.31682635e+05,  1.63635272e+05,  1.19025219e+05,
       -2.74956842e+02,  1.50705906e+05]), array([ 1.23244775e+06,  2.31682635e+05,  1.63635272e+05,  1.19025219e+05,
       -2.74956842e+02,  1.50705906e+05])]


Using values of beta, computing R2_score for validation and test set

In [None]:
X_test_list = [X[int(len(X) * 0.56):int(len(X) * 0.7)], X[int(len(X) * 0.7):]]
y_test_list = [y[int(len(y) * 0.56):int(len(y) * 0.7)], y[int(len(y) * 0.7):]]
beta_best = []
r2_max = - 10 ** 10
for beta in beta_list:
    for X_test, y_test in zip(X_test_list, y_test_list):
        X_test = np.insert(X_test, 0, values = 1, axis = 1)
        y_predict = X_test.dot(beta)
        sum_square_error = np.sum((y_test - y_predict) ** 2)
        sum_square_variance = np.sum((y_test - np.mean(y_test)) ** 2)
        r2 = 1 - sum_square_error / sum_square_variance
        if r2_max < r2:
            r2_max = r2
            beta_best = beta
print(f"Beta matrix for best value of R-squared:{beta_best}")

Beta matrix for best value of R-squared:[ 1.23240080e+06  2.31659714e+05  1.63606011e+05  1.18757266e+05
 -8.60666297e+00  1.50706774e+05]


Ans 3. Reading Car dataset

In [None]:
columns = ["symboling", "normalized_losses",
"make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels",
"engine_location", "wheel_base", "length", "width", "height", "curb_weight",
"engine_type", "num_cylinders", "engine_size", "fuel_system", "bore", "stroke",
"compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
df = pd.read_csv("/content/drive/My Drive/ColabFiles/imports-85.data", names = columns)
print(df.head())

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


Replacing all ? values with NaN and converting "num_doors" words to figures

In [None]:
df = df.replace("?", np.NaN)
doors = {"two": 2, "four": 4}
df["num_doors"] = df["num_doors"].map(doors)
print(df.head())

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,2.0,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,2.0,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,2.0,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,4.0,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,4.0,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


Dropping the rows with NaN values in price column

In [None]:
df = df[df['price'].notnull()]
df = df.reset_index()
print(df.head())

Unnamed: 0,index,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,0,3,,alfa-romero,gas,std,2.0,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,1,3,,alfa-romero,gas,std,2.0,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,2,1,,alfa-romero,gas,std,2.0,hatchback,rwd,front,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,3,2,164.0,audi,gas,std,4.0,sedan,fwd,front,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,4,2,164.0,audi,gas,std,4.0,sedan,4wd,front,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


Replacing all NaN values with median

In [None]:
for column in df.columns:
    if df[column].isna().sum() > 0:
        df[column] = df[column].astype(np.float64)
        df[column] = df[column].fillna(df[column].median())
print(df.head())

Unnamed: 0,index,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,0,3,115.0,alfa-romero,gas,std,2.0,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,1,3,115.0,alfa-romero,gas,std,2.0,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,2,1,115.0,alfa-romero,gas,std,2.0,hatchback,rwd,front,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,3,2,164.0,audi,gas,std,4.0,sedan,fwd,front,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,4,2,164.0,audi,gas,std,4.0,sedan,4wd,front,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450


Converting "num_cylinders" words to figures

In [None]:
cylinders = {'four' :4, 'six' :6, 'five' :5, 'three' :3, 'twelve' :12, 'two' :2, 'eight' :8}
df["num_cylinders"] = df["num_cylinders"].map(cylinders)
print(df.head())

Unnamed: 0,index,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,0,3,115.0,alfa-romero,gas,std,2.0,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,1,3,115.0,alfa-romero,gas,std,2.0,convertible,rwd,front,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,2,1,115.0,alfa-romero,gas,std,2.0,hatchback,rwd,front,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,3,2,164.0,audi,gas,std,4.0,sedan,fwd,front,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950
4,4,2,164.0,audi,gas,std,4.0,sedan,4wd,front,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450


Dummy encoding of "body_style" and "drive_wheels" using OneHotEncoder and deleting 1 feature each generated from it

In [None]:
encoder = OneHotEncoder()
df_style = pd.DataFrame(encoder.fit_transform(df[["body_style"]]).toarray())
df_style.columns = encoder.get_feature_names_out(["body_style"])
df = df.drop(["body_style"], axis = 1)
df = pd.concat([df, df_style], axis = 1)
df = df.iloc[:, :-1]
df_wheels = pd.DataFrame(encoder.fit_transform(df[["drive_wheels"]]).toarray())
df_wheels.columns = encoder.get_feature_names_out(["drive_wheels"])
df = df.drop(["drive_wheels"], axis = 1)
df = pd.concat([df, df_wheels], axis = 1)
df = df.iloc[:, :-1]
print(df.head())

Unnamed: 0,index,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,...,peak_rpm,city_mpg,highway_mpg,price,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,drive_wheels_4wd,drive_wheels_fwd
0,0,3,115.0,alfa-romero,gas,std,2.0,front,88.6,168.8,...,5000.0,21,27,13495,1.0,0.0,0.0,0.0,0.0,0.0
1,1,3,115.0,alfa-romero,gas,std,2.0,front,88.6,168.8,...,5000.0,21,27,16500,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1,115.0,alfa-romero,gas,std,2.0,front,94.5,171.2,...,5000.0,19,26,16500,0.0,0.0,1.0,0.0,0.0,0.0
3,3,2,164.0,audi,gas,std,4.0,front,99.8,176.6,...,5500.0,24,30,13950,0.0,0.0,0.0,1.0,0.0,1.0
4,4,2,164.0,audi,gas,std,4.0,front,99.4,176.6,...,5500.0,18,22,17450,0.0,0.0,0.0,1.0,1.0,0.0


Label encoding on 'make', 'aspiration', 'engine_location' and 'fuel_type'

In [None]:
df[['make', 'aspiration', 'engine_location', 'fuel_type']] = df[['make', 'aspiration', 'engine_location', 'fuel_type']].apply(LabelEncoder().fit_transform)
print(df.head())

Unnamed: 0,index,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,...,peak_rpm,city_mpg,highway_mpg,price,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,drive_wheels_4wd,drive_wheels_fwd
0,0,3,115.0,0,1,0,2.0,0,88.6,168.8,...,5000.0,21,27,13495,1.0,0.0,0.0,0.0,0.0,0.0
1,1,3,115.0,0,1,0,2.0,0,88.6,168.8,...,5000.0,21,27,16500,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1,115.0,0,1,0,2.0,0,94.5,171.2,...,5000.0,19,26,16500,0.0,0.0,1.0,0.0,0.0,0.0
3,3,2,164.0,1,1,0,4.0,0,99.8,176.6,...,5500.0,24,30,13950,0.0,0.0,0.0,1.0,0.0,1.0
4,4,2,164.0,1,1,0,4.0,0,99.4,176.6,...,5500.0,18,22,17450,0.0,0.0,0.0,1.0,1.0,0.0


For "fuel_system": replacing values containing string "mpfi" to 1 else all values to 0.
For engine_type: replacing values containing string "ohc" to 1 else all values to 0.

In [None]:
df['fuel_system'] = df['fuel_system'].replace('mpfi', 1)
df['fuel_system'][df['fuel_system'] != 1] = 0
df['engine_type'] = df['engine_type'].replace('ohc', 1)
df['engine_type'][df['engine_type'] != 1] = 0
print(df.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fuel_system'][df['fuel_system'] != 1] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['engine_type'][df['engine_type'] != 1] = 0


Unnamed: 0,index,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,engine_location,wheel_base,length,...,peak_rpm,city_mpg,highway_mpg,price,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,drive_wheels_4wd,drive_wheels_fwd
0,0,3,115.0,0,1,0,2.0,0,88.6,168.8,...,5000.0,21,27,13495,1.0,0.0,0.0,0.0,0.0,0.0
1,1,3,115.0,0,1,0,2.0,0,88.6,168.8,...,5000.0,21,27,16500,1.0,0.0,0.0,0.0,0.0,0.0
2,2,1,115.0,0,1,0,2.0,0,94.5,171.2,...,5000.0,19,26,16500,0.0,0.0,1.0,0.0,0.0,0.0
3,3,2,164.0,1,1,0,4.0,0,99.8,176.6,...,5500.0,24,30,13950,0.0,0.0,0.0,1.0,0.0,1.0
4,4,2,164.0,1,1,0,4.0,0,99.4,176.6,...,5500.0,18,22,17450,0.0,0.0,0.0,1.0,1.0,0.0


Defining input and output variables

In [None]:
X = df.loc[:, df.columns != 'price']
y = df.loc[:, df.columns == 'price']
print(y[:5])

Unnamed: 0,price
0,13495
1,16500
2,16500
3,13950
4,17450


Standard Scaler to scale input data

In [None]:
scaler = StandardScaler()
scaler = scaler.fit(X)
X = scaler.transform(X)
print(X[:5])

array([[-1.74149581,  1.72504964, -0.17830627, -1.97171697,  0.33241125,
        -0.46709937, -1.15637766, -0.12309149, -1.6851071 , -0.43940949,
        -0.85346024, -2.03408071, -0.01485764, -1.60912576, -0.34365977,
         0.07538912,  1.0884772 ,  0.52240521, -1.83181951, -0.29143464,
         0.20617449, -0.24823862, -0.65224901, -0.54228772,  5.70087713,
        -0.20359464, -0.7150372 , -0.9372858 , -0.20359464, -1.19234506],
       [-1.72457173,  1.72504964, -0.17830627, -1.97171697,  0.33241125,
        -0.46709937, -1.15637766, -0.12309149, -1.6851071 , -0.43940949,
        -0.85346024, -2.03408071, -0.01485764, -1.60912576, -0.34365977,
         0.07538912,  1.0884772 ,  0.52240521, -1.83181951, -0.29143464,
         0.20617449, -0.24823862, -0.65224901, -0.54228772,  5.70087713,
        -0.20359464, -0.7150372 , -0.9372858 , -0.20359464, -1.19234506],
       [-1.70764765,  0.1271926 , -0.17830627, -1.97171697,  0.33241125,
        -0.46709937, -1.15637766, -0.12309149, -0

Dividing input and output variables into training and testing data and performing Linear regression

In [None]:
X_train = X[:int(len(X) * 0.7)]
X_test = X[int(len(X) * 0.7):]
y_train = y[:int(len(y) * 0.7)]
y_test = y[int(len(y) * 0.7):]
model = LinearRegression()
model = model.fit(X_train, y_train)
r_sq = model.score(X_test, y_test)
print(f"R-squared value before PCA:{r_sq}")

R-squared value before PCA:0.22672157499175694


Performing PCA and getting 4 reduced components

In [None]:
pca = PCA(n_components = 4)
X_transformed = pca.fit_transform(X)
X_transformed = pd.DataFrame(data = X_transformed)
print(X_transformed.head())

Unnamed: 0,0,1,2,3
0,-0.229535,4.207915,0.305627,0.65969
1,-0.228776,4.205348,0.31273,0.664253
2,0.69114,3.051111,-1.958372,-0.592041
3,-0.342292,0.623174,-1.670951,-2.425158
4,1.550377,1.362714,-0.276575,-2.444403


Dividing input and output variables into training and testing data and performing Linear regression on reduced dataset and showing the improved performance using R-squared value

In [None]:
X_train = X_transformed[:int(len(X_transformed) * 0.7)]
X_test = X_transformed[int(len(X_transformed) * 0.7):]
model = LinearRegression()
model = model.fit(X_train, y_train)
r_sq = model.score(X_test, y_test)
print(f"R-squared value after PCA:{r_sq}")

R-squared value after PCA:0.38229923221015405
