In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Loading and Pre-Processing data


In [2]:
df = pd.read_csv("/content/drive/MyDrive/Fuel.csv")

df.columns = map(str.lower, df.columns) #lower case
df.drop(['make','model', 'modelyear'], axis=1, inplace=True) # removing model year, make and model taking the other factors as more techincal components that are used across the makes and models

df.head()

Unnamed: 0,vehicleclass,enginesize,cylinders,transmission,fueltype,fuelconsumption_city,fuelconsumption_hwy,fuelconsumption_comb,fuelconsumption_comb_mpg,co2emissions
0,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [3]:
# Label encoding** the categorical data, might introduce implicit ordinal relations but one hot introduces too many columns for our data
# **For Eg. taking vehicleclasses - suv, compact, wagon etc as 1, 2, 3 and so on.

df['vehicleclass'] = df['vehicleclass'].astype('category').cat.codes
df['transmission'] = df['transmission'].astype('category').cat.codes
df['fueltype'] = df['fueltype'].astype('category').cat.codes

df.head()

Unnamed: 0,vehicleclass,enginesize,cylinders,transmission,fueltype,fuelconsumption_city,fuelconsumption_hwy,fuelconsumption_comb,fuelconsumption_comb_mpg,co2emissions
0,0,2.0,4,10,3,9.9,6.7,8.5,33,196
1,0,2.4,4,20,3,11.2,7.7,9.6,29,221
2,0,1.5,4,17,3,6.0,5.8,5.9,48,136
3,11,3.5,6,11,3,12.7,9.1,11.1,25,255
4,11,3.5,6,11,3,12.1,8.7,10.6,27,244


In [4]:
df["intercept"] = 1 #adding column of ones
df = df.iloc[:,[10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]] #rearranging the columns

x = df.drop(columns = 'co2emissions')
y = df[['co2emissions']]

#Splitting the data into train and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) #x- inputs, y- outputs

Adding column of ones to dataframe so when multiplied with coeff matrix, it has an intercept aka coeff of 1.

a, b, c ... i represent different categories like vehicleclass, enginesize etc.
$$
X = \begin{bmatrix}
1 & a_1 & .. & i_1\\
1 & a_2 & .. & i_2\\
.. & .. & .. & .. \\
1 & a_n & .. & i_n\\
\end{bmatrix},
B = \begin{bmatrix}
B_0\\
B_1\\
B_2\\
..\\
B_9\\
\end{bmatrix}
$$
$$
$$
$$
XB = \begin{bmatrix}
B_0 + B_1a_1 .. + B_9i_1\\
B_0 + B_1a_2 .. + B_9i_2\\
..\\
B_0 + B_1a_n .. + B_9i_n\\
\end{bmatrix}
$$

XB is also our predictions matrix

# Linear regression implementation (Using the Matrix approach)


In [5]:
X = x_train.copy()
X_T = X.T
Y = y_train.copy()
B = np.linalg.inv(X_T @ X) @ X_T @ Y #Coefficent matrix

So in the matrix approach we try to reverse engineer our way from XB to find the B matrix which can be used with any other test set.

This equation for error matrix:
$$e(B) = Y − XB$$

To find mean square error we find the squared matrix, sum the values and divide by n:
$$
\ \\
MSE(B) = \frac{1}{n}e^Te -(1)\\
$$
$$
\frac{1}{n}\begin{bmatrix}
e_1 & e_2 & .. & e_n\\
\end{bmatrix}\begin{bmatrix}
e_1 \\
e_2 \\
..\\
e_n
\end{bmatrix}
$$
On expanding (1) we get
$$
\ \\
MSE(B) = \frac{1}{n}(Y-XB)^T(Y − XB)\\
\ \\
\ \\
MSE(B) = \frac{1}{n}(Y^TY − 2B^TX^TY + B^TX^TXB) - (2)
\ \\
$$

To find optimum line we must find minimum MSE. For that we find gradient of MSE with respect to B and equate to 0:

$$
∇MSE(B) = \frac{1}{n}(∇Y^TY − 2∇B^TX^TY + ∇B^TX^TXB)\\
\ \\
\ \\
= \frac{1}{n}(0 − 2X^TY + 2X^TXB) \\
\ \\
\ \\
= \frac{2}{n}(X^TXB - X^TY) = 0
$$

This gives,
$$
\ \\X^TXB − X^TY = 0$$

and finally,
$$
\ \\
B = (X^TX)^{-1}X^TY - (3)
\ \\
$$



*We use Eq (3) in the above code to find Coefficient matrix and finally the best fit line.*

In [6]:
predictions = np.dot(x_train, B)

pred_train = predictions.ravel()

In [7]:
pred_train_df = pd.DataFrame()
pred_train_df['predictions'] = pred_train
pred_train_df

Unnamed: 0,predictions
0,347.122747
1,299.319693
2,204.995786
3,187.046367
4,258.211739
...,...
795,209.787778
796,181.326720
797,301.018466
798,235.033800


# R-Square

In [8]:
def r2_metric(labels, predictions):

  label_array = np.array(labels)
  pred_array = np.array(predictions)

  RSS = ((label_array - pred_array)**2).sum()
  TSS = ((label_array - label_array.mean())**2).sum()

  r2 = 1-(RSS/TSS)

  return r2

R-Square is basically a measure of how much reduction in error happens in our best fit line compared to if we just take the mean of CO2 emissions as our predictions.

$$
R^2 = \frac{(MSE\ of\ y = {\bar{y}}\ line) - (MSE\ of\ best\ fit\ line)}{(MSE\ of\ y = {\bar{y}}\ line)}\\
$$

$$
\ \\
R^2 = 1 - \frac{RSS}{TSS}
\ \\
$$

Here, \\
RSS = sum of squares of residuals aka MSE of best fit \\
TSS = total sum of squares aka MSE of mean line


In [9]:
r2_train = r2_metric(Y, pred_train_df)

print(r2_train)

0.9043347379081124


# Running the model with Test Data

In [10]:
pred_test = np.dot(x_test, B)
pred_test_df = pd.DataFrame(pred_test)
pred_test_df

Unnamed: 0,0
0,275.515286
1,197.677446
2,190.955384
3,162.076657
4,321.224587
...,...
262,309.615065
263,339.931802
264,215.979210
265,265.443531


In [11]:
r2_test = r2_metric(y_test, pred_test_df)
r2_test

0.8921142876942829

# Comparing my model with sklearn model

In [12]:
#sklearn model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [13]:
lr.fit(x_train, y_train)

In [14]:
c = lr.intercept_
m = lr.coef_
c,m

(array([195.27252602]),
 array([[  0.        ,   1.08197333,  10.40528847,   4.63417133,
          -0.06620878,   8.54085262, -12.33461492,  -9.45162628,
          26.51165587,  -3.00964751]]))

In [15]:
pred_train_skl = lr.predict(x_train)
pred_test_skl = lr.predict(x_test)

In [16]:
r2_train_skl = r2_metric(Y, pred_train_skl)
r2_train_skl

0.9043347379081124

In [17]:
r2_test_skl = r2_metric(y_test, pred_test_skl)
r2_test_skl

0.8921142876980942

# References
All the math here was taken from \\
- [Lecture 13: Simple Linear Regression in Matrix
Format](https://www.stat.cmu.edu/~cshalizi/mreg/15/lectures/13/lecture-13.pdf)
- [R-Squared](https://www.investopedia.com/terms/r/r-squared.asp)

Math symbols
 - [LaTeX for colab](https://colab.research.google.com/github/EPS-Libraries-Berkeley/volt/blob/main/LaTeX/Equations_and_Formulas.ipynb#scrollTo=vGj8REy3bWuR)