## Importing the required libraries and loading the CSV file into a Pandas dataframe

In [13]:
import numpy as np
import pandas as pd

df = pd.read_csv('/home/shardul/PycharmProjects/GDSCML/Fuel.csv')

## Splitting the data into training and test 

In [14]:
df = df.sample(frac=1) #Shuffles the dataframe
train_ratio = 0.8    #Percentage of training data
total_rows = df.shape[0]
train_size = int(total_rows * train_ratio)
train = df[0:train_size]
test = df[train_size:]

## Analyzing the correlation between the predictor variables

In [15]:
print(df.iloc[:, np.r_[4, 5, 8:13]].corr())

                          ENGINESIZE  CYLINDERS  FUELCONSUMPTION_CITY  \
ENGINESIZE                  1.000000   0.934011              0.832225   
CYLINDERS                   0.934011   1.000000              0.796473   
FUELCONSUMPTION_CITY        0.832225   0.796473              1.000000   
FUELCONSUMPTION_HWY         0.778746   0.724594              0.965718   
FUELCONSUMPTION_COMB        0.819482   0.776788              0.995542   
FUELCONSUMPTION_COMB_MPG   -0.808554  -0.770430             -0.935613   
CO2EMISSIONS                0.874154   0.849685              0.898039   

                          FUELCONSUMPTION_HWY  FUELCONSUMPTION_COMB  \
ENGINESIZE                           0.778746              0.819482   
CYLINDERS                            0.724594              0.776788   
FUELCONSUMPTION_CITY                 0.965718              0.995542   
FUELCONSUMPTION_HWY                  1.000000              0.985804   
FUELCONSUMPTION_COMB                 0.985804              1

# All predictors have high correlation with CO2 emissions

## Creating training and test NumPy arrays of the predictor and target variables

In [16]:
x_train = train.iloc[:, np.r_[4, 5, 8:12]].to_numpy()
y_train = train.iloc[:, 12].to_numpy()

x_test = test.iloc[:, np.r_[4, 5, 8:12]].to_numpy()
y_test = test.iloc[:, 12].to_numpy()

## Function to add a column of 1s to the features vector to account for the intercept during matrix multiplication

In [17]:
def generateXvector(X):
    vector = np.c_[np.ones((len(X), 1)), X]
    return vector

## Function to generate random initial parameters vector

In [18]:
def theta_init(X):
    theta = np.random.randn(len(X[0]) + 1, 1)
    return theta

## Gradient descent function

In [19]:
def Gradient_Descent(X, y, learningrate, epochs):
    y_new = np.reshape(y, (len(y), 1))
    vectorX = generateXvector(X)
    theta = theta_init(X)
    m = len(X)
    for i in range(epochs):
        gradients = 2 / m * vectorX.T.dot(vectorX.dot(theta) - y_new) #Gradient vector of cost function
        theta = np.float128(theta - learningrate * gradients) #Updates parameters for each iteration

    return theta

## Scaling the data to remove the overflow errors occuring with unscaled data

In [20]:
from sklearn.preprocessing import StandardScaler
X_transform_train=StandardScaler().fit_transform(x_train)
X_transform_test=StandardScaler().fit_transform(x_test)

## Calling the gradient descent function and storing the parameters

In [21]:
learningrate = 0.0001
epochs = 100000
theta = Gradient_Descent(X_transform_train, y_train, learningrate, epochs)

## Scikit-Learn Linear Regression to compare it with my model

In [22]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_transform_train, y_train)
print(lin_reg.intercept_, lin_reg.coef_)

256.12661195779606 [ 13.21191318  14.31495668 -67.68265118 -18.4362907   93.57674929
 -29.18587943]


## R-squared and RMSE for both models

In [23]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("R-squared of my model: ", r2_score(y_test, X_transform_test.dot(theta[1:]) + theta[0]))
print("RMSE of my model: ", np.sqrt(mean_squared_error(y_test, X_transform_test.dot(theta[1:]) + theta[0])))
print("Scikit-Learn R-squared: ", r2_score(y_test, lin_reg.predict(X_transform_test)))
print("Scikit-Learn RMSE: ", np.sqrt(mean_squared_error(y_test, lin_reg.predict(X_transform_test))))

R-squared of my model:  0.8865334319976048
RMSE of my model:  20.382660249297304482
Scikit-Learn R-squared:  0.8834353558272443
Scikit-Learn RMSE:  20.65904906377307
