In [None]:
#importing data visualization and manipulation libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

#importing machine learning libraries

from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
#importing dataset

df = pd.read_csv("/kaggle/input/co2-emission-by-vehicles/CO2 Emissions_Canada.csv")

In [None]:
#checking for null values, didn't expect any

df.isnull().sum()

In [None]:
#I chose to rename this column to something easier to type as it is used very frequently 

df.rename(columns={'CO2 Emissions(g/km)' : 'CO2_emission'}, inplace=True)

In [None]:
df2 = df.copy()

In [None]:
df2.drop('Fuel Consumption Comb (mpg)', axis = 1, inplace = True)

In [None]:
df2.rename(columns={'Fuel Consumption Comb (L/100 km)' : 'Fuel_Cons_comb_(l/100km)'}, inplace=True)
df2.rename(columns={'Fuel Consumption Hwy (L/100 km)' : 'Fuel_Cons_hwy_(l/100km)'}, inplace=True)
df2.rename(columns={'Fuel Consumption City (L/100 km)' : 'Fuel_Cons_city_(l/100km)'}, inplace=True)

In [None]:
df2.rename(columns={'Fuel Type' : 'Fuel_type'}, inplace=True)

In [None]:
#updated dataset

df

In [None]:
#getting to know the dataset a little more in the next few steps

df['Fuel Type'].value_counts()

In [None]:
df['Transmission'].value_counts()

In [None]:
#discovering correlation

df.corr()['CO2_emission'].sort_values()

In [None]:
#heatmap for a better understanding of correlated values

plt.figure(figsize = (8,6))
corr = df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap = 'Blues', square = True)

In [None]:
#I have a habit of using pairplot function of seaborn to see how each individual graph looks like

sns.pairplot(df)

In [None]:
#Some visualizations to show our understanding of the dataset

mkI = df['Make'].value_counts().index
mkV = df['Make'].value_counts().values
plt.figure(figsize = (10,8))
sns.barplot(mkI,mkV)
plt.xticks(rotation='vertical')

In [None]:
mkI = df['Vehicle Class'].value_counts().index
mkV = df['Vehicle Class'].value_counts().values
plt.figure(figsize = (10,8))
sns.barplot(mkV,mkI, orient = 'h', palette='Spectral')
plt.xticks(rotation='vertical')

In [None]:
#this boxplot shows us that Vans typically emit more CO2 when compared to other vehicle classes

plt.figure(figsize = (10,8))
sns.boxplot(x="Vehicle Class", y="CO2_emission", data=df)
plt.xticks(rotation = 'vertical')

In [None]:
sns.boxplot(df['Fuel Consumption City (L/100 km)'], color = "red")
plt.show()
sns.boxplot(df['Fuel Consumption Hwy (L/100 km)'])
plt.show()
sns.boxplot(df['Fuel Consumption Comb (L/100 km)'], color = 'green')
plt.show()

In [None]:
plt.figure(figsize = (10,8))
sns.boxplot(x = 'Fuel Type' , y = 'CO2_emission', data = df)
plt.xticks([0,1,2,3,4],['Premium Gasoline','Diesel','Regular Gasoline','Ethanol','Natural Gas'])
plt.show()

In [None]:
plt.figure(figsize = (10,8))
sns.catplot(x = 'Cylinders' , y = 'CO2_emission', data = df)
plt.show()

In [None]:
#Ethanol typically is the most efficient fuel type 

plt.figure(figsize = (10,8))
sns.boxplot(y = 'Fuel Consumption Comb (mpg)', x = 'Fuel Type', data = df, palette = 'muted')
plt.xticks([0,1,2,3,4],['Premium Gasoline','Diesel','Regular Gasoline','Ethanol','Natural Gas'])

In [None]:
plt.figure(figsize = (10,8))
sns.distplot(df['Fuel Consumption Comb (mpg)'], bins = 10, color = 'purple')

In [None]:
df2.drop('Make', axis = 1, inplace = True)
df2.drop('Model', axis = 1, inplace = True)
df2.drop('Vehicle Class', axis = 1, inplace = True)

In [None]:
df2

In [None]:
from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()

encode.fit(df2.Fuel_type.drop_duplicates()) 
df2.Fuel_type = encode.transform(df2.Fuel_type)

encode.fit(df2.Transmission.drop_duplicates())
df2.Transmission = encode.transform(df2.Transmission)

In [None]:
#assigning dependent and independent variables
#can be used with any column across the dataset provided hyperparameters are adjusted accordingly

x = df2.iloc[:, :-1].values
y = df2.iloc[:, -1].values

In [None]:
#splitting and reshaping data into testing and training sets

xTrain, xTest, yTrain, yTest = train_test_split(x,y, test_size = 0.2, random_state = 0)

# xTrain= xTrain.reshape(-1, 1)
# yTrain= yTrain.reshape(-1, 1)
# xTest = xTest.reshape(-1, 1)
# yTest = yTest.reshape(-1, 1)

In [None]:
#linear regression model achieving 85% accuracy
#at the end of the kernel I attempted to create and use my own linear regression model to find out coefficient and intercept without using scikit learn

reg = LinearRegression()
reg.fit(xTrain, yTrain)
regYpred = reg.predict(xTest)
print(reg.score(xTest,yTest))

In [None]:
#I printed the coefficient and the intercept here to compare my model built from scratch against the imported scikit learn model

print('regression coefficient', reg.coef_, 'intercept', reg.intercept_)

In [None]:
f, ax = plt.subplots(1, figsize=(10, 8), sharex=True)

sns.stripplot(y = yTest.flatten(), color = 'darkmagenta', alpha = 0.7, label = 'Test Data')
sns.stripplot(y = regYpred.flatten(), color = 'lawngreen', alpha = 0.7, label = 'Train Data')
plt.legend()
plt.show()

In [None]:
#I used these histograms to show Predicted values vs. Actual values in all three models

sns.distplot(regYpred, bins = 20, color = 'red')
plt.title = 'Predicted values'
plt.show()
sns.distplot(yTest, bins = 20)
plt.title = 'Actual values'
plt.show()

In [None]:
#Regression line showing best fit

sns.regplot(x = 'Fuel Consumption Comb (L/100 km)', y = 'CO2_emission', data  = df, color = 'blue')

In [None]:
#Decision Tree model got us a higher accuracy at 88%

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(random_state = 0)
dtr.fit(xTrain, yTrain)
dtrYpred = dtr.predict(xTest)
dtrScore = r2_score(yTest,dtrYpred)
print('Score: %.3f' % dtrScore)

In [None]:
f, ax = plt.subplots(1, figsize=(10, 8), sharex=True)

sns.stripplot(y = yTest.flatten(), color = 'darkmagenta', alpha = 0.7, label = 'Test Data')
sns.stripplot(y = dtrYpred.flatten(), color = 'lawngreen', alpha = 0.7, label = 'Train Data')
plt.legend()
plt.show()

In [None]:
sns.distplot(dtrYpred, bins = 20, color = 'red')
plt.show()
sns.distplot(yTest, bins = 20)
plt.show()

In [None]:
#Random Forest Regressor had the highest accuracy standing at 89%
#I used a for loop for the n estimators to see which yielded the highest accuracy, it landed at 20

from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor(n_estimators = 20, random_state = 0)
rfr.fit(xTrain, yTrain)
rfrYpred = rfr.predict(xTest)
rfrScore = r2_score(yTest,rfrYpred)
print('Score: %.3f' % rfrScore)

In [None]:
f, ax = plt.subplots(1, figsize=(10, 8), sharex=True)

sns.stripplot(y = yTest.flatten(), color = 'darkmagenta', alpha = 0.7, label = 'Test Data')
sns.stripplot(y = regYpred.flatten(), color = 'lawngreen', alpha = 0.7, label = 'Train Data')
plt.legend()
plt.show()

In [None]:
sns.distplot(rfrYpred, bins = 20, color = 'red')
plt.show()
sns.distplot(yTest, bins = 20)
plt.show()

Below is my attempt to build my own linear regression model from scratch to calculate coefficient and slope of the regression line

In [None]:
#calculating mean of x and y values

X,Y = xTrain,yTrain
xMean = np.mean(X)
yMean = np.mean(Y)

In [None]:
#calculating variance and covariance

covar = 0
var = 0
for i in range (len(X)):
    covar += (X[i] - xMean) * (Y[i] - yMean)
    var += (X[i]-xMean) ** 2      

In [None]:
#computing coefficient and intercepts based on previous calculations

coeff = covar/var
intercept = yMean - (coeff * xMean)

print('intercept is',intercept, 'coefficient is', coeff)

This was my first attempt to build a linear regression model from scratch. Any and all critiques welcomed!