In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
%matplotlib inline

In [None]:
data = pd.read_csv('/kaggle/input/oc2emission/FuelConsumptionCo2.csv')
data.columns

Most of the columns in the dataset are have no relationship with our target CO2EMISSIONS.

MODELYEAR : Has only one value 2014, hence can be dropped.


For rest other categorical values, use swarmplot and check its scatter relationship with CO2EMISSIONS.

In [None]:
sns.swarmplot(x="FUELTYPE", y="CO2EMISSIONS", data=data);

Notice, how the dots are spread vertically (Range is almost same) for all the different categories.

Difference is clear, with swarm plot of CYLINDER Vs CO2EMISSIONS, which clearly has an upward moving trend.

In [None]:
sns.swarmplot(x="CYLINDERS", y="CO2EMISSIONS", data=data);

You can also LabelEncode all the categorical variables to some numbers and then use sns.pairplot() to view the relationships.

### Removing redundant columns

In [None]:
redundant_cols = ['MODELYEAR','MAKE','MODEL','VEHICLECLASS','TRANSMISSION','FUELTYPE']
data.drop(redundant_cols, axis=1, inplace=True)
# data.sample(10)

In [None]:
sns.pairplot(data);

All our candidate variables shows some type of linear relationship with C02EMISSIONS. But there is a caveat, in linear regression the independent variables should not be correlated to each other. Let's check correlation of each pair.

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr());

#### Inference from above

ENGINESIZE and CYLINDERS are highly correlated i.e value of 0.934011
Hence, we can select ENGINESIZE


Also 'FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY' are correlated to FUELCONSUMPTION_COMB_MPG


It may seem that FUELCONSUMPTION_COMB_MPG is not at all correlated to any other variable, but it is a negatively (inversely) correlated to FUELCONSUMPTION_COMB.

In [None]:
data['INV_FUELCONSUMPTION_COMB_MPG'] = 1/data['FUELCONSUMPTION_COMB_MPG']

In [None]:
plt.scatter(data['INV_FUELCONSUMPTION_COMB_MPG'], data['CO2EMISSIONS']);

In [None]:
plt.scatter(data['FUELCONSUMPTION_COMB'], data['CO2EMISSIONS']);

Therefore, dropping more redundant cols.

Note: That I kept FUELCONSUMPTION_COMB_MPG and removed FUELCONSUMPTION_COMB because FUELCONSUMPTION_COMB_MPG has opposite relation to ENGINESIZE

In [None]:
data.drop(['CYLINDERS', 'FUELCONSUMPTION_CITY', 'FUELCONSUMPTION_HWY','INV_FUELCONSUMPTION_COMB_MPG','FUELCONSUMPTION_COMB'],axis=1, inplace=True)

In [None]:
data.columns

### CREATING TRAIN TEST data frames

In [None]:
X = data.drop('CO2EMISSIONS',axis=1)
y = data['CO2EMISSIONS']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
def measure(actual, prediction):
    print('Mean Absolute Error:', metrics.mean_absolute_error(actual, prediction))  
    print('Mean Squared Error:', metrics.mean_squared_error(actual, prediction))  
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(actual, prediction)))
    print('r2 score:',r2_score(prediction, actual))
    
measure(y_test,y_test)

In [None]:
regressor = LinearRegression()  
regressor.fit(X_train.loc[:,['ENGINESIZE']], y_train)
measure(y_test, regressor.predict(X_test.loc[:,['ENGINESIZE']]))

In [None]:
cols = ['ENGINESIZE','FUELCONSUMPTION_COMB_MPG']
regressor = LinearRegression()  
regressor.fit(X_train.loc[:,cols], y_train)
yhat = regressor.predict(X_test.loc[:,cols])
measure(y_test, yhat)
print(regressor.coef_)

In [None]:
sns.residplot(y_test, yhat)
plt.title('Residual plot of YHAT x Y_TEST')
plt.show()

In [None]:
sns.distplot(y_test, hist = False, label = 'Actual values')
sns.distplot(yhat, hist = False, label = 'Predicted values')
plt.title('Comparison of predicted values with actual values')
plt.show()

### Trying Normalization

In [None]:
df = data.copy()

In [None]:
from sklearn import preprocessing

x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)
df.columns = ['ENGINESIZE', 'FUELCONSUMPTION_COMB_MPG','CO2EMISSIONS']

In [None]:
X = df.drop('CO2EMISSIONS',axis=1)
y = df['CO2EMISSIONS']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
cols = ['ENGINESIZE','FUELCONSUMPTION_COMB_MPG']
regressor = LinearRegression()  
regressor.fit(X_train.loc[:,cols], y_train)
yhat = regressor.predict(X_test.loc[:,cols])
measure(y_test, yhat)
print(regressor.coef_)

# POLYNOMIAL

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
train_x = np.asanyarray(X_train[['ENGINESIZE']])
train_y = np.asanyarray(y_train)

test_x = np.asanyarray(X_test[['ENGINESIZE']])
test_y = np.asanyarray(y_test)


poly = PolynomialFeatures(degree=2)
train_x_poly = poly.fit_transform(train_x)
test_x_poly = poly.transform(test_x)

regressor = LinearRegression()  
regressor.fit(train_x_poly, train_y)
yhat = regressor.predict(test_x_poly)
measure(y_test, yhat)
print(regressor.coef_)