In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


# Check for the VIF values of the feature variables. 
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv('../input/fuel-consumption/FuelConsumption.csv')
df.head()

In [None]:
df.VEHICLECLASS.value_counts()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# No missing values found

In [None]:
df.shape

In [None]:
# Need to look for outliers

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,3,1)
sns.boxplot(df['ENGINESIZE'])
plt.subplot(2,3,2)
sns.boxplot(df['CYLINDERS'])
plt.subplot(2,3,3)
sns.boxplot(df['FUELCONSUMPTION_CITY'])
plt.subplot(2,3,4)
sns.boxplot(df['FUELCONSUMPTION_HWY'])
plt.subplot(2,3,5)
sns.boxplot(df['FUELCONSUMPTION_COMB_MPG'])

In [None]:
df['ENGINESIZE'].describe()

In [None]:
#There are outliers in a number of variables, need outlier treatment

In [None]:
sns.displot(df['CO2EMISSIONS'],bins=20)

In [None]:
df.columns

In [None]:
sns.pairplot(df)

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
#Lets consider only the numeric variables

df=df[['ENGINESIZE', 'CYLINDERS',
         'FUELCONSUMPTION_CITY',
       'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB',
       'FUELCONSUMPTION_COMB_MPG', 'CO2EMISSIONS']]

In [None]:
X_train,X_test=train_test_split(df,train_size=.7,test_size=.30,random_state=100)

In [None]:
cols=list(X_train.columns)
cols

In [None]:
X_train.info()

In [None]:
scaler=MinMaxScaler()
X_train[cols]=scaler.fit_transform(X_train[cols])

In [None]:
y_train=X_train.pop('CO2EMISSIONS')

In [None]:
lm=LinearRegression()

model=lm.fit(X_train,y_train)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
X_train_sm=sm.add_constant(X_train)


In [None]:
sm_lr=sm.OLS(y_train,X_train_sm).fit()

sm_lr.params

print(sm_lr.summary())

In [None]:
# FUELCONSUMPTION_HWY got a high p-value, means it is insignificant for the model

X_train.drop('FUELCONSUMPTION_HWY',axis=1,inplace=True)

In [None]:
X_train_sm=sm.add_constant(X_train)

sm_lr=sm.OLS(y_train,X_train_sm).fit()

sm_lr.params

print(sm_lr.summary())

In [None]:
VIF_calculator(X_train)

In [None]:
# FUELCONSUMPTION_CITY got a high VIF, means it is highly correlated with other independant variables

X_train.drop('FUELCONSUMPTION_CITY',axis=1,inplace=True)

In [None]:
X_train_sm=sm.add_constant(X_train)

sm_lr=sm.OLS(y_train,X_train_sm).fit()

sm_lr.params

print(sm_lr.summary())



In [None]:
VIF_calculator(X_train)

In [None]:
# Even though the p values are good, VIF says a different story. ENGINESIZE got a high VIF, means it is highly correlated with other independant variables

X_train.drop('ENGINESIZE',axis=1,inplace=True)

In [None]:
X_train_sm=sm.add_constant(X_train)

sm_lr=sm.OLS(y_train,X_train_sm).fit()

sm_lr.params

print(sm_lr.summary())


In [None]:
VIF_calculator(X_train)

In [None]:
# FUELCONSUMPTION_COMB got a high VIF, means it is highly correlated with other independant variables
X_train.drop('FUELCONSUMPTION_COMB',axis=1,inplace=True)

In [None]:
X_train_sm=sm.add_constant(X_train)

sm_lr=sm.OLS(y_train,X_train_sm).fit()

sm_lr.params

print(sm_lr.summary())

In [None]:
VIF_calculator(X_train)

In [None]:
cols=list(X_test.columns)
cols

In [None]:

X_test[cols]=scaler.transform(X_test[cols])

In [None]:
X_test.head()

In [None]:
X_test=X_test[['CYLINDERS','FUELCONSUMPTION_COMB_MPG','CO2EMISSIONS']]

In [None]:
y_test=X_test.pop('CO2EMISSIONS')

In [None]:
X_test_sm=sm.add_constant(X_test)

y_pred=sm_lr.predict(X_test_sm)



In [None]:
y_pred

In [None]:
r2_score(y_test,y_pred)

In [None]:
sns.scatterplot(y_test,y_pred)

# Error terms have almost a constant variance

In [None]:
sns.distplot(y_test-y_pred)
#Error terms are normally distributed