# CO2 Emission by Vehicles

This dataset captures the details of how CO2 emissions by a vehicle can vary with the different features. The dataset has been taken from Canada Government official open data website. This is a compiled version. This contains data over a period of 7 years.
There are total 7385 rows and 12 columns. There are few abbreviations that has been used to describe the features. 


## The Data 

### Model

- 4WD/4X4 = Four-wheel drive
- AWD = All-wheel drive
- FFV = Flexible-fuel vehicle
- SWB = Short wheelbase
- LWB = Long wheelbase
- EWB = Extended wheelbase

### Transmission

- A = Automatic
- AM = Automated manual
- AS = Automatic with select shift
- AV = Continuously variable
- M = Manual
- 3 - 10 = Number of gears

### Fuel type

- X = Regular gasoline
- Z = Premium gasoline
- D = Diesel
- E = Ethanol (E85)
- N = Natural gas

### Fuel Consumption

City and highway fuel consumption ratings are shown in litres per 100 kilometres (L/100 km) - the combined rating (55% city, 45% hwy) is shown in L/100 km and in miles per gallon (mpg) CO2 Emissions

The tailpipe emissions of carbon dioxide (in grams per kilometre) for combined city and highway driving


## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Get the data

In [None]:
emissions = pd.read_csv('CO2 Emissions_Canada.csv')

In [None]:
emissions.columns

In [None]:
emissions.info()

In [None]:
emissions.head()

In [None]:
emissions.describe()

In [None]:
sns.pairplot(emissions)

In [None]:
sns.histplot(emissions['CO2 Emissions(g/km)'])
#não está equilibrado. Não está próximo de uma distribuição normal

In [None]:
sns.heatmap(emissions.corr(numeric_only=True))

#fuel consumption comb mpg inversamente proporcional a co2 emissions

In [None]:
emissions.drop('Fuel Consumption Comb (L/100 km)', axis=1, inplace=True)
emissions.drop('Fuel Consumption Hwy (L/100 km)', axis=1, inplace=True)

In [None]:
emissions.nunique()

## Handling missing data and possible data transformations
- Remove missing values, outliers, and unnecessary rows/ columns
- Check and impute null values
- Check Imbalanced data
- Re-indexing and reformatting our data

### 1. Missing Values

In [None]:
sns.heatmap(emissions.isnull(),yticklabels=False,cbar=False,cmap='viridis')
emissions.isnull().sum()

Assim, podemos concluir que não existem missing values neste dataset

## Handling Categorical Features

In [None]:
emissions['Make'].value_counts().count()

In [None]:
def check_model(model):
    models_available = ["4WD", "4X4", "AWD", "FFV", "SWB", "LWB", "EWB"]
    opts=model.split(' ')
    for opt in opts:
        if opt in models_available:
            if opt=='4WD' or opt=='4X4':
                return '4WD'
            return opt
    return 'Unknown'

emissions['Model'] = emissions['Model'].apply(lambda x: check_model(x))
emissions['Model'].value_counts()

In [None]:
nums = "0123456789"
emissions['Gear_Amount'] = emissions['Transmission'].str.extract(r'([0-9]+)')
emissions['Gear_Amount'] = emissions['Gear_Amount'].apply(lambda x: int(x) if x is not np.nan else 0)
emissions['Transmission'] = emissions['Transmission'].str.extract(r'([A-Z]+)')
emissions.head()

In [None]:
emissions.drop('Vehicle Class', axis=1, inplace=True)
emissions.drop('Make', axis=1, inplace=True)


In [None]:
# transmission type is letters + one number, so take out the number
transmission_count = emissions['Transmission']
labels = transmission_count.astype('category').cat.categories.tolist()
counts = transmission_count.value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%')
ax1.axis('equal')
plt.show()

In [None]:
transmission_count = emissions['Transmission']
sns.set(style="darkgrid")
sns.barplot(x=transmission_count.values, y=transmission_count.index)
plt.title(' Distribuição de Frequência de Injeção de rede (kwh)')
plt.ylabel('Número de ocorrências', fontsize=12)
plt.xlabel('Injeção na Rede', fontsize=12)
plt.show()

### a) Binary Encoding

In [None]:
%pip install category_encoders

#### Encode Transmission

In [None]:
import category_encoders as ce

encoder_transmission = ce.BinaryEncoder(cols=['Transmission'])
df_binary = encoder_transmission.fit_transform(emissions)

df_binary.head()

#### Encode Fuel Type

In [None]:
encoder_fuel = ce.BinaryEncoder(cols=['Fuel Type'])
df_binary = encoder_fuel.fit_transform(df_binary)

df_binary.head()

#### Encode Model

In [None]:
encoder_model = ce.BinaryEncoder(cols=['Model'])
df_binary = encoder_model.fit_transform(df_binary)
emissions = df_binary
emissions.head()


In [None]:
sns.heatmap(emissions.corr())

In [None]:
emissions.drop('Cylinders', axis=1, inplace=True)
emissions.head()

In [None]:
sns.heatmap(emissions.corr())

# Model Training and Testing

In [None]:
emissions.columns

### X and y arrays

In [None]:
X = emissions[['Model_0', 'Model_1', 'Model_2', 'Engine Size(L)', 'Transmission_0',
       'Transmission_1', 'Transmission_2', 'Fuel Type_0', 'Fuel Type_1',
       'Fuel Type_2', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Comb (mpg)', 'Gear_Amount']]
y = emissions['CO2 Emissions(g/km)']

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2023)

In [None]:
sns.histplot(y_train)

A partição não afetou significativamente a distribuição dos dados, enviasada para a esquerda

In [None]:
sns.histplot(y_test)

## Creating and Training the Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)

### Model Evaluation

In [None]:
print(lm.intercept_)

In [None]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df

### Predictions from our Model

In [None]:
predictions = lm.predict(X_test)

In [None]:
plt.scatter(y_test,predictions)

In [None]:
sns.histplot((y_test-predictions), bins=50);

### Regression Evaluation Metrics

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test,predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,predictions)))

In [None]:
from sklearn.metrics import r2_score
r_squared = r2_score(y_test, predictions)
print('R2:', r_squared)

## Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2023)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)
print(f"MSE: {mse}")
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R^2: {r_squared}")

# GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

In [None]:
param_grid = {
    'n_estimators': [10,50],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10,20,30],
    'criterion': ['poisson', 'squared_error', 'friedman_mse'],
    'ccp_alpha': [0.0,0.01,0.02]
}

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=3)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_model = grid_search.best_estimator_
y_prediction = best_model.predict(X)

In [None]:
print(classification_report(y_test, y_prediction))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_prediction)
plt.show()