In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('/kaggle/input/co2-emission-by-vehicles/CO2 Emissions_Canada.csv')

print(df.shape)
df.head()

# Data Analysis

## Missing Values

In [None]:
df.isnull().sum()

## Numerical Features

In [None]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
df[num_features].head()

### Discrete features

In [None]:
discrete_features = [feature for feature in num_features if len(df[feature].unique()) < 25]
print(discrete_features)

In [None]:
for feature in discrete_features:
    data = df.copy()
    
    #Fraction of total cars for each cylinder type
    print(data.groupby(feature)['CO2 Emissions(g/km)'].count() / len(data))
    
    #Counts of each cylinder type
    print(data[feature].value_counts()) 
    
    data[feature].value_counts().plot.bar()
    plt.show()

In [None]:
for feature in discrete_features:
    data = df.copy()
    
    #Contribution of each cylinder type to CO2 Emission
    print(data.groupby(feature)['CO2 Emissions(g/km)'].sum() / data['CO2 Emissions(g/km)'].sum())
    data.groupby(feature)['CO2 Emissions(g/km)'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('Emission')
    plt.show()

##### Observation:
1. Majority of the cars have 4 cylinders(43%), followed by 6 cylinders(33%) and 8 cylinders(18%)
2. These cars account for 94.7% of total CO2 Emissions
3. Emission increases as number of cylinders increase

### Continuous Features

In [None]:
continuous_features = [feature for feature in num_features if feature not in discrete_features]
print(continuous_features)

#### Distribution

In [None]:
for feature in continuous_features:
    data = df.copy()
    
    data[feature].hist(bins = 25)
    plt.xlabel(feature)
    plt.ylabel('count')
    plt.show()

Features are slightly right skewed. Log transformation could be applied

In [None]:
## Log transformation and relationship with target variable

for feature in continuous_features:
    if feature != 'CO2 Emissions(g/km)':
        data = df.copy()
        
        data[feature] = np.log(data[feature])
        
        plt.scatter(data[feature], np.log(data['CO2 Emissions(g/km)']))
        plt.xlabel(feature)
        plt.ylabel('emission')
        plt.show()

##### Observation:
1. Emission increases with increase in engine size
2. Emission increases with increase in Fuel Consumption (City, Hwy, Comb (L/100km))
3. Emission decreases with increase in Fuel Consumption Comb (mpg)

### Continuous Outliers

In [None]:
for feature in continuous_features:
    data = df.copy()
    
    data[feature] = np.log(data[feature])
    data.boxplot(column = feature)
    plt.ylabel('values')
    plt.show()

## Categorical Features

In [None]:
cat_features = [feature for feature in df.columns if feature not in num_features]
df[cat_features].head()

In [None]:
for feature in cat_features:
    print('{}: {} categories'.format(feature, len(df[feature].unique())))

We can drop 'model' feature as 'make' will provide an overview on the specifications 

In [None]:
cat_features.remove('Model')

In [None]:
## relationship with target variable

for feature in cat_features:
    data = df.copy()
    
    data.groupby(feature)['CO2 Emissions(g/km)'].median().plot.bar()
    plt.xlabel(feature)
    plt.ylabel('emission')
    plt.show()

##### Observations:
1. Bugatti has the highest emission
2. Vans (Cargo and Passenger) have the highest emission
3. AM5 engines have relatively lowest emission
4. E > Z > D > X > N fuel type emission rate

# Feature Engineering

In [None]:
dataset = df.copy()
dataset.head()

## Transformation of Continuous Values

In [None]:
for feature in continuous_features:
    dataset[feature] = np.log(dataset[feature])

## Handling rare Categorical Features

We cannot classify 'Make' or 'Transmission' as there are more categories in it and they have a relationship with Emission. There are not enough rare variables in the other categories that the need to declare them explicitly should arise

### Converting Categorical features to Numerical features

Since each categorical feature has a strong relationship with Emission we should use ordinal encoding

We will not convert 'Model' as we are dropping the feature later

In [None]:
for feature in cat_features:
    ordinal_labels = dataset.groupby(feature)['CO2 Emissions(g/km)'].max().sort_values().index
    ordinal_dict = {k:i for i, k in enumerate(ordinal_labels, 0)}
    dataset[feature] = data[feature].map(ordinal_dict)

In [None]:
dataset.drop(['Model'], axis = 1, inplace = True)
dataset.head()

In [None]:
dataset.to_csv('processed_data.csv', index = False)

# Feature Selection

In [None]:
df = pd.read_csv('processed_data.csv')
df.head()

In [None]:
X = df.drop('CO2 Emissions(g/km)', axis = 1)
y = df.iloc[:, -1]

## Dropping with correlation

In [None]:
cor = X.corr()
sns.heatmap(cor, annot = True, cmap = plt.cm.CMRmap_r)
plt.show()

We can drop columns with correlation > 0.85

In [None]:
X = X.drop(['Cylinders', 'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)'], axis = 1)

## Feature Scaling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Models

## Linear Regression

In [None]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_train_scaled, y_train)

In [None]:
linear_regressor.score(X_test_scaled, y_test)

### Cross Validation

In [None]:
scores = cross_val_score(linear_regressor, X_train_scaled, y_train, cv=5)
print(np.mean(scores))

## Ridge Regression

In [None]:
ridge_regressor = Ridge(alpha = 1)
ridge_regressor.fit(X_train_scaled, y_train)

In [None]:
ridge_regressor.score(X_test_scaled, y_test)

## Lasso Regression

In [None]:
lasso_regr = Lasso(alpha = 0.005)
lasso_regr.fit(X_train_scaled, y_train)

In [None]:
lasso_regr.score(X_test_scaled, y_test)

In [None]:
lasso_regr.coef_

## Random Forest Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
regressor = RandomForestRegressor()

regressor.fit(X_train, y_train)

In [None]:
regressor.score(X_test, y_test)