In [None]:
# import library 
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load datasets
df_08 = pd.read_csv('../input/green-vehicle-guide-datafile/all_alpha_08.csv') 
df_18 = pd.read_csv('../input/green-vehicle-guide-datafile/all_alpha_18.csv')

In [None]:
# view 2008 dataset
df_08.head()

In [None]:
# view 2018 dataset
df_18.head()

In [None]:
df_08.shape

In [None]:
df_18.shape

In [None]:
df_08.columns.values

In [None]:
df_18.columns.values

# Drop Extraneous Columns

In [None]:
# drop columns from 2008 dataset
df_08.drop(['Stnd', 'Underhood ID', 'FE Calc Appr', 'Unadj Cmb MPG'], axis=1, inplace=True)

# confirm changes
df_08.head(1)

In [None]:
# drop columns from 2018 dataset
df_18.drop(['Stnd', 'Stnd Description', 'Underhood ID', 'Comb CO2'], axis=1, inplace=True)

# confirm changes
df_18.head(1)

# Rename Columns

In [None]:
# rename Sales Area to Cert Region
df_08.rename(columns={'Sales Area': 'Cert Region'}, inplace=True)

# confirm changes
df_08.head(1)

In [None]:
# replace spaces with underscores and lowercase labels for 2008 dataset
df_08.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

# confirm changes
df_08.head(1)

In [None]:
# replace spaces with underscores and lowercase labels for 2018 dataset
df_18.rename(columns=lambda x: x.strip().lower().replace(" ", "_"), inplace=True)

# confirm changes
df_18.head(1)

In [None]:
# confirm column labels for 2008 and 2018 datasets are identical
df_08.columns == df_18.columns

In [None]:
# make sure they're all identical like this
(df_08.columns == df_18.columns).all()

# Filter by Certification Region

In [None]:

# confirm only certification region is California
df_08['cert_region'].unique()
# confirm only certification region is California
df_08['cert_region'].unique()


In [None]:
# confirm only certification region is California
df_08['cert_region'].unique()

In [None]:
# confirm only certification region is California
df_18['cert_region'].unique()

In [None]:
# drop certification region columns form both datasets
df_08.drop('cert_region', axis=1, inplace=True)
df_18.drop('cert_region', axis=1, inplace=True)

In [None]:
df_08.shape

In [None]:
df_18.shape

# Drop Rows with Missing Values

In [None]:
# view missing value count for each feature in 2008
df_08.isnull().sum()

In [None]:
# view missing value count for each feature in 2018
df_18.isnull().sum()

In [None]:
# drop rows with any null values in both datasets
df_08.dropna(inplace=True)
df_18.dropna(inplace=True)

In [None]:
# checks if any of columns in 2008 have null values - should print False
df_08.isnull().sum().any()

In [None]:
# checks if any of columns in 2018 have null values - should print False
df_18.isnull().sum().any()

# Dedupe Data

In [None]:
# print number of duplicates in 2008 and 2018 datasets
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())

In [None]:
# drop duplicates in both datasets
df_08.drop_duplicates(inplace=True)
df_18.drop_duplicates(inplace=True)

In [None]:
# print number of duplicates again to confirm dedupe - should both be 0
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())

# Fixing cyl Data Type

In [None]:
# check value counts for the 2008 cyl column
df_08['cyl'].value_counts()

In [None]:
# Extract int from strings in the 2008 cyl column
df_08['cyl'] = df_08['cyl'].str.extract('(\d+)').astype(int)

In [None]:
# Check value counts for 2008 cyl column again to confirm the change
df_08['cyl'].value_counts()

In [None]:
# convert 2018 cyl column to int
df_18['cyl'] = df_18['cyl'].astype(int)

# Fixing `air_pollution_score` Data Type
# Figuring out the issue
Looks like this isn't going to be as simple as converting the datatype. According to the error above, the air pollution score value in one of the rows is "6/4" - let's check it out.

In [None]:
df_08[df_08.air_pollution_score == '6/4']

# It's not just the air pollution score!
The mpg columns and greenhouse gas scores also seem to have the same problem - maybe that's why these were all saved as strings! According to [this link](http://www.fueleconomy.gov/feg/findacarhelp.shtml#airPollutionScore), which I found from the PDF documentation:

    "If a vehicle can operate on more than one type of fuel, an estimate is provided for each fuel type."
    
Ohh.. so all vehicles with more than one fuel type, or hybrids, like the one above (it uses ethanol AND gas) will have a string that holds two values - one for each. This is a little tricky, so I'm going to show you how to do it with the 2008 dataset, and then you'll try it with the 2018 dataset.

In [None]:
# First, let's get all the hybrids in 2008
hb_08 = df_08[df_08['fuel'].str.contains('/')]
hb_08

In [None]:
# hybrids in 2018
hb_18 = df_18[df_18['fuel'].str.contains('/')]
hb_18

In [None]:
# create two copies of the 2008 hybrids dataframe
df1 = hb_08.copy()  # data on first fuel type of each hybrid vehicle
df2 = hb_08.copy()  # data on second fuel type of each hybrid vehicle

# Each one should look like this
df1

In [None]:
# columns to split by "/"
split_columns = ['fuel', 'air_pollution_score', 'city_mpg', 'hwy_mpg', 'cmb_mpg', 'greenhouse_gas_score']

# apply split function to each column of each dataframe copy
for c in split_columns:
    df1[c] = df1[c].apply(lambda x: x.split("/")[0])
    df2[c] = df2[c].apply(lambda x: x.split("/")[1])

In [None]:
# this dataframe holds info for the FIRST fuel type of the hybrid
# aka the values before the "/"s
df1

In [None]:
# this dataframe holds info for the SECOND fuel type of the hybrid
# aka the values before the "/"s
df2

In [None]:
# combine dataframes to add to the original dataframe
new_rows = df1.append(df2)

# now we have separate rows for each fuel type of each vehicle!
new_rows

In [None]:
# drop the original hybrid rows
df_08.drop(hb_08.index, inplace=True)

# add in our newly separated rows
df_08 = df_08.append(new_rows, ignore_index=True)

In [None]:
# check that all the original hybrid rows with "/"s are gone
df_08[df_08['fuel'].str.contains('/')]

In [None]:
df_08.shape

# Repeat this process for the 2018 dataset

In [None]:
# create two copies of the 2018 hybrids dataframe, hb_18
df1 = hb_18.copy()
df2 = hb_18.copy()

### Split values for `fuel`, `city_mpg`, `hwy_mpg`, `cmb_mpg`
You don't need to split for `air_pollution_score` or `greenhouse_gas_score` here because these columns are already ints in the 2018 dataset.

In [None]:
# list of columns to split
split_columns = ['fuel', 'city_mpg', 'hwy_mpg', 'cmb_mpg']

# apply split function to each column of each dataframe copy
for c in split_columns:
    df1[c] = df1[c].apply(lambda x: x.split("/")[0])
    df2[c] = df2[c].apply(lambda x: x.split("/")[1])

In [None]:
# append the two dataframes
new_rows = df1.append(df2)

# drop each hybrid row from the original 2018 dataframe
# do this by using Pandas drop function with hb_18's index
df_18.drop(hb_18.index, inplace=True)

# append new_rows to df_18
df_18 = df_18.append(new_rows, ignore_index=True)

In [None]:
# check that they're gone
df_18[df_18['fuel'].str.contains('/')]

In [None]:
df_18.shape

### Now we can comfortably continue the changes needed for `air_pollution_score`! Here they are again:
- 2008: convert string to float
- 2018: convert int to float

In [None]:
# convert string to float for 2008 air pollution column
df_08.air_pollution_score = df_08.air_pollution_score.astype(float)

In [None]:
# convert int to float for 2018 air pollution column
df_18.air_pollution_score = df_18.air_pollution_score.astype(float)

## Fix `city_mpg`, `hwy_mpg`, `cmb_mpg` datatypes
    2008 and 2018: convert string to float

In [None]:
# convert mpg columns to floats
mpg_columns = ['city_mpg', 'hwy_mpg', 'cmb_mpg']
for c in mpg_columns:
    df_18[c] = df_18[c].astype(float)
    df_08[c] = df_08[c].astype(float)

## Fix `greenhouse_gas_score` datatype
    2008: convert from float to int

In [None]:
# convert from float to int
df_08['greenhouse_gas_score'] = df_08['greenhouse_gas_score'].astype(int)

## All the dataypes are now fixed! Take one last check to confirm all the changes.

In [None]:
df_08.dtypes

In [None]:
df_18.dtypes

In [None]:
df_08.dtypes == df_18.dtypes

# Drawing Conclusions

### Q1: Are more unique models using alternative sources of fuel? By how much?

Let's first look at what the sources of fuel are and which ones are alternative sources.

In [None]:
df_08.fuel.value_counts()

In [None]:
df_18.fuel.value_counts()

Looks like the alternative sources of fuel available in 2008 are CNG and ethanol, and those in 2018 ethanol and electricity. (You can use Google if you weren't sure which ones are alternative sources of fuel!)

In [None]:
# how many unique models used alternative sources of fuel in 2008
alt_08 = df_08.query('fuel in ["CNG", "ethanol"]').model.nunique()
alt_08

In [None]:
# how many unique models used alternative sources of fuel in 2018
alt_18 = df_18.query('fuel in ["Ethanol", "Electricity"]').model.nunique()
alt_18

In [None]:
plt.bar(["2008", "2018"], [alt_08, alt_18])
plt.title("Number of Unique Models Using Alternative Fuels")
plt.xlabel("Year")
plt.ylabel("Number of Unique Models");

Since 2008, the number of unique models using alternative sources of fuel increased by 24. We can also look at proportions.

In [None]:
# total unique models each year
total_08 = df_08.model.nunique()
total_18 = df_18.model.nunique()
total_08, total_18

In [None]:
prop_08 = alt_08/total_08
prop_18 = alt_18/total_18
prop_08, prop_18

In [None]:
plt.bar(["2008", "2018"], [prop_08, prop_18])
plt.title("Proportion of Unique Models Using Alternative Fuels")
plt.xlabel("Year")
plt.ylabel("Proportion of Unique Models");

### Q2: How much have vehicle classes improved in fuel economy?  

Let's look at the average fuel economy for each vehicle class for both years.

In [None]:
veh_08 = df_08.groupby('veh_class').cmb_mpg.mean()
veh_08

In [None]:
veh_18 = df_18.groupby('veh_class').cmb_mpg.mean()
veh_18

In [None]:
# how much they've increased by for each vehicle class
inc = veh_18 - veh_08
inc

In [None]:
# only plot the classes that exist in both years
inc.dropna(inplace=True)
plt.subplots(figsize=(8, 5))
plt.bar(inc.index, inc)
plt.title('Improvements in Fuel Economy from 2008 to 2018 by Vehicle Class')
plt.xlabel('Vehicle Class')
plt.ylabel('Increase in Average Combined MPG');

### Q3: What are the characteristics of SmartWay vehicles? Have they changed over time?

We can analyze this by filtering each dataframe by SmartWay classification and exploring these datasets.

In [None]:
# smartway labels for 2008
df_08.smartway.unique()

In [None]:
# get all smartway vehicles in 2008
smart_08 = df_08.query('smartway == "yes"')

In [None]:
# explore smartway vehicles in 2008
smart_08.describe()

Use what you've learned so for to further explore this dataset on 2008 smartway vehicles.

In [None]:
# smartway labels for 2018
df_18.smartway.unique()

In [None]:
# get all smartway vehicles in 2018
smart_18 = df_18.query('smartway in ["Yes", "Elite"]')

In [None]:
smart_18.describe()

Use what you've learned so for to further explore this dataset on 2018 smartway vehicles.

### Q4: What features are associated with better fuel economy?

You can explore trends between cmb_mpg and the other features in this dataset, or filter this dataset like in the previous question and explore the properties of that dataset. For example, you can select all vehicles that have the top 50% fuel economy ratings like this.

In [None]:
top_08 = df_08.query('cmb_mpg > cmb_mpg.mean()')
top_08.describe()

In [None]:
top_18 = df_18.query('cmb_mpg > cmb_mpg.mean()')
top_18.describe()

# Merging Datasets

# Create combined dataset

In [None]:
# rename 2008 columns
df_08.rename(columns=lambda x: x[:10] + "_2008", inplace=True)

In [None]:
# view to check names
df_08.head()

In [None]:
# merge datasets
df = df_08.merge(df_18, left_on='model_2008', right_on='model', how='inner')

In [None]:
# view to check merge
df.head()

# Results with Merged Dataset
#### Q5: For all of the models that were produced in 2008 that are still being produced now, how much has the mpg improved and which vehicle improved the most?


In [None]:
model_mpg = df.groupby('model').mean()[['cmb_mpg_2008', 'cmb_mpg']]

### 1. Create a new dataframe, `model_mpg`, that contain the mean combined mpg values in 2008 and 2018 for each unique model

To do this, group by `model` and find the mean `cmb_mpg_2008` and mean `cmb_mpg` for each.

In [None]:
model_mpg.head()

### 2. Create a new column, `mpg_change`, with the change in mpg
Subtract the mean mpg in 2008 from that in 2018 to get the change in mpg

In [None]:
model_mpg['mpg_change'] = model_mpg['cmb_mpg'] - model_mpg['cmb_mpg_2008']

In [None]:
model_mpg.head()

### 3. Find the vehicle that improved the most
Find the max mpg change, and then use query or indexing to see what model it is!

In [None]:
max_change = model_mpg['mpg_change'].max()
max_change

In [None]:
model_mpg[model_mpg['mpg_change'] == max_change]

Pandas also has a useful [`idxmax`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.idxmax.html) function you can use to find the index of the row containing a column's maximum value!

In [None]:
idx = model_mpg.mpg_change.idxmax()
idx

In [None]:
model_mpg.loc[idx]

In [None]:
df.head()

In [None]:
from pandas_profiling import ProfileReport 

profile = ProfileReport( df, title='Pandas profiling report ' , html={'style':{'full_width':True}})

profile.to_notebook_iframe()