In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Analysis 

In [None]:
import pandas as pd
import numpy as np
import re
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv("../input/fuel-economy/database.csv", header=0)
df.head()

In [None]:
df.columns

In [None]:
total = 0
def sum_(x):
    global total
    total += x
    return x

def percentage(x):
    return ("%.2f" % (x / total * 100)) + "%"

percentages = []
unique_counts = df["Class"].value_counts()
unique_counts.apply(sum_)
unique_counts = unique_counts.apply(percentage)

pd.DataFrame(unique_counts).head(5)

## We will also reduce the amount of columns, and mainly focus on the ones that we understand.

We should reduce the varying MPG columns into one. We can try to eliminate some columns by determining if we can remove one or two fuel type columns from the three given

In [None]:
print("Fuel Type: %s\n\n\n" % (df["Fuel Type"].unique()),  
      "Fuel Type 1: %s\n\n\n" % (df["Fuel Type 1"].unique()), 
      "Fuel Type 2: %s" % (df["Fuel Type 2"].unique()))

In [None]:
print("Fuel Type: ", df[df["Fuel Type"].isnull()].shape[0])
print("Fuel Type 1: ", df[df["Fuel Type 1"].isnull()].shape[0]) 
print("Fuel Type 2: ", df[df["Fuel Type 2"].isnull()].shape[0])

In [None]:
df.columns

## Let's put our main focus onto the fuel data

In [None]:
unique_columns = df[["Vehicle ID", "Year", "Make", "Model","Class", "Fuel Type", "Fuel Type 1", "Fuel Type 2", "Fuel Economy Score", "GHG Score", "Alternative Fuel/Technology"]]
semi_relevant_MPG_columns = df.loc[:, "City MPG (FT1)":"Highway Utility Factor"]

col = df.columns[df.columns.str.contains("Annual Fuel Cost") | df.columns.str.contains("Tailpipe CO2") | df.columns.str.contains("Annual Consumption in Barrels") | df.columns.str.contains("Hours to Charge") \
                ]
fuelDf = pd.concat([unique_columns, semi_relevant_MPG_columns, df[col.values]], axis=1)

## Let's Check the Parameters

In [None]:
def find_inconsistencies(df):
    fig, ax = plt.subplots(3,11, figsize=(48,48))
    x, y = 0,0
    for i, column in enumerate(df.columns):
        if type(df[column]) != pd.Series:
            continue
        if df[column].dtype == np.object:
            continue

        ax[x,y].hist(fuelDf[column], bins=np.linspace(df[column].min(), df[column].max(), 40))
        ax[x,y].set_title(column)
        ax[x,y].set_xlabel(column)
        ax[x,y].set_ylabel("counts")

        y += 1
        if y % 11 == 0:
            y = 0
            x += 1
    #fuelDf["GHG Score"].value_counts().plot.bar(title="GHG Score", xlabel="GHG Score", ylabel="count")

In [None]:
timeDf = fuelDf
find_inconsistencies(timeDf)

## Here are the conclusions that I have made from the plot above:

- Only Annual Fuel Cost, Annual Consumption in Barrels, Tailpipe CO2 in Grams/Miles, CITY MPG, Highway MPG have consistent data for all vehicles from 1984 - 2017 
- We can calculate the amount of CO2 released for most cars using the above statistics, even though some values are already calculated, based on 55% MPG on roads, and 45% MPG on highway.
- NOOO WE HAVE TO DO CALCULATIONS TO MAKE UP FOR THIS LAZY DATASET??? Very Sad
- Normal distribution is evident for Fuel Economy and GHG Scores



## More questions to ask
- Is Vehicle ID not unique?
- Which pieces of data shall we include in the frontend application? I think aount of CO2, some basic stats, and MPG

## Now let's take a look at all the columns necessary to compute CO2 for non-electric Cars. For electric cars, we need to additionally make sure electrical consumption for highway and normal road exist

## Let's check the number of empty fields we have in columns that are important for both electric and non-electric cars

In [None]:
indicator = fuelDf.columns.str.contains("Tailpipe CO2") | fuelDf.columns.str.contains("City MPG") | fuelDf.columns.str.contains("Highway MPG") | \
fuelDf.columns.str.contains("Annual Fuel Cost") | fuelDf.columns.str.contains("Annual Consumption in Barrels") | fuelDf.columns.str.contains("Fuel Type 1") | fuelDf.columns.str.contains("Fuel Type 2") \
| fuelDf.columns.str.contains("Electricity Consumption")
vital_statistics = fuelDf[fuelDf.columns[indicator]]

In [None]:
## Make sure vital statistics that we're using for FT1 calculations are not empty
def check_vital(df, f_type : str):
    df1 = df[df["Highway MPG (%s)" % f_type] <= 0]
    df2 = df[df["Annual Fuel Cost (%s)" % f_type ] <= 0]
    df3 = df[df["Annual Consumption in Barrels (%s)" % f_type] <= 0]
    df4 = df[df["City MPG (%s)" % f_type] <= 0]
    return pd.concat([df1, df2, df3, df4])
    
check_vital(vital_statistics, "FT1")

In [None]:
## 1. Make sure that non-electric cares have the Tailpipe CO2 in Grams/Mile
## 2. Make sure electric cars have the highway and city electric consumptions.

def check_diff_types_vital(df, f_type : str, f_type_name : str):
    indicator = (~df[f_type].str.contains("Electricity")) & (df["Tailpipe CO2 in Grams/Mile (%s)" % (f_type_name)] <= 0)
    df1 = df[indicator]
    percent = df1.shape[0]/df[~df[f_type].str.contains("Electricity")].shape[0] * 100
    print("%.2f%% of Tailpipe CO2 in Grams/Mile for %s unfilled" % (percent, f_type_name)) ## double % escape

    indicator = (df[f_type].str.contains("Electricity")) & ((df["City Electricity Consumption"] <= 0) | df["Highway Electricity Consumption"] <= 0)
    df1 = df[indicator]
    percent = df1.shape[0]/df[df[f_type].str.contains("Electricity")].shape[0] * 100
    print("%.2f%% of Electricity Consumptions for %s unfilled" % (percent, f_type_name))
    
    indicator = (~df[f_type].str.contains("Electricity")) & (df["Tailpipe CO2 (%s)" % (f_type_name)] <= 0)
    df1 = df[indicator]
    percent = df1.shape[0]/df[~df[f_type].str.contains("Electricity")].shape[0] * 100
    print("%.2f%% of Tailpipe CO2 in Grams/Mile for %s unfilled" % (percent, f_type_name)) ## double % escape
    
    
check_diff_types_vital(vital_statistics, "Fuel Type 1", "FT1")

In [None]:
vital_statistics[vital_statistics["Tailpipe CO2 in Grams/Mile (FT1)"] == 0]["Fuel Type 1"].value_counts()

# Every non-CO2-emission vehicle under Fuel Type 1 vehicles is Electric

In [None]:
# Do the same for FT2
vital_statistics2 = vital_statistics.dropna()
check_vital(vital_statistics2, "FT2")

We just found some fuel costs for FT2 that are 0 for all Electric cars. It shouldn't be too big of a problem going forward, but we'll keep this in mind

In [None]:
vital_statistics2[vital_statistics2["Tailpipe CO2 in Grams/Mile (FT2)"] == 0]["Fuel Type 2"].value_counts()

## Every non-emission-CO2 vehicle is electric

In [None]:
# Do the same for FT2
check_diff_types_vital(vital_statistics2, "Fuel Type 2", "FT2")

## We can see that there's a large amount of Tailpipe CO2 values unfilled for both FT1 and FT2. Let's see if we can fill the missing information.

For electric cars, we use this equation:
```
Number of barrels consumed annually * 42 gallons/(# of barrel) * (0.55 * City Miles/Gallon * City Electric Consumption + 0.45 * Highway Miles/Gallon * Highway Electric Consumption)
```

For others, we use this equation:
```
Number of barrels consumed annually * 42 gallons/(# of barrel) * (0.55 * City Miles/Gallon + 0.45 * Highway Miles/Gallon) * CO2 Grams/Mile
```

We can safely assume that the energy consumption is measured in (Wh/mile)
https://insideevs.com/reviews/343702/electric-car-energy-consumption-epa-compared-april-1-2019/
    
    
## We made sure all of the above variables are available for every car and that certain variables are available for cars associated with the fuel type that they fall under. Let's fill in CO2 values and create a new column for Watt-hour consumption!


### 1. CO2 Calculation: Let's quick remind ourselves of the unique values that fall under both Fuel Types, to make sure we know what to calculate

In [None]:
## Update for fuel type one and fuel type
print("Fuel Type: %s\n\n\n" % (df["Fuel Type"].unique()),  
      "Fuel Type 1: %s\n\n\n" % (df["Fuel Type 1"].unique()), 
      "Fuel Type 2: %s" % (df["Fuel Type 2"].unique()))

In [None]:
## We need a CO2 column for each Fuel Type, and a total CO2 column if Fuel Type contains an 'and'
GPB = 42 ## Gallons per Barrel 


'''
    Update for:
    Non-Electric Car FT1 CO2:
         x["Tailpipe CO2 (FT1)"] = x["Annual Consumption in Barrels (FT1)"] * GPB \
    * (0.55 * x["City MPG (FT1)"] + 0.45 * x["Highway MPG (FT1)"]) \
    * x["Tailpipe CO2 in Grams/Mile (FT1)"]
    
    Non-Electric Car FT2 CO2 :
    if not np.isnan(x["Fuel Type 2"])
        x["Tailpipe CO2 (FT2)"] = x["Annual Consumption in Barrels (FT2)"] * GPB \
    * (0.55 * x["City MPG (FT2)"] + 0.45 * x["Highway MPG (FT2)"]) \
    * x["Tailpipe CO2 in Grams/Mile (FT2)"]
    
    
    Electric Car FT1 Wh
        x["Wh (FT1)"] = x["Annual Consumption in Barrels (FT1)"] * GPB \
    * (0.55 * x["City MPG (FT1)"] * x["City Electric Consumption (FT1)"] \
    + 0.45 * x["Highway MPG (FT1)"] * x["Highway Electric Consumption (FT1)"]) \
        
    
    Electric Car FT2 Wh
        x["Wh (FT2)"] = x["Annual Consumption in Barrels (FT2)"] * GPB \
    * (0.55 * x["City MPG (FT2)"] * x["City Electric Consumption (FT2)"] \
    + 0.45 * x["Highway MPG (FT2)"] * x["Highway Electric Consumption (FT2)"]) \
        
    if not np.isnan(x["Wh (FT1)"] + x["Wh (FT2)"]):
        x["Wh (Total)"] = x["Wh (FT1)"] + x["Wh (FT2)"]

'''
def calculateCO2_1(x):
    global GPB
    return x["Annual Consumption in Barrels (FT1)"] * GPB \
    * (0.55 * x["City MPG (FT1)"] + 0.45 * x["Highway MPG (FT1)"]) \
    * x["Tailpipe CO2 in Grams/Mile (FT1)"]

def calculateCO2_2(x):
    global GPB
    return x["Annual Consumption in Barrels (FT2)"] * GPB \
    * (0.55 * x["City MPG (FT2)"] + 0.45 * x["Highway MPG (FT2)"]) \
    * x["Tailpipe CO2 in Grams/Mile (FT2)"]

def calculateWh1(x):
    global GPB
    return x["Annual Consumption in Barrels (FT1)"] * GPB \
    * (0.55 * x["City MPG (FT1)"] * x["City Electricity Consumption"] \
    + 0.45 * x["Highway MPG (FT1)"] * x["Highway Electricity Consumption"])

def calculateWh2(x):
    global GPB
    return x["Annual Consumption in Barrels (FT2)"] * GPB \
    * (0.55 * x["City MPG (FT2)"] * x["City Electricity Consumption"] \
    + 0.45 * x["Highway MPG (FT2)"] * x["Highway Electricity Consumption"])

def updateCO2(x):
    if x["Fuel Type 2"] != "nan":
        if x["Fuel Type 2"] == "Electricity":
            x["Wh (FT2)"] = calculateWh2(x)
        else:
            ## Only calculate if doesn't exist
            if x["Tailpipe CO2 (FT2)"] == -1:
                x["Tailpipe CO2 (FT2)"] = calculateCO2_2(x)
    
    if x["Fuel Type 1"] == "Electricity":
        x["Wh (FT1)"] = calculateWh1(x)
    else:
        if x["Tailpipe CO2 (FT1)"] == -1:
            x["Tailpipe CO2 (FT1)"] = calculateCO2_1(x)
            
    return x
            

def getTotals(x):
    x["Wh (TOTAL)"] = x["Wh (FT1)"] + x["Wh (FT2)"]
    
    if re.search("\sand\s", x["Fuel Type"]):
        x["Tailpipe CO2 (Total)"] = x["Tailpipe CO2 (FT1)"] + x["Tailpipe CO2 (FT2)"]
    else:
        x["Tailpipe CO2 (Total)"] = -1
        
    return x
    
    
columns = ["Tailpipe CO2 (FT1)", "Tailpipe CO2 {FT2}"]


## Before we output our results, let's write some tests

In [None]:
test_1 = vital_statistics2.iloc[[0]] ## Test for Updating Tailpipe CO2 (FT1) and Updating Tailpipe CO2 (FT2). Works!
test_2 = vital_statistics2[vital_statistics2["Fuel Type 2"] == "Electricity"].iloc[[0]] ## Test for Updating Wh (FT2). Works!
test_3 = vital_statistics[vital_statistics["Tailpipe CO2 (FT1)"] == -1].iloc[[0]] ## Test for Updating Tailpipe CO2 (FT2) Works!
test_4 = vital_statistics[vital_statistics["Fuel Type 1"] == "Electricity"].iloc[[0]] ## Test for Updating Wh (FT1) Works!
test_5 = fuelDf[fuelDf["Fuel Type"].str.contains("and")].iloc[2]
test_5





## Now that we're sure that everything works, let's apply the function onto the actual dataframe!

In [None]:
finalDf = df.apply(updateCO2, axis=1)
finalDf.shape

## Fillna for group calculation

In [None]:
for column in finalDf.columns:
    if finalDf[column].dtype == np.int64 or finalDf[column].dtype == np.float64:
        finalDf[column] = finalDf[column].fillna(0)
        

        

## Replace -1 in Tailpipe CO2 (FT2) and Tailpipe CO2 (FT1) with 0

In [None]:
finalDf["Tailpipe CO2 (FT1)"] = finalDf["Tailpipe CO2 (FT1)"].replace({-1 : 0})
finalDf["Tailpipe CO2 (FT2)"] = finalDf["Tailpipe CO2 (FT2)"].replace({-1 : 0})

## Combines Totals for Fuel Type 1 + Fuel Type 2

In [None]:
finalDf = finalDf.apply(getTotals, axis=1)

In [None]:
# finalDf.shape
# list(finalDf.columns)
finalDf["Fuel Type"].unique()

In [None]:
finalDf = finalDf[["Vehicle ID", "Fuel Type", "Fuel Type 1", "Fuel Type 2", "Year", "Make", "Model", "Class", "Fuel Economy Score", "GHG Score", \
                   "Tailpipe CO2 (FT1)", "Tailpipe CO2 (FT2)", "Tailpipe CO2 (Total)", "Wh (FT1)", "Wh (FT2)", "Wh (TOTAL)", "Annual Consumption in Barrels (FT1)", 
                  "Annual Consumption in Barrels (FT2)", "City MPG (FT1)", "City MPG (FT2)", "Highway MPG (FT1)", "Highway MPG (FT2)", "City Electricity Consumption", 
                  "Highway Electricity Consumption"]]

In [None]:
fuelDf[~(fuelDf["Fuel Type 1"] == "Electricity")]["Tailpipe CO2 (FT1)"].value_counts()

In [None]:
fuelDf["Tailpipe CO2 (FT2)"].value_counts()

## If fuel type is not electricity, Tailpipe CO2 cannot equal to 0. Car not electric will always have CO2 or it is undefined.

In [None]:
Type_1_Averages = (finalDf.groupby("Fuel Type 1"))[["Wh (FT1)", "Tailpipe CO2 (FT1)"]].describe()
Type_2_Averages = (finalDf.groupby("Fuel Type 2"))[["Wh (FT2)", "Tailpipe CO2 (FT2)"]].describe()
Type_2_Averages


In [None]:
Type_1_Averages.to_csv("./averages_1.csv")
Type_2_Averages.to_csv("./averages_2.csv")
finalDf.to_csv("./results.csv", index=False)

In [None]:
%ls

## Alright, let's make sure we have filled out the CO2 Production and Electric Consumption of all vehicles

In [None]:
check_diff_types_vital(finalDf, "Fuel Type 1", "FT1")

In [None]:
fuelType2Check = finalDf[~finalDf["Fuel Type 2"].isna()]
check_diff_types_vital(fuelType2Check, "Fuel Type 2", "FT2")

In [None]:
Percent_Unfilled_FT1 = 1 - finalDf[~finalDf["Wh (FT1)"].isna()].shape[0] / finalDf.shape[0] 
Percent_Unfilled_FT2 = 1 - finalDf[~finalDf["Wh (FT2)"].isna()].shape[0] / finalDf.shape[0]

print("Percent of Unfilled Wh for WT1 : %.2f%%" % Percent_Unfilled_FT1)
print("Percent of Unfilled Wh for WT2 : %.2f%%" % Percent_Unfilled_FT2)

## Alright great! All Electric and Fuel Consumption Values are filled! Now, let's analyze some important statistics so that my friends can use them! Let's call them Friend A and Friend B, and let's start with Friend A!

   ### Problem Statement for Friend A:

&nbsp;&nbsp;&nbsp; Friend A is interested in finding out the statistics behind Mini Countryman. Let's find out more about it in order to assess its environmental quality and any other relevant statistics!
    
      

### 1. What is a Mini Cooper Countryman

&nbsp;&nbsp;&nbsp; To be frank, I have no idea what is a Mini Cooper Countryman, so let's dive into it!

![Insert Image Here](https://cdn.pixabay.com/photo/2020/05/02/16/22/iguanas-5122093__340.jpg)

## Let's try to see if we can find a Mini under Make

In [None]:
interestedColumns = ["Year", "Make", "Model", "Class", "Transmission","Fuel Type", "Fuel Type 1", "Fuel Type 2", "Tailpipe CO2 (FT1)", "Tailpipe CO2 (FT2)", 
                                                    "Tailpipe CO2 (Total)", "Wh (TOTAL)", "GHG Score", "Fuel Economy Score", "GHG Score (Alt Fuel)"]
CooperCollection = finalDf[finalDf["Make"] == "MINI"][interestedColumns]

In [None]:
CooperCollection.head()

## Yes, We have found MINI under Make (the Brand)!

Now, let's rank them by CO2 Production in Ascending Order

In [None]:
indicator = (CooperCollection["Class"] == "Subcompact Cars") & ((CooperCollection["Transmission"].str.contains("6-Speed")) | (CooperCollection["Transmission"].str.contains("8-Speed")) \
| (CooperCollection["Transmission"].str.contains("7-Speed"))) 
Best_CO2_Savings = CooperCollection[indicator].sort_values("Tailpipe CO2 (Total)", ascending=True) ## optional key ascending. Pandas sorts DataFrames Ascending by default
Best_CO2_Savings

In [None]:
Best_CO2_Savings

In [None]:
Best_CO2_Savings["Tailpipe CO2 (Total)"].shape

In [None]:
Best_CO2_Savings["Indice"] = Best_CO2_Savings["Year"].astype(str) + " " + Best_CO2_Savings["Make"] + " " + Best_CO2_Savings["Model"]
bins = np.linspace(Best_CO2_Savings["Tailpipe CO2 (Total)"].min(), Best_CO2_Savings["Tailpipe CO2 (Total)"].max(), 10)
Best_CO2_Savings[["Tailpipe CO2 (Total)"]].plot.hist(by="Tailpipe CO2 (Total)", ylabel="Amount of CO2 in Grams", xlabel="Car Type", title="Tailpipe CO2 Production")

In [None]:
data_we_need = Best_CO2_Savings[["Indice", "Tailpipe CO2 (Total)", "GHG Score", "Fuel Economy Score"]]
CO2 = Best_CO2_Savings["Tailpipe CO2 (Total)"]
normalized_CO2 = (CO2 - CO2.min())/(CO2.max() - CO2.min())
data_we_need["Tailpipe CO2 (Total)"] = normalized_CO2
data_we_need.plot.bar(x="Indice", xlabel="Car Type", ylabel="Normalized CO2 values", title="Normalized CO2 Consumption from grams of CO2", figsize=(24,12))

## Seems like ~20 MINI Cooper Countryman cars really care about minimizing CO2 production and the other 15 just don't give a damn. Notice how those ~15 cars with horrible pollution has a -1 GHG Score and ENvironmental Score. Lol, government hiding the fact that those 15 cars bring poLLUTion (Or maybe data was simply not collected. We'll explore this more when I have time.)

![Mini Cooper](https://upload.wikimedia.org/wikipedia/commons/f/f2/2018_Mini_Countryman_Cooper_Automatic_1.5_Front.jpg)

### Anyways. since only 5-door Subcompact Cars with Manual 6-Speed MINIs are considered to be Mini Countrymen, we recommend Cooper (5-doors) as the most environmental friendly car out of all Countrymen

### Yay, we have finished helping friend A! Let's help out Friend B now!

## Friend B Requirements

Friend B is interested in a mini SUV that's the most environmentally friendly. Let's get onto it!


# 1. What is a Mini SUV?


In [None]:
finalDf["Class"].unique()

## That's right, we want to find Sport Utility Vehicles. Let's grab ourselves a list of all of them

In [None]:
SUVs = finalDf[finalDf["Class"].str.contains("Sport Utility Vehicle")][interestedColumns]


## Looks like the SUVs that produce the least pollution are electric vehicles, some of them being Teslas. Let's rank the electric vehicles by ascending Watthours consumed annually.

In [None]:
EVs = SUVs[SUVs["Fuel Type"] == "Electricity"].sort_values(["Tailpipe CO2 (Total)", "Wh (TOTAL)"], ascending=True)
EVs

In [None]:
EVs["Indice"] = EVs["Year"].astype(str) + " " + EVs["Make"] + " " + EVs["Model"]
EVs.plot.bar(x="Indice", y="Wh (TOTAL)", rot=45, figsize=(24,12), ylabel="Energy Consumption in Watt-Hours", xlabel="Car Type", title="Energy Consumption of Electric SUVs")

## The Teslas are very efficient. We have finished helping all of our friends! Hurray!