# Using Pandas

In [60]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 200)
## to make it possible to display multiple output inside one cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

<b>load the data from the vehicles.csv file into pandas data frame

In [61]:
## Your Code here
veh_df=pd.read_csv("data/vehicles.csv")

First exploration of the dataset:

- How many observations does it have?
- Look at all the columns: do you understand what they mean?
- Look at the raw data: do you see anything weird?
- Look at the data types: are they the expected ones for the information the column contains?

In [62]:
# How many observations does it have?
len(veh_df)

35952

In [63]:
#Look at all the columns: do you understand what they mean?
veh_df.columns

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year'],
      dtype='object')

In [64]:
#Look at the raw data: do you see anything weird?
veh_df
#inconsistencies in labelling make, DJ PO vehicle vs post office dj
#2 wheel drive showing up in 3 different columns (make,drivetrain, vehicle class)
#Drivetrain rear wheel vs 2 wheel drive
#2 wheel drive also in vehicle class, duplicated information
#combined MPG- assume it's supposed to be an average of highway and city, but not always true


Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,522.764706,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,555.437500,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,683.615385,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,555.437500,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,243.000000,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,244.000000,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,246.000000,1100


In [65]:
#Look at the data types: are they the expected ones for the information the column contains?

#From a first glance this seems correct. For efficiency, cyclinders could be int instead of float.  MPG could be float


### Cleaning and wrangling data

- Some car brand names refer to the same brand. Replace all brand names that contain the word "Dutton" for simply "Dutton". If you find similar examples, clean their names too. Use `loc` with boolean indexing.

- Convert CO2 Emissions from Grams/Mile to Grams/Km

- Create a binary column that solely indicates if the transmission of a car is automatic or manual. Use `pandas.Series.str.startswith` and .

- convert MPG columns to km_per_liter

In [66]:
#Some car brand names refer to the same brand. Replace all brand names that contain the word "Dutton" for simply "Dutton". If you find similar examples, clean their names too. 
#Use `loc` with boolean indexing.
veh_df[veh_df["Make"].str.contains("Dutton")]

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
11012,"E. P. Dutton, Inc.",Funeral Coach,1985,4.1,8.0,Automatic 4-spd,Front-Wheel Drive,Special Purpose Vehicles,Regular,19.388824,15,21,17,522.764706,1950
30164,S and S Coach Company E.p. Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,9,11,10,888.7,3350
31754,Superior Coaches Div E.p. Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,10,11,10,888.7,3350


In [67]:
veh_df["Make"].value_counts().sort_index(ascending=True)

AM General                               4
ASC Incorporated                         1
Acura                                  302
Alfa Romeo                              41
American Motors Corporation             22
Aston Martin                           133
Audi                                   890
Aurora Cars Ltd                          1
Autokraft Limited                        4
BMW                                   1677
BMW Alpina                               3
Bentley                                116
Bertone                                  5
Bill Dovell Motor Car Company            2
Bitter Gmbh and Co. Kg                   3
Bugatti                                  8
Buick                                  537
CCC Engineering                          2
CX Automotive                           16
Cadillac                               508
Chevrolet                             3643
Chrysler                               641
Consulier Industries Inc                 3
Dabryan Coa

In [68]:

def data_consistency_replacements(col_name,old_val,new_val):
    '''Used to make info in a column consistent in terms of naming
    Uses old value to search for part of string
    Then replaces it with new string
    Requires column name for search'''
    veh_df.loc[veh_df[col_name].str.contains(old_val), col_name] = new_val

data_consistency_replacements("Make","Dutton","Dutton")
data_consistency_replacements("Make","BMW","BMW")
data_consistency_replacements("Make","Grumman","Grumman")
data_consistency_replacements("Make","PAS","GMC")



veh_df[veh_df["Make"]=="Dutton"]


Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emission Grams/Mile,Fuel Cost/Year
11012,Dutton,Funeral Coach,1985,4.1,8.0,Automatic 4-spd,Front-Wheel Drive,Special Purpose Vehicles,Regular,19.388824,15,21,17,522.764706,1950
30164,Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,9,11,10,888.7,3350
31754,Dutton,Funeral Coach 2WD,1984,6.0,8.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,32.961,10,11,10,888.7,3350


In [69]:
# Convert CO2 Emissions from Grams/Mile to Grams/Km

#gather values of grams/mile from dataframe as series
grams_mile=veh_df["CO2 Emission Grams/Mile"]

#convert values to grams/KM
grams_km=lambda x:x*0.621372737

#re-introduce the values into the dataframe
veh_df["CO2 Emission Grams/Mile"]=grams_km(grams_mile)


#rename column
veh_df.rename(columns={"CO2 Emission Grams/Mile":"CO2 Emissions Gram/KM"},inplace=True)
veh_df

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emissions Gram/KM,Fuel Cost/Year
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,324.831736,1950
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,424.779963,2550
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,345.133720,2100
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,424.779963,2550
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,345.133720,2550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,151.614948,1100
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,150.993575,1100
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,151.614948,1100
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,152.857693,1100


In [70]:
# Create a binary column that solely indicates if the transmission of a car is automatic or manual. Use `pandas.Series.str.startswith` and .
#Check data to understand it's underlying structure
veh_df['Transmission'].value_counts()

# Check whether str.contains has manual or automatic in true/false format
is_auto=veh_df['Transmission'].str.contains("Auto")
#append this as new column in the data


veh_df['Auto or Manual?']=is_auto
veh_df['Auto or Manual?']=veh_df['Auto or Manual?'].map({True:"Automatic",False:"Manual"})
veh_df

Automatic 4-spd                     10585
Manual 5-spd                         7787
Automatic (S6)                       2631
Automatic 3-spd                      2597
Manual 6-spd                         2423
Automatic 5-spd                      2171
Automatic 6-spd                      1432
Manual 4-spd                         1306
Automatic (S8)                        960
Automatic (S5)                        822
Automatic (variable gear ratios)      675
Automatic 7-spd                       662
Automatic (S7)                        261
Auto(AM-S7)                           256
Automatic 8-spd                       243
Automatic (S4)                        229
Auto(AM7)                             157
Auto(AV-S6)                           145
Auto(AM6)                             110
Auto(AM-S6)                            92
Automatic 9-spd                        90
Manual 3-spd                           74
Manual 7-spd                           68
Auto(AV-S7)                       

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City MPG,Highway MPG,Combined MPG,CO2 Emissions Gram/KM,Fuel Cost/Year,Auto or Manual?
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,18,17,17,324.831736,1950,Automatic
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,424.779963,2550,Automatic
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,16,17,16,345.133720,2100,Automatic
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,13,13,13,424.779963,2550,Automatic
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,14,21,16,345.133720,2550,Automatic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,151.614948,1100,Automatic
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,150.993575,1100,Automatic
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,38,36,151.614948,1100,Automatic
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,34,39,36,152.857693,1100,Automatic


In [71]:
# convert MPG columns to km_per_liter

def mpg_to_kpl_converter(col_name):
    
    #gather values of MPG from dataframe as series
    mpg_list=veh_df[col_name]
    #convert values to grams/KM
    mpg_to_kpl=lambda x:round(x*0.425144,2)
    #re-introduce the values into the dataframe
    veh_df[col_name]=mpg_to_kpl(mpg_list)
    #rename column
    new_col_name=col_name.replace("MPG","KM/Liter")
    veh_df.rename(columns={col_name:new_col_name},inplace=True)

mpg_to_kpl_converter("City MPG")
mpg_to_kpl_converter("Highway MPG")
mpg_to_kpl_converter("Combined MPG")

veh_df

Unnamed: 0,Make,Model,Year,Engine Displacement,Cylinders,Transmission,Drivetrain,Vehicle Class,Fuel Type,Fuel Barrels/Year,City KM/Liter,Highway KM/Liter,Combined KM/Liter,CO2 Emissions Gram/KM,Fuel Cost/Year,Auto or Manual?
0,AM General,DJ Po Vehicle 2WD,1984,2.5,4.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,19.388824,7.65,7.23,7.23,324.831736,1950,Automatic
1,AM General,FJ8c Post Office,1984,4.2,6.0,Automatic 3-spd,2-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.53,5.53,5.53,424.779963,2550,Automatic
2,AM General,Post Office DJ5 2WD,1985,2.5,4.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,20.600625,6.80,7.23,6.80,345.133720,2100,Automatic
3,AM General,Post Office DJ8 2WD,1985,4.2,6.0,Automatic 3-spd,Rear-Wheel Drive,Special Purpose Vehicle 2WD,Regular,25.354615,5.53,5.53,5.53,424.779963,2550,Automatic
4,ASC Incorporated,GNX,1987,3.8,6.0,Automatic 4-spd,Rear-Wheel Drive,Midsize Cars,Premium,20.600625,5.95,8.93,6.80,345.133720,2550,Automatic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35947,smart,fortwo coupe,2013,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.45,16.16,15.31,151.614948,1100,Automatic
35948,smart,fortwo coupe,2014,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.45,16.16,15.31,150.993575,1100,Automatic
35949,smart,fortwo coupe,2015,1.0,3.0,Auto(AM5),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.45,16.16,15.31,151.614948,1100,Automatic
35950,smart,fortwo coupe,2016,0.9,3.0,Auto(AM6),Rear-Wheel Drive,Two Seaters,Premium,9.155833,14.45,16.58,15.31,152.857693,1100,Automatic


Converting Grams/Mile to Grams/Km

1 Mile = 1.60934 Km

Grams/Mile * Mile/Km -> Grams/Mile * 1 Mile/1.60934Km

$$ \frac{Grams}{Mile} * \frac{Mile}{Km} $$

$$ \frac{Grams}{Mile} * \frac{1 Mile}{1.60934Km}  $$

convert MPG columns to km_per_liter

MPG = Miles/Gallon -> Km/Liter

1 Mile = 1.60934 Km

1 Gallon = 3.78541 Liters

$$ \frac{Miles}{Gallon} -> \frac{Miles}{Gallon} * \frac{Km}{Miles} * \frac{Gallon}{Liters}$$

$$ \frac{Miles}{Gallon} -> \frac{Miles}{Gallon} * \frac{1.60934Km}{ 1Miles} * \frac{1 Gallon}{3.78541 Liters}$$

* ( 1.60934 / 3.78541 )


### Gathering insights:

- How many car makers are there? How many models? Which car maker has the most cars in the dataset?

- When were these cars made? How big is the engine of these cars?

- What's the frequency of different transmissions, drivetrains and fuel types?

- What's the car that consumes the least/most fuel?

In [72]:

# How many car makers are there? How many models? Which car maker has the most cars in the dataset?


make_list=list(set(veh_df['Make']))
model_list=list(set(veh_df['Model']))
most_cars=veh_df['Make'].mode()

print(" ".join(most_cars),"has the most car models in the list")
print("There are",len(make_list),"makes of cars")
print("There are",len(model_list),"models of cars")

Chevrolet has the most car models in the list
There are 121 makes of cars
There are 3608 models of cars


In [73]:
# When were these cars made? How big is the engine of these cars?
# When were these cars made? How big is the engine of these cars?
min_year=min(veh_df["Year"])
max_year=max(veh_df["Year"])
min_cyclinders=int(min(veh_df["Cylinders"]))
max_cyclinders=int(max(veh_df["Cylinders"]))

print(f"The cars were built between {min_year} and {max_year}")
print(f"The engines are between a {min_cyclinders} and {max_cyclinders} cylinders")

The cars were built between 1984 and 2017
The engines are between a 2 and 16 cylinders


In [74]:
# What's the frequency of different transmissions, drivetrains and fuel types?
def data_consistency_replacements(col_name,old_val,new_val):
    '''Used to Transmission info in a column consistent in terms of naming
    Uses old value to search for part of string
    Then replaces it with new string
    Requires column name for search'''
    temp_str=new_val+" "+veh_df["Auto or Manual?"]
    veh_df.loc[veh_df[col_name].str.contains(old_val), col_name] = temp_str

data_consistency_replacements("Transmission","1","Variable Speed")
data_consistency_replacements("Transmission","variable","Variable Speed")
data_consistency_replacements("Transmission","AV","Variable Speed")
data_consistency_replacements("Transmission","2","2 Speed")
data_consistency_replacements("Transmission","3","3 Speed")
data_consistency_replacements("Transmission","4","4 Speed")
data_consistency_replacements("Transmission","5","5 Speed")
data_consistency_replacements("Transmission","6","6 Speed")
data_consistency_replacements("Transmission","7","7 Speed")
data_consistency_replacements("Transmission","8","8 Speed")
data_consistency_replacements("Transmission","9","9 Speed")
veh_df["Transmission"].value_counts()

4 Speed Automatic           10816
5 Speed Manual               7788
6 Speed Automatic            4271
5 Speed Automatic            3007
3 Speed Automatic            2599
6 Speed Manual               2423
7 Speed Automatic            1336
4 Speed Manual               1306
8 Speed Automatic            1214
Variable Speed Automatic      930
9 Speed Automatic             117
3 Speed Manual                 74
7 Speed Manual                 71
Name: Transmission, dtype: int64

In [75]:
#drivetrains and fuel types?

def data_consistency_replacements(col_name,old_val,new_val):
    '''For data cleaning, takes records that contain old value, and replaces it with new, consistent value.  
    Makes search and replace in specific column name'''
    
    veh_df.loc[veh_df[col_name].str.contains(old_val), col_name] = new_val
    
data_consistency_replacements("Drivetrain","4-Wheel","All-Wheel Drive")

two_wheel=['Front','Rear','2']    
for i in two_wheel:
    data_consistency_replacements("Drivetrain",i,"Two-Wheel Drive")

veh_df["Drivetrain"].value_counts()


Two-Wheel Drive    26194
All-Wheel Drive     9758
Name: Drivetrain, dtype: int64

In [76]:
petrol_list=['Regular','Premium','Gasoline','Midgrade']
data_consistency_replacements("Fuel Type",'or','Hybrid')

for i in petrol_list:
    data_consistency_replacements("Fuel Type",i,"Petrol")


veh_df['Fuel Type'].value_counts()



Petrol    33618
Hybrid     1363
Diesel      911
CNG          60
Name: Fuel Type, dtype: int64

In [77]:
# What's the car that consumes the least/most fuel?
min_fuel=min(veh_df["Fuel Barrels/Year"])
max_fuel=max(veh_df["Fuel Barrels/Year"])

min_df=veh_df.loc[veh_df["Fuel Barrels/Year"]==min_fuel,"Make"]
max_df=veh_df.loc[veh_df["Fuel Barrels/Year"]==max_fuel,"Make"]
print (" ".join(set(min_df)),"uses the least fuel")
print (" ".join(set(max_df)),"uses the most fuel")

Honda uses the least fuel
Lamborghini uses the most fuel


<b> (Optional)

What brand has the worse CO2 Emissions on average?

Hint: use the function `sort_values()`

In [78]:
## your Code here

min_CO2=min(veh_df["CO2 Emissions Gram/KM"])
max_CO2=max(veh_df["CO2 Emissions Gram/KM"])

min_df=veh_df['Make'][veh_df["CO2 Emissions Gram/KM"]==min_CO2]
max_df=veh_df['Make'][veh_df["CO2 Emissions Gram/KM"]==max_CO2]
print (" ".join(set(min_df)),"uses the least CO2")
print (" ".join(set(max_df)),"uses the most CO2")


BMW uses the least CO2
Lamborghini uses the most CO2


Do cars with automatic transmission consume more fuel than cars with manual transmission on average?

In [79]:
## Your Code is here 
#columns to use: Fuel Barrels/Year
            # Auto or Manual?

auto_avg=round(np.mean(veh_df['Fuel Barrels/Year'][veh_df['Auto or Manual?']=="Automatic"]),2)
manual_avg=round(np.mean(veh_df['Fuel Barrels/Year'][veh_df['Auto or Manual?']=="Manual"]),2)

print(f'Automatic transmissions use {auto_avg} barrels of fuel a year, while manual transmissions use {manual_avg} barrels of fuel each year')

Automatic transmissions use 18.04 barrels of fuel a year, while manual transmissions use 16.7 barrels of fuel each year
