# Imports


In [1]:
import pandas as pd
import re # for regex functionality
import matplotlib.pyplot as plt # Import for visuals
from matplotlib.ticker import FuncFormatter
import seaborn as sns

# Adjust display settings for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

## Code Start

In [2]:
fuel_econ = pd.read_csv('../data/vehicles.csv', dtype={74: str, 75: str, 77: str})#, low_memory=False)

#print(fuel_econ.columns)
print(fuel_econ['atvType'].unique()) # see all the unique fuel types we are working with.

# nan appears to be regular,mid, premium indicating that gasoline appears as nan

#fuel_type_check = fuel_econ[fuel_econ['model'].str.contains('sentra', case=False, na=False)]
#print(fuel_type_check['atvType'].unique())

# Also noted a datatype warning so I will need to verify what datatype to utilize for this column if kept.
# Check the datatypes of the specified columns

# Used to check dtypes to resolve warning regarding varying data types in three columns.
#columns_to_check = [74, 75, 77]
#datatypes = fuel_econ.iloc[:, columns_to_check].dtypes
#print(datatypes)

# chose to set datatype on columns to str on load to resolve warning.



[nan 'Diesel' 'Hybrid' 'Bifuel (CNG)' 'CNG' 'FFV' 'EV' 'Bifuel (LPG)'
 'Plug-in Hybrid' 'FCV']


In [3]:
# created this code to find all columns with base in their name in order to solve a naming descrepency.  Needed to capitalize the M in baseModel

# Read the CSV file to get the column names
file_path = '../data/vehicles.csv'
df_temp = pd.read_csv(file_path, nrows=0) # return only column names

# Print out columns contained within csv altogether.
print(fuel_econ.columns)
#count rows and columns
print(fuel_econ.shape)
# 84 columns this will be fun

# Find columns that contain 'base' in their names - Due to conflict with naming documentation and actual column names.
# use col.lower to removed conflicts in locating the data that may be case sensitive.
base_columns = [col for col in df_temp.columns if 'base' in col.lower()]

print("Columns that include 'base' in their name:")
print(base_columns) # Shows column basemodel based on documentation, real name = baseModel

# Determin columns that include fuel in their name.
ftype_columns = [col for col in df_temp.columns if 'fuel' in col.lower()]

print("Columns that include 'fuel' in their name:")
print(ftype_columns)


Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'baseModel',
       'guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2',
       'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn'

In [4]:
print(fuel_econ.head(1))
print('-'*20)

# Filter the DataFrame
barrels_filtered_df = fuel_econ[(fuel_econ['barrels08'].notna()) & 
                        (fuel_econ['barrels08'] != 0.0) & 
                        (fuel_econ['barrelsA08'].notna()) & 
                        (fuel_econ['barrelsA08'] != 0.0)]

# Select the first row that meets the criteria
selected_row = barrels_filtered_df.iloc[0]

selected_row_fuelTypes = barrels_filtered_df['fuelType'].unique()

print(selected_row_fuelTypes)  # see the fuel types that are associated with column 'barrelsA08' that contain values above 0
print('-'*20)
print()

   barrels08  barrelsA08  charge120  charge240  city08  city08U  cityA08  cityA08U  cityCD  cityE  cityUF  co2  co2A  co2TailpipeAGpm  co2TailpipeGpm  comb08  comb08U  combA08  combA08U  combE  combinedCD  combinedUF  cylinders  displ             drive  engId eng_dscr  feScore  fuelCost08  fuelCostA08 fuelType         fuelType1  ghgScore  ghgScoreA  highway08  highway08U  highwayA08  highwayA08U  highwayCD  highwayE  highwayUF  hlv  hpv  id  lv2  lv4        make               model mpgData  phevBlended  pv2  pv4  range  rangeCity  rangeCityA  rangeHwy  rangeHwyA         trany    UCity  UCityA  UHighway  UHighwayA       VClass  year  youSaveSpend baseModel guzzler trans_dscr tCharger sCharger atvType fuelType2 rangeA evMotor mfrCode c240Dscr  charge240b c240bDscr                     createdOn                    modifiedOn startStop  phevCity  phevHwy  phevComb
0  14.167143         0.0        0.0        0.0      19      0.0        0       0.0     0.0    0.0     0.0   -1    -1            

### Get Overview of data

In [5]:
# Cleaned up method

spacer = ['-'*20, '-'*2]
titleList = ['Data Discovery', 'Annual fuel consumption', 'Charging time', 'Mileage/Type', 'Carbon Emissions per mile', 'MPG/MPKwh']
descript_dict = {
    'barrels': 'annual petroleum consumption for fuelType1 and fuelType2 (1 barrel = 42 gallons)',
    'barrels08': 'annual petroleum consumption for fuelType1',
    'barrelsA08': 'annual petroleum consumption for fuelType2',
    'charge120': 'time to charge at 120V (hours)',
    'charge240': 'time to charge at 240V (hours)',
    'city08': 'city MPG for fuelType1',
    'city08U': 'unrounded city MPG for fuelType1',
    'cityA08': 'city MPG for fuelType2',
    'cityA08U': 'unrounded city MPG for fuelType2',
    'cityCD': 'city gasoline consumption in charge depleting mode (gallons/100 miles)',
    'cityE': 'city electricity consumption (kw-hrs/100 miles)',
    'cityUF': 'EPA city utility factor for PHEV',
    'co2': 'tailpipe CO2 for fuelType1 (grams/mile)',
    'co2A': 'tailpipe CO2 for fuelType2 (grams/mile)',
    'co2TailpipeGpm': 'tailpipe CO2 for fuelType1 (grams/mile)',
    'co2TailpipeAGpm': 'tailpipe CO2 for fuelType2 (grams/mile)',
    'comb08': 'combined MPG for fuelType1',
    'comb08U': 'unrounded combined MPG for fuelType1',
    'combA08': 'combined MPG for fuelType2',
    'combA08U': 'unrounded combined MPG for fuelType2',
    'combE': 'combined electricity consumption (kw-hrs/100 miles)',
    'combinedCD': 'combined gasoline consumption in charge depleting mode (gallons/100 miles)',
    'combinedUF': 'EPA combined utility factor for PHEV',
    'cylinders': 'engine cylinders',
    'displ': 'engine displacement (liters)',
    'drive': 'drive axle type',
    'engId': 'EPA model type index',
    'eng_dscr': 'engine descriptor',
    'feScore': 'EPA Fuel Economy Score (-1 = Not available)',
    'fuelCost08': 'annual fuel cost for fuelType1 ($)',
    'fuelCostA08': 'annual fuel cost for fuelType2 ($)',
    'fuelType': 'fuel type (fuelType1 and fuelType2, if applicable)',
    'fuelType1': 'primary fuel type for single or conventional fuel vehicles',
    'fuelType2': 'alternative fuel type for dual fuel vehicles',
    'ghgScore': 'EPA GHG score for fuelType1 (-1 = Not available)',
    'ghgScoreA': 'EPA GHG score for fuelType2 (-1 = Not available)',
    'guzzler': 'subject to gas guzzler tax (G or T)',
    'trans_dscr': 'transmission descriptor',
    'highway08': 'highway MPG for fuelType1',
    'highway08U': 'unrounded highway MPG for fuelType1',
    'highwayA08': 'highway MPG for fuelType2',
    'highwayA08U': 'unrounded highway MPG for fuelType2',
    'highwayCD': 'highway gasoline consumption in charge depleting mode (gallons/100 miles)',
    'highwayE': 'highway electricity consumption (kw-hrs/100 miles)',
    'highwayUF': 'EPA highway utility factor for PHEV',
    'hlv': 'hatchback luggage volume (cubic feet)',
    'hpv': 'hatchback passenger volume (cubic feet)',
    'id': 'vehicle record ID',
    'lv2': '2-door luggage volume (cubic feet)',
    'lv4': '4-door luggage volume (cubic feet)',
    'make': 'manufacturer (division)',
    'mfrCode': '3-character manufacturer code',
    'model': 'model name (carline)',
    'mpgData': 'has My MPG data',
    'phevBlended': 'PHEV operates on a blend of gasoline and electricity in charge depleting mode',
    'pv2': '2-door passenger volume (cubic feet)',
    'pv4': '4-door passenger volume (cubic feet)',
    'range': 'estimated range for fuelType1',
    'rangeCity': 'estimated city range for fuelType1',
    'rangeCityA': 'EPA city range for fuelType2',
    'rangeHwy': 'estimated highway range for fuelType1',
    'rangeHwyA': 'EPA highway range for fuelType2',
    'trany': 'transmission',
    'UCity': 'unadjusted city MPG for fuelType1',
    'UCityA': 'unadjusted city MPG for fuelType2',
    'UHighway': 'unadjusted highway MPG for fuelType1',
    'UHighwayA': 'unadjusted highway MPG for fuelType2',
    'VClass': 'EPA vehicle size class',
    'year': 'model year',
    'youSaveSpend': 'savings/spending over 5 years compared to an average car',
    'sCharger': 'supercharged (S)',
    'tCharger': 'turbocharged (T)',
    'c240Dscr': 'electric vehicle charger description',
    'charge240b': 'time to charge an electric vehicle at 240V (hours)',
    'c240bDscr': 'electric vehicle alternate charger description',
    'createdOn': 'vehicle record creation date',
    'modifiedOn': 'last modified date of the vehicle record',
    'startStop': 'vehicle has stop-start technology',
    'phevCity': 'EPA composite gasoline-electricity city MPGe for plug-in hybrid vehicles',
    'phevHwy': 'EPA composite gasoline-electricity highway MPGe for plug-in hybrid vehicles',
    'phevComb': 'EPA composite gasoline-electricity combined city-highway MPGe for plug-in hybrid vehicles',
    'baseModel': 'base model name',
    'atvType': 'type of alternative fuel or advanced technology vehicle',
    'rangeA': 'EPA range for fuelType2',
    'evMotor': 'electric motor (kw-hrs)'
}


# get names and unique values for each column in the column list
with open('../Documents/Column_Descript_Unique_Values.txt', 'w') as file:
    for col in fuel_econ:
        unique_values = fuel_econ[col].unique()
        description = descript_dict.get(col, "No description available")
        file.write(f'{col}:  {description}\n')
        file.write(f"{spacer[1]}\n")
        for value in unique_values:
            file.write(f"{value}, ")
        file.write("\n" + "-"*10 + "\n")







In [24]:
# Define the list of columns you need
columns = [
    'barrels08', 'charge240',
    'cityE', 'co2',
    'co2TailpipeGpm', 'comb08',
    'combE','cylinders',
    'displ', 'drive', 
    'engId', 'eng_dscr', 
    'feScore', 'fuelCost08',
    'fuelType', 'fuelType1', 
    'ghgScore', 'id',
    'make', 'model', 
    'mpgData', 'range',
    'rangeCity', 'rangeHwy',
    'trany', 'VClass', 
    'year', 'youSaveSpend', 
    'baseModel', 'trans_dscr', 
    'tCharger', 'sCharger', 
    'atvType', 'evMotor', 
    'mfrCode', 'c240Dscr', 
    'charge240b', 'fuelType2'
    ] # 37 total

# filter out all fuel types other than regu

# Read the CSV file, selecting only the specified columns
df = pd.read_csv('../data/vehicles.csv', usecols=columns)

# print(df['fuelType'].unique())
print(f'{df.shape} Before')
remove = ['Diesel', 'CNG', 'Gasoline or natural gas',
 'Gasoline or E85', 'Gasoline or propane' 'Premium or E85',
 'Premium Gas or Electricity', 'Regular Gas and Electricity',
 'Premium and Electricity', 'Regular Gas or Electricity', 'Hydrogen', 'Gasoline or propane', 'Premium or E85']

keep_rows = df[~df['fuelType'].isin(remove)]

print(f'{keep_rows.shape} After')

print(keep_rows['fuelType'].unique())
print(keep_rows['fuelType1'].unique())
print(keep_rows['fuelType1'].unique())
print(keep_rows['atvType'].unique())

print(keep_rows[keep_rows['atvType'] == 'Hybrid'].head())


'''
unique_ev_combinations = df[['VClass', 'fuelType', 'fuelType1', 'combE']].drop_duplicates(subset=['VClass','fuelType' ,'fuelType1'])  # check diff fuel types for vehicles that are electric/hybrid unique combos using subset
print(unique_ev_combinations.sort_values(by='VClass')) # show them and order them by VClass
unique_ev_combinations = df[['VClass', 'fuelType', 'fuelType1', 'combE']]


print(unique_ev_combinations.shape)
print(41*.13/100) # approx 41Kwh per 100 miles, and approx 13 cents per Kwh, multiply Kwh by cost per Kwh to get total for 100 miles then divide to get per mile.  .0533 cents per mile.
# combE returns a number of Kwh per 100 miles.
print(2.50/22)  # guessing fuel prices at 2.50/gallon and approx 22 miles per gallon equals .1136 cents per mile
print(unique_ev_combinations['VClass'].unique())  # See the diff unique Vehicle classes that I can compare that use electricity or gas/electric
'''


(48351, 38) Before
(45042, 38) After
['Regular' 'Premium' 'Electricity' 'Midgrade']
['Regular Gasoline' 'Premium Gasoline' 'Electricity' 'Midgrade Gasoline']
['Regular Gasoline' 'Premium Gasoline' 'Electricity' 'Midgrade Gasoline']
[nan 'Hybrid' 'EV']
      barrels08  charge240  cityE  co2  co2TailpipeGpm  comb08  combE  cylinders  displ              drive  engId eng_dscr  feScore  fuelCost08 fuelType         fuelType1  ghgScore     id    make    model mpgData  range  rangeCity  rangeHwy                             trany        VClass  year  youSaveSpend baseModel trans_dscr tCharger sCharger atvType fuelType2 evMotor mfrCode c240Dscr  charge240b
6230   5.613396        0.0    0.0   -1      167.679245      53    0.0        3.0    1.0  Front-Wheel Drive      0      NaN       -1         900  Regular  Regular Gasoline        -1  15606   Honda  Insight       Y      0        0.0       0.0                      Manual 5-spd   Two Seaters  2000          3750   Insight        SIL      NaN      N

  df = pd.read_csv('../data/vehicles.csv', usecols=columns)


"\nunique_ev_combinations = df[['VClass', 'fuelType', 'fuelType1', 'combE']].drop_duplicates(subset=['VClass','fuelType' ,'fuelType1'])  # check diff fuel types for vehicles that are electric/hybrid unique combos using subset\nprint(unique_ev_combinations.sort_values(by='VClass')) # show them and order them by VClass\nunique_ev_combinations = df[['VClass', 'fuelType', 'fuelType1', 'combE']]\n\n\nprint(unique_ev_combinations.shape)\nprint(41*.13/100) # approx 41Kwh per 100 miles, and approx 13 cents per Kwh, multiply Kwh by cost per Kwh to get total for 100 miles then divide to get per mile.  .0533 cents per mile.\n# combE returns a number of Kwh per 100 miles.\nprint(2.50/22)  # guessing fuel prices at 2.50/gallon and approx 22 miles per gallon equals .1136 cents per mile\nprint(unique_ev_combinations['VClass'].unique())  # See the diff unique Vehicle classes that I can compare that use electricity or gas/electric\n"