# Imports


In [49]:
import pandas as pd
import re # for regex functionality
import matplotlib.pyplot as plt # Import for visuals
from matplotlib.ticker import FuncFormatter
import seaborn as sns

# Adjust display settings for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

## Code Start

In [50]:
fuel_econ = pd.read_csv('../data/vehicles.csv', dtype={74: str, 75: str, 77: str})#, low_memory=False)

#print(fuel_econ.columns)
print(fuel_econ['atvType'].unique()) # see all the unique fuel types we are working with.

# nan appears to be regular,mid, premium indicating that gasoline appears as nan

#fuel_type_check = fuel_econ[fuel_econ['model'].str.contains('sentra', case=False, na=False)]
#print(fuel_type_check['atvType'].unique())

# Also noted a datatype warning so I will need to verify what datatype to utilize for this column if kept.
# Check the datatypes of the specified columns

# Used to check dtypes to resolve warning regarding varying data types in three columns.
#columns_to_check = [74, 75, 77]
#datatypes = fuel_econ.iloc[:, columns_to_check].dtypes
#print(datatypes)

# chose to set datatype on columns to str on load to resolve warning.



[nan 'Diesel' 'Hybrid' 'Bifuel (CNG)' 'CNG' 'FFV' 'EV' 'Bifuel (LPG)'
 'Plug-in Hybrid' 'FCV']


In [51]:
# created this code to find all columns with base in their name in order to solve a naming descrepency.  Needed to capitalize the M in baseModel

# Read the CSV file to get the column names
file_path = '../data/vehicles.csv'
df_temp = pd.read_csv(file_path, nrows=0) # return only column names

# Print out columns contained within csv altogether.
print(fuel_econ.columns)
#count rows and columns
print(fuel_econ.shape)
# 84 columns this will be fun

# Find columns that contain 'base' in their names - Due to conflict with naming documentation and actual column names.
# use col.lower to removed conflicts in locating the data that may be case sensitive.
base_columns = [col for col in df_temp.columns if 'base' in col.lower()]

print("Columns that include 'base' in their name:")
print(base_columns) # Shows column basemodel based on documentation, real name = baseModel

# Determin columns that include fuel in their name.
ftype_columns = [col for col in df_temp.columns if 'fuel' in col.lower()]

print("Columns that include 'fuel' in their name:")
print(ftype_columns)


Index(['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08',
       'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2',
       'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U',
       'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders',
       'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08',
       'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA',
       'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD',
       'highwayE', 'highwayUF', 'hlv', 'hpv', 'id', 'lv2', 'lv4', 'make',
       'model', 'mpgData', 'phevBlended', 'pv2', 'pv4', 'range', 'rangeCity',
       'rangeCityA', 'rangeHwy', 'rangeHwyA', 'trany', 'UCity', 'UCityA',
       'UHighway', 'UHighwayA', 'VClass', 'year', 'youSaveSpend', 'baseModel',
       'guzzler', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'fuelType2',
       'rangeA', 'evMotor', 'mfrCode', 'c240Dscr', 'charge240b', 'c240bDscr',
       'createdOn'

In [None]:
print(fuel_econ.head(1))
print('-'*20)

# Filter the DataFrame
filtered_df = fuel_econ[(fuel_econ['barrels08'].notna()) & 
                        (fuel_econ['barrels08'] != 0.0) & 
                        (fuel_econ['barrelsA08'].notna()) & 
                        (fuel_econ['barrelsA08'] != 0.0)]

# Select the first row that meets the criteria
selected_row = filtered_df.iloc[0]

selected_row_fuelTypes = filtered_df['fuelType'].unique()

print(selected_row_fuelTypes)  # see the fuel types that are associated with column 'barrelsA08' that contain values above 0



   barrels08  barrelsA08  charge120  charge240  city08  city08U  cityA08  cityA08U  cityCD  cityE  cityUF  co2  co2A  co2TailpipeAGpm  co2TailpipeGpm  comb08  comb08U  combA08  combA08U  combE  combinedCD  combinedUF  cylinders  displ             drive  engId eng_dscr  feScore  fuelCost08  fuelCostA08 fuelType         fuelType1  ghgScore  ghgScoreA  highway08  highway08U  highwayA08  highwayA08U  highwayCD  highwayE  highwayUF  hlv  hpv  id  lv2  lv4        make               model mpgData  phevBlended  pv2  pv4  range  rangeCity  rangeCityA  rangeHwy  rangeHwyA         trany    UCity  UCityA  UHighway  UHighwayA       VClass  year  youSaveSpend baseModel guzzler trans_dscr tCharger sCharger atvType fuelType2 rangeA evMotor mfrCode c240Dscr  charge240b c240bDscr                     createdOn                    modifiedOn startStop  phevCity  phevHwy  phevComb
0  14.167143         0.0        0.0        0.0      19      0.0        0       0.0     0.0    0.0     0.0   -1    -1            

In [52]:
# Define the list of columns you need
columns = [
    'charge120', 'charge240', 'city08', 'cityA08', 'cityCD', 'cityE', 'cityUF', 'co2', 'co2A',
    'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'combA08', 'combE', 'fuelType1', 'fuelType2',
    'combinedCD', 'combinedUF', 'cylinders', 'displ', 'drive', 'engId', 'fuelType', 'highway08',
    'highwayA08', 'highwayCD', 'highwayE', 'highwayUF', 'id', 'make', 'model', 'UCity', 'UCityA',
    'UHighway', 'UHighwayA', 'VClass', 'year', 'pv2', 'pv4', 'rangeA', 'rangeCityA', 'rangeHwyA',
    'startStop', 'phevCity', 'phevHwy', 'phevComb', 'baseModel'
]

# Read the CSV file, selecting only the specified columns
df = pd.read_csv('../data/vehicles.csv', usecols=columns)

# Display the first few rows of the DataFrame
# print(df.head(0))  # view the column names
# print(df['VClass'].unique())  # check the classes for all vehicles

# filter electric vehicles
# ev_df = df[]
# print(df['fuelType'].unique())
ev_df = df[df['fuelType'].str.contains('Electricity', case=False, na=False)]
# print(ev_df['VClass'].unique()) # check classes of vehicles that contain electricity as fuel type

unique_ev_combinations = ev_df[['VClass', 'fuelType', 'combE']].drop_duplicates(subset=['VClass', 'fuelType'])  # check diff fuel types for vehicles that are electric/hybrid unique combos using subset
print(unique_ev_combinations.sort_values(by='VClass')) # show them and order them by VClass

print(unique_ev_combinations.shape)
print(41*.13/100) # approx 41Kwh per 100 miles, and approx 13 cents per Kwh, multiply Kwh by cost per Kwh to get total for 100 miles then divide to get per mile.  .0533 cents per mile.
# combE returns a number of Kwh per 100 miles.
print(2.50/22)  # guessing fuel prices at 2.50/gallon and approx 22 miles per gallon equals .1136 cents per mile
print(unique_ev_combinations['VClass'].unique())  # See the diff unique Vehicle classes that I can compare that use electricity or gas/electric



                                   VClass                     fuelType    combE
23022                        Compact Cars                  Electricity  71.0000
23033                        Compact Cars   Premium Gas or Electricity  36.0000
29785                        Compact Cars      Premium and Electricity  40.0000
29495                        Compact Cars   Regular Gas or Electricity  31.0000
32921                          Large Cars   Premium Gas or Electricity  59.0000
27214                          Large Cars      Premium and Electricity  52.0000
24767                          Large Cars                  Electricity  38.0000
31469                        Midsize Cars      Premium and Electricity  54.0000
23031                        Midsize Cars                  Electricity  34.0000
24686                        Midsize Cars  Regular Gas and Electricity  29.0000
7138               Midsize Station Wagons                  Electricity  40.0000
35983                    Minicompact Car

In [53]:
print(fuel_econ['fuelType'].unique()) # get all unique fuel types.

# Filtering data to only include cars that are gasoline, diesel, hybrid gas/electric or electric
# fuelTypes to keep:  +

['Regular' 'Premium' 'Diesel' 'CNG' 'Gasoline or natural gas'
 'Gasoline or E85' 'Electricity' 'Gasoline or propane' 'Premium or E85'
 'Midgrade' 'Premium Gas or Electricity' 'Regular Gas and Electricity'
 'Premium and Electricity' 'Regular Gas or Electricity' 'Hydrogen']


In [54]:
# fuel_econ['fuelType'].count() # Starting with 48351 records for diff vehicles

# Create a dictionary with column names and their descriptions
column_descriptions = {
    "atvtype": "type of alternative fuel or advanced technology vehicle",
    "barrels08": "annual petroleum consumption in barrels for fuelType1",
    "barrelsA08": "annual petroleum consumption in barrels for fuelType2",
    "charge120": "time to charge an electric vehicle in hours at 120 V",
    "charge240": "time to charge an electric vehicle in hours at 240 V",
    "city08": "city MPG for fuelType1",
    "city08U": "unrounded city MPG for fuelType1",
    "cityA08": "city MPG for fuelType2",
    "cityA08U": "unrounded city MPG for fuelType2",
    "cityCD": "city gasoline consumption (gallons/100 miles) in charge depleting mode",
    "cityE": "city electricity consumption in kw-hrs/100 miles",
    "cityMpk": "city miles per Kilogram for Hydrogen",
    "cityUmpk": "unrounded city miles per Kilogram for Hydrogen",
    "cityUF": "EPA city utility factor (share of electricity) for PHEV",
    "co2": "tailpipe CO2 in grams/mile for fuelType1",
    "co2A": "tailpipe CO2 in grams/mile for fuelType2",
    "co2TailpipeAGpm": "tailpipe CO2 in grams/mile for fuelType2",
    "co2TailpipeGpm": "tailpipe CO2 in grams/mile for fuelType1",
    "comb08": "combined MPG for fuelType1",
    "comb08U": "unrounded combined MPG for fuelType1",
    "combA08": "combined MPG for fuelType2",
    "combA08U": "unrounded combined MPG for fuelType2",
    "combE": "combined electricity consumption in kw-hrs/100 miles",
    "combMpk": "combined miles per Kilogram for Hydrogen",
    "combUmpk": "unrounded combined miles per Kilogram for Hydrogen",
    "combinedCD": "combined gasoline consumption (gallons/100 miles) in charge depleting mode",
    "combinedUF": "EPA combined utility factor (share of electricity) for PHEV",
    "cylinders": "engine cylinders",
    "displ": "engine displacement in liters",
    "drive": "drive axle type",
    "engId": "EPA model type index",
    "eng_dscr": "engine descriptor",
    "evMotor": "electric motor (kw-hrs)",
    "feScore": "EPA Fuel Economy Score",
    "fuelCost08": "annual fuel cost for fuelType1",
    "fuelCostA08": "annual fuel cost for fuelType2",
    "fuelType": "fuel type with fuelType1 and fuelType2",
    "fuelType1": "fuel type 1",
    "fuelType2": "fuel type 2",
    "ghgScore": "EPA GHG score",
    "ghgScoreA": "EPA GHG score for dual fuel vehicle running on the alternative fuel",
    "guzzler": "if G or T, this vehicle is subject to the gas guzzler tax",
    "highway08": "highway MPG for fuelType1",
    "highway08U": "unrounded highway MPG for fuelType1",
    "highwayA08": "highway MPG for fuelType2",
    "highwayA08U": "unrounded highway MPG for fuelType2",
    "highwayCD": "highway gasoline consumption (gallons/100miles) in charge depleting mode",
    "highwayE": "highway electricity consumption in kw-hrs/100 miles",
    "highwayMpk": "highway miles per Kilogram for Hydrogen",
    "highwayUmpk": "unrounded highway miles per Kilogram for Hydrogen",
    "highwayUF": "EPA highway utility factor (share of electricity) for PHEV",
    "hlv": "hatchback luggage volume (cubic feet)",
    "hpv": "hatchback passenger volume (cubic feet)",
    "id": "vehicle record id",
    "lv2": "2 door luggage volume (cubic feet)",
    "lv4": "4 door luggage volume (cubic feet)",
    "make": "manufacturer (division)",
    "mfrCode": "3-character manufacturer code",
    "model": "model name",
    "mpgData": "has My MPG data",
    "phevBlended": "if true, this vehicle operates on a blend of gasoline and electricity in charge depleting mode",
    "pv2": "2-door passenger volume (cubic feet)",
    "pv4": "4-door passenger volume (cubic feet)",
    "rangeA": "EPA range for fuelType2",
    "rangeCityA": "EPA city range for fuelType2",
    "rangeHwyA": "EPA highway range for fuelType2",
    "trans_dscr": "transmission descriptor",
    "trany": "transmission",
    "UCity": "unadjusted city MPG for fuelType1",
    "UCityA": "unadjusted city MPG for fuelType2",
    "UHighway": "unadjusted highway MPG for fuelType1",
    "UHighwayA": "unadjusted highway MPG for fuelType2",
    "VClass": "EPA vehicle size class",
    "year": "model year",
    "youSaveSpend": "you save/spend over 5 years compared to an average car",
    "sCharger": "if S, this vehicle is supercharged",
    "tCharger": "if T, this vehicle is turbocharged",
    "c240Dscr": "electric vehicle charger description",
    "charge240b": "time to charge an electric vehicle in hours at 240 V using the alternate charger",
    "c240bDscr": "electric vehicle alternate charger description",
    "createdOn": "date the vehicle record was created",
    "modifiedOn": "date the vehicle record was last modified",
    "startStop": "vehicle has stop-start technology",
    "phevCity": "EPA composite gasoline-electricity city MPGe for plug-in hybrid vehicles",
    "phevHwy": "EPA composite gasoline-electricity highway MPGe for plug-in hybrid vehicles",
    "phevComb": "EPA composite gasoline-electricity combined city-highway MPGe for plug-in hybrid vehicles",
    "baseModel": "base model name",
}

# get names and unique values for each column in the column list
with open('../Documents/unique_values.txt', 'w') as file:
    for col in df:
        unique_values = df[col].unique()
        description = column_descriptions.get(col, "No description available")
        # Print unique values to the console
        print(f"Column: {col}")
        print(unique_values)
        print("\n" + "-"*40 + "\n")

        # Write unique values to the text file
        file.write(f"\nColumn: {col}: {description}\n")
        file.write(f'-'*10 + "\n")
        for value in unique_values:
            file.write(f"{value}, ")
        file.write("\n" + "-"*10 + "\n")



Column: charge120
[0.  3.2]

----------------------------------------

Column: charge240
[ 0.    7.    4.    8.    5.    6.    1.5  12.    2.5  10.    0.67  3.6
  3.    3.5   2.    2.75  3.7   4.5   2.7   1.9   9.3   5.5   5.3   3.75
  2.3   2.2   2.25  9.    2.6  13.    9.5  11.    8.5   9.2   1.3   5.8
  6.25 10.5   2.4   3.3  15.3  11.2  11.6  15.   11.8  14.7   7.5  10.7
  8.8  10.9  11.4   3.4   7.8   1.7  12.5  10.3  10.1   6.3   8.4   8.7
 11.5  10.4  14.    8.1   9.4  11.9   7.2   6.5   8.9  12.75  8.25 10.2
 19.    6.1  16.   12.7   7.9   7.7  10.75  7.4   2.8   3.2   6.7   9.9
  7.6  14.5  15.2   6.2  14.6   7.1   5.1  17.    9.8  17.6   6.9   8.32
  2.1  15.5  13.3  18.6   8.2   9.1  11.7 ]

----------------------------------------

Column: city08
[ 19   9  23  10  17  21  22  18  12  20  14  11  15  13  16  25  24  26
  31  27  30  38  28  43  35  33  29  39  37   8   7  34  32  36  49  81
  45  48  42   6  44  74  84  40  87  41  51  62  59  79  50  52 102 106
  94 126  53

In [55]:
print(f"{len(df)}\n----------")

# Specify the fuel types I want to keep
fuel_types = [
    'Regular', 'Premium', 'Electricity', 'Premium Gas or Electricity', 
    'Regular Gas and Electricity', 'Premium and Electricity', 'Regular Gas or Electricity'
]

# Filter the DataFrame
filtered_df = df[df['fuelType'].isin(fuel_types)][columns]

# filter and print out the important columns regarding fuel and return their values for inspection
print( f"{filtered_df['fuelType'].unique()}\n--" )
print( f"{filtered_df['fuelType1'].unique()}\n--" )
print( f"{filtered_df['fuelType2'].unique()}\n----------\n{len(filtered_df)}" )
ft = filtered_df[filtered_df['fuelType2'] == 'Electricity']
ft = ft.drop_duplicates(subset=['fuelType'])
print(ft[['fuelType', 'fuelType1', 'fuelType2', 'VClass']])

# Assuming your DataFrame is named df
non_electric_df = filtered_df[~filtered_df['fuelType'].str.contains('Electricity', na=False)]
print('\n', non_electric_df[['fuelType', 'fuelType1', 'fuelType2', 'VClass']].drop_duplicates(subset=['fuelType']))




48351
----------
['Regular' 'Premium' 'Electricity' 'Premium Gas or Electricity'
 'Regular Gas and Electricity' 'Premium and Electricity'
 'Regular Gas or Electricity']
--
['Regular Gasoline' 'Premium Gasoline' 'Electricity']
--
[nan 'Electricity']
----------
45247
                          fuelType         fuelType1    fuelType2        VClass
23033   Premium Gas or Electricity  Premium Gasoline  Electricity  Compact Cars
24686  Regular Gas and Electricity  Regular Gasoline  Electricity  Midsize Cars
27214      Premium and Electricity  Premium Gasoline  Electricity    Large Cars
29495   Regular Gas or Electricity  Regular Gasoline  Electricity  Compact Cars

   fuelType         fuelType1 fuelType2        VClass
0  Regular  Regular Gasoline       NaN   Two Seaters
4  Premium  Premium Gasoline       NaN  Compact Cars
