# Imports

In [8]:
import sqlite3
import pandas as pd
import re # for regex functionality
import matplotlib.pyplot as plt # Import for visuals
from matplotlib.ticker import FuncFormatter
import seaborn as sns

# Set display options to prevent wrapping columns
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.width', None)

### Read CSV file to data frame and load into sqlite database.

In [27]:
vehicle_data = pd.read_csv('../data/vehicles.csv', dtype={74: str, 75: str, 77: str}) # make sure these specific columns contain string values as they were mixed value types.

# Create/Connect to SQLite database
connection = sqlite3.connect('../data/vehicles_data.db')
vehicle_data.to_sql('vehicles_data_sql', connection, if_exists='replace', index=False)
connection.close()

### Test read data from sqlite db

In [None]:
connection = sqlite3.connect('../data/vehicles_data.db')
df = pd.read_sql_query('SELECT * FROM vehicles_data_sql', connection)
connection.close()

# print(df.head(2))

### List which columns to keep

In [3]:
keep_columns = ['barrels08', 'charge240', 'cityE', 'co2', 'co2TailpipeGpm', 'comb08', 'combE','cylinders', 'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08', 'fuelType', 'fuelType1',  'ghgScore', 'id', 
           'make', 'model', 'mpgData', 'range', 'rangeCity', 'rangeHwy', 'trany', 'VClass', 'year', 'youSaveSpend', 'baseModel', 'trans_dscr', 'tCharger', 'sCharger', 'atvType', 'evMotor', 'mfrCode', 
           'c240Dscr', 'charge240b'] # 37 total columns to keep, possible usable data

### List which fuel types to remove

In [4]:
remove_fuel_types = ['Diesel', 'CNG', 'Gasoline or natural gas', 'Gasoline or E85', 'Gasoline or propane' 'Premium or E85', 'Premium Gas or Electricity', 'Regular Gas and Electricity', 
                     'Premium and Electricity', 'Regular Gas or Electricity', 'Hydrogen', 'Gasoline or propane', 'Premium or E85'] # rows where cars use unwanted fuel types


### Clean initial data - by loading only what I need from the sqlite database.

In [36]:
keep_col_str = ', '.join(keep_columns)
remove_fuel_types_str = ', '.join(f"'{fuel}'" for fuel in remove_fuel_types) # need single quotes for each item

sqlite_query = f"""
SELECT {keep_col_str}
FROM vehicles_data_sql
WHERE fuelType NOT IN ({remove_fuel_types_str})
"""

connection = sqlite3.connect('../data/vehicles_data.db')

df = pd.read_sql_query(sqlite_query, connection)

connection.close()

df

Unnamed: 0,barrels08,charge240,cityE,co2,co2TailpipeGpm,comb08,combE,cylinders,displ,drive,...,youSaveSpend,baseModel,trans_dscr,tCharger,sCharger,atvType,evMotor,mfrCode,c240Dscr,charge240b
0,14.167143,0.0,0.0,-1,423.190476,21,0.0,4.0,2.0,Rear-Wheel Drive,...,-2750,Spider,,,,,,,,0.0
1,27.046364,0.0,0.0,-1,807.909091,11,0.0,12.0,4.9,Rear-Wheel Drive,...,-13000,Testarossa,,,,,,,,0.0
2,11.018889,0.0,0.0,-1,329.148148,27,0.0,4.0,2.2,Front-Wheel Drive,...,-500,Charger,SIL,,,,,,,0.0
3,27.046364,0.0,0.0,-1,807.909091,11,0.0,8.0,5.2,Rear-Wheel Drive,...,-13000,B150/B250 Wagon,,,,,,,,0.0
4,15.658421,0.0,0.0,-1,467.736842,19,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,-7500,Legacy/Outback,,T,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45037,13.523182,0.0,0.0,-1,403.954545,22,0.0,4.0,2.2,Front-Wheel Drive,...,-2250,Legacy/Outback,CLKUP,,,,,,,0.0
45038,12.935217,0.0,0.0,-1,386.391304,23,0.0,4.0,2.2,Front-Wheel Drive,...,-2000,Legacy/Outback,,,,,,,,0.0
45039,14.167143,0.0,0.0,-1,423.190476,21,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,-2750,Legacy/Outback,CLKUP,,,,,,,0.0
45040,14.167143,0.0,0.0,-1,423.190476,21,0.0,4.0,2.2,4-Wheel or All-Wheel Drive,...,-2750,Legacy/Outback,,,,,,,,0.0


In [None]:
# Deprecated as I am going to only load what I need from sqlite file

#vehicle_data_cleaned = vehicle_data[~vehicle_data['fuelType'].isin(remove_fuel_types)] #remove unwanted fuel types

#vehicle_data_cleaned = vehicle_data_cleaned[keep_columns] # remove unwanted columns

#vehicle_data_cleaned.shape # check data shape

(45042, 37)

### Write data to txt file - all unique values for viewing and adjustment.

In [None]:
# Write to txt file so I can view the data
#---------------
spacer = ['-'*20, '-'*2]
#titleList = ['Data Discovery', 'Annual fuel consumption', 'Charging time', 'Mileage/Type', 'Carbon Emissions per mile', 'MPG/MPKwh']
descript_dict = {
    'barrels': 'annual petroleum consumption for fuelType1 and fuelType2 (1 barrel = 42 gallons)',
    'barrels08': 'annual petroleum consumption for fuelType1',
    'barrelsA08': 'annual petroleum consumption for fuelType2',
    'charge120': 'time to charge at 120V (hours)',
    'charge240': 'time to charge at 240V (hours)',
    'city08': 'city MPG for fuelType1',
    'city08U': 'unrounded city MPG for fuelType1',
    'cityA08': 'city MPG for fuelType2',
    'cityA08U': 'unrounded city MPG for fuelType2',
    'cityCD': 'city gasoline consumption in charge depleting mode (gallons/100 miles)',
    'cityE': 'city electricity consumption (kw-hrs/100 miles)',
    'cityUF': 'EPA city utility factor for PHEV',
    'co2': 'tailpipe CO2 for fuelType1 (grams/mile)',
    'co2A': 'tailpipe CO2 for fuelType2 (grams/mile)',
    'co2TailpipeGpm': 'tailpipe CO2 for fuelType1 (grams/mile)',
    'co2TailpipeAGpm': 'tailpipe CO2 for fuelType2 (grams/mile)',
    'comb08': 'combined MPG for fuelType1',
    'comb08U': 'unrounded combined MPG for fuelType1',
    'combA08': 'combined MPG for fuelType2',
    'combA08U': 'unrounded combined MPG for fuelType2',
    'combE': 'combined electricity consumption (kw-hrs/100 miles)',
    'combinedCD': 'combined gasoline consumption in charge depleting mode (gallons/100 miles)',
    'combinedUF': 'EPA combined utility factor for PHEV',
    'cylinders': 'engine cylinders',
    'displ': 'engine displacement (liters)',
    'drive': 'drive axle type',
    'engId': 'EPA model type index',
    'eng_dscr': 'engine descriptor',
    'feScore': 'EPA Fuel Economy Score (-1 = Not available)',
    'fuelCost08': 'annual fuel cost for fuelType1 ($)',
    'fuelCostA08': 'annual fuel cost for fuelType2 ($)',
    'fuelType': 'fuel type (fuelType1 and fuelType2, if applicable)',
    'fuelType1': 'primary fuel type for single or conventional fuel vehicles',
    'fuelType2': 'alternative fuel type for dual fuel vehicles',
    'ghgScore': 'EPA GHG score for fuelType1 (-1 = Not available)',
    'ghgScoreA': 'EPA GHG score for fuelType2 (-1 = Not available)',
    'guzzler': 'subject to gas guzzler tax (G or T)',
    'trans_dscr': 'transmission descriptor',
    'highway08': 'highway MPG for fuelType1',
    'highway08U': 'unrounded highway MPG for fuelType1',
    'highwayA08': 'highway MPG for fuelType2',
    'highwayA08U': 'unrounded highway MPG for fuelType2',
    'highwayCD': 'highway gasoline consumption in charge depleting mode (gallons/100 miles)',
    'highwayE': 'highway electricity consumption (kw-hrs/100 miles)',
    'highwayUF': 'EPA highway utility factor for PHEV',
    'hlv': 'hatchback luggage volume (cubic feet)',
    'hpv': 'hatchback passenger volume (cubic feet)',
    'id': 'vehicle record ID',
    'lv2': '2-door luggage volume (cubic feet)',
    'lv4': '4-door luggage volume (cubic feet)',
    'make': 'manufacturer (division)',
    'mfrCode': '3-character manufacturer code',
    'model': 'model name (carline)',
    'mpgData': 'has My MPG data',
    'phevBlended': 'PHEV operates on a blend of gasoline and electricity in charge depleting mode',
    'pv2': '2-door passenger volume (cubic feet)',
    'pv4': '4-door passenger volume (cubic feet)',
    'range': 'estimated range for fuelType1',
    'rangeCity': 'estimated city range for fuelType1',
    'rangeCityA': 'EPA city range for fuelType2',
    'rangeHwy': 'estimated highway range for fuelType1',
    'rangeHwyA': 'EPA highway range for fuelType2',
    'trany': 'transmission',
    'UCity': 'unadjusted city MPG for fuelType1',
    'UCityA': 'unadjusted city MPG for fuelType2',
    'UHighway': 'unadjusted highway MPG for fuelType1',
    'UHighwayA': 'unadjusted highway MPG for fuelType2',
    'VClass': 'EPA vehicle size class',
    'year': 'model year',
    'youSaveSpend': 'savings/spending over 5 years compared to an average car',
    'sCharger': 'supercharged (S)',
    'tCharger': 'turbocharged (T)',
    'c240Dscr': 'electric vehicle charger description',
    'charge240b': 'time to charge an electric vehicle at 240V (hours)',
    'c240bDscr': 'electric vehicle alternate charger description',
    'createdOn': 'vehicle record creation date',
    'modifiedOn': 'last modified date of the vehicle record',
    'startStop': 'vehicle has stop-start technology',
    'phevCity': 'EPA composite gasoline-electricity city MPGe for plug-in hybrid vehicles',
    'phevHwy': 'EPA composite gasoline-electricity highway MPGe for plug-in hybrid vehicles',
    'phevComb': 'EPA composite gasoline-electricity combined city-highway MPGe for plug-in hybrid vehicles',
    'baseModel': 'base model name',
    'atvType': 'type of alternative fuel or advanced technology vehicle',
    'rangeA': 'EPA range for fuelType2',
    'evMotor': 'electric motor (kw-hrs)'
}


# get names and unique values for each column in the column list
with open('../Documents/Column_Descript_Unique_Values.txt', 'w') as file:
    for col in vehicle_data_cleaned:
        unique_values = vehicle_data_cleaned[col].unique()
        description = descript_dict.get(col, "No description available")
        file.write(f'{col}:  {description}\n')
        file.write(f"{spacer[1]}\n")
        for value in unique_values:
            file.write(f"{value}, ")
        file.write("\n" + "-"*10 + "\n")

#------------