In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# Read the data for EV registrations in California
df = pd.read_csv(
    Path("../../../../data/raw_data/ca_ev_registrations_public.csv"),
    dtype={1: str,},
    parse_dates=["Registration Valid Date"],
    date_format="%Y-%m-%d",
)

print("Shape:", df.shape)
display(df.head())

Shape: (2542443, 9)


Unnamed: 0,Vehicle ID,County GEOID,Registration Valid Date,DMV ID,DMV Snapshot,Registration Expiration Date,State Abbreviation,Geography,Vehicle Name
0,CA-002-03597\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Chevrolet Volt
1,CA-002-03598\r,6105,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Nissan Leaf
2,CA-002-03599\r,6103,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Chevrolet Volt
3,CA-002-03600\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Tesla Roadster
4,CA-002-03601\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Tesla Roadster


In [3]:
# Familiarize with the columns
for index, value in enumerate(df.columns):
    print(index, value)

0 Vehicle ID
1 County GEOID
2 Registration Valid Date
3 DMV ID
4 DMV Snapshot
5 Registration Expiration Date
6 State Abbreviation
7 Geography
8 Vehicle Name


In [4]:
# Familiarize with the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542443 entries, 0 to 2542442
Data columns (total 9 columns):
 #   Column                        Dtype         
---  ------                        -----         
 0   Vehicle ID                    object        
 1   County GEOID                  object        
 2   Registration Valid Date       datetime64[ns]
 3   DMV ID                        int64         
 4   DMV Snapshot                  object        
 5   Registration Expiration Date  float64       
 6   State Abbreviation            object        
 7   Geography                     object        
 8   Vehicle Name                  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(6)
memory usage: 174.6+ MB


In [5]:
# Before breaking GEOID str into numeric values, check if there are any missing values
df["County GEOID"].sort_values().unique()

array(['06001', '06003', '06005', '06007', '06009', '06011', '06013',
       '06015', '06017', '06019', '06021', '06023', '06025', '06027',
       '06029', '06031', '06033', '06035', '06037', '06039', '06041',
       '06043', '06045', '06047', '06049', '06051', '06053', '06055',
       '06057', '06059', '06061', '06063', '06065', '06067', '06069',
       '06071', '06073', '06075', '06077', '06079', '06081', '06083',
       '06085', '06087', '06089', '06091', '06093', '06095', '06097',
       '06099', '06101', '06103', '06105', '06107', '06109', '06111',
       '06113', '06115', 'Unknown'], dtype=object)

In [6]:
# Obtain the County Name from the GEOID for California
# The County Name will be used later to obtain the ZIP codes for each county
source = "https://www2.census.gov/geo/docs/reference/codes2020/cou/st06_ca_cou2020.txt"

geoid_map_df = pd.read_csv(
    source,
    sep="|",
)

print("Shape:", geoid_map_df.shape)
geoid_map_df.head()

Shape: (58, 7)


Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNS,COUNTYNAME,CLASSFP,FUNCSTAT
0,CA,6,1,1675839,Alameda County,H1,A
1,CA,6,3,1675840,Alpine County,H1,A
2,CA,6,5,1675841,Amador County,H1,A
3,CA,6,7,1675842,Butte County,H1,A
4,CA,6,9,1675885,Calaveras County,H1,A


In [7]:
# Familiarize with the GEOID map df data types
# Use GEOID to match with STATEFP and COUNTYFP in the shapefile (They are int not str - of the GEOID)
geoid_map_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   STATE       58 non-null     object
 1   STATEFP     58 non-null     int64 
 2   COUNTYFP    58 non-null     int64 
 3   COUNTYNS    58 non-null     int64 
 4   COUNTYNAME  58 non-null     object
 5   CLASSFP     58 non-null     object
 6   FUNCSTAT    58 non-null     object
dtypes: int64(3), object(4)
memory usage: 3.3+ KB


In [8]:
# Create a dictionary for GEOID to COUNTYNAME mapping
# Example Output: {('06', '001'): 'Example County', ('06', '002'): 'Example2 County', ...}
geoid_to_countyname_map = geoid_map_df.set_index(['STATEFP', 'COUNTYFP'])['COUNTYNAME'].to_dict()

# Extract the STATEFP and COUNTYFP from the GEOID
df['STATEFP'] = pd.to_numeric(df['County GEOID'].str[:2], errors='coerce').astype('Int64')
df['COUNTYFP'] = pd.to_numeric(df['County GEOID'].str[2:], errors='coerce').astype('Int64')

# Create a list of tuples referencing the STATEFP and COUNTYFP
# Example Tuple: ('06', '001')
state_county_fp_tuples = list(zip(df['STATEFP'], df['COUNTYFP']))

# Create a new column for the County Name using the tuples
# Key is the value of the tuple that will be used to obtain the County Name from the dictionary
df['County'] = [geoid_to_countyname_map.get(tuple_value, np.nan) for tuple_value in state_county_fp_tuples]

print("Shape:", df.shape)
display(df.head())

Shape: (2542443, 12)


Unnamed: 0,Vehicle ID,County GEOID,Registration Valid Date,DMV ID,DMV Snapshot,Registration Expiration Date,State Abbreviation,Geography,Vehicle Name,STATEFP,COUNTYFP,County
0,CA-002-03597\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Chevrolet Volt,6,99,Stanislaus County
1,CA-002-03598\r,6105,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Nissan Leaf,6,105,Trinity County
2,CA-002-03599\r,6103,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Chevrolet Volt,6,103,Tehama County
3,CA-002-03600\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Tesla Roadster,6,99,Stanislaus County
4,CA-002-03601\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Tesla Roadster,6,99,Stanislaus County


In [9]:
# Check if there are any missing values (nan)
df["County"].sort_values().unique()

array(['Alameda County', 'Alpine County', 'Amador County', 'Butte County',
       'Calaveras County', 'Colusa County', 'Contra Costa County',
       'Del Norte County', 'El Dorado County', 'Fresno County',
       'Glenn County', 'Humboldt County', 'Imperial County',
       'Inyo County', 'Kern County', 'Kings County', 'Lake County',
       'Lassen County', 'Los Angeles County', 'Madera County',
       'Marin County', 'Mariposa County', 'Mendocino County',
       'Merced County', 'Modoc County', 'Mono County', 'Monterey County',
       'Napa County', 'Nevada County', 'Orange County', 'Placer County',
       'Plumas County', 'Riverside County', 'Sacramento County',
       'San Benito County', 'San Bernardino County', 'San Diego County',
       'San Francisco County', 'San Joaquin County',
       'San Luis Obispo County', 'San Mateo County',
       'Santa Barbara County', 'Santa Clara County', 'Santa Cruz County',
       'Shasta County', 'Sierra County', 'Siskiyou County',
       'Solano 

In [10]:
# Obtain the Zip Codes for each County in California

# Webpage URL to scrape the ZIP codes for each county in California
source = "https://www.unitedstateszipcodes.org/ca/"

# Important Note: Set up headers to mimic a browser request
# Does not work otherwise - Page blocks the request
# Learned of this from:
# https://zenscrape.com/5-approaches-to-write-unblock-able-data-scraping-in-python/
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Referer": "https://www.google.com/",
    "Upgrade-Insecure-Requests": "1"
}

# Make a request to the webpage
response = requests.get(source, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all the relevant div elements with the class 'list-group-item'
    items = soup.find_all("div", class_="list-group-item")
    
    # List to store the extracted data
    data = []
    
    # Iterate through each item and extract the required information
    for item in items:
        row = item.find_all("div", class_="row")[0]
        zip_code = row.find("div", class_="prefix-col1").text.strip()
        type_ = row.find("div", class_="prefix-col2").text.strip()
        city = row.find("div", class_="prefix-col3").text.strip()
        county = row.find("div", class_="prefix-col4").text.strip()
        area_codes = [a.text.strip() for a in row.find("div", class_="prefix-col5").find_all("a")]
        
        # Append the extracted data to the list
        data.append({
            "Zip Code": zip_code,
            "Type": type_,
            "Cities": city,
            "County": county,
            "Area Codes": ", ".join(area_codes)
        })
    
    # Convert the list of dictionaries to DataFrame
    ca_zip_df = pd.DataFrame(data)
    
    print("Shape:", ca_zip_df.shape)
    display(ca_zip_df.head())

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Shape: (2655, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,90001,Standard,"Los Angeles, Firestone Park, Firestone Pk",Los Angeles County,"Area Code 213, Area Code 323"
1,90002,Standard,"Los Angeles, Watts",Los Angeles County,"Area Code 213, Area Code 310, Area Code 323, A..."
2,90003,Standard,Los Angeles,Los Angeles County,"Area Code 213, Area Code 323"
3,90004,Standard,"Los Angeles, Oakwood",Los Angeles County,"Area Code 213, Area Code 323"
4,90005,Standard,"Los Angeles, Sanford",Los Angeles County,"Area Code 213, Area Code 323"


In [11]:
# Create a dictionary for County to Zip Code mapping
county_to_zip = ca_zip_df.set_index('County')['Zip Code'].to_dict()

# Use apply with to map the County to the Zip Code and create a new column with the Zip Code
df['Zip Code'] = df['County'].apply(lambda x: county_to_zip.get(x))

print("Shape:", df.shape)
display(df.head(2))

Shape: (2542443, 13)


Unnamed: 0,Vehicle ID,County GEOID,Registration Valid Date,DMV ID,DMV Snapshot,Registration Expiration Date,State Abbreviation,Geography,Vehicle Name,STATEFP,COUNTYFP,County,Zip Code
0,CA-002-03597\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Chevrolet Volt,6,99,Stanislaus County,95397
1,CA-002-03598\r,6105,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Nissan Leaf,6,105,Trinity County,96093


In [12]:
# Noticed "Make" and "Model" are combined in the "Vehicle Name" column
# We will need to separate them into two columns

# Split the "Vehicle Name" column into a new "Make" and "Model" column
df[['make', 'model']] = df['Vehicle Name'].str.split(" ", n=1, expand=True)

print("Shape:", df.shape)
display(df.head(2))

Shape: (2542443, 15)


Unnamed: 0,Vehicle ID,County GEOID,Registration Valid Date,DMV ID,DMV Snapshot,Registration Expiration Date,State Abbreviation,Geography,Vehicle Name,STATEFP,COUNTYFP,County,Zip Code,make,model
0,CA-002-03597\r,6099,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Chevrolet Volt,6,99,Stanislaus County,95397,Chevrolet,Volt
1,CA-002-03598\r,6105,2011-01-01,2,CA Registration Data from CA (12/31/2011),,CA,County,Nissan Leaf,6,105,Trinity County,96093,Nissan,Leaf


In [13]:
# Create a new DataFrame with only the columns needed for the analysis
select_columns = ["Registration Valid Date", "State Abbreviation", "Zip Code", "make", "model"]

df_clean = df.loc[:, select_columns]

print("Shape:", df_clean.shape)
display(df_clean.head())

Shape: (2542443, 5)


Unnamed: 0,Registration Valid Date,State Abbreviation,Zip Code,make,model
0,2011-01-01,CA,95397,Chevrolet,Volt
1,2011-01-01,CA,96093,Nissan,Leaf
2,2011-01-01,CA,96092,Chevrolet,Volt
3,2011-01-01,CA,95397,Tesla,Roadster
4,2011-01-01,CA,95397,Tesla,Roadster


In [14]:
# Rename columns to manageable names
df_clean = df_clean.rename(columns={
    "Registration Valid Date": "registration_date",
    "State Abbreviation": "state",
    "Zip Code": "zip_code",
    "make": "make",
    "model": "model",
})

print("Shape:", df_clean.shape)
display(df_clean.head())

Shape: (2542443, 5)


Unnamed: 0,registration_date,state,zip_code,make,model
0,2011-01-01,CA,95397,Chevrolet,Volt
1,2011-01-01,CA,96093,Nissan,Leaf
2,2011-01-01,CA,96092,Chevrolet,Volt
3,2011-01-01,CA,95397,Tesla,Roadster
4,2011-01-01,CA,95397,Tesla,Roadster


In [15]:
# View data types before saving the cleaned data
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2542443 entries, 0 to 2542442
Data columns (total 5 columns):
 #   Column             Dtype         
---  ------             -----         
 0   registration_date  datetime64[ns]
 1   state              object        
 2   zip_code           object        
 3   make               object        
 4   model              object        
dtypes: datetime64[ns](1), object(4)
memory usage: 97.0+ MB


In [16]:
# Export to a new CSV file
file_name = "ca_ev_registrations.csv"
file_path = Path(f"../../../../data/processed_data/{file_name}")

if file_path.exists():
    print("File already exists. Overwriting file.")
    file_path.unlink()

if not file_path.exists():
    print("Creating file...")
    df_clean.to_csv(file_path, index=False)
    df_clean.to_csv(file_path, index=False)
    print(f"File saved as {file_name}")

File already exists. Overwriting file.
Creating file...
File saved as ca_ev_registrations.csv
