In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd
import requests
from bs4 import BeautifulSoup

#### Florida EV Vehicle Registration: Raw Dataset

In [2]:
df = pd.read_csv(
    Path("../../../../data/raw_data/fl_ev_registrations_public.csv"),
    parse_dates=["Registration Valid Date"],
    date_format="%m/%d/%Y"
)

print("Shape:", df.shape)
display(df.head())

Shape: (353974, 7)


Unnamed: 0,DMV ID,DMV Snapshot (Date),County,Vehicle Name,Registration Valid Date,Registration Expiration Date,Technology
0,1,Registration Data from FPL (6/30/2018),Dade,Tesla Model X,2018-06-30,,BEV
1,1,Registration Data from FPL (6/30/2018),Dade,Tesla Model X,2018-06-30,,BEV
2,1,Registration Data from FPL (6/30/2018),Dade,Tesla Model X,2018-06-30,,BEV
3,1,Registration Data from FPL (6/30/2018),Dade,Tesla Model X,2018-06-30,,BEV
4,1,Registration Data from FPL (6/30/2018),Dade,Tesla Model X,2018-06-30,,BEV


In [3]:
for index, value in enumerate(df.columns):
    print(index, value)

0 DMV ID
1 DMV Snapshot (Date)
2 County
3 Vehicle Name
4 Registration Valid Date
5 Registration Expiration Date
6 Technology


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353974 entries, 0 to 353973
Data columns (total 7 columns):
 #   Column                        Non-Null Count   Dtype         
---  ------                        --------------   -----         
 0   DMV ID                        353974 non-null  int64         
 1   DMV Snapshot (Date)           353974 non-null  object        
 2   County                        353974 non-null  object        
 3   Vehicle Name                  353974 non-null  object        
 4   Registration Valid Date       353974 non-null  datetime64[ns]
 5   Registration Expiration Date  0 non-null       float64       
 6   Technology                    353754 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(4)
memory usage: 18.9+ MB


In [5]:
# Check if there are any missing values (Unknown)
# Check to see if county values are different from our Zip Code Data
# Reference https://www.unitedstateszipcodes.org/fl/

df["County"].sort_values().unique()

array(['Alachua', 'Baker', 'Bay', 'Bradford', 'Brevard', 'Broward',
       'Calhoun', 'Charlotte', 'Citrus', 'Clay', 'Collier', 'Columbia',
       'Dade', 'Desoto', 'Dixie', 'Duval', 'Escambia', 'Flagler',
       'Franklin', 'Gadsden', 'Gadsen', 'Gilchrist', 'Glades', 'Gulf',
       'Hamilton', 'Hardee', 'Hendry', 'Hernando', 'Highlands',
       'Hillsborough', 'Holmes', 'Indian River', 'Jackson', 'Jefferson',
       'Lake', 'Lee', 'Leon', 'Levy', 'Liberty', 'Madison', 'Manatee',
       'Marion', 'Martin', 'Monroe', 'Nassau', 'Okaloosa', 'Okeechobee',
       'Orange', 'Osceola', 'Other', 'Palm Beach', 'Pasco', 'Pinellas',
       'Polk', 'Putnam', 'Santa Rosa', 'Sarasota', 'Seminole',
       'St. Johns', 'St. Lucie', 'Sumter', 'Suwannee', 'Taylor', 'Union',
       'Unknown', 'Volusia', 'Wakulla', 'Walton', 'Washington'],
      dtype=object)

**Note:**  
    - Ensure County Values are the same in the Main Dataset and Zip Code Data.  
    - Necessary to obtain Zip Codes later...

In [6]:
# Correct the county name values to match the values from the ZIP code data
# Reference https://www.unitedstateszipcodes.org/fl/

# Define for values to be corrected
county_corrections = {
    "Unknown": "Unknown",
    "Other": "Unknown",
    "Dade": "Miami-Dade County",
    "Gadsen": "Gadsden County"
}

# Apply the corrections and add " County" for other values
df["County"] = df["County"].map(lambda x: county_corrections.get(x, x.strip() + " County"))

# Print the modified DataFrame for verification
df["County"].sort_values().unique()

array(['Alachua County', 'Baker County', 'Bay County', 'Bradford County',
       'Brevard County', 'Broward County', 'Calhoun County',
       'Charlotte County', 'Citrus County', 'Clay County',
       'Collier County', 'Columbia County', 'Desoto County',
       'Dixie County', 'Duval County', 'Escambia County',
       'Flagler County', 'Franklin County', 'Gadsden County',
       'Gilchrist County', 'Glades County', 'Gulf County',
       'Hamilton County', 'Hardee County', 'Hendry County',
       'Hernando County', 'Highlands County', 'Hillsborough County',
       'Holmes County', 'Indian River County', 'Jackson County',
       'Jefferson County', 'Lake County', 'Lee County', 'Leon County',
       'Levy County', 'Liberty County', 'Madison County',
       'Manatee County', 'Marion County', 'Martin County',
       'Miami-Dade County', 'Monroe County', 'Nassau County',
       'Okaloosa County', 'Okeechobee County', 'Orange County',
       'Osceola County', 'Palm Beach County', 'Pasco Count

In [7]:
# Count County values that are Unknown
df.loc[df["County"] == "Unknown", "County"].count()

10611

#### Florida Zip Code: Raw Dataset

In [8]:
# Obtain the Zip Codes for each County in Florida

# Webpage URL to scrape the ZIP codes for each county in California
source = "https://www.unitedstateszipcodes.org/fl/"

# Important Note: Set up headers to mimic a browser request
# Does not work otherwise - Page blocks the request
# Learned of this from:
# https://zenscrape.com/5-approaches-to-write-unblock-able-data-scraping-in-python/
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Referer": "https://www.google.com/",
    "Upgrade-Insecure-Requests": "1"
}

# Make a request to the webpage
response = requests.get(source, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Find all the relevant div elements with the class 'list-group-item'
    items = soup.find_all("div", class_="list-group-item")
    
    # List to store the extracted data
    data = []
    
    # Iterate through each item and extract the required information
    for item in items:
        row = item.find_all("div", class_="row")[0]
        zip_code = row.find("div", class_="prefix-col1").text.strip()
        type_ = row.find("div", class_="prefix-col2").text.strip()
        city = row.find("div", class_="prefix-col3").text.strip()
        county = row.find("div", class_="prefix-col4").text.strip()
        area_codes = [a.text.strip() for a in row.find("div", class_="prefix-col5").find_all("a")]
        
        # Append the extracted data to the list
        data.append({
            "Zip Code": zip_code,
            "Type": type_,
            "Cities": city,
            "County": county,
            "Area Codes": ", ".join(area_codes)
        })
    
    # Convert the list of dictionaries to DataFrame
    fl_zip_df = pd.DataFrame(data)
    
    print("Shape:", fl_zip_df.shape)
    display(fl_zip_df.head())

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Shape: (1495, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,32003,Standard,"Fleming Island, Fleming Isle, Orange Park",Clay County,Area Code 904
1,32004,PO Box,"Ponte Vedra Beach, Ponte Vedra",St. Johns County,Area Code 904
2,32006,PO Box,"Fleming Island, Fleming Isle, Orange Park",Clay County,Area Code 904
3,32007,PO Box,Bostwick,Putnam County,Area Code 386
4,32008,Standard,Branford,Suwannee County,"Area Code 352, Area Code 386"


In [9]:
# Compare both DataFrames to ensure values our our main DataFrame are in the ZIP code DataFrame

# Get unique county values from both DataFrames
unique_counties_main_df = df["County"].sort_values().unique()
unique_counties_zip_df = fl_zip_df["County"].sort_values().unique()

# Convert to sets for comparison
set_counties_main_df = set(unique_counties_main_df)
set_counties_zip_df = set(unique_counties_zip_df)

# Count total unique values in the main DataFrame
total_unique_counties_main_df = len(set_counties_main_df)

# Count values in the main DataFrame that ARE also in the ZIP code DataFrame
# Intersection: Finds common elements between two sets
common_counties = set_counties_main_df.intersection(set_counties_zip_df)
count_common_counties = len(common_counties)

# Count values in the main DataFrame that ARE NOT in the ZIP code DataFrame
# Difference: Finds elements that are in the first set but not in the second set
unique_counties_to_main_df = set_counties_main_df.difference(set_counties_zip_df)
count_unique_counties_to_main_df = len(unique_counties_to_main_df)

print(f"Total unique values in `df['County'`]: {total_unique_counties_main_df}")
print(f"Values in `df['County']` also in `fl_zip_df['County']`: {count_common_counties}")
print(f"Values in `df['County']` not in `fl_zip_df['County']`: {count_unique_counties_to_main_df}")

Total unique values in `df['County'`]: 67
Values in `df['County']` also in `fl_zip_df['County']`: 66
Values in `df['County']` not in `fl_zip_df['County']`: 1


##### Main Dataset: Create Zip Code Column

In [10]:
# After confirming that the values of County match on both DataFrames, 
# we can proceed to map the Zip Codes to the main DataFrame

# Create a dictionary to for County to Zip code mapping
county_to_zip = fl_zip_df.set_index('County')['Zip Code'].to_dict()

# Use apply to map the County to Zip code and create a new column with the Zip codes
df["Zip Code"] = df["County"].apply(lambda x: county_to_zip.get(x, pd.NA))

print("Shape:", df.shape)
display(df.head(2))

Shape: (353974, 8)


Unnamed: 0,DMV ID,DMV Snapshot (Date),County,Vehicle Name,Registration Valid Date,Registration Expiration Date,Technology,Zip Code
0,1,Registration Data from FPL (6/30/2018),Miami-Dade County,Tesla Model X,2018-06-30,,BEV,33299
1,1,Registration Data from FPL (6/30/2018),Miami-Dade County,Tesla Model X,2018-06-30,,BEV,33299


In [11]:
# Count the number of missing values in the Zip Code column
df["Zip Code"].isna().sum()

10611

##### Main Dataset: Create State Column

In [12]:
# Create a "State" column with the value "FL" for Florida
df["State"] = "FL"

##### Main Dataset: Create a Make and Model Column

In [13]:
# Noticed "Make" and "Model" are combined in the "Vehicle Name" column
# We will need to separate them into two columns

# `n` parameter is set to 1 to split the string at the first space
# `expand=True` returns a DataFrame with the split strings in separate columns
df[["make", "model"]] = df["Vehicle Name"].str.split(" ", n=1, expand=True)

print("Shape:", df.shape)
display(df.head(2))

Shape: (353974, 11)


Unnamed: 0,DMV ID,DMV Snapshot (Date),County,Vehicle Name,Registration Valid Date,Registration Expiration Date,Technology,Zip Code,State,make,model
0,1,Registration Data from FPL (6/30/2018),Miami-Dade County,Tesla Model X,2018-06-30,,BEV,33299,FL,Tesla,Model X
1,1,Registration Data from FPL (6/30/2018),Miami-Dade County,Tesla Model X,2018-06-30,,BEV,33299,FL,Tesla,Model X


#### Florida EV Vehicle Registration: Processed Dataset

In [14]:
# Create a new DataFrame with only the columns needed for the analysis
select_columns = ["Registration Valid Date", "State", "Zip Code", "make", "model"]

df_clean = df.loc[:, select_columns]

print("Shape:", df_clean.shape)
display(df_clean.head())

Shape: (353974, 5)


Unnamed: 0,Registration Valid Date,State,Zip Code,make,model
0,2018-06-30,FL,33299,Tesla,Model X
1,2018-06-30,FL,33299,Tesla,Model X
2,2018-06-30,FL,33299,Tesla,Model X
3,2018-06-30,FL,33299,Tesla,Model X
4,2018-06-30,FL,33299,Tesla,Model X


In [15]:
# Rename columns to manageable names

df_clean = df_clean.rename(columns={
    "Registration Valid Date": "registration_date",
    "State": "state",
    "Zip Code": "zip_code",
    "make": "make",
    "model": "model",
})

print("Shape:", df_clean.shape)
display(df_clean.head())

Shape: (353974, 5)


Unnamed: 0,registration_date,state,zip_code,make,model
0,2018-06-30,FL,33299,Tesla,Model X
1,2018-06-30,FL,33299,Tesla,Model X
2,2018-06-30,FL,33299,Tesla,Model X
3,2018-06-30,FL,33299,Tesla,Model X
4,2018-06-30,FL,33299,Tesla,Model X


In [16]:
# View data types before saving the cleaned data
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353974 entries, 0 to 353973
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   registration_date  353974 non-null  datetime64[ns]
 1   state              353974 non-null  object        
 2   zip_code           343363 non-null  object        
 3   make               353974 non-null  object        
 4   model              353974 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 13.5+ MB


In [17]:
# Count missing values in each column
df_clean.isnull().sum()

registration_date        0
state                    0
zip_code             10611
make                     0
model                    0
dtype: int64

In [18]:
# Export to a new CSV file
file_name = "fl_ev_registrations.csv"
file_path = Path(f"../../../../data/processed_data/{file_name}")

if file_path.exists():
    print("File already exists. Overwriting file.")
    file_path.unlink()

if not file_path.exists():
    print("Creating file...")
    df_clean.to_csv(file_path, index=False)
    print(f"File saved as {file_name}")

File already exists. Overwriting file.
Creating file...
File saved as fl_ev_registrations.csv
