In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

### California EV Registrations DataFrame (Clean)

In [3]:
#  Load the EV registrations data for California
ca_ev_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_ev_registrations.csv"),
    parse_dates=["registration_date"],
)

#  Sort the data by registration date
ca_ev_df.sort_values("registration_date", inplace=True)

print("Shape:", ca_ev_df.shape)
ca_ev_df

Shape: (2542443, 6)


Unnamed: 0,registration_date,state,county,zip_codes,make,model
0,2010-01-01,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",Tesla,Roadster
498,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster
499,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster
500,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster
501,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster
...,...,...,...,...,...,...
2123462,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt
2123463,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt
2123464,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt
2123457,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt


In [4]:
# Inspect the data types of the EV registrations DataFrame
ca_ev_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2542443 entries, 0 to 2542442
Data columns (total 6 columns):
 #   Column             Dtype         
---  ------             -----         
 0   registration_date  datetime64[ns]
 1   state              object        
 2   county             object        
 3   zip_codes          object        
 4   make               object        
 5   model              object        
dtypes: datetime64[ns](1), object(5)
memory usage: 135.8+ MB


In [5]:
# Check for missing values 
# `zip_codes` has missing values
# `county` has missing values marked as "Unknown" so it is not seen as missing
ca_ev_df.isnull().sum()

registration_date        0
state                    0
county               19504
zip_codes            19504
make                     0
model                    0
dtype: int64

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [6]:
# Get values where country and zipcodes are missing
missing_values = ca_ev_df[ca_ev_df["zip_codes"].isnull()]
missing_values

Unnamed: 0,registration_date,state,county,zip_codes,make,model
743,2010-01-01,CA,,,Ford,Ranger
744,2010-01-01,CA,,,Ford,Ranger
745,2010-01-01,CA,,,Mini,Hardtop 2 Door EV
746,2010-01-01,CA,,,Mini,Hardtop 2 Door EV
747,2010-01-01,CA,,,Mini,Hardtop 2 Door EV
...,...,...,...,...,...,...
2540809,2020-01-01,CA,,,Tesla,Model S
2540810,2020-01-01,CA,,,Tesla,Model S
2540803,2020-01-01,CA,,,Tesla,Model S
2542441,2020-01-01,CA,,,Volvo,XC90 Plug In


### California Population DataFrame (Clean)

In [7]:
# Load the population data for Florida
ca_pop_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_population.csv"),
    parse_dates=["date"],
)

# Sort the data by date
ca_pop_df.sort_values("date", inplace=True)

# Rename the `date` column to `year`
ca_pop_df.rename(columns={"date": "year"}, inplace=True)

print("Shape:", ca_pop_df.shape)
ca_pop_df

Shape: (870, 5)


Unnamed: 0,year,state,county,zip_codes,population
0,2010-01-01,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986
31,2010-01-01,CA,Plumas County,"[95915, 95923, 95934, 95947, 95956, 95971, 959...",19914
32,2010-01-01,CA,Riverside County,"[91752, 92201, 92202, 92203, 92210, 92211, 922...",2201576
33,2010-01-01,CA,Sacramento County,"[94203, 94204, 94205, 94206, 94207, 94208, 942...",1421383
34,2010-01-01,CA,San Benito County,"[95023, 95024, 95043, 95045, 95075]",55516
...,...,...,...,...,...
835,2024-01-01,CA,Merced County,"[93635, 95301, 95340, 95348, 95341, 95334, 953...",280132
836,2024-01-01,CA,Modoc County,"[96101, 96134, 96104, 96015, 96054, 96112, 961...",9350
837,2024-01-01,CA,Mono County,"[93514, 93546, 96107, 93541, 93517, 93512, 935...",26456
824,2024-01-01,CA,Imperial County,"[92243, 92231, 92227, 92251, 92250, 92249, 922...",174094


In [8]:
# Inspect the data types of the population DataFrame
ca_pop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 870 entries, 0 to 869
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   year        870 non-null    datetime64[ns]
 1   state       870 non-null    object        
 2   county      870 non-null    object        
 3   zip_codes   870 non-null    object        
 4   population  870 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 40.8+ KB


In [9]:
# Inspect the missing values in the population DataFrame
ca_pop_df.isnull().sum()

year          0
state         0
county        0
zip_codes     0
population    0
dtype: int64

### Prepare EV Registration DataFrame for Merge

In [10]:
# Create a new column for the `year` so we can merge later with the population DataFrame
ca_ev_df["year"] = ca_ev_df["registration_date"].dt.year

print("Shape:", ca_ev_df.shape)
ca_ev_df

Shape: (2542443, 7)


Unnamed: 0,registration_date,state,county,zip_codes,make,model,year
0,2010-01-01,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",Tesla,Roadster,2010
498,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster,2010
499,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster,2010
500,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster,2010
501,2010-01-01,CA,San Diego County,"[91901, 91902, 91903, 91905, 91906, 91908, 919...",Tesla,Roadster,2010
...,...,...,...,...,...,...,...
2123462,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt,2020
2123463,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt,2020
2123464,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt,2020
2123457,2020-01-01,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",Chevrolet,Volt,2020


In [11]:
# Group the EV registrations DataFrame by `year`, `state`, `county`, and `registration_date`...
# to get the total number of registrations per specific dates grouped by year, state, and county
# Create column for the total number of registrations called `registrations`
ca_ev_df_group = ca_ev_df.groupby(['year', 'state', 'county', "registration_date"], dropna=False).size().reset_index(name='ev_registrations')

print("Shape:", ca_ev_df_group.shape)
ca_ev_df_group

Shape: (600, 5)


Unnamed: 0,year,state,county,registration_date,ev_registrations
0,2010,CA,Alameda County,2010-01-01,20
1,2010,CA,Amador County,2010-01-01,1
2,2010,CA,Contra Costa County,2010-01-01,10
3,2010,CA,Fresno County,2010-01-01,2
4,2010,CA,Humboldt County,2010-01-01,2
...,...,...,...,...,...
595,2020,CA,Tuolumne County,2020-01-01,240
596,2020,CA,Ventura County,2020-01-01,13080
597,2020,CA,Yolo County,2020-01-01,2948
598,2020,CA,Yuba County,2020-01-01,289


In [12]:
# Check for missing values in the grouped DataFrame
ca_ev_df_group.isnull().sum()

year                  0
state                 0
county               11
registration_date     0
ev_registrations      0
dtype: int64

In [13]:
# Inspect to see if Unknown values are kept after grouping
# Important to keep as they have valuable information on registration counts
ca_ev_df_group.loc[ca_ev_df_group["county"].isna(), :]

Unnamed: 0,year,state,county,registration_date,ev_registrations
30,2010,CA,,2010-01-01,11
77,2011,CA,,2011-01-01,195
130,2012,CA,,2012-01-01,302
187,2013,CA,,2013-01-01,578
245,2014,CA,,2014-01-01,956
304,2015,CA,,2015-01-01,1047
363,2016,CA,,2016-01-01,1166
422,2017,CA,,2017-01-01,2118
481,2018,CA,,2018-01-01,3955
540,2019,CA,,2019-01-01,4622


### Prepare Population DataFrame for Merge

In [14]:
# Change the value of the year from DateTime to int ready for merging
ca_pop_df['year'] = ca_pop_df['year'].dt.year

print("Shape:", ca_pop_df.shape)
ca_pop_df

Shape: (870, 5)


Unnamed: 0,year,state,county,zip_codes,population
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986
31,2010,CA,Plumas County,"[95915, 95923, 95934, 95947, 95956, 95971, 959...",19914
32,2010,CA,Riverside County,"[91752, 92201, 92202, 92203, 92210, 92211, 922...",2201576
33,2010,CA,Sacramento County,"[94203, 94204, 94205, 94206, 94207, 94208, 942...",1421383
34,2010,CA,San Benito County,"[95023, 95024, 95043, 95045, 95075]",55516
...,...,...,...,...,...
835,2024,CA,Merced County,"[93635, 95301, 95340, 95348, 95341, 95334, 953...",280132
836,2024,CA,Modoc County,"[96101, 96134, 96104, 96015, 96054, 96112, 961...",9350
837,2024,CA,Mono County,"[93514, 93546, 96107, 93541, 93517, 93512, 935...",26456
824,2024,CA,Imperial County,"[92243, 92231, 92227, 92251, 92250, 92249, 922...",174094


In [15]:
# Create a list of unique years in the EV registrations DataFrame
# This will be used to filter the population DataFrame
# and keep only the years that are present in the EV registrations DataFrame

select_years = ca_ev_df_group['year'].unique()
select_years

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])

In [16]:
# Filter the population DataFrame to keep only the years present in the EV registrations DataFrame
filter_years = ca_pop_df['year'].isin(select_years)

# Create a new DataFrame with the filtered population data
ca_pop_filtered = ca_pop_df.loc[filter_years, :]

# Reset the index of the filtered population DataFrame
ca_pop_filtered.reset_index(drop=True, inplace=True)

ca_pop_filtered

Unnamed: 0,year,state,county,zip_codes,population
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986
1,2010,CA,Plumas County,"[95915, 95923, 95934, 95947, 95956, 95971, 959...",19914
2,2010,CA,Riverside County,"[91752, 92201, 92202, 92203, 92210, 92211, 922...",2201576
3,2010,CA,Sacramento County,"[94203, 94204, 94205, 94206, 94207, 94208, 942...",1421383
4,2010,CA,San Benito County,"[95023, 95024, 95043, 95045, 95075]",55516
...,...,...,...,...,...
633,2020,CA,Madera County,"[93601, 93604, 93610, 93614, 93636, 93637, 936...",156345
634,2020,CA,Los Angeles County,"[90001, 90002, 90003, 90004, 90005, 90006, 900...",9992813
635,2020,CA,Lassen County,"[96009, 96068, 96109, 96113, 96114, 96117, 961...",32315
636,2020,CA,Lake County,"[95422, 95423, 95424, 95426, 95435, 95443, 954...",68199


### Merge the Population DataFrame with EV Registration DataFrame

In [17]:
# Merge the filtered population DataFrame with the EV registrations DataFrame
# This will allow us to see the adoption rate of EVs per county in Florida
# We will see the number of registrations per county and the population per county
ca_ev_adoption_df = pd.merge(ca_pop_filtered, ca_ev_df_group, how='outer', on=['year', 'state', 'county'])

ca_ev_adoption_df.sort_values(['year', 'state', 'county'], inplace=True)

print("Shape:", ca_ev_adoption_df.shape)
ca_ev_adoption_df

Shape: (649, 7)


Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,2010-01-01,20.0
1,2010,CA,Alpine County,"[95646, 96120, 96156]",1161.0,NaT,
2,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,2010-01-01,1.0
3,2010,CA,Butte County,"[95914, 95916, 95917, 95926, 95927, 95928, 959...",219949.0,NaT,
4,2010,CA,Calaveras County,"[95221, 95222, 95223, 95224, 95225, 95226, 952...",45468.0,NaT,
...,...,...,...,...,...,...,...
644,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,2020-01-01,240.0
645,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,2020-01-01,13080.0
646,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2020-01-01,2948.0
647,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,2020-01-01,289.0


In [18]:
# Check null values after merging to keep the necessary `nan` values
# and remove the unnecessary `nan` values
ca_ev_adoption_df.isnull().sum()

year                  0
state                 0
county               11
zip_codes            11
population           11
registration_date    49
ev_registrations     49
dtype: int64

In [19]:
# Check the missing values in `registrations` column to see if they are necessary
ca_ev_adoption_df.loc[ca_ev_adoption_df["ev_registrations"].isna(), :]

Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
1,2010,CA,Alpine County,"[95646, 96120, 96156]",1161.0,NaT,
3,2010,CA,Butte County,"[95914, 95916, 95917, 95926, 95927, 95928, 959...",219949.0,NaT,
4,2010,CA,Calaveras County,"[95221, 95222, 95223, 95224, 95225, 95226, 952...",45468.0,NaT,
5,2010,CA,Colusa County,"[95912, 95932, 95950, 95955, 95970, 95979, 95987]",21437.0,NaT,
7,2010,CA,Del Norte County,"[95531, 95532, 95538, 95543, 95548, 95567]",28566.0,NaT,
8,2010,CA,El Dorado County,"[95613, 95614, 95619, 95623, 95633, 95634, 956...",181136.0,NaT,
10,2010,CA,Glenn County,"[95913, 95920, 95939, 95943, 95951, 95963, 95988]",28127.0,NaT,
12,2010,CA,Imperial County,"[92222, 92227, 92231, 92232, 92233, 92243, 922...",174716.0,NaT,
13,2010,CA,Inyo County,"[92328, 92384, 92389, 93513, 93514, 93515, 935...",18511.0,NaT,
15,2010,CA,Kings County,"[93202, 93204, 93212, 93230, 93232, 93239, 932...",152370.0,NaT,


In [20]:
# Remove the unnecessary rows that have `nan` values 
# present in the `registrations` column...
ca_ev_adoption_df.dropna(subset=['ev_registrations'], inplace=True)

# Confirm the removal of the unnecessary rows
ca_ev_adoption_df.isna().sum()

year                  0
state                 0
county               11
zip_codes            11
population           11
registration_date     0
ev_registrations      0
dtype: int64

In [21]:
ca_ev_adoption_df.loc[ca_ev_adoption_df["zip_codes"].isna(), :]

Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
58,2010,CA,,,,2010-01-01,11.0
117,2011,CA,,,,2011-01-01,195.0
176,2012,CA,,,,2012-01-01,302.0
235,2013,CA,,,,2013-01-01,578.0
294,2014,CA,,,,2014-01-01,956.0
353,2015,CA,,,,2015-01-01,1047.0
412,2016,CA,,,,2016-01-01,1166.0
471,2017,CA,,,,2017-01-01,2118.0
530,2018,CA,,,,2018-01-01,3955.0
589,2019,CA,,,,2019-01-01,4622.0


In [22]:
# Sort the DataFrame by year, state, and county
ca_ev_adoption_df.sort_values(['year', 'state', 'county', 'registration_date'], inplace=True)

print("Shape:", ca_ev_adoption_df.shape)
ca_ev_adoption_df

Shape: (600, 7)


Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,2010-01-01,20.0
2,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,2010-01-01,1.0
6,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,2010-01-01,10.0
9,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2010-01-01,2.0
11,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2010-01-01,2.0
...,...,...,...,...,...,...,...
644,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,2020-01-01,240.0
645,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,2020-01-01,13080.0
646,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2020-01-01,2948.0
647,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,2020-01-01,289.0


In [23]:
ca_ev_adoption_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 600 entries, 0 to 648
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   year               600 non-null    int32         
 1   state              600 non-null    object        
 2   county             589 non-null    object        
 3   zip_codes          589 non-null    object        
 4   population         589 non-null    float64       
 5   registration_date  600 non-null    datetime64[ns]
 6   ev_registrations   600 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int32(1), object(3)
memory usage: 35.2+ KB


In [24]:
# Create a new DataFrame to calculate the EV adoption rate per year per county in Florida
# Do not want to overwrite the original DataFrame
ca_ev_adopt_year = ca_ev_adoption_df.copy()

# Group the DataFrame by `year` and `county` to get the total number of registrations per year per county
ca_ev_adopt_year['ev_registrations'] = ca_ev_adopt_year.groupby(['year', 'county'], dropna=False)['ev_registrations'].transform('sum')

# Remove duplicates in `year` and `county` to get the unique values after grouping
ca_ev_adopt_year.drop_duplicates(subset=['year', 'county'], inplace=True)

# Drop the unnecessary columns
ca_ev_adopt_year.drop(columns=['registration_date'], inplace=True)

print("Shape:", ca_ev_adopt_year.shape)
ca_ev_adopt_year

Shape: (600, 6)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20.0
2,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1.0
6,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10.0
9,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2.0
11,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2.0
...,...,...,...,...,...,...
644,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,240.0
645,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,13080.0
646,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2948.0
647,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,289.0


In [25]:
# Save the DataFrame to a CSV file
file_name = "ca_ev_registration_population.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(ca_ev_adopt_year, file_path)

File saved as `ca_ev_registration_population.csv`
