In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

### Florida EV Registrations DataFrame (Clean)

In [3]:
#  Load the EV registrations data for Florida
fl_ev_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_ev_registrations.csv"),
    parse_dates=["registration_date"],
)

#  Sort the data by registration date
fl_ev_df.sort_values("registration_date", inplace=True)

print("Shape:", fl_ev_df.shape)
fl_ev_df

Shape: (353974, 6)


Unnamed: 0,registration_date,state,county,zip_codes,make,model
0,2018-06-30,FL,Miami-Dade County,"[33002, 33010, 33011, 33012, 33013, 33014, 330...",Tesla,Model X
19628,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",BMW,X5
19627,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",Tesla,Model X
19626,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",Tesla,Model X
19625,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",Tesla,Model X
...,...,...,...,...,...,...
291822,2021-07-19,FL,Orange County,"[32703, 32704, 32709, 32710, 32712, 32733, 327...",Volvo,XC90 Plug In
291821,2021-07-19,FL,Martin County,"[33455, 33475, 34956, 34957, 34958, 34990, 349...",Volvo,XC90 Plug In
291820,2021-07-19,FL,Collier County,"[34101, 34102, 34103, 34104, 34105, 34106, 341...",Volvo,XC90 Plug In
291818,2021-07-19,FL,Osceola County,"[33848, 34739, 34741, 34742, 34743, 34744, 347...",Volvo,XC90 Plug In


In [4]:
# Inspect the data types of the EV registrations DataFrame
fl_ev_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 353974 entries, 0 to 353973
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   registration_date  353974 non-null  datetime64[ns]
 1   state              353974 non-null  object        
 2   county             353974 non-null  object        
 3   zip_codes          343363 non-null  object        
 4   make               353974 non-null  object        
 5   model              353974 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 18.9+ MB


In [5]:
# Check for missing values 
# `zip_codes` has missing values
# `county` has missing values marked as "Unknown" so it is not seen as missing
fl_ev_df.isnull().sum()

registration_date        0
state                    0
county                   0
zip_codes            10611
make                     0
model                    0
dtype: int64

In [6]:
# Change the value of `Desoto County` to `DeSoto County` 
# to match the population DataFrame county names later on...
select_column = "county"
select_row = "Desoto County"
change_value = "DeSoto County"

# filter the DataFrame
filter_county = fl_ev_df[select_column] == select_row

# change the value
fl_ev_df.loc[filter_county, select_column] = change_value

# confirm the change was made
fl_ev_df[fl_ev_df[select_column] == change_value].head()

Unnamed: 0,registration_date,state,county,zip_codes,make,model
919,2018-06-30,FL,DeSoto County,"[34265, 34267]",Porsche,Cayenne S E-Hybrid
1537,2018-06-30,FL,DeSoto County,"[34265, 34267]",Mitsubishi,Outlander Plug In
1536,2018-06-30,FL,DeSoto County,"[34265, 34267]",Chrysler,Pacifica
1535,2018-06-30,FL,DeSoto County,"[34265, 34267]",Volvo,S90 Plug In
1534,2018-06-30,FL,DeSoto County,"[34265, 34267]",Ford,Fusion Energi


In [7]:
# Change the value of `Unknown` to `NA` 
# so that it can show up as missing in the DataFrame...
select_column = "county"
select_row = "Unknown"
change_value = pd.NA

# filter the DataFrame
filter_county = fl_ev_df[select_column] == select_row

# change the value
fl_ev_df.loc[filter_county, select_column] = change_value

# confirm the change was made
fl_ev_df.loc[fl_ev_df["county"].isna(), :]

Unnamed: 0,registration_date,state,county,zip_codes,make,model
28743,2018-06-30,FL,,,Chevrolet,Volt
28585,2018-06-30,FL,,,Volvo,XC90 Plug In
28584,2018-06-30,FL,,,Volvo,XC90 Plug In
28583,2018-06-30,FL,,,Volvo,XC90 Plug In
28582,2018-06-30,FL,,,Volvo,XC90 Plug In
...,...,...,...,...,...,...
291896,2021-07-19,FL,,,Volvo,XC60 Plug In
291929,2021-07-19,FL,,,Volvo,XC60 Plug In
291922,2021-07-19,FL,,,Volvo,XC90 Plug In
291917,2021-07-19,FL,,,Volvo,XC90 Plug In


In [8]:
# Missing values are now correctly marked as `NA` for `county`
fl_ev_df.isnull().sum()

registration_date        0
state                    0
county               10611
zip_codes            10611
make                     0
model                    0
dtype: int64

### Florida Population DataFrame (Clean)

In [9]:
# Load the population data for Florida
fl_pop_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_population.csv"),
    parse_dates=["date"],
)

# Sort the data by date
fl_pop_df.sort_values("date", inplace=True)

# Rename the `date` column to `year`
fl_pop_df.rename(columns={"date": "year"}, inplace=True)

print("Shape:", fl_pop_df.shape)
fl_pop_df

Shape: (1005, 5)


Unnamed: 0,year,state,county,zip_codes,population
0,2010-01-01,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",247614
36,2010-01-01,FL,Levy County,"[32621, 32625, 32626, 32639, 32644, 32668, 326...",40720
37,2010-01-01,FL,Liberty County,"[32321, 32334, 32335, 32360]",8349
38,2010-01-01,FL,Madison County,"[32059, 32331, 32340, 32341, 32350]",19248
39,2010-01-01,FL,Manatee County,"[34201, 34202, 34203, 34204, 34205, 34206, 342...",323424
...,...,...,...,...,...
957,2024-01-01,FL,Gilchrist County,"[32693, 32619]",16430
956,2024-01-01,FL,Gadsden County,"[32351, 32333, 32352, 32324, 32343, 32332, 32330]",43746
955,2024-01-01,FL,Franklin County,"[32322, 32328, 32320, 32323]",11780
961,2024-01-01,FL,Hardee County,"[33873, 33834, 33890, 33865]",25736


In [10]:
# Inspect the data types of the population DataFrame
fl_pop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1005 entries, 0 to 1004
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   year        1005 non-null   datetime64[ns]
 1   state       1005 non-null   object        
 2   county      1005 non-null   object        
 3   zip_codes   1005 non-null   object        
 4   population  1005 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 47.1+ KB


In [11]:
# Inspect the missing values in the population DataFrame
fl_pop_df.isnull().sum()

year          0
state         0
county        0
zip_codes     0
population    0
dtype: int64

### Prepare EV Registration DataFrame for Merge

In [12]:
# Create a new column for the `year` so we can merge later with the population DataFrame
fl_ev_df["year"] = fl_ev_df["registration_date"].dt.year

print("Shape:", fl_ev_df.shape)
fl_ev_df

Shape: (353974, 7)


Unnamed: 0,registration_date,state,county,zip_codes,make,model,year
0,2018-06-30,FL,Miami-Dade County,"[33002, 33010, 33011, 33012, 33013, 33014, 330...",Tesla,Model X,2018
19628,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",BMW,X5,2018
19627,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",Tesla,Model X,2018
19626,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",Tesla,Model X,2018
19625,2018-06-30,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",Tesla,Model X,2018
...,...,...,...,...,...,...,...
291822,2021-07-19,FL,Orange County,"[32703, 32704, 32709, 32710, 32712, 32733, 327...",Volvo,XC90 Plug In,2021
291821,2021-07-19,FL,Martin County,"[33455, 33475, 34956, 34957, 34958, 34990, 349...",Volvo,XC90 Plug In,2021
291820,2021-07-19,FL,Collier County,"[34101, 34102, 34103, 34104, 34105, 34106, 341...",Volvo,XC90 Plug In,2021
291818,2021-07-19,FL,Osceola County,"[33848, 34739, 34741, 34742, 34743, 34744, 347...",Volvo,XC90 Plug In,2021


In [13]:
# Group the EV registrations DataFrame by `year`, `state`, `county`, and `registration_date`...
# to get the total number of registrations per specific dates grouped by year, state, and county
# Create column for the total number of registrations called `registrations`
fl_ev_df_group = fl_ev_df.groupby(['year', 'state', 'county', "registration_date"], dropna=False).size().reset_index(name='ev_registrations')

print("Shape:", fl_ev_df_group.shape)
fl_ev_df_group

Shape: (399, 5)


Unnamed: 0,year,state,county,registration_date,ev_registrations
0,2018,FL,Alachua County,2018-06-30,335
1,2018,FL,Alachua County,2018-12-31,431
2,2018,FL,Baker County,2018-06-30,3
3,2018,FL,Baker County,2018-12-31,2
4,2018,FL,Bay County,2018-06-30,141
...,...,...,...,...,...
394,2021,FL,Volusia County,2021-07-19,1458
395,2021,FL,Wakulla County,2021-07-19,38
396,2021,FL,Walton County,2021-07-19,314
397,2021,FL,Washington County,2021-07-19,14


In [14]:
# Check for missing values in the grouped DataFrame
fl_ev_df_group.isnull().sum()

year                 0
state                0
county               6
registration_date    0
ev_registrations     0
dtype: int64

In [15]:
# Inspect to see if Unknown values are kept after grouping
# Important to keep as they have valuable information on registration counts
fl_ev_df_group.loc[fl_ev_df_group["county"].isna(), :]

Unnamed: 0,year,state,county,registration_date,ev_registrations
129,2018,FL,,2018-06-30,709
130,2018,FL,,2018-12-31,1092
197,2019,FL,,2019-12-31,1778
330,2020,FL,,2020-06-30,1769
331,2020,FL,,2020-12-31,2226
398,2021,FL,,2021-07-19,3037


### Prepare Population DataFrame for Merge

In [16]:
# Change the value of the year from DateTime to int ready for merging
fl_pop_df['year'] = fl_pop_df['year'].dt.year

print("Shape:", fl_pop_df.shape)
fl_pop_df

Shape: (1005, 5)


Unnamed: 0,year,state,county,zip_codes,population
0,2010,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",247614
36,2010,FL,Levy County,"[32621, 32625, 32626, 32639, 32644, 32668, 326...",40720
37,2010,FL,Liberty County,"[32321, 32334, 32335, 32360]",8349
38,2010,FL,Madison County,"[32059, 32331, 32340, 32341, 32350]",19248
39,2010,FL,Manatee County,"[34201, 34202, 34203, 34204, 34205, 34206, 342...",323424
...,...,...,...,...,...
957,2024,FL,Gilchrist County,"[32693, 32619]",16430
956,2024,FL,Gadsden County,"[32351, 32333, 32352, 32324, 32343, 32332, 32330]",43746
955,2024,FL,Franklin County,"[32322, 32328, 32320, 32323]",11780
961,2024,FL,Hardee County,"[33873, 33834, 33890, 33865]",25736


In [17]:
# Create a list of unique years in the EV registrations DataFrame
# This will be used to filter the population DataFrame
# and keep only the years that are present in the EV registrations DataFrame

select_years = fl_ev_df_group['year'].unique()
select_years

array([2018, 2019, 2020, 2021])

In [18]:
# Filter the population DataFrame to keep only the years present in the EV registrations DataFrame
filter_years = fl_pop_df['year'].isin(select_years)

# Create a new DataFrame with the filtered population data
fl_pop_filtered = fl_pop_df.loc[filter_years, :]

# Reset the index of the filtered population DataFrame
fl_pop_filtered.reset_index(drop=True, inplace=True)

fl_pop_filtered

Unnamed: 0,year,state,county,zip_codes,population
0,2018,FL,Nassau County,"[32009, 32011, 32034, 32035, 32041, 32046, 32097]",85936
1,2018,FL,Osceola County,"[33848, 34739, 34741, 34742, 34743, 34744, 347...",368456
2,2018,FL,Orange County,"[32703, 32704, 32709, 32710, 32712, 32733, 327...",1381540
3,2018,FL,Okeechobee County,"[34972, 34973, 34974]",41654
4,2018,FL,Okaloosa County,"[32531, 32536, 32537, 32539, 32540, 32541, 325...",206934
...,...,...,...,...,...
263,2021,FL,Gulf County,"[32456, 32457, 32465]",14462
264,2021,FL,Glades County,"[33471, 33944]",12266
265,2021,FL,Gilchrist County,"[32619, 32693]",18316
266,2021,FL,Gadsden County,"[32324, 32330, 32332, 32333, 32343, 32351, 323...",43599


### Merge the Population DataFrame with EV Registration DataFrame

In [20]:
# Merge the filtered population DataFrame with the EV registrations DataFrame
# This will allow us to see the adoption rate of EVs per county in Florida
# We will see the number of registrations per county and the population per county
fl_ev_adoption_df = pd.merge(fl_pop_filtered, fl_ev_df_group, how='outer', on=['year', 'state', 'county'])

fl_ev_adoption_df.sort_values(['year', 'state', 'county'], inplace=True)

print("Shape:", fl_ev_adoption_df.shape)
fl_ev_adoption_df

Shape: (403, 7)


Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,2018-06-30,335.0
1,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,2018-12-31,431.0
2,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,2018-06-30,3.0
3,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,2018-12-31,2.0
4,2018,FL,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",186240.0,2018-06-30,141.0
...,...,...,...,...,...,...,...
398,2021,FL,Volusia County,"[32105, 32114, 32115, 32116, 32117, 32118, 321...",566368.0,2021-07-19,1458.0
399,2021,FL,Wakulla County,"[32326, 32327, 32346, 32355, 32358]",34270.0,2021-07-19,38.0
400,2021,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",80021.0,2021-07-19,314.0
401,2021,FL,Washington County,"[32427, 32428, 32437, 32462, 32463]",24867.0,2021-07-19,14.0


In [21]:
# Check null values after merging to keep the necessary `nan` values
# and remove the unnecessary `nan` values
fl_ev_adoption_df.isnull().sum()

year                 0
state                0
county               6
zip_codes            6
population           6
registration_date    4
ev_registrations     4
dtype: int64

In [22]:
# Check the missing values in `registrations` column to see if they are necessary
fl_ev_adoption_df.loc[fl_ev_adoption_df["ev_registrations"].isna(), :]

Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
63,2018,FL,Lafayette County,"[32013, 32066]",8691.0,NaT,
164,2019,FL,Lafayette County,"[32013, 32066]",8422.0,NaT,
264,2020,FL,Lafayette County,"[32013, 32066]",8218.0,NaT,
367,2021,FL,Lafayette County,"[32013, 32066]",7952.0,NaT,


In [23]:
# Remove the unnecessary rows that have `nan` values 
# present in the `registrations` column...
fl_ev_adoption_df.dropna(subset=['ev_registrations'], inplace=True)

# Confirm the removal of the unnecessary rows
fl_ev_adoption_df.isna().sum()

year                 0
state                0
county               6
zip_codes            6
population           6
registration_date    0
ev_registrations     0
dtype: int64

In [24]:
# Check the missing values in `zip_codes` column to see if they are necessary
# Confirmed that the missing values are necessary due to containing important numerical data for registrations
fl_ev_adoption_df.loc[fl_ev_adoption_df["zip_codes"].isna(), :]

Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
130,2018,FL,,,,2018-06-30,709.0
131,2018,FL,,,,2018-12-31,1092.0
199,2019,FL,,,,2019-12-31,1778.0
333,2020,FL,,,,2020-06-30,1769.0
334,2020,FL,,,,2020-12-31,2226.0
402,2021,FL,,,,2021-07-19,3037.0


In [25]:
# Sort the DataFrame by year, state, and county
fl_ev_adoption_df.sort_values(['year', 'state', 'county', 'registration_date'], inplace=True)

print("Shape:", fl_ev_adoption_df.shape)
fl_ev_adoption_df

Shape: (399, 7)


Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,2018-06-30,335.0
1,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,2018-12-31,431.0
2,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,2018-06-30,3.0
3,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,2018-12-31,2.0
4,2018,FL,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",186240.0,2018-06-30,141.0
...,...,...,...,...,...,...,...
398,2021,FL,Volusia County,"[32105, 32114, 32115, 32116, 32117, 32118, 321...",566368.0,2021-07-19,1458.0
399,2021,FL,Wakulla County,"[32326, 32327, 32346, 32355, 32358]",34270.0,2021-07-19,38.0
400,2021,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",80021.0,2021-07-19,314.0
401,2021,FL,Washington County,"[32427, 32428, 32437, 32462, 32463]",24867.0,2021-07-19,14.0


In [26]:
# Inspect the data types of the DataFrame
fl_ev_adoption_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399 entries, 0 to 402
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   year               399 non-null    int32         
 1   state              399 non-null    object        
 2   county             393 non-null    object        
 3   zip_codes          393 non-null    object        
 4   population         393 non-null    float64       
 5   registration_date  399 non-null    datetime64[ns]
 6   ev_registrations   399 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int32(1), object(3)
memory usage: 23.4+ KB


### Group EV Adoption by Year

In [27]:
# Create a new DataFrame to calculate the EV adoption rate per year per county in Florida
# Do not want to overwrite the original DataFrame
fl_ev_adopt_year = fl_ev_adoption_df.copy()

# Group the DataFrame by `year` and `county` to get the total number of registrations per year per county
fl_ev_adopt_year['ev_registrations'] = fl_ev_adopt_year.groupby(['year', 'county'], dropna=False)['ev_registrations'].transform('sum')

# Remove duplicates in `year` and `county` to get the unique values after grouping
fl_ev_adopt_year.drop_duplicates(subset=['year', 'county'], inplace=True)

# Drop the unnecessary columns
fl_ev_adopt_year.drop(columns=['registration_date'], inplace=True)

print("Shape:", fl_ev_adopt_year.shape)
fl_ev_adopt_year

Shape: (268, 6)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
0,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,766.0
2,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,5.0
4,2018,FL,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",186240.0,300.0
6,2018,FL,Bradford County,"[32042, 32044, 32058, 32091, 32622]",27752.0,22.0
8,2018,FL,Brevard County,"[32754, 32775, 32780, 32781, 32782, 32783, 327...",595203.0,1881.0
...,...,...,...,...,...,...
398,2021,FL,Volusia County,"[32105, 32114, 32115, 32116, 32117, 32118, 321...",566368.0,1458.0
399,2021,FL,Wakulla County,"[32326, 32327, 32346, 32355, 32358]",34270.0,38.0
400,2021,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",80021.0,314.0
401,2021,FL,Washington County,"[32427, 32428, 32437, 32462, 32463]",24867.0,14.0


In [28]:
# Save the DataFrame to a CSV file
file_name = "fl_ev_registration_population.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(fl_ev_adopt_year, file_path)

File saved as `fl_ev_registration_population.csv`
