In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

### Texas EV Registrations DataFrame (Clean)

In [3]:
#  Load the EV registrations data for Florida
tx_ev_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_ev_registrations.csv"),
    parse_dates=["registration_date"],
)

#  Sort the data by registration date
tx_ev_df.sort_values("registration_date", inplace=True)

print("Shape:", tx_ev_df.shape)
tx_ev_df

Shape: (2274866, 7)


Unnamed: 0,registration_date,state,county,zip_code,make,model,year
619961,2017-07-01,TX,Travis County,78721,NISSAN,LEAF,2013
345572,2017-07-01,TX,Harris County,77008,CHEVROLET,VOLT,2012
1936284,2017-07-01,TX,Comal County,78130,NISSAN,LEAF,2015
2094853,2017-07-01,TX,Dallas County,75254,CHEVROLET,VOLT,2017
847936,2017-07-01,TX,Hidalgo County,78501,TESLA,MODEL X,2017
...,...,...,...,...,...,...,...
118603,2024-07-01,TX,Travis County,78739,TESLA,MODEL 3,2019
1411725,2024-07-01,TX,Denton County,75007,TESLA,MODEL Y,2023
115202,2024-07-01,TX,Denton County,76226,TESLA,MODEL 3,2023
922701,2024-07-01,TX,Nueces County,78405,FORD,FOCUS,2017


In [4]:
# Inspect the data types of the EV registrations DataFrame
tx_ev_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2274866 entries, 619961 to 74877
Data columns (total 7 columns):
 #   Column             Dtype         
---  ------             -----         
 0   registration_date  datetime64[ns]
 1   state              object        
 2   county             object        
 3   zip_code           object        
 4   make               object        
 5   model              object        
 6   year               int64         
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 138.8+ MB


In [5]:
# Check for missing values 
# `zip_codes` has missing values
# `county` has missing values marked as "Unknown" so it is not seen as missing
tx_ev_df.isnull().sum()

registration_date       0
state                   0
county               4686
zip_code                0
make                    0
model                   0
year                    0
dtype: int64

In [6]:
filter_rows = tx_ev_df["county"].isnull()

tx_ev_df.loc[filter_rows]

Unnamed: 0,registration_date,state,county,zip_code,make,model,year
2269181,2017-07-01,TX,,Error,FORD,FUSION,2014
1626947,2017-07-01,TX,,Error,TESLA,MODEL X,2017
474055,2017-08-01,TX,,Error,TOYOTA,PRIUS PRIME,2017
1944968,2017-08-01,TX,,Error,BMW,I3,2014
1964946,2017-08-01,TX,,Error,BMW,330E,2017
...,...,...,...,...,...,...,...
555978,2024-06-01,TX,,Error,HONDA,CLARITY,2018
1840462,2024-06-01,TX,,Error,HONDA,CLARITY,2018
338853,2024-06-01,TX,,Error,CHEVROLET,VOLT,2013
1498072,2024-06-01,TX,,Error,TESLA,MODEL S,2019


### Texas Population DataFrame (Clean)

In [7]:
# Load the population data for Florida
tx_pop_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_population.csv"),
    parse_dates=["date"],
)

# Sort the data by date
tx_pop_df.sort_values("date", inplace=True)

# Rename the `date` column to `year`
tx_pop_df.rename(columns={"date": "year"}, inplace=True)

print("Shape:", tx_pop_df.shape)
tx_pop_df

Shape: (3811, 5)


Unnamed: 0,year,state,county,zip_codes,population
0,2010-01-01,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58493
161,2010-01-01,TX,McMullen County,"[78007, 78072]",711
162,2010-01-01,TX,Medina County,"[78009, 78016, 78039, 78056, 78059, 78066, 788...",46114
163,2010-01-01,TX,Menard County,"[76841, 76848, 76859]",2230
164,2010-01-01,TX,Midland County,"[79701, 79702, 79703, 79704, 79705, 79706, 797...",136974
...,...,...,...,...,...
3646,2024-01-01,TX,Grayson County,"[75092, 75090, 75020, 76273, 75495, 75021, 750...",141272
3647,2024-01-01,TX,Gregg County,"[75605, 75604, 75662, 75601, 75647, 75603, 75693]",130580
3648,2024-01-01,TX,Grimes County,"[77868, 77363, 77831, 77861, 77830, 77876]",29742
3682,2024-01-01,TX,Jones County,"[79601, 79553, 79525, 79501, 79520, 79503]",37863


In [8]:
# Inspect the data types of the population DataFrame
tx_pop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3811 entries, 0 to 3810
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   year        3811 non-null   datetime64[ns]
 1   state       3811 non-null   object        
 2   county      3811 non-null   object        
 3   zip_codes   3811 non-null   object        
 4   population  3811 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 178.6+ KB


In [9]:
# Inspect the missing values in the population DataFrame
tx_pop_df.isnull().sum()

year          0
state         0
county        0
zip_codes     0
population    0
dtype: int64

### Prepare EV Registration DataFrame for Merge

In [10]:
# Group the EV registrations DataFrame by `year`, `state`, `county`, and `registration_date`...
# to get the total number of registrations per specific dates grouped by year, state, and county
# Create column for the total number of registrations called `registrations`
tx_ev_df_group = tx_ev_df.groupby(['year', 'state', 'county', "registration_date"], dropna=False).size().reset_index(name='ev_registrations')

print("Shape:", tx_ev_df_group.shape)
tx_ev_df_group

Shape: (46278, 5)


Unnamed: 0,year,state,county,registration_date,ev_registrations
0,1987,TX,Harris County,2019-01-01,1
1,1988,TX,Jefferson County,2018-11-01,2
2,1993,TX,Dallas County,2022-01-01,1
3,1993,TX,Dallas County,2022-12-01,5
4,1993,TX,Dallas County,2023-01-01,8
...,...,...,...,...,...
46273,2025,TX,Travis County,2024-05-01,4
46274,2025,TX,Travis County,2024-06-01,1
46275,2025,TX,Williamson County,2024-05-01,1
46276,2025,TX,Williamson County,2024-06-01,1


In [11]:
# Check for missing values in the grouped DataFrame
tx_ev_df_group.isnull().sum()

year                   0
state                  0
county               499
registration_date      0
ev_registrations       0
dtype: int64

In [12]:
# Inspect to see if Unknown values are kept after grouping
# Important to keep as they have valuable information on registration counts
tx_ev_df_group.loc[tx_ev_df_group["county"].isna(), :]

Unnamed: 0,year,state,county,registration_date,ev_registrations
371,2008,TX,,2022-01-01,1
372,2008,TX,,2023-01-01,6
373,2008,TX,,2024-01-01,6
433,2009,TX,,2021-10-01,2
584,2010,TX,,2020-12-01,1
...,...,...,...,...,...
46243,2024,TX,,2024-03-01,9
46244,2024,TX,,2024-04-01,3
46245,2024,TX,,2024-05-01,5
46246,2024,TX,,2024-06-01,1


### Prepare Population DataFrame for Merge

In [13]:
# Change the value of the year from DateTime to int ready for merging
tx_pop_df['year'] = tx_pop_df['year'].dt.year

print("Shape:", tx_pop_df.shape)
tx_pop_df

Shape: (3811, 5)


Unnamed: 0,year,state,county,zip_codes,population
0,2010,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58493
161,2010,TX,McMullen County,"[78007, 78072]",711
162,2010,TX,Medina County,"[78009, 78016, 78039, 78056, 78059, 78066, 788...",46114
163,2010,TX,Menard County,"[76841, 76848, 76859]",2230
164,2010,TX,Midland County,"[79701, 79702, 79703, 79704, 79705, 79706, 797...",136974
...,...,...,...,...,...
3646,2024,TX,Grayson County,"[75092, 75090, 75020, 76273, 75495, 75021, 750...",141272
3647,2024,TX,Gregg County,"[75605, 75604, 75662, 75601, 75647, 75603, 75693]",130580
3648,2024,TX,Grimes County,"[77868, 77363, 77831, 77861, 77830, 77876]",29742
3682,2024,TX,Jones County,"[79601, 79553, 79525, 79501, 79520, 79503]",37863


In [14]:
# Create a list of unique years in the EV registrations DataFrame
# This will be used to filter the population DataFrame
# and keep only the years that are present in the EV registrations DataFrame

select_years = tx_ev_df_group['year'].unique()
select_years

array([1987, 1988, 1993, 1994, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025],
      dtype=int64)

In [15]:
# Filter the population DataFrame to keep only the years present in the EV registrations DataFrame
filter_years = tx_pop_df['year'].isin(select_years)

# Create a new DataFrame with the filtered population data
tx_pop_filtered = tx_pop_df.loc[filter_years, :]

# Reset the index of the filtered population DataFrame
tx_pop_filtered.reset_index(drop=True, inplace=True)

tx_pop_filtered

Unnamed: 0,year,state,county,zip_codes,population
0,2010,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58493
1,2010,TX,McMullen County,"[78007, 78072]",711
2,2010,TX,Medina County,"[78009, 78016, 78039, 78056, 78059, 78066, 788...",46114
3,2010,TX,Menard County,"[76841, 76848, 76859]",2230
4,2010,TX,Midland County,"[79701, 79702, 79703, 79704, 79705, 79706, 797...",136974
...,...,...,...,...,...
3806,2024,TX,Grayson County,"[75092, 75090, 75020, 76273, 75495, 75021, 750...",141272
3807,2024,TX,Gregg County,"[75605, 75604, 75662, 75601, 75647, 75603, 75693]",130580
3808,2024,TX,Grimes County,"[77868, 77363, 77831, 77861, 77830, 77876]",29742
3809,2024,TX,Jones County,"[79601, 79553, 79525, 79501, 79520, 79503]",37863


### Merge the Population DataFrame with EV Registration DataFrame

# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [16]:
# Merge the filtered population DataFrame with the EV registrations DataFrame
# This will allow us to see the adoption rate of EVs per county in Florida
# We will see the number of registrations per county and the population per county
tx_ev_adoption_df = pd.merge(tx_pop_filtered, tx_ev_df_group, how='outer', on=['year', 'state', 'county'])

tx_ev_adoption_df.sort_values(['year', 'state', 'county'], inplace=True)

print("Shape:", tx_ev_adoption_df.shape)
tx_ev_adoption_df

Shape: (47671, 7)


Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,1987,TX,Harris County,,,2019-01-01,1.0
1,1988,TX,Jefferson County,,,2018-11-01,2.0
2,1993,TX,Dallas County,,,2022-01-01,1.0
3,1993,TX,Dallas County,,,2022-12-01,5.0
4,1993,TX,Dallas County,,,2023-01-01,8.0
...,...,...,...,...,...,...,...
47666,2025,TX,Travis County,,,2024-05-01,4.0
47667,2025,TX,Travis County,,,2024-06-01,1.0
47668,2025,TX,Williamson County,,,2024-05-01,1.0
47669,2025,TX,Williamson County,,,2024-06-01,1.0


In [17]:
# Check null values after merging to keep the necessary `nan` values
# and remove the unnecessary `nan` values
tx_ev_adoption_df.isnull().sum()

year                    0
state                   0
county                499
zip_codes             964
population            964
registration_date    1393
ev_registrations     1393
dtype: int64

In [18]:
tx_ev_adoption_df.loc[tx_ev_adoption_df["zip_codes"].isnull()]

Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,1987,TX,Harris County,,,2019-01-01,1.0
1,1988,TX,Jefferson County,,,2018-11-01,2.0
2,1993,TX,Dallas County,,,2022-01-01,1.0
3,1993,TX,Dallas County,,,2022-12-01,5.0
4,1993,TX,Dallas County,,,2023-01-01,8.0
...,...,...,...,...,...,...,...
47666,2025,TX,Travis County,,,2024-05-01,4.0
47667,2025,TX,Travis County,,,2024-06-01,1.0
47668,2025,TX,Williamson County,,,2024-05-01,1.0
47669,2025,TX,Williamson County,,,2024-06-01,1.0


In [19]:
# Remove the unnecessary rows that have `nan` values 
# present in the `registrations` column...
tx_ev_adoption_df.dropna(subset=['ev_registrations'], inplace=True)

# Confirm the removal of the unnecessary rows
tx_ev_adoption_df.isna().sum()

year                   0
state                  0
county               499
zip_codes            964
population           964
registration_date      0
ev_registrations       0
dtype: int64

In [20]:
# Check the missing values in `zip_codes` column to see if they are necessary
# Confirmed that the missing values are necessary due to containing important numerical data for registrations
tx_ev_adoption_df.loc[tx_ev_adoption_df["zip_codes"].isnull(), :]

Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,1987,TX,Harris County,,,2019-01-01,1.0
1,1988,TX,Jefferson County,,,2018-11-01,2.0
2,1993,TX,Dallas County,,,2022-01-01,1.0
3,1993,TX,Dallas County,,,2022-12-01,5.0
4,1993,TX,Dallas County,,,2023-01-01,8.0
...,...,...,...,...,...,...,...
47666,2025,TX,Travis County,,,2024-05-01,4.0
47667,2025,TX,Travis County,,,2024-06-01,1.0
47668,2025,TX,Williamson County,,,2024-05-01,1.0
47669,2025,TX,Williamson County,,,2024-06-01,1.0


In [21]:
# Sort the DataFrame by year, state, and county
tx_ev_adoption_df.sort_values(['year', 'state', 'county', 'registration_date'], inplace=True)

print("Shape:", tx_ev_adoption_df.shape)
tx_ev_adoption_df

Shape: (46278, 7)


Unnamed: 0,year,state,county,zip_codes,population,registration_date,ev_registrations
0,1987,TX,Harris County,,,2019-01-01,1.0
1,1988,TX,Jefferson County,,,2018-11-01,2.0
2,1993,TX,Dallas County,,,2022-01-01,1.0
3,1993,TX,Dallas County,,,2022-12-01,5.0
4,1993,TX,Dallas County,,,2023-01-01,8.0
...,...,...,...,...,...,...,...
47666,2025,TX,Travis County,,,2024-05-01,4.0
47667,2025,TX,Travis County,,,2024-06-01,1.0
47668,2025,TX,Williamson County,,,2024-05-01,1.0
47669,2025,TX,Williamson County,,,2024-06-01,1.0


In [22]:
# Inspect the data types of the DataFrame
tx_ev_adoption_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46278 entries, 0 to 47670
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   year               46278 non-null  int64         
 1   state              46278 non-null  object        
 2   county             45779 non-null  object        
 3   zip_codes          45314 non-null  object        
 4   population         45314 non-null  float64       
 5   registration_date  46278 non-null  datetime64[ns]
 6   ev_registrations   46278 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 2.8+ MB


### Group EV Adoption by Year

In [23]:
# Create a new DataFrame to calculate the EV adoption rate per year per county in Florida
# Do not want to overwrite the original DataFrame
tx_ev_adopt_year = tx_ev_adoption_df.copy()

# Group the DataFrame by `year` and `county` to get the total number of registrations per year per county
tx_ev_adopt_year['ev_registrations'] = tx_ev_adopt_year.groupby(['year', 'county'], dropna=False)['ev_registrations'].transform('sum')

# Remove duplicates in `year` and `county` to get the unique values after grouping
tx_ev_adopt_year.drop_duplicates(subset=['year', 'county'], inplace=True)

# Drop the unnecessary columns
tx_ev_adopt_year.drop(columns=['registration_date'], inplace=True)

print("Shape:", tx_ev_adopt_year.shape)
tx_ev_adopt_year

Shape: (2625, 6)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
0,1987,TX,Harris County,,,1.0
1,1988,TX,Jefferson County,,,2.0
2,1993,TX,Dallas County,,,32.0
8,1994,TX,Dallas County,,,11.0
10,1998,TX,Cameron County,,,1.0
...,...,...,...,...,...,...
47662,2025,TX,Tarrant County,,,6.0
47664,2025,TX,Taylor County,,,1.0
47665,2025,TX,Travis County,,,7.0
47668,2025,TX,Williamson County,,,2.0


In [24]:
tx_ev_adopt_year.isnull().sum()

year                  0
state                 0
county               18
zip_codes           207
population          207
ev_registrations      0
dtype: int64

In [25]:
tx_ev_adopt_year.loc[tx_ev_adopt_year["county"].isnull(), :]

Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
371,2008,TX,,,,13.0
433,2009,TX,,,,2.0
810,2010,TX,,,,14.0
2360,2011,TX,,,,18.0
4591,2012,TX,,,,38.0
7758,2013,TX,,,,91.0
11172,2014,TX,,,,125.0
14568,2015,TX,,,,106.0
18149,2016,TX,,,,202.0
22743,2017,TX,,,,262.0


In [26]:
tx_ev_adopt_year.loc[tx_ev_adopt_year["zip_codes"].isnull(), :]

Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
0,1987,TX,Harris County,,,1.0
1,1988,TX,Jefferson County,,,2.0
2,1993,TX,Dallas County,,,32.0
8,1994,TX,Dallas County,,,11.0
10,1998,TX,Cameron County,,,1.0
...,...,...,...,...,...,...
47662,2025,TX,Tarrant County,,,6.0
47664,2025,TX,Taylor County,,,1.0
47665,2025,TX,Travis County,,,7.0
47668,2025,TX,Williamson County,,,2.0


In [27]:
# Save the DataFrame to a CSV file
file_name = "tx_ev_registration_population.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(tx_ev_adopt_year, file_path)

File `tx_ev_registration_population.csv` already exists. Overwriting file.
File saved as `tx_ev_registration_population.csv`
