In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [3]:
# Load the TX data
tx_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_ev_main_dataset.csv"),
)

# Display the data
print("Shape:", tx_df.shape)
tx_df.head()

Shape: (1158, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1,0,42313
1,2017,TX,Andrews County,[79714],17603.0,1,0,70753
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2,0,46472
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1,2,44601
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2,0,63192


In [4]:
# Load the FL data
fl_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_ev_main_dataset.csv"),
)

# Display the data
print("Shape:", fl_df.shape)
fl_df.head()

Shape: (268, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income
0,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,766,12,49078
1,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,5,0,61769
2,2018,FL,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",186240.0,300,0,51829
3,2018,FL,Bradford County,"[32042, 32044, 32058, 32091, 32622]",27752.0,22,0,46197
4,2018,FL,Brevard County,"[32754, 32775, 32780, 32781, 32782, 32783, 327...",595203.0,1881,21,54359


In [5]:
# Load the CA data
ca_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_ev_main_dataset.csv"),
)

# Display the data
print("Shape:", ca_df.shape)
ca_df.head()

Shape: (600, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20,0,69384
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1,1,54758
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10,0,78385
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2,0,46430
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2,0,40089


In [6]:
# Concatenate the dataframes into one to make the data easier to work with
df = pd.concat([tx_df, fl_df, ca_df])

# Sort the data by year, state, and county
df.sort_values(by=["year", "state", "county"], inplace=True)

# Display the data
print("Shape:", df.shape)
df.head()

Shape: (2026, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20,0,69384
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1,1,54758
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10,0,78385
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2,0,46430
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2,0,40089


In [7]:
# Calculate EV adoption rate by county : EV Adoption Rate = EV Registrations / Population
# Remember it will be decimal value. Later we can multiply by 100 to get percentage when plotting
# We will not multiply by 100 here to keep the data consistent with the other columns
df["ev_adoption_rate"] = df["ev_registrations"] / df["population"]

# Display the data
print("Shape:", df.shape)
df.head()

Shape: (2026, 9)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income,ev_adoption_rate
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20,0,69384,1.3e-05
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1,1,54758,2.6e-05
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10,0,78385,1e-05
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2,0,46430,2e-06
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2,0,40089,1.5e-05


In [8]:
# Inspect the columns data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2026 entries, 0 to 1157
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    2026 non-null   int64  
 1   state                   2026 non-null   object 
 2   county                  2011 non-null   object 
 3   zip_codes               2011 non-null   object 
 4   population              2011 non-null   float64
 5   ev_registrations        2026 non-null   int64  
 6   cumulative_ev_stations  2026 non-null   int64  
 7   median_income           2011 non-null   object 
 8   ev_adoption_rate        2011 non-null   float64
dtypes: float64(2), int64(3), object(4)
memory usage: 158.3+ KB


In [9]:
# Inspect the columns with missing values
df.isnull().sum()

year                       0
state                      0
county                    15
zip_codes                 15
population                15
ev_registrations           0
cumulative_ev_stations     0
median_income             15
ev_adoption_rate          15
dtype: int64

In [10]:
# Get a closer look at the rows with missing values
# They have a good metric of `ev_registrations` however since we are doing a deep dive into counties
# we will drop these rows
df[df["ev_adoption_rate"].isnull()]

Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income,ev_adoption_rate
30,2010,CA,,,,11,0,,
77,2011,CA,,,,195,0,,
130,2012,CA,,,,302,0,,
187,2013,CA,,,,578,0,,
245,2014,CA,,,,956,0,,
304,2015,CA,,,,1047,0,,
363,2016,CA,,,,1166,0,,
422,2017,CA,,,,2118,0,,
481,2018,CA,,,,3955,0,,
66,2018,FL,,,,1801,0,,


In [11]:
# Drop the rows with missing values to make the data consistent
df.dropna(inplace=True)

# Confirm the missing values have been removed
df.isnull().sum()

year                      0
state                     0
county                    0
zip_codes                 0
population                0
ev_registrations          0
cumulative_ev_stations    0
median_income             0
ev_adoption_rate          0
dtype: int64

In [12]:
# Inspect unique values in median_income as they are not converting to float
df["median_income"].sort_values(ascending=True).unique()

array(['-', '100,189', '100,310', ..., '98,692', '99,406', '99,716'],
      dtype=object)

In [13]:
# Replace the '-' values with NaN
df["median_income"] = df["median_income"].replace('-', np.nan)

# Remove the `,` from the values and convert the column to float
df["median_income"] = df["median_income"].str.replace(',', '').astype(float)

# Confirm the changes to the values
df["median_income"].sort_values(ascending=True).unique()

array([ 22716.,  25058.,  25098., ..., 128091., 130890.,     nan])

In [14]:
# For the one Null value we will substitute it with the median value of the whole column
# This is becasue the data is sorted by year, state, and county so the median value will be a good representation
df[df["median_income"].isnull()]

Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income,ev_adoption_rate
589,2020,TX,Jeff Davis County,"[79734, 79854]",1984.0,5,0,,0.00252


In [15]:
# Obtain the index of the null values
null_index = df[df["median_income"].isnull()].index

# Iterate through the index and replace the null value with the median value of the column
for i in null_index:
    state = df.iloc[i]["state"]
    median_income = df[df["state"] == state]["median_income"].median()
    df.at[i, "median_income"] = median_income

# Inspect the sections where the values were previously null
# Confirm the values have been replaced
df.iloc[null_index]

Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income,ev_adoption_rate
466,2018,CA,Santa Cruz County,"[95001, 95003, 95005, 95006, 95007, 95010, 950...",273841.0,3988,12,78041.0,0.014563


In [16]:
# Convert the population and median_income columns to integers
df = df.astype(
    {
        "population": int,
        "median_income": int,
    }
)

# Confirm the changes to the data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2011 entries, 0 to 1157
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    2011 non-null   int64  
 1   state                   2011 non-null   object 
 2   county                  2011 non-null   object 
 3   zip_codes               2011 non-null   object 
 4   population              2011 non-null   int32  
 5   ev_registrations        2011 non-null   int64  
 6   cumulative_ev_stations  2011 non-null   int64  
 7   median_income           2011 non-null   int32  
 8   ev_adoption_rate        2011 non-null   float64
dtypes: float64(1), int32(2), int64(3), object(3)
memory usage: 205.9+ KB


In [17]:
# Refresh the data once more to ensure it is consistent
df.sort_values(by=["year", "state", "county"], inplace=True)
df.reset_index(drop=True, inplace=True)

# Display the data
print("Shape:", df.shape)
df.head()

Shape: (2011, 9)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income,ev_adoption_rate
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986,20,0,69384,1.3e-05
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886,1,1,54758,2.6e-05
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540,10,0,78385,1e-05
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039,2,0,46430,2e-06
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009,2,0,40089,1.5e-05


In [18]:
# Calculate the growth rate of EV adoption by county
# EV Growth Rate = (EV Adoption Rate - Previous EV Adoption Rate) / Previous EV Adoption Rate
# We will not multiply by 100 here to keep the data consistent with the other columns
# We will do that when we plot the data
df["ev_growth_rate"] = df.groupby(["county"])["ev_adoption_rate"].pct_change()

# Display the data
print("Shape:", df.shape)
df.head()

Shape: (2011, 10)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income,ev_adoption_rate,ev_growth_rate
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986,20,0,69384,1.3e-05,
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886,1,1,54758,2.6e-05,
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540,10,0,78385,1e-05,
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039,2,0,46430,2e-06,
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009,2,0,40089,1.5e-05,


In [19]:
# Confirm null values are only the initial years of each state
df.isnull().sum()

year                        0
state                       0
county                      0
zip_codes                   0
population                  0
ev_registrations            0
cumulative_ev_stations      0
median_income               0
ev_adoption_rate            0
ev_growth_rate            340
dtype: int64

In [20]:
# Save the DataFrame to a CSV file
file_name = "ev_main_dataset.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(df, file_path)

File saved as `ev_main_dataset.csv`
