In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

### California Main Dataset

In [3]:
# Load the main DataFrame that we will be merging the data into
ca_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_ev_registration_population.csv"),
)

# Sort the DataFrame by year, state, and county
ca_df.sort_values(["year", "state", "county"], inplace=True)

print("Shape:", ca_df.shape)
ca_df

Shape: (600, 6)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20.0
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1.0
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10.0
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2.0
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2.0
...,...,...,...,...,...,...
595,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,240.0
596,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,13080.0
597,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2948.0
598,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,289.0


### California EV Station Dataset

In [4]:
# Load the DataFrame with the number of EV stations in California
# Showing increases in the number of EV stations each year
ca_stations_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_evstation_progression.csv"),
)

# Create a new column for the state (to match the format of the main DataFrame)
ca_stations_df["state"] = "CA"

print("Shape:", ca_stations_df.shape)
ca_stations_df

Shape: (596, 5)


Unnamed: 0,year,county,ev_station_count,cumulative_ev_stations,state
0,1995,Los Angeles County,1,1,CA
1,1996,Los Angeles County,1,2,CA
2,1997,Los Angeles County,4,6,CA
3,1997,Riverside County,1,1,CA
4,1997,San Diego County,1,1,CA
...,...,...,...,...,...
591,2024,Tehama County,1,16,CA
592,2024,Tulare County,8,75,CA
593,2024,Tuolumne County,9,31,CA
594,2024,Ventura County,14,265,CA


### Merge the two Datasets

In [5]:
ca_df_clean = ca_df.merge(
    ca_stations_df,
    how="left",
    left_on=["year", "state", "county"],
    right_on=["year", "state", "county"],
)

print("Shape:", ca_df_clean.shape)
ca_df_clean

Shape: (600, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,ev_station_count,cumulative_ev_stations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20.0,,
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1.0,1.0,1.0
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10.0,,
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2.0,,
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2.0,,
...,...,...,...,...,...,...,...,...
595,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,240.0,2.0,9.0
596,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,13080.0,51.0,117.0
597,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2948.0,14.0,53.0
598,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,289.0,3.0,3.0


In [6]:
# Remove the `ev_station_count` column and keep the `cumulative_ev_station_count` column
ca_df_clean.drop(columns=["ev_station_count"], inplace=True)

print("Shape:", ca_df_clean.shape)
ca_df_clean

Shape: (600, 7)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20.0,
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1.0,1.0
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10.0,
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2.0,
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2.0,
...,...,...,...,...,...,...,...
595,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,240.0,9.0
596,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,13080.0,117.0
597,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2948.0,53.0
598,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,289.0,3.0


In [7]:
# Check for missing values or NaNs
ca_df_clean.isnull().sum()

year                        0
state                       0
county                     11
zip_codes                  11
population                 11
ev_registrations            0
cumulative_ev_stations    231
dtype: int64

In [8]:
# Replace the NaNs in the `cumulative_ev_stations` column with 0
ca_df_clean["cumulative_ev_stations"] = ca_df_clean["cumulative_ev_stations"].fillna(0).infer_objects(copy=False)

# Confirm that there are no missing values or NaNs in the `cumulative_ev_stations` column
ca_df_clean.isnull().sum()

year                       0
state                      0
county                    11
zip_codes                 11
population                11
ev_registrations           0
cumulative_ev_stations     0
dtype: int64

In [9]:
# Inspect the Data Types
ca_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    600 non-null    int64  
 1   state                   600 non-null    object 
 2   county                  589 non-null    object 
 3   zip_codes               589 non-null    object 
 4   population              589 non-null    float64
 5   ev_registrations        600 non-null    float64
 6   cumulative_ev_stations  600 non-null    float64
dtypes: float64(3), int64(1), object(3)
memory usage: 32.9+ KB


In [10]:
# Convert the `cumulative_ev_stations` and `ev_registrations` columns to integers
ca_df_clean = ca_df_clean.astype(
    {
        "cumulative_ev_stations": int,
        "ev_registrations": int,
    }
)

ca_df_clean

Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20,0
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1,1
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10,0
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2,0
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2,0
...,...,...,...,...,...,...,...
595,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,240,9
596,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,13080,117
597,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2948,53
598,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,289,3


In [11]:
# Save the DataFrame to a CSV file
file_name = "ca_main_dataset.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(ca_df_clean, file_path)

File saved as `ca_main_dataset.csv`
