In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

### Texas Main Dataset

In [3]:
# Load the main DataFrame that we will be merging the data into
tx_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_ev_main_dataset.csv"),
)

# Sort the DataFrame by year, state, and county
tx_df.sort_values(["year", "state", "county"], inplace=True)

print("Shape:", tx_df.shape)
tx_df

Shape: (1158, 7)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,median_income
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1.0,42313
1,2017,TX,Andrews County,[79714],17603.0,1.0,70753
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2.0,46472
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1.0,44601
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2.0,63192
...,...,...,...,...,...,...,...
1153,2022,TX,Wood County,"[75410, 75444, 75494, 75497, 75765, 75773, 75783]",46930.0,188.0,61748
1154,2022,TX,Yoakum County,"[79323, 79355]",7484.0,10.0,80317
1155,2022,TX,Young County,"[76372, 76374, 76450, 76460, 76481]",18012.0,94.0,65565
1156,2022,TX,Zapata County,"[78067, 78076, 78564]",13838.0,10.0,35061


### Texas EV Station Dataset

In [4]:
# Load the DataFrame with the number of EV stations in California
# Showing increases in the number of EV stations each year
tx_stations_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_evstation_progression.csv"),
)

# Create a new column for the state (to match the format of the main DataFrame)
tx_stations_df["state"] = "TX"

print("Shape:", tx_stations_df.shape)
tx_stations_df

Shape: (587, 5)


Unnamed: 0,year,county,ev_station_count,cumulative_ev_stations,state
0,2010,Dallas County,1,1,TX
1,2011,Bell County,2,2,TX
2,2011,Bexar County,5,5,TX
3,2011,Brazos County,1,1,TX
4,2011,Cameron County,1,1,TX
...,...,...,...,...,...
582,2024,Webb County,1,6,TX
583,2024,Wharton County,2,8,TX
584,2024,Willacy County,1,1,TX
585,2024,Williamson County,12,108,TX


### Merge the two Datasets

In [5]:
tx_df_clean = tx_df.merge(
    tx_stations_df,
    how="left",
    left_on=["year", "state", "county"],
    right_on=["year", "state", "county"],
)

print("Shape:", tx_df_clean.shape)
tx_df_clean

Shape: (1158, 9)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,median_income,ev_station_count,cumulative_ev_stations
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1.0,42313,,
1,2017,TX,Andrews County,[79714],17603.0,1.0,70753,,
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2.0,46472,,
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1.0,44601,1.0,2.0
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2.0,63192,,
...,...,...,...,...,...,...,...,...,...
1153,2022,TX,Wood County,"[75410, 75444, 75494, 75497, 75765, 75773, 75783]",46930.0,188.0,61748,,
1154,2022,TX,Yoakum County,"[79323, 79355]",7484.0,10.0,80317,1.0,1.0
1155,2022,TX,Young County,"[76372, 76374, 76450, 76460, 76481]",18012.0,94.0,65565,,
1156,2022,TX,Zapata County,"[78067, 78076, 78564]",13838.0,10.0,35061,,


In [6]:
# Remove the `ev_station_count` column and keep the `cumulative_ev_station_count` column
tx_df_clean.drop(columns=["ev_station_count"], inplace=True)

print("Shape:", tx_df_clean.shape)
tx_df_clean

Shape: (1158, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,median_income,cumulative_ev_stations
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1.0,42313,
1,2017,TX,Andrews County,[79714],17603.0,1.0,70753,
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2.0,46472,
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1.0,44601,2.0
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2.0,63192,
...,...,...,...,...,...,...,...,...
1153,2022,TX,Wood County,"[75410, 75444, 75494, 75497, 75765, 75773, 75783]",46930.0,188.0,61748,
1154,2022,TX,Yoakum County,"[79323, 79355]",7484.0,10.0,80317,1.0
1155,2022,TX,Young County,"[76372, 76374, 76450, 76460, 76481]",18012.0,94.0,65565,
1156,2022,TX,Zapata County,"[78067, 78076, 78564]",13838.0,10.0,35061,


In [7]:
# Check for missing values or NaNs
tx_df_clean.isnull().sum()

year                        0
state                       0
county                      0
zip_codes                   0
population                  0
ev_registrations            0
median_income               0
cumulative_ev_stations    886
dtype: int64

In [8]:
# Replace the NaNs in the `cumulative_ev_stations` column with 0
tx_df_clean["cumulative_ev_stations"] = tx_df_clean["cumulative_ev_stations"].fillna(0).infer_objects(copy=False)

# Confirm that there are no missing values or NaNs in the `cumulative_ev_stations` column
tx_df_clean.isnull().sum()

year                      0
state                     0
county                    0
zip_codes                 0
population                0
ev_registrations          0
median_income             0
cumulative_ev_stations    0
dtype: int64

In [9]:
# Inspect the Data Types
tx_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158 entries, 0 to 1157
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   year                    1158 non-null   int64  
 1   state                   1158 non-null   object 
 2   county                  1158 non-null   object 
 3   zip_codes               1158 non-null   object 
 4   population              1158 non-null   float64
 5   ev_registrations        1158 non-null   float64
 6   median_income           1158 non-null   object 
 7   cumulative_ev_stations  1158 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 72.5+ KB


In [10]:
# Convert the `cumulative_ev_stations` and `ev_registrations` columns to integers
tx_df_clean = tx_df_clean.astype(
    {
        "cumulative_ev_stations": int,
        "ev_registrations": int,
    }
)

tx_df_clean

Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,median_income,cumulative_ev_stations
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1,42313,0
1,2017,TX,Andrews County,[79714],17603.0,1,70753,0
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2,46472,0
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1,44601,2
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2,63192,0
...,...,...,...,...,...,...,...,...
1153,2022,TX,Wood County,"[75410, 75444, 75494, 75497, 75765, 75773, 75783]",46930.0,188,61748,0
1154,2022,TX,Yoakum County,"[79323, 79355]",7484.0,10,80317,1
1155,2022,TX,Young County,"[76372, 76374, 76450, 76460, 76481]",18012.0,94,65565,0
1156,2022,TX,Zapata County,"[78067, 78076, 78564]",13838.0,10,35061,0


In [11]:
# Reorder columns
columns_order = ["year", "state", "county", "zip_codes", "population",
                 "ev_registrations", "cumulative_ev_stations", "median_income"]

tx_df_clean = tx_df_clean[columns_order]

print("Shape:", tx_df_clean.shape)
tx_df_clean

Shape: (1158, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1,0,42313
1,2017,TX,Andrews County,[79714],17603.0,1,0,70753
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2,0,46472
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1,2,44601
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2,0,63192
...,...,...,...,...,...,...,...,...
1153,2022,TX,Wood County,"[75410, 75444, 75494, 75497, 75765, 75773, 75783]",46930.0,188,0,61748
1154,2022,TX,Yoakum County,"[79323, 79355]",7484.0,10,1,80317
1155,2022,TX,Young County,"[76372, 76374, 76450, 76460, 76481]",18012.0,94,0,65565
1156,2022,TX,Zapata County,"[78067, 78076, 78564]",13838.0,10,0,35061


In [12]:
# Save the DataFrame to a CSV file
file_name = "tx_ev_main_dataset.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(tx_df_clean, file_path)

File saved as `tx_main_dataset.csv`
