In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Load bulk Statewide registration data
registration_df = pd.read_csv(
    Path("../../../../../data/processed_data/states_registrations.csv"),
)

registration_df

Unnamed: 0,year,state,electric,phev,hev,biodiesel,ethanol,cng,propane,hydrogen,methanol,gasoline,diesel,unknown
0,2016,Alabama,500,900,29100,0,428300,20100,0,0,0,3777300,126500,53900
1,2016,Alaska,200,200,5000,0,55700,4900,0,0,0,525900,44800,19400
2,2016,Arizona,4700,4400,89600,0,427300,17500,0,0,100,4805000,179500,112800
3,2016,Arkansas,200,500,19100,0,320500,12600,0,0,0,2097800,96800,22200
4,2016,California,141500,116700,966700,0,1322600,80600,0,1300,400,27241000,710400,115500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,2022,Virginia,56600,21700,198400,40000,496200,300,0,0,0,6643300,153700,31900
353,2022,Washington,104100,31400,270200,67500,348300,100,100,0,0,5650700,277400,52700
354,2022,West Virginia,1900,1400,18300,15600,127500,100,0,0,0,1267500,45700,10900
355,2022,Wisconsin,15700,10000,105200,46500,549700,300,0,0,0,4577400,144500,26900


In [3]:
# Obtain a list of unique states in the registration data
states_list = registration_df["state"].unique().tolist()

states_list

['Alabama',
 'Alaska',
 'Arizona',
 'Arkansas',
 'California',
 'Colorado',
 'Connecticut',
 'Delaware',
 'District of Columbia',
 'Florida',
 'Georgia',
 'Hawaii',
 'Idaho',
 'Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Kentucky',
 'Louisiana',
 'Maine',
 'Maryland',
 'Massachusetts',
 'Michigan',
 'Minnesota',
 'Mississippi',
 'Missouri',
 'Montana',
 'Nebraska',
 'Nevada',
 'New Hampshire',
 'New Jersey',
 'New Mexico',
 'New York',
 'North Carolina',
 'North Dakota',
 'Ohio',
 'Oklahoma',
 'Oregon',
 'Pennsylvania',
 'Rhode Island',
 'South Carolina',
 'South Dakota',
 'Tennessee',
 'Texas',
 'Utah',
 'Vermont',
 'Virginia',
 'Washington',
 'West Virginia',
 'Wisconsin',
 'Wyoming']

In [4]:
# Obtain bulk population data for each state
population_df = pd.read_csv(
    Path(f"../../../../../data/processed_data/states_population.csv"),
)

population_df

Unnamed: 0,year,state,population
0,2010,Alabama,4785437
1,2010,Alaska,713910
2,2010,Arizona,6407172
3,2010,Arkansas,2921964
4,2010,California,37319502
...,...,...,...
695,2023,Virginia,8715698
696,2023,Washington,7812880
697,2023,West Virginia,1770071
698,2023,Wisconsin,5910955


In [11]:
def process_state_data(population_df, registration_df):

    # Group the population data by year
    population_df = population_df.groupby(["year", "state"]).agg({"population": "sum"}).reset_index()

    # Merge the datasets on the year/date column
    df = pd.merge(registration_df, population_df, left_on=["year", "state"], right_on=["year", "state"], how="left")

    # Sort the data by year and state
    df.sort_values(by=["year", "state"], inplace=True)

    # Reset the index
    df.reset_index(drop=True, inplace=True)

    # Reorder columns
    cols_to_order = ['year', 'state', 'population']
    new_columns = cols_to_order + \
        [col for col in df.columns if col not in cols_to_order]
    df = df[new_columns]

    return df

In [12]:
# Create a new DataFrame with the processed DataFrames
state_df = process_state_data(population_df, registration_df)

# Save the DataFrame to a CSV file
file_name = "states_analysis.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")

if not file_path.parent.exists():
    print(f"Error: The directory `{file_path.parent}` does not exist.")

if file_path.exists():
    print(f"File `{file_path.name}` already exists. Overwriting file.")
    file_path.unlink()

state_df.to_csv(file_path, index=False)
print(f"File saved as `{file_path.name}`")

# Display the DataFrame
display(state_df)

File saved as `states_analysis.csv`


Unnamed: 0,year,state,population,electric,phev,hev,biodiesel,ethanol,cng,propane,hydrogen,methanol,gasoline,diesel,unknown
0,2016,Alabama,4863525.0,500,900,29100,0,428300,20100,0,0,0,3777300,126500,53900
1,2016,Alaska,741456.0,200,200,5000,0,55700,4900,0,0,0,525900,44800,19400
2,2016,Arizona,6941072.0,4700,4400,89600,0,427300,17500,0,0,100,4805000,179500,112800
3,2016,Arkansas,2989918.0,200,500,19100,0,320500,12600,0,0,0,2097800,96800,22200
4,2016,California,39167117.0,141500,116700,966700,0,1322600,80600,0,1300,400,27241000,710400,115500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,2022,Virginia,8679099.0,56600,21700,198400,40000,496200,300,0,0,0,6643300,153700,31900
353,2022,Washington,7784477.0,104100,31400,270200,67500,348300,100,100,0,0,5650700,277400,52700
354,2022,West Virginia,1774035.0,1900,1400,18300,15600,127500,100,0,0,0,1267500,45700,10900
355,2022,Wisconsin,5890543.0,15700,10000,105200,46500,549700,300,0,0,0,4577400,144500,26900
