In [1]:
from pathlib import Path
import pandas as pd

In [2]:
# Load the data for States adoption rate analysis
df = pd.read_csv(
    Path("../../../../data/processed_data/states_adoption_rates.csv"),
)

# Display the data
print("Shape:", df.shape)
df

Shape: (357, 18)


Unnamed: 0,year,state,population,electric,phev,hev,biodiesel,ethanol,cng,propane,hydrogen,methanol,gasoline,diesel,unknown,total_vehicles,ev_adoption_rate,gas_adoption_rate
0,2016,Alabama,4863525.0,500,900,29100,0,428300,20100,0,0,0,3777300,126500,53900,4436600,0.011270,85.139521
1,2016,Alaska,741456.0,200,200,5000,0,55700,4900,0,0,0,525900,44800,19400,656100,0.030483,80.155464
2,2016,Arizona,6941072.0,4700,4400,89600,0,427300,17500,0,0,100,4805000,179500,112800,5640900,0.083320,85.181443
3,2016,Arkansas,2989918.0,200,500,19100,0,320500,12600,0,0,0,2097800,96800,22200,2569700,0.007783,81.635989
4,2016,California,39167117.0,141500,116700,966700,0,1322600,80600,0,1300,400,27241000,710400,115500,30696700,0.460962,88.742438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,2022,Virginia,8679099.0,56600,21700,198400,40000,496200,300,0,0,0,6643300,153700,31900,7642100,0.740634,86.930294
353,2022,Washington,7784477.0,104100,31400,270200,67500,348300,100,100,0,0,5650700,277400,52700,6802500,1.530320,83.067990
354,2022,West Virginia,1774035.0,1900,1400,18300,15600,127500,100,0,0,0,1267500,45700,10900,1488900,0.127611,85.129962
355,2022,Wisconsin,5890543.0,15700,10000,105200,46500,549700,300,0,0,0,4577400,144500,26900,5476200,0.286695,83.587159


In [3]:
# Calculate percentage growth for EV and gasoline vehicles year over year

# Create a copy of the dataframe
stats_df = df.copy()

# Create new columns for the growth rates
stats_df['ev_growth_rate'] = stats_df.groupby('state')['electric'].pct_change() * 100
stats_df['gas_growth_rate'] = stats_df.groupby('state')['gasoline'].pct_change() * 100

stats_df

Unnamed: 0,year,state,population,electric,phev,hev,biodiesel,ethanol,cng,propane,hydrogen,methanol,gasoline,diesel,unknown,total_vehicles,ev_adoption_rate,gas_adoption_rate,ev_growth_rate,gas_growth_rate
0,2016,Alabama,4863525.0,500,900,29100,0,428300,20100,0,0,0,3777300,126500,53900,4436600,0.011270,85.139521,,
1,2016,Alaska,741456.0,200,200,5000,0,55700,4900,0,0,0,525900,44800,19400,656100,0.030483,80.155464,,
2,2016,Arizona,6941072.0,4700,4400,89600,0,427300,17500,0,0,100,4805000,179500,112800,5640900,0.083320,85.181443,,
3,2016,Arkansas,2989918.0,200,500,19100,0,320500,12600,0,0,0,2097800,96800,22200,2569700,0.007783,81.635989,,
4,2016,California,39167117.0,141500,116700,966700,0,1322600,80600,0,1300,400,27241000,710400,115500,30696700,0.460962,88.742438,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,2022,Virginia,8679099.0,56600,21700,198400,40000,496200,300,0,0,0,6643300,153700,31900,7642100,0.740634,86.930294,84.364821,-0.265726
353,2022,Washington,7784477.0,104100,31400,270200,67500,348300,100,100,0,0,5650700,277400,52700,6802500,1.530320,83.067990,55.838323,-2.439572
354,2022,West Virginia,1774035.0,1900,1400,18300,15600,127500,100,0,0,0,1267500,45700,10900,1488900,0.127611,85.129962,90.000000,-0.063077
355,2022,Wisconsin,5890543.0,15700,10000,105200,46500,549700,300,0,0,0,4577400,144500,26900,5476200,0.286695,83.587159,68.817204,-1.329999


In [4]:
# Inspect the values of the new columns `ev_growth_rate` and `gas_growth_rate`

stats_df[["state", "ev_growth_rate", "gas_growth_rate", "electric", "gasoline", "total_vehicles"]
   ].sort_values(by=["ev_growth_rate", "gas_growth_rate"], ascending=False)

Unnamed: 0,state,ev_growth_rate,gas_growth_rate,electric,gasoline,total_vehicles
85,North Dakota,inf,0.088292,100,566800,775100
138,Oklahoma,208.333333,-1.053022,3700,3204200,4126000
96,Vermont,133.333333,-0.555982,700,518700,616100
342,Oklahoma,129.577465,3.160626,16300,3538100,4249900
125,Minnesota,113.043478,2.994874,4900,4078700,5012200
...,...,...,...,...,...,...
46,Virginia,,,3100,6554200,7403200
47,Washington,,,14900,5329200,6228400
48,West Virginia,,,100,1286500,1524400
49,Wisconsin,,,2600,4500500,5368300


In [5]:
# Inspect the missing values in the dataframe
stats_df.isnull().sum()

year                  0
state                 0
population            7
electric              0
phev                  0
hev                   0
biodiesel             0
ethanol               0
cng                   0
propane               0
hydrogen              0
methanol              0
gasoline              0
diesel                0
unknown               0
total_vehicles        0
ev_adoption_rate      0
gas_adoption_rate     0
ev_growth_rate       51
gas_growth_rate      51
dtype: int64

In [6]:
# Replace the infinite values with NaN
# inf values were caused by division by zero when calculating the growth rates
stats_df.replace([float('-inf'), float('inf')], pd.NA, inplace=True)

# Confirm that the infinite values have been replaced
stats_df[["state", "ev_growth_rate", "gas_growth_rate", "electric", "gasoline", "total_vehicles"]
   ].sort_values(by=["ev_growth_rate", "gas_growth_rate"], ascending=False)

Unnamed: 0,state,ev_growth_rate,gas_growth_rate,electric,gasoline,total_vehicles
138,Oklahoma,208.333333,-1.053022,3700,3204200,4126000
96,Vermont,133.333333,-0.555982,700,518700,616100
342,Oklahoma,129.577465,3.160626,16300,3538100,4249900
125,Minnesota,113.043478,2.994874,4900,4078700,5012200
309,Arkansas,112.5,-0.539793,5100,2229500,2685400
...,...,...,...,...,...,...
46,Virginia,,,3100,6554200,7403200
47,Washington,,,14900,5329200,6228400
48,West Virginia,,,100,1286500,1524400
49,Wisconsin,,,2600,4500500,5368300


In [7]:
# Remove rows with missing values from the `ev_growth_rate` and `gas_growth_rate` columns
stats_df.dropna(subset=['ev_growth_rate', 'gas_growth_rate'], axis=0, inplace=True)

# Confirm that the rows with missing values have been removed
stats_df.isnull().sum()

year                 0
state                0
population           6
electric             0
phev                 0
hev                  0
biodiesel            0
ethanol              0
cng                  0
propane              0
hydrogen             0
methanol             0
gasoline             0
diesel               0
unknown              0
total_vehicles       0
ev_adoption_rate     0
gas_adoption_rate    0
ev_growth_rate       0
gas_growth_rate      0
dtype: int64

In [8]:
# Inspection of the data types of the columns
stats_df.dtypes

year                   int64
state                 object
population           float64
electric               int64
phev                   int64
hev                    int64
biodiesel              int64
ethanol                int64
cng                    int64
propane                int64
hydrogen               int64
methanol               int64
gasoline               int64
diesel                 int64
unknown                int64
total_vehicles         int64
ev_adoption_rate     float64
gas_adoption_rate    float64
ev_growth_rate        object
gas_growth_rate      float64
dtype: object

In [9]:
# Convert the `ev_growth_rate` to numeric data type as it is currently an object
stats_df["ev_growth_rate"] = stats_df["ev_growth_rate"].apply(pd.to_numeric, errors='coerce')

stats_df.dtypes

year                   int64
state                 object
population           float64
electric               int64
phev                   int64
hev                    int64
biodiesel              int64
ethanol                int64
cng                    int64
propane                int64
hydrogen               int64
methanol               int64
gasoline               int64
diesel                 int64
unknown                int64
total_vehicles         int64
ev_adoption_rate     float64
gas_adoption_rate    float64
ev_growth_rate       float64
gas_growth_rate      float64
dtype: object

In [10]:
# Save the DataFrame to a CSV file
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [11]:
# Save the cleaned data to a new CSV file
file_name = "states_percent_change.csv"
file_path = Path(f"../../../../data/processed_data/{file_name}")
save_csv_file(stats_df, file_path)

# Display the shape of the cleaned data
print("Shape:", stats_df.shape)
stats_df

File `states_percent_change.csv` already exists. Overwriting file.
File saved as `states_percent_change.csv`
Shape: (305, 20)


Unnamed: 0,year,state,population,electric,phev,hev,biodiesel,ethanol,cng,propane,hydrogen,methanol,gasoline,diesel,unknown,total_vehicles,ev_adoption_rate,gas_adoption_rate,ev_growth_rate,gas_growth_rate
51,2017,Alabama,4874486.0,800,1100,30600,0,476700,18500,0,0,0,3750700,131500,49000,4458900,0.017942,84.117159,60.000000,-0.704207
52,2017,Alaska,739700.0,400,300,5200,0,59400,4400,0,0,0,506800,43200,16400,636100,0.062883,79.673007,100.000000,-3.631869
53,2017,Arizona,7044008.0,7200,5800,96000,0,484600,16600,0,0,100,4940200,187500,108300,5846300,0.123155,84.501309,53.191489,2.813736
54,2017,Arkansas,3001345.0,300,600,20000,0,352700,10800,0,0,0,2097400,100100,19800,2601700,0.011531,80.616520,50.000000,-0.019068
55,2017,California,39358497.0,189700,159600,1039300,0,1495800,79300,0,3200,400,28171500,738600,122200,31999600,0.592820,88.037038,34.063604,3.415807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,2022,Virginia,8679099.0,56600,21700,198400,40000,496200,300,0,0,0,6643300,153700,31900,7642100,0.740634,86.930294,84.364821,-0.265726
353,2022,Washington,7784477.0,104100,31400,270200,67500,348300,100,100,0,0,5650700,277400,52700,6802500,1.530320,83.067990,55.838323,-2.439572
354,2022,West Virginia,1774035.0,1900,1400,18300,15600,127500,100,0,0,0,1267500,45700,10900,1488900,0.127611,85.129962,90.000000,-0.063077
355,2022,Wisconsin,5890543.0,15700,10000,105200,46500,549700,300,0,0,0,4577400,144500,26900,5476200,0.286695,83.587159,68.817204,-1.329999
