In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [None]:
# Load the main Florida DataFrame that we will be merging with the income data
fl_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_ev_registration_population.csv"),
)

print("Shape:", fl_df.shape)
fl_df

In [None]:
# Load the Florida Income DataFrame
fl_income_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_county_income.csv"),
)

# Display the DataFrame
print("Shape:", fl_income_df.shape)
fl_income_df.head()

In [None]:
# Inspect both County Values in both DataFrames
# To ensure that the county names are the same in both DataFrames, we will compare the unique county values in each DataFrame.
main_df_counties = set(fl_df['county'])
income_df_counties = set(fl_income_df['County']) 

# Find the counties that don't match
mismatches = main_df_counties.symmetric_difference(income_df_counties)

# Show mismatches
print("Counties that don't match:", mismatches)

Note: Confirmed `Lafayette County` is not in the main dataframe we will be merging into. No action is needed. If it was and if it were named differently we would fix the name and then merge it.

In [None]:
# Reshape the DataFrame 'fl_income_df' from wide format to long format using the melt function
# Param 1: The DataFrame to be reshaped
# Param 2: Column(s) to use as identifier variables (these columns will remain as-is)
# Param 3: Name of the new column that will contain the original column names (years in this case)
# Param 4: Name of the new column that will contain the values from the original columns
fl_income_df = pd.melt(
    fl_income_df, 
    id_vars=["County"], 
    var_name="year", 
    value_name="median_income", 
)

# Display the DataFrame 
print("Shape:", fl_income_df.shape)
fl_income_df.head()

In [None]:
# From the 'year' column, extract only the year values using the 'str.extract' function
# The regular expression '\d{4}' will match any four consecutive digits
fl_income_df['year'] = fl_income_df['year'].str.extract('(\d{4})')

# Confirm that the 'year' column only contains year values
print("Shape:", fl_income_df.shape)
fl_income_df

In [None]:
# Inspect the data types of the columns in both DataFrames
fl_income_df.info()

In [None]:
# Convert the 'year' column to an integer data type
fl_income_df["year"] = fl_income_df["year"].astype(int)

fl_income_df.info()

In [None]:
# Merge the main California DataFrame with the reshaped California Income DataFrame
fl_df_clean = pd.merge(fl_df, fl_income_df, left_on=['county', 'year'], right_on=['County', 'year'], how='left')

# Remove the redundant 'County' column
fl_df_clean.drop(columns=['County'], inplace=True)

print("Shape:", fl_df_clean.shape)
fl_df_clean.head()

In [None]:
# Inspect for missing values
fl_df_clean.isnull().sum()

In [None]:
# Save the cleaned California DataFrame to a CSV file
file_name = f"fl_ev_main_dataset.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(fl_df_clean, file_path)