In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [3]:
# Load the main Florida DataFrame that we will be merging with the income data
fl_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_ev_registration_population.csv"),
)

print("Shape:", fl_df.shape)
fl_df

Shape: (268, 6)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
0,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,766.0
1,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,5.0
2,2018,FL,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",186240.0,300.0
3,2018,FL,Bradford County,"[32042, 32044, 32058, 32091, 32622]",27752.0,22.0
4,2018,FL,Brevard County,"[32754, 32775, 32780, 32781, 32782, 32783, 327...",595203.0,1881.0
...,...,...,...,...,...,...
263,2021,FL,Volusia County,"[32105, 32114, 32115, 32116, 32117, 32118, 321...",566368.0,1458.0
264,2021,FL,Wakulla County,"[32326, 32327, 32346, 32355, 32358]",34270.0,38.0
265,2021,FL,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",80021.0,314.0
266,2021,FL,Washington County,"[32427, 32428, 32437, 32462, 32463]",24867.0,14.0


In [4]:
# Load the Florida Income DataFrame
fl_income_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_county_income.csv"),
)

# Display the DataFrame
print("Shape:", fl_income_df.shape)
fl_income_df.head()

Shape: (67, 5)


Unnamed: 0,County,2018 Median Income,2019 Median Income,2020 Median Income,2021 Median Income
0,Alachua County,49078,49689,50089,53314
1,Baker County,61769,63275,62299,63860
2,Bay County,51829,54316,56483,60473
3,Bradford County,46197,45921,43580,48803
4,Brevard County,54359,56775,59359,63632


In [5]:
# Inspect both County Values in both DataFrames
# To ensure that the county names are the same in both DataFrames, we will compare the unique county values in each DataFrame.
main_df_counties = set(fl_df['county'])
income_df_counties = set(fl_income_df['County']) 

# Find the counties that don't match
mismatches = main_df_counties.symmetric_difference(income_df_counties)

# Show mismatches
print("Counties that don't match:", mismatches)

Counties that don't match: {'Lafayette County', nan}


Note: Confirmed `Lafayette County` is not in the main dataframe we will be merging into. No action is needed. If it was and if it were named differently we would fix the name and then merge it.

In [6]:
# Reshape the DataFrame 'fl_income_df' from wide format to long format using the melt function
# Param 1: The DataFrame to be reshaped
# Param 2: Column(s) to use as identifier variables (these columns will remain as-is)
# Param 3: Name of the new column that will contain the original column names (years in this case)
# Param 4: Name of the new column that will contain the values from the original columns
fl_income_df = pd.melt(
    fl_income_df, 
    id_vars=["County"], 
    var_name="year", 
    value_name="median_income", 
)

# Display the DataFrame 
print("Shape:", fl_income_df.shape)
fl_income_df.head()

Shape: (268, 3)


Unnamed: 0,County,year,median_income
0,Alachua County,2018 Median Income,49078
1,Baker County,2018 Median Income,61769
2,Bay County,2018 Median Income,51829
3,Bradford County,2018 Median Income,46197
4,Brevard County,2018 Median Income,54359


In [7]:
# From the 'year' column, extract only the year values using the 'str.extract' function
# The regular expression '\d{4}' will match any four consecutive digits
fl_income_df['year'] = fl_income_df['year'].str.extract('(\d{4})')

# Confirm that the 'year' column only contains year values
print("Shape:", fl_income_df.shape)
fl_income_df

Shape: (268, 3)


Unnamed: 0,County,year,median_income
0,Alachua County,2018,49078
1,Baker County,2018,61769
2,Bay County,2018,51829
3,Bradford County,2018,46197
4,Brevard County,2018,54359
...,...,...,...
263,Union County,2021,55463
264,Volusia County,2021,56786
265,Wakulla County,2021,72941
266,Walton County,2021,68111


In [8]:
# Inspect the data types of the columns in both DataFrames
fl_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   County         268 non-null    object
 1   year           268 non-null    object
 2   median_income  268 non-null    object
dtypes: object(3)
memory usage: 6.4+ KB


In [9]:
# Convert the 'year' column to an integer data type
fl_income_df["year"] = fl_income_df["year"].astype(int)

fl_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   County         268 non-null    object
 1   year           268 non-null    int32 
 2   median_income  268 non-null    object
dtypes: int32(1), object(2)
memory usage: 5.4+ KB


In [10]:
# Merge the main California DataFrame with the reshaped California Income DataFrame
fl_df_clean = pd.merge(fl_df, fl_income_df, left_on=['county', 'year'], right_on=['County', 'year'], how='left')

# Remove the redundant 'County' column
fl_df_clean.drop(columns=['County'], inplace=True)

print("Shape:", fl_df_clean.shape)
fl_df_clean.head()

Shape: (268, 7)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,median_income
0,2018,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",268851.0,766.0,49078
1,2018,FL,Baker County,"[32040, 32063, 32072, 32087]",28353.0,5.0,61769
2,2018,FL,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",186240.0,300.0,51829
3,2018,FL,Bradford County,"[32042, 32044, 32058, 32091, 32622]",27752.0,22.0,46197
4,2018,FL,Brevard County,"[32754, 32775, 32780, 32781, 32782, 32783, 327...",595203.0,1881.0,54359


In [11]:
# Inspect for missing values
fl_df_clean.isnull().sum()

year                0
state               0
county              4
zip_codes           4
population          4
ev_registrations    0
median_income       4
dtype: int64

In [12]:
# Save the cleaned California DataFrame to a CSV file
file_name = f"fl_ev_main_dataset.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(fl_df_clean, file_path)

File `fl_ev_main_dataset.csv` already exists. Overwriting file.
File saved as `fl_ev_main_dataset.csv`
