In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [3]:
# Load the main California DataFrame that we will be merging with the income data
ca_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_main_dataset.csv"),
)

print("Shape:", ca_df.shape)
ca_df

Shape: (600, 7)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20,0
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1,1
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10,0
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2,0
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2,0
...,...,...,...,...,...,...,...
595,2020,CA,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",55379.0,240,9
596,2020,CA,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",843371.0,13080,117
597,2020,CA,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",216291.0,2948,53
598,2020,CA,Yuba County,"[95692, 95901, 95903, 95918, 95919, 95922, 959...",81958.0,289,3


In [4]:
# Load the California Income DataFrame
ca_income_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_county_income.csv"),
)

# Display the DataFrame
print("Shape:", ca_income_df.shape)
ca_income_df.head()

Shape: (58, 12)


Unnamed: 0,County,2010 Median Income,2011 Median Income,2012 Median Income,2013 Median Income,2014 Median Income,2015 Median Income,2016 Median Income,2017 Median Income,2018 Median Income,2019 Median Income,2020 Median Income
0,Alameda County,69384,70821,71516,72112,73775,75619,79831,85743,92574,99406,104888
1,Alpine County,63478,59018,59931,58636,61343,52917,62375,63438,64688,63750,85750
2,Amador County,54758,56180,53462,53684,52964,54171,57032,60636,61198,62772,65187
3,Butte County,43170,42971,43339,43752,43165,43444,44366,46516,48443,52537,54972
4,Calaveras County,54971,55256,54686,55295,54936,53233,53502,54800,58151,63158,67054


In [5]:
# Inspect both County Values in both DataFrames
# To ensure that the county names are the same in both DataFrames, we will compare the unique county values in each DataFrame.
main_df_counties = set(ca_df['county'])
income_df_counties = set(ca_income_df['County']) 

# Find the counties that don't match
mismatches = main_df_counties.symmetric_difference(income_df_counties)

# Show mismatches
print("Counties that don't match:", mismatches)

Counties that don't match: {nan}


In [6]:
# Reshape the DataFrame 'ca_income_df' from wide format to long format using the melt function
# Param 1: The DataFrame to be reshaped
# Param 2: Column(s) to use as identifier variables (these columns will remain as-is)
# Param 3: Name of the new column that will contain the original column names (years in this case)
# Param 4: Name of the new column that will contain the values from the original columns
ca_income_df = pd.melt(
    ca_income_df, 
    id_vars=["County"], 
    var_name="year", 
    value_name="median_income", 
)

# Display the DataFrame 
print("Shape:", ca_income_df.shape)
ca_income_df.head()

Shape: (638, 3)


Unnamed: 0,County,year,median_income
0,Alameda County,2010 Median Income,69384
1,Alpine County,2010 Median Income,63478
2,Amador County,2010 Median Income,54758
3,Butte County,2010 Median Income,43170
4,Calaveras County,2010 Median Income,54971


In [7]:
# From the 'year' column, extract only the year values using the 'str.extract' function
# The regular expression '\d{4}' will match any four consecutive digits
ca_income_df['year'] = ca_income_df['year'].str.extract('(\d{4})')

# Confirm that the 'year' column only contains year values
print("Shape:", ca_income_df.shape)
ca_income_df

Shape: (638, 3)


Unnamed: 0,County,year,median_income
0,Alameda County,2010,69384
1,Alpine County,2010,63478
2,Amador County,2010,54758
3,Butte County,2010,43170
4,Calaveras County,2010,54971
...,...,...,...
633,Tulare County,2020,52534
634,Tuolumne County,2020,60509
635,Ventura County,2020,89295
636,Yolo County,2020,73746


In [8]:
# Inspect the data types of the columns in both DataFrames
ca_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   County         638 non-null    object
 1   year           638 non-null    object
 2   median_income  638 non-null    object
dtypes: object(3)
memory usage: 15.1+ KB


In [9]:
# Convert the 'year' column to an integer data type
ca_income_df["year"] = ca_income_df["year"].astype(int)

ca_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 638 entries, 0 to 637
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   County         638 non-null    object
 1   year           638 non-null    int32 
 2   median_income  638 non-null    object
dtypes: int32(1), object(2)
memory usage: 12.6+ KB


In [10]:
# Merge the main California DataFrame with the reshaped California Income DataFrame
ca_df_clean = pd.merge(ca_df, ca_income_df, left_on=['county', 'year'], right_on=['County', 'year'], how='left')

# Remove the redundant 'County' column
ca_df_clean.drop(columns=['County'], inplace=True)

print("Shape:", ca_df_clean.shape)
ca_df_clean.head()

Shape: (600, 8)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,cumulative_ev_stations,median_income
0,2010,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986.0,20,0,69384
1,2010,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886.0,1,1,54758
2,2010,CA,Contra Costa County,"[94505, 94506, 94507, 94509, 94511, 94513, 945...",1052540.0,10,0,78385
3,2010,CA,Fresno County,"[93210, 93234, 93242, 93602, 93605, 93606, 936...",932039.0,2,0,46430
4,2010,CA,Humboldt County,"[95501, 95502, 95503, 95511, 95514, 95518, 955...",135009.0,2,0,40089


In [11]:
# Inspect for missing values
ca_df_clean.isnull().sum()

year                       0
state                      0
county                    11
zip_codes                 11
population                11
ev_registrations           0
cumulative_ev_stations     0
median_income             11
dtype: int64

In [12]:
# Save the cleaned California DataFrame to a CSV file
file_name = f"ca_ev_main_dataset.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(ca_df_clean, file_path)

File saved as `ca_ev_main_dataset.csv`
