In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [3]:
# Load the main Texas DataFrame that we will be merging with the income data
tx_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_ev_registration_population.csv"),
)

print("Shape:", tx_df.shape)
tx_df

Shape: (1630, 6)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1.0
1,2017,TX,Andrews County,[79714],17603.0,1.0
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2.0
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1.0
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2.0
...,...,...,...,...,...,...
1625,2024,TX,Yoakum County,"[79355, 79376]",1685.0,6.0
1626,2024,TX,Young County,"[76450, 76374, 76460, 76481]",17309.0,43.0
1627,2024,TX,Zapata County,"[78076, 78067, 78564]",13896.0,6.0
1628,2024,TX,Zavala County,"[78839, 78872, 78829]",9700.0,3.0


In [4]:
# Load the Texas Income DataFrame
tx_income_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_county_income.csv"),
)

# Display the DataFrame
print("Shape:", tx_income_df.shape)
tx_income_df.head()

Shape: (254, 7)


Unnamed: 0,County,2017 Median Income,2018 Median Income,2019 Median Income,2020 Median Income,2021 Median Income,2022 Median Income
0,Anderson County,42313,43355,43455,45847,49736,57445
1,Andrews County,70753,74233,76158,75147,80518,86458
2,Angelina County,46472,47714,50453,49684,52377,57055
3,Aransas County,44601,44865,45137,47924,51509,58168
4,Archer County,63192,64476,63835,63958,67083,69954


In [5]:
# Inspect both County Values in both DataFrames
# To ensure that the county names are the same in both DataFrames, we will compare the unique county values in each DataFrame.
main_df_counties = set(tx_df['county'])
income_df_counties = set(tx_income_df['County']) 

# Show mismatches
print("Counties in main_df but not in income_df_counties:", main_df_counties.difference(income_df_counties))
print("Counties in income_df_counties but not in main_df:", income_df_counties.difference(main_df_counties))

Counties in main_df but not in income_df_counties: {nan}
Counties in income_df_counties but not in main_df: {'Motley County', 'Cottle County', 'Hartley County', 'Schleicher County', 'Kenedy County', 'Kent County', 'Reagan County', 'Crane County', 'Glasscock County', 'Terrell County', 'McMullen County'}


Note: Since the income df counties are not in the main_df we do not need to worry about renaming these counties. 

In [6]:
# Reshape the DataFrame 'tx_income_df' from wide format to long format using the melt function
# Param 1: The DataFrame to be reshaped
# Param 2: Column(s) to use as identifier variables (these columns will remain as-is)
# Param 3: Name of the new column that will contain the original column names (years in this case)
# Param 4: Name of the new column that will contain the values from the original columns
tx_income_df = pd.melt(
    tx_income_df, 
    id_vars=["County"], 
    var_name="year", 
    value_name="median_income", 
)

# Display the DataFrame 
print("Shape:", tx_income_df.shape)
tx_income_df.head()

Shape: (1524, 3)


Unnamed: 0,County,year,median_income
0,Anderson County,2017 Median Income,42313
1,Andrews County,2017 Median Income,70753
2,Angelina County,2017 Median Income,46472
3,Aransas County,2017 Median Income,44601
4,Archer County,2017 Median Income,63192


In [7]:
# From the 'year' column, extract only the year values using the 'str.extract' function
# The regular expression '\d{4}' will match any four consecutive digits
tx_income_df['year'] = tx_income_df['year'].str.extract('(\d{4})')

# Confirm that the 'year' column only contains year values
print("Shape:", tx_income_df.shape)
tx_income_df

Shape: (1524, 3)


Unnamed: 0,County,year,median_income
0,Anderson County,2017,42313
1,Andrews County,2017,70753
2,Angelina County,2017,46472
3,Aransas County,2017,44601
4,Archer County,2017,63192
...,...,...,...
1519,Wood County,2022,61748
1520,Yoakum County,2022,80317
1521,Young County,2022,65565
1522,Zapata County,2022,35061


In [8]:
# Inspect the data types of the columns in both DataFrames
tx_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1524 entries, 0 to 1523
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   County         1524 non-null   object
 1   year           1524 non-null   object
 2   median_income  1524 non-null   object
dtypes: object(3)
memory usage: 35.8+ KB


In [9]:
# Convert the 'year' column to an integer data type
tx_income_df["year"] = tx_income_df["year"].astype(int)

tx_income_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1524 entries, 0 to 1523
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   County         1524 non-null   object
 1   year           1524 non-null   int32 
 2   median_income  1524 non-null   object
dtypes: int32(1), object(2)
memory usage: 29.9+ KB


In [10]:
# Merge the main California DataFrame with the reshaped California Income DataFrame
tx_df_clean = pd.merge(tx_df, tx_income_df, left_on=['county', 'year'], right_on=['County', 'year'], how='left')

# Remove the redundant 'County' column
tx_df_clean.drop(columns=['County'], inplace=True)

print("Shape:", tx_df_clean.shape)
tx_df_clean.head()

Shape: (1630, 7)


Unnamed: 0,year,state,county,zip_codes,population,ev_registrations,median_income
0,2017,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58175.0,1.0,42313
1,2017,TX,Andrews County,[79714],17603.0,1.0,70753
2,2017,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",87572.0,2.0,46472
3,2017,TX,Aransas County,"[78358, 78381, 78382]",25392.0,1.0,44601
4,2017,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",8783.0,2.0,63192


In [11]:
# Inspect for missing values
tx_df_clean.isnull().sum()

year                  0
state                 0
county                8
zip_codes             8
population            8
ev_registrations      0
median_income       472
dtype: int64

In [12]:
# Drop rows with missing values in `median_income` column
tx_df_clean.dropna(subset=['median_income'], inplace=True)

tx_df_clean.isnull().sum()

year                0
state               0
county              0
zip_codes           0
population          0
ev_registrations    0
median_income       0
dtype: int64

In [13]:
# Save the cleaned California DataFrame to a CSV file
file_name = f"tx_ev_main_dataset.csv"
file_path = Path(f"../../../../../data/processed_data/{file_name}")
save_csv_file(tx_df_clean, file_path)

File `tx_ev_main_dataset.csv` already exists. Overwriting file.
File saved as `tx_ev_main_dataset.csv`
