In [1]:
from pathlib import Path
import datetime as dt
import pandas as pd

In [2]:
# Read the data for US population from 2010 to 2019
df_2019 = pd.read_csv(
    Path("../../../../../data/raw_data/population_2010-2019.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", df_2019.shape)
display(df_2019.head())

Shape: (3193, 164)


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952


In [3]:
# Read the data for US population from 2020 to 2023
df_2023 = pd.read_csv(
    Path("../../../../../data/raw_data/population_2020-2023.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", df_2023.shape)
display(df_2023.head())

Shape: (3195, 67)


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021,...,RNATURALCHG2023,RINTERNATIONALMIG2021,RINTERNATIONALMIG2022,RINTERNATIONALMIG2023,RDOMESTICMIG2021,RDOMESTICMIG2022,RDOMESTICMIG2023,RNETMIG2021,RNETMIG2022,RNETMIG2023
0,40,3,6,1,0,Alabama,Alabama,5024294,5031864,5050380,...,-0.306805,0.358254,0.864061,1.057514,5.497784,5.622917,6.038672,5.856038,6.486978,7.096186
1,50,3,6,1,1,Alabama,Autauga County,58809,58915,59203,...,1.549122,0.253983,0.369969,0.566346,4.097597,8.526095,8.178699,4.351581,8.896064,8.745044
2,50,3,6,1,3,Alabama,Baldwin County,231768,233227,239439,...,-0.435967,0.444288,1.02887,1.163912,29.500747,28.95652,27.213932,29.945035,29.98539,28.377843
3,50,3,6,1,5,Alabama,Barbour County,25229,24969,24533,...,-3.368165,0.0,0.081246,0.527544,-12.645954,9.627689,-1.826113,-12.645954,9.708935,-1.29857
4,50,3,6,1,7,Alabama,Bibb County,22301,22188,22359,...,-2.280294,0.044896,0.045101,0.045606,11.403686,-13.665577,-3.146805,11.448582,-13.620476,-3.101199


In [4]:
# Function to reshape the DataFrame structure
def reshape_data(df):
    # Define the columns to keep and the columns to unpivot
    keep_columns = ["STNAME", "CTYNAME"]
    unpivot_columns = [col for col in df.columns if col.startswith('POPESTIMATE')]
    
    # Reshape the DataFrame to longer format
    df = df.melt(
        id_vars=keep_columns,
        value_vars=unpivot_columns,
        var_name="Year", # New column name for the unpivoted columns
        value_name="Population", # New column name for the values 
    )

    return df

In [5]:
# Cleanup function to reorganize columns and format data
def format_columns(df):
    # Obtain the year from the "POPESTIMATE" value (after it has been unpivoted)
    # and convert it to an integer
    df['Year'] = df['Year'].str.extract('(\d{4})').astype(int)

    # Convert the "Year" column to a datetime object
    df['Date'] = pd.to_datetime(df['Year'], format='%Y')

    # Remove the "Year" column (No longer needed)
    df.drop(columns=['Year'], inplace=True)
    
    return df

In [6]:
# Function to remove the total population for the state in the DataFrame
# Necessary to avoid skewing the data when calculating the population growth
def remove_total(df, filter_state):
    df = df.loc[df["CTYNAME"] != filter_state].reset_index(drop=True)
    return df

In [7]:
# Cleanup function to rename columns and reorder them
def rename_columns(df):
    df = df.rename(
        columns={
            "Date": "date",
            "STNAME": "state",
            "CTYNAME": "county",
            "Population": "population",
        },
    )

    df = df[["date", "state", "county", "population"]]

    return df 

In [8]:
# Function to filter population data by state
def filter_by_state(df, filter_state):
    # Select columns to keep
    selected_columns = ["STNAME", "CTYNAME"] + \
        [col for col in df.columns if col.startswith('POPESTIMATE')]
    
    # Filter the DataFrame by selected state
    df = df.loc[df["STNAME"] == filter_state, selected_columns].reset_index(drop=True)

    # Reshape and reorganize the DataFrame
    df = reshape_data(df)
    df = format_columns(df)
    df = remove_total(df, filter_state)
    df = rename_columns(df)

    return df

In [9]:
# Obtain filtered population data for states
# Concatenate the population data for each state
# from 2010 to 2019 and 2020 to 2023.

# California
df_ca_2019 = filter_by_state(df_2019, "California")
df_ca_2023 = filter_by_state(df_2023, "California")
df_ca = pd.concat([df_ca_2019, df_ca_2023])

# Texas
df_tx_2019 = filter_by_state(df_2019, "Texas")
df_tx_2023 = filter_by_state(df_2023, "Texas")
df_tx = pd.concat([df_tx_2019, df_tx_2023])

# Florida
df_fl_2019 = filter_by_state(df_2019, "Florida")
df_fl_2023 = filter_by_state(df_2023, "Florida")
df_fl = pd.concat([df_fl_2019, df_fl_2023])

# Display the population data for each state
print("Shape (California):", df_ca.shape)
display(df_ca)

print("Shape (Texas):", df_tx.shape)
display(df_tx)

print("Shape (Florida):", df_fl.shape)
display(df_fl)


Shape (California): (812, 4)


Unnamed: 0,date,state,county,population
0,2010-01-01,California,Alameda County,1512986
1,2010-01-01,California,Alpine County,1161
2,2010-01-01,California,Amador County,37886
3,2010-01-01,California,Butte County,219949
4,2010-01-01,California,Calaveras County,45468
...,...,...,...,...
227,2023-01-01,California,Tulare County,479468
228,2023-01-01,California,Tuolumne County,54204
229,2023-01-01,California,Ventura County,829590
230,2023-01-01,California,Yolo County,220544


Shape (Texas): (3556, 4)


Unnamed: 0,date,state,county,population
0,2010-01-01,Texas,Anderson County,58493
1,2010-01-01,Texas,Andrews County,14849
2,2010-01-01,Texas,Angelina County,86905
3,2010-01-01,Texas,Aransas County,23181
4,2010-01-01,Texas,Archer County,9118
...,...,...,...,...
1011,2023-01-01,Texas,Wood County,47921
1012,2023-01-01,Texas,Yoakum County,7468
1013,2023-01-01,Texas,Young County,18124
1014,2023-01-01,Texas,Zapata County,13736


Shape (Florida): (938, 4)


Unnamed: 0,date,state,county,population
0,2010-01-01,Florida,Alachua County,247614
1,2010-01-01,Florida,Baker County,27066
2,2010-01-01,Florida,Bay County,169206
3,2010-01-01,Florida,Bradford County,28536
4,2010-01-01,Florida,Brevard County,543965
...,...,...,...,...
263,2023-01-01,Florida,Union County,15532
264,2023-01-01,Florida,Volusia County,590357
265,2023-01-01,Florida,Wakulla County,36449
266,2023-01-01,Florida,Walton County,86354


In [10]:
# Reference the data for the Texas zip codes
tx_zipcodes_df = pd.read_csv(
    Path("../../../../../data/processed_data/tx_zipcodes.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", tx_zipcodes_df.shape)
display(tx_zipcodes_df.head())

Shape: (2661, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,73301,Unique,Austin,Travis County,Area Code 512
1,73344,Unique,Austin,Travis County,Area Code 512
2,73960,PO Box,Texhoma,Sherman County,Area Code 806
3,75001,Standard,Addison,Dallas County,"Area Code 214, Area Code 469, Area Code 945, A..."
4,75002,Standard,"Allen, Lucas, Parker",Collin County,"Area Code 214, Area Code 469, Area Code 945, A..."


In [11]:
# Reference the data for the California zip codes
ca_zipcodes_df = pd.read_csv(
    Path("../../../../../data/processed_data/ca_zipcodes.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", ca_zipcodes_df.shape)
display(ca_zipcodes_df.head())

Shape: (2655, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,90001,Standard,"Los Angeles, Firestone Park, Firestone Pk",Los Angeles County,"Area Code 213, Area Code 323"
1,90002,Standard,"Los Angeles, Watts",Los Angeles County,"Area Code 213, Area Code 310, Area Code 323, A..."
2,90003,Standard,Los Angeles,Los Angeles County,"Area Code 213, Area Code 323"
3,90004,Standard,"Los Angeles, Oakwood",Los Angeles County,"Area Code 213, Area Code 323"
4,90005,Standard,"Los Angeles, Sanford",Los Angeles County,"Area Code 213, Area Code 323"


In [12]:
# Reference the data for the Florida zip codes
fl_zipcodes_df = pd.read_csv(
    Path("../../../../../data/processed_data/fl_zipcodes.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", fl_zipcodes_df.shape)
display(fl_zipcodes_df.head())

Shape: (1495, 5)


Unnamed: 0,Zip Code,Type,Cities,County,Area Codes
0,32003,Standard,"Fleming Island, Fleming Isle, Orange Park",Clay County,Area Code 904
1,32004,PO Box,"Ponte Vedra Beach, Ponte Vedra",St. Johns County,Area Code 904
2,32006,PO Box,"Fleming Island, Fleming Isle, Orange Park",Clay County,Area Code 904
3,32007,PO Box,Bostwick,Putnam County,Area Code 386
4,32008,Standard,Branford,Suwannee County,"Area Code 352, Area Code 386"


In [13]:
# Create zip_codes column in the population data and add all zipcodes for each county
def add_zipcodes(df, zipcodes_df):
    # Group by County and aggregate zip codes into a list
    zipcodes_grouped = zipcodes_df.groupby('County')['Zip Code'].apply(list).reset_index()
    
    # Merge the Main DataFrame with the zip codes DataFrame
    # Keep the 'County' and 'Zip Code' columns from the zip codes DataFrame
    df = df.merge(
        # Rename the columns to match the main DataFrame
        zipcodes_grouped.rename(columns={'Zip Code': 'zip_codes'}), how="left", left_on="county", right_on="County"
        )
    
    # Remove the 'County' column (No longer needed)
    df.drop(columns=['County'], inplace=True)

    # Reorder the columns to match the main DataFrame
    df = df[["date", "state", "county", "zip_codes", "population"]]
    
    return df

In [14]:
# Add zip codes to the population data for each state
df_ca = add_zipcodes(df_ca, ca_zipcodes_df)
df_tx = add_zipcodes(df_tx, tx_zipcodes_df)
df_fl = add_zipcodes(df_fl, fl_zipcodes_df)

# Display the population data for each state with zip codes
print("Shape (California):", df_ca.shape)
display(df_ca)

print("Shape (Texas):", df_tx.shape)
display(df_tx)

print("Shape (Florida):", df_fl.shape)
display(df_fl)

Shape (California): (812, 5)


Unnamed: 0,date,state,county,zip_codes,population
0,2010-01-01,California,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986
1,2010-01-01,California,Alpine County,"[95646, 96120, 96156]",1161
2,2010-01-01,California,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886
3,2010-01-01,California,Butte County,"[95914, 95916, 95917, 95926, 95927, 95928, 959...",219949
4,2010-01-01,California,Calaveras County,"[95221, 95222, 95223, 95224, 95225, 95226, 952...",45468
...,...,...,...,...,...
807,2023-01-01,California,Tulare County,"[93201, 93207, 93208, 93218, 93219, 93221, 932...",479468
808,2023-01-01,California,Tuolumne County,"[95305, 95309, 95310, 95314, 95321, 95327, 953...",54204
809,2023-01-01,California,Ventura County,"[91319, 91320, 91358, 91360, 91361, 91362, 913...",829590
810,2023-01-01,California,Yolo County,"[95605, 95606, 95607, 95612, 95616, 95617, 956...",220544


Shape (Texas): (3556, 5)


Unnamed: 0,date,state,county,zip_codes,population
0,2010-01-01,Texas,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58493
1,2010-01-01,Texas,Andrews County,[79714],14849
2,2010-01-01,Texas,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",86905
3,2010-01-01,Texas,Aransas County,"[78358, 78381, 78382]",23181
4,2010-01-01,Texas,Archer County,"[76351, 76366, 76370, 76379, 76389]",9118
...,...,...,...,...,...
3551,2023-01-01,Texas,Wood County,"[75410, 75444, 75494, 75497, 75765, 75773, 75783]",47921
3552,2023-01-01,Texas,Yoakum County,"[79323, 79355]",7468
3553,2023-01-01,Texas,Young County,"[76372, 76374, 76450, 76460, 76481]",18124
3554,2023-01-01,Texas,Zapata County,"[78067, 78076, 78564]",13736


Shape (Florida): (938, 5)


Unnamed: 0,date,state,county,zip_codes,population
0,2010-01-01,Florida,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",247614
1,2010-01-01,Florida,Baker County,"[32040, 32063, 32072, 32087]",27066
2,2010-01-01,Florida,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",169206
3,2010-01-01,Florida,Bradford County,"[32042, 32044, 32058, 32091, 32622]",28536
4,2010-01-01,Florida,Brevard County,"[32754, 32775, 32780, 32781, 32782, 32783, 327...",543965
...,...,...,...,...,...
933,2023-01-01,Florida,Union County,"[32026, 32054, 32083, 32697]",15532
934,2023-01-01,Florida,Volusia County,"[32105, 32114, 32115, 32116, 32117, 32118, 321...",590357
935,2023-01-01,Florida,Wakulla County,"[32326, 32327, 32346, 32355, 32358]",36449
936,2023-01-01,Florida,Walton County,"[32422, 32433, 32434, 32435, 32439, 32454, 324...",86354


#### Add 2024 Population Data with Zip Codes

In [15]:
# Create a function to format the DataFrame for 2024 data
# Format the 2024 DataFrame to match the structure of the 2010-2023 data
def format_2024_df(df, state):
    # Create a column for the state
    df["state"] = state

    # Remove the "city" column
    df.drop(columns=["city"], inplace=True)

    # Group by state and county to aggregate zip codes into a list
    # Zip codes will be grouped to match the format of the 2010-2023 data
    df = df.groupby(["state", "county"]).agg({
        "zip": lambda x: list(x),
        "population": "sum",
    }).reset_index()

    # Create a column for the date (2024)
    df['date'] = pd.to_datetime('2024', format='%Y')

    # Reorder the columns to match the structure of the 2010-2023 data
    df = df[['date', 'state', 'county', 'zip', 'population']]

    # Rename the "zip" column to "zip_codes"
    df.rename(columns={'zip': 'zip_codes'}, inplace=True)

    return df


In [16]:
# Read the data for Texas population in 2024
df_tx_2024 = pd.read_csv(
    Path("../../../../../data/raw_data/tx_population_2024.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", df_tx_2024.shape)
display(df_tx_2024.head())

Shape: (1934, 5)


Unnamed: 0,zip,population,city,county,state
0,77494,130920,Katy,Fort Bend County,Tx
1,77449,122098,Katy,Harris County,Tx
2,78660,113386,Pflugerville,Travis County,Tx
3,77084,108557,Houston,Harris County,Tx
4,77433,107887,Cypress,Harris County,Tx


In [17]:
# Read the data for California population in 2024
df_ca_2024 = pd.read_csv(
    Path("../../../../../data/raw_data/ca_population_2024.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", df_ca_2024.shape)
display(df_ca_2024.head())

Shape: (1765, 5)


Unnamed: 0,zip,population,city,county,state
0,90011,106042,Los Angeles,Los Angeles County,Ca
1,90650,101983,Norwalk,Los Angeles County,Ca
2,94565,100826,Pittsburg,Contra Costa County,Ca
3,92336,100571,Fontana,San Bernardino County,Ca
4,91331,99804,Pacoima,Los Angeles County,Ca


In [18]:
# Read the data for Florida population in 2024
df_fl_2024 = pd.read_csv(
    Path("../../../../../data/raw_data/fl_population_2024.csv"),
    encoding="ISO-8859-1",
)

print("Shape:", df_fl_2024.shape)
display(df_fl_2024.head())

Shape: (992, 5)


Unnamed: 0,zip,population,city,county,state
0,34787,86913,Winter Garden,Orange County,Fl
1,34953,78558,Port Saint Lucie,St. Lucie County,Fl
2,33311,75435,Fort Lauderdale,Broward County,Fl
3,33411,73919,West Palm Beach,Palm Beach County,Fl
4,33024,73405,Hollywood,Broward County,Fl


In [19]:
# Format the 2024 population data for each state
# Then concatenate the 2024 data with the 2010-2023 data
# Reset the index and sort the DataFrame by date and county

# Texas
df_tx_2024 = format_2024_df(df_tx_2024, "Texas")
df_tx = pd.concat([df_tx, df_tx_2024])
df_tx["state"] = "TX"
df_tx.reset_index(drop=True, inplace=True)
df_tx.sort_values(by=["date", "county"], inplace=True)

# California
df_ca_2024 = format_2024_df(df_ca_2024, "California")
df_ca = pd.concat([df_ca, df_ca_2024])
df_ca["state"] = "CA"
df_ca.reset_index(drop=True, inplace=True)
df_ca.sort_values(by=["date", "county"], inplace=True)

# Florida
df_fl_2024 = format_2024_df(df_fl_2024, "Florida")
df_fl = pd.concat([df_fl, df_fl_2024])
df_fl["state"] = "FL"
df_fl.reset_index(drop=True, inplace=True)
df_fl.sort_values(by=["date", "county"], inplace=True)

print("Shape (Texas):", df_tx.shape)
display(df_tx)

print("Shape (California):", df_ca.shape)
display(df_ca)

print("Shape (Florida):", df_fl.shape)
display(df_fl)

Shape (Texas): (3811, 5)


Unnamed: 0,date,state,county,zip_codes,population
0,2010-01-01,TX,Anderson County,"[75763, 75779, 75801, 75802, 75803, 75832, 758...",58493
1,2010-01-01,TX,Andrews County,[79714],14849
2,2010-01-01,TX,Angelina County,"[75901, 75902, 75903, 75904, 75915, 75941, 759...",86905
3,2010-01-01,TX,Aransas County,"[78358, 78381, 78382]",23181
4,2010-01-01,TX,Archer County,"[76351, 76366, 76370, 76379, 76389]",9118
...,...,...,...,...,...
3806,2024-01-01,TX,Wood County,"[75773, 75494, 75765, 75783, 75410, 75497]",49284
3807,2024-01-01,TX,Yoakum County,"[79355, 79376]",1685
3808,2024-01-01,TX,Young County,"[76450, 76374, 76460, 76481]",17309
3809,2024-01-01,TX,Zapata County,"[78076, 78067, 78564]",13896


Shape (California): (870, 5)


Unnamed: 0,date,state,county,zip_codes,population
0,2010-01-01,CA,Alameda County,"[94501, 94502, 94536, 94537, 94538, 94539, 945...",1512986
1,2010-01-01,CA,Alpine County,"[95646, 96120, 96156]",1161
2,2010-01-01,CA,Amador County,"[95601, 95629, 95640, 95642, 95654, 95665, 956...",37886
3,2010-01-01,CA,Butte County,"[95914, 95916, 95917, 95926, 95927, 95928, 959...",219949
4,2010-01-01,CA,Calaveras County,"[95221, 95222, 95223, 95224, 95225, 95226, 952...",45468
...,...,...,...,...,...
865,2024-01-01,CA,Tulare County,"[93257, 93274, 93291, 93277, 93292, 93618, 932...",499653
866,2024-01-01,CA,Tuolumne County,"[95370, 95327, 95321, 95379, 95383, 95329, 953...",55850
867,2024-01-01,CA,Ventura County,"[93033, 93065, 93030, 93063, 93003, 93010, 930...",853164
868,2024-01-01,CA,Yolo County,"[95616, 95691, 95695, 95618, 95776, 95605, 956...",218797


Shape (Florida): (1005, 5)


Unnamed: 0,date,state,county,zip_codes,population
0,2010-01-01,FL,Alachua County,"[32601, 32602, 32603, 32604, 32605, 32606, 326...",247614
1,2010-01-01,FL,Baker County,"[32040, 32063, 32072, 32087]",27066
2,2010-01-01,FL,Bay County,"[32401, 32402, 32403, 32404, 32405, 32406, 324...",169206
3,2010-01-01,FL,Bradford County,"[32042, 32044, 32058, 32091, 32622]",28536
4,2010-01-01,FL,Brevard County,"[32754, 32775, 32780, 32781, 32782, 32783, 327...",543965
...,...,...,...,...,...
1000,2024-01-01,FL,Union County,"[32054, 32083, 32026, 32697]",17444
1001,2024-01-01,FL,Volusia County,"[32174, 32725, 32738, 32724, 32114, 32720, 321...",564255
1002,2024-01-01,FL,Wakulla County,"[32327, 32346, 32358, 32355]",34228
1003,2024-01-01,FL,Walton County,"[32459, 32433, 32439, 32550, 32435, 32455, 32461]",75592


In [20]:
# Create a function to save the DataFrames to CSV files
def save_csv_file(df, file_path):
    """ 
    Save a DataFrame to a CSV file at the specified file path.

    Parameters:
    - df: DataFrame to save
    - file_path: Path to save the CSV file
    """
    
    # Check if the parent directory exists
    if not file_path.parent.exists():
        print(f"Error: The directory `{file_path.parent}` does not exist.")
        return
    
    if file_path.exists():
        print(f"File `{file_path.name}` already exists. Overwriting file.")
        file_path.unlink()
    
    # Save the DataFrame to the specified file path
    df.to_csv(file_path, index=False)
    print(f"File saved as `{file_path.name}`")

In [21]:
# Save the DataFrames to separate CSV files
states = {
    "ca": df_ca,
    "tx": df_tx,
    "fl": df_fl,
}

for states, df in states.items():
    file_name = f"{states}_population.csv"
    file_path = Path(f"../../../../../data/processed_data/{file_name}")
    save_csv_file(df, file_path)

File `ca_population.csv` already exists. Overwriting file.
File saved as `ca_population.csv`
File `tx_population.csv` already exists. Overwriting file.
File saved as `tx_population.csv`
File `fl_population.csv` already exists. Overwriting file.
File saved as `fl_population.csv`
