In [99]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from functools import reduce


In [100]:

# Load files with low_memory=False
ACS_2019 = pd.read_csv("/Users/architamisra/Library/CloudStorage/OneDrive-HarvardUniversity/DS Final Project/1. Data/3. US Census/ACSDP1Y2019.DP05-Data.csv", low_memory=False)
ACS_2021 = pd.read_csv("/Users/architamisra/Library/CloudStorage/OneDrive-HarvardUniversity/DS Final Project/1. Data/3. US Census/ACSDP1Y2021.DP05-Data.csv", low_memory=False)
ACS_2022 = pd.read_csv("/Users/architamisra/Library/CloudStorage/OneDrive-HarvardUniversity/DS Final Project/1. Data/3. US Census/ACSDP1Y2022.DP05-Data.csv", low_memory=False)

In [101]:

# Add suffixes to columns
ACS_2019.columns = [f"{col}_2019" if col != 'GEO_ID' else col for col in ACS_2019.columns]
ACS_2021.columns = [f"{col}_2021" if col != 'GEO_ID' else col for col in ACS_2021.columns]
ACS_2022.columns = [f"{col}_2022" if col != 'GEO_ID' else col for col in ACS_2022.columns]

# Merge the dataframes
dataframes = [ACS_2019, ACS_2021, ACS_2022]


In [102]:

# Use reduce to merge all dataframes on 'GEO_ID'
merged_ACS = reduce(lambda left, right: pd.merge(left, right, on='GEO_ID', how='outer'), dataframes)

# Display the first few rows of the merged dataframe
merged_ACS.head()


Unnamed: 0,GEO_ID,NAME_2019,DP05_0001E_2019,DP05_0001EA_2019,DP05_0001M_2019,DP05_0001MA_2019,DP05_0002E_2019,DP05_0002M_2019,DP05_0002MA_2019,DP05_0002EA_2019,...,DP05_0089PEA_2022,DP05_0090PE_2022,DP05_0090PEA_2022,DP05_0090PM_2022,DP05_0090PMA_2022,DP05_0091PE_2022,DP05_0091PEA_2022,DP05_0091PM_2022,DP05_0091PMA_2022,Unnamed: 730_2022
0,Geography,Geographic Area Name,Estimate!!SEX AND AGE!!Total population,Annotation of Estimate!!SEX AND AGE!!Total pop...,Margin of Error!!SEX AND AGE!!Total population,Annotation of Margin of Error!!SEX AND AGE!!To...,Estimate!!SEX AND AGE!!Total population!!Male,Margin of Error!!SEX AND AGE!!Total population...,Annotation of Margin of Error!!SEX AND AGE!!To...,Annotation of Estimate!!SEX AND AGE!!Total pop...,...,"Annotation of Percent!!CITIZEN, VOTING AGE POP...","Percent!!CITIZEN, VOTING AGE POPULATION!!Citiz...","Annotation of Percent!!CITIZEN, VOTING AGE POP...","Percent Margin of Error!!CITIZEN, VOTING AGE P...",Annotation of Percent Margin of Error!!CITIZEN...,"Percent!!CITIZEN, VOTING AGE POPULATION!!Citiz...","Annotation of Percent!!CITIZEN, VOTING AGE POP...","Percent Margin of Error!!CITIZEN, VOTING AGE P...",Annotation of Percent Margin of Error!!CITIZEN...,
1,0500000US01003,"Baldwin County, Alabama",223234,,*****,*****,109192,1466,,,...,,48.1,,0.4,,51.9,,0.4,,
2,0500000US01015,"Calhoun County, Alabama",113605,,*****,*****,54285,701,,,...,,47.8,,0.3,,52.2,,0.3,,
3,0500000US01043,"Cullman County, Alabama",83768,,*****,*****,40579,905,,,...,,49.0,,0.6,,51.0,,0.6,,
4,0500000US01049,"DeKalb County, Alabama",71513,,*****,*****,35688,884,,,...,,49.6,,0.9,,50.4,,0.9,,


In [103]:
# Let's export the merged_ACS DataFrame to a CSV file.
output_file_path = 'merged_ACS.csv'
merged_ACS.to_csv(output_file_path, index=False)

In [104]:
# Assuming 'merged_ACS' is my merged DataFrame
columns_to_check = ['DP05_0001E_2019', 'DP05_0001E_2021', 'DP05_0001E_2022']
for column in columns_to_check:
    print(f"Column {column} exists: {column in merged_ACS.columns}")

Column DP05_0001E_2019 exists: True
Column DP05_0001E_2021 exists: True
Column DP05_0001E_2022 exists: True


In [105]:
#CLEANING 

# Get the headers from the second row, which contains the descriptive text.
descriptive_headers = merged_ACS.iloc[0]
header_dict = descriptive_headers.to_dict()

# Filter out the columns that contain 'Estimate!!' and also include 'Geography' and 'Geographic Area Name'
estimate_columns = [col for col, desc in header_dict.items() if "Estimate!!" in str(desc) and not desc.startswith("Annotation of Estimate!!")]


# We need to make sure to include 'GEO_ID' and 'NAME' columns which might not contain 'Estimate!!'
estimate_columns.extend(['GEO_ID', 'NAME_2019', 'NAME_2021', 'NAME_2022'])



In [106]:
# Ensure 'GEO_ID' and 'NAME_20xx' are at the beginning of the estimate_columns list
if 'GEO_ID' in merged_ACS.columns:
    estimate_columns.insert(0, 'GEO_ID')
if 'NAME_2019' in merged_ACS.columns:
    estimate_columns.insert(1, 'NAME_2019')
if 'NAME_2021' in merged_ACS.columns:
    estimate_columns.insert(2, 'NAME_2021')
if 'NAME_2022' in merged_ACS.columns:
    estimate_columns.insert(3, 'NAME_2022')


##observation: some county data is missing in diff years!

In [107]:

filtered_ACS = merged_ACS[estimate_columns]
filtered_ACS.head()


Unnamed: 0,GEO_ID,NAME_2019,NAME_2021,NAME_2022,DP05_0001E_2019,DP05_0002E_2019,DP05_0003E_2019,DP05_0004E_2019,DP05_0005E_2019,DP05_0006E_2019,...,DP05_0086E_2022,DP05_0087E_2022,DP05_0088E_2022,DP05_0089E_2022,DP05_0090E_2022,DP05_0091E_2022,GEO_ID.1,NAME_2019.1,NAME_2021.1,NAME_2022.1
0,Geography,Geographic Area Name,Geographic Area Name,Geographic Area Name,Estimate!!SEX AND AGE!!Total population,Estimate!!SEX AND AGE!!Total population!!Male,Estimate!!SEX AND AGE!!Total population!!Female,Estimate!!SEX AND AGE!!Total population!!Sex r...,Estimate!!SEX AND AGE!!Total population!!Under...,Estimate!!SEX AND AGE!!Total population!!5 to ...,...,Estimate!!HISPANIC OR LATINO AND RACE!!Total p...,Estimate!!HISPANIC OR LATINO AND RACE!!Total p...,Estimate!!Total housing units,"Estimate!!CITIZEN, VOTING AGE POPULATION!!Citi...","Estimate!!CITIZEN, VOTING AGE POPULATION!!Citi...","Estimate!!CITIZEN, VOTING AGE POPULATION!!Citi...",Geography,Geographic Area Name,Geographic Area Name,Geographic Area Name
1,0500000US01003,"Baldwin County, Alabama","Baldwin County, Alabama","Baldwin County, Alabama",223234,109192,114042,95.7,10616,12826,...,1825,9670,132299,189876,91296,98580,0500000US01003,"Baldwin County, Alabama","Baldwin County, Alabama","Baldwin County, Alabama"
2,0500000US01015,"Calhoun County, Alabama","Calhoun County, Alabama","Calhoun County, Alabama",113605,54285,59320,91.5,6699,5534,...,330,2334,53408,90642,43301,47341,0500000US01015,"Calhoun County, Alabama","Calhoun County, Alabama","Calhoun County, Alabama"
3,0500000US01043,"Cullman County, Alabama","Cullman County, Alabama","Cullman County, Alabama",83768,40579,43189,94.0,5310,4563,...,897,1872,39893,69096,33827,35269,0500000US01043,"Cullman County, Alabama","Cullman County, Alabama","Cullman County, Alabama"
4,0500000US01049,"DeKalb County, Alabama","DeKalb County, Alabama","DeKalb County, Alabama",71513,35688,35825,99.6,4578,4292,...,453,2053,31022,52110,25847,26263,0500000US01049,"DeKalb County, Alabama","DeKalb County, Alabama","DeKalb County, Alabama"


In [93]:
# Assuming filtered_ACS' is my clean DataFrame
columns_to_check = ['DP05_0001E_2019', 'DP05_0001E_2021', 'DP05_0001E_2022']
for column in columns_to_check:
    print(f"Column {column} exists: {column in filtered_ACS.columns}")

Column DP05_0001E_2019 exists: True
Column DP05_0001E_2021 exists: True
Column DP05_0001E_2022 exists: True


In [109]:
# Let's export the filtered_ACS DataFrame to a CSV file.
output_file_path = 'filtered_ACS.csv'
filtered_ACS.to_csv(output_file_path, index=False)
