In [1]:
# Load necessary libraries
%load_ext autotime
import pandas as pd
import requests
from bs4 import BeautifulSoup
from census import Census
import os
from dotenv import load_dotenv


time: 2.42 s (started: 2024-11-07 10:56:16 -05:00)


In [2]:
# Load API key from enivornment variable
load_dotenv(dotenv_path='key.env')
api_key=os.getenv('API_KEY')

time: 0 ns (started: 2024-11-07 10:56:19 -05:00)


In [3]:
race_variable = [
        'B02001_001E',  # Estimate!!Total
        'B02001_002E',  # Estimate!!Total!!White alone
        'B02001_003E',  # Estimate!!Total!!Black or African American alone
        'B02001_004E',  # Estimate!!Total!!American Indian and Alaska Native alone
        'B02001_005E',  # Estimate!!Total!!Asian alone
        'B02001_006E',  # Estimate!!Total!!Native Hawaiian and Other Pacific Islander alone
        'B02001_007E',  # Estimate!!Total!!Some other race alone
        'B02001_008E',  # Estimate!!Total!!Two or more races
        'B02001_009E',  # Estimate!!Total!!Two or more races!!Two races including Some other race
        'B02001_010E'   # Estimate!!Total!!Two or more races!!Two races excluding Some other race, and three or more races
    ]



time: 15 ms (started: 2024-11-07 10:56:19 -05:00)


In [4]:
def fetch_variable_labels(year):
    variable_label_mapping = {}
    # Define variables url to extract the variable name
    variables_url = f'https://api.census.gov/data/{year}/acs/acs1/variables.html'
    
    # Request the variables page
    data_response = requests.get(variables_url)

    # Parse the variables page to extract labels
    if data_response.status_code == 200:
        soup = BeautifulSoup(data_response.content, 'html.parser')
        table = soup.find_all('table')

        if table:
            rows = table[0].find_all('tr')[1:] 
            for row in rows:  # Skip the header row
                cols = row.find_all('td')
                if len(cols) >= 2:
                    variable_code = cols[0].text.strip()  # Get variable code
                    variable_label = cols[1].text.strip()  # Get variable label

                    variable_label_mapping[variable_code] = variable_label

    return variable_label_mapping


time: 15 ms (started: 2024-11-07 10:56:19 -05:00)


In [5]:
def fetch_census_data(year, variable_label_mapping, state=48):
    url = f'https://api.census.gov/data/{year}/acs/acs1?get=NAME,{",".join(race_variable)}&for=state:{state}&key={api_key}'

    data_response = requests.get(url)

    if data_response.status_code ==200:
        data= data_response.json()
        header = data[0]
        values = data[1][:-1]

        row_dict = {
            'YEAR' : year,
            'NAME' : values[0]
        }

        for i, var_code in enumerate(race_variable):
            label = variable_label_mapping.get(var_code, var_code)
            row_dict[label] =int(values[i+1])
        return row_dict
        

time: 16 ms (started: 2024-11-07 10:56:19 -05:00)


In [6]:
def collect_all_years_data(start_year=1926, end_year=2026):
    all_data = []
    first_year_labels = None

    for year in range(start_year,end_year+1):
        print(f"Processing year {year}...")
        variable_labels = fetch_variable_labels(year)

        if first_year_labels is None and variable_labels:
            first_year_labels = {code: label for code, label in variable_labels.items() if code in race_variable}

        if first_year_labels:
            year_data = fetch_census_data(year, first_year_labels)
            if year_data:
                all_data.append(year_data)
    if all_data:
        df = pd.DataFrame(all_data)

        column_order = ['YEAR','NAME'] + [first_year_labels[var] for var in race_variable]
        df = df[column_order]

        return df

time: 16 ms (started: 2024-11-07 10:56:19 -05:00)


In [7]:
print("Starting Data Coleection...")
df = collect_all_years_data()

if df is not None:
    output_files = "TX_census_data_with_labels_acs5.csv"
    df.to_csv(output_files, index = False)

Starting Data Coleection...
Processing year 1926...
Processing year 1927...
Processing year 1928...
Processing year 1929...
Processing year 1930...
Processing year 1931...
Processing year 1932...
Processing year 1933...
Processing year 1934...
Processing year 1935...
Processing year 1936...
Processing year 1937...
Processing year 1938...
Processing year 1939...
Processing year 1940...
Processing year 1941...
Processing year 1942...
Processing year 1943...
Processing year 1944...
Processing year 1945...
Processing year 1946...
Processing year 1947...
Processing year 1948...
Processing year 1949...
Processing year 1950...
Processing year 1951...
Processing year 1952...
Processing year 1953...
Processing year 1954...
Processing year 1955...
Processing year 1956...
Processing year 1957...
Processing year 1958...
Processing year 1959...
Processing year 1960...
Processing year 1961...
Processing year 1962...
Processing year 1963...
Processing year 1964...
Processing year 1965...
Processing y