In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.linear_model import LassoCV

In [15]:
# Set pandas display options to show up to 100 rows
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

# I - Census 2021 DEMOGRAPHICS Data 
## a. (cleaning and converting individual state file into usable data)

In [16]:
# Load in Alabama DEMOGRAPHIC file for all districts
data = pd.read_csv('data/ALLCD_DP05_AL.csv', dtype={'PROFLN': str})
data.rename(columns = {'GEONAME' :'District'}, inplace=True)

# make a dictionary of PROFLN keys to TITLE values
title_dict = dict(zip(data['PROFLN'], data['TITLE']))

# convert the dictionary to dataframe and observe 
title_df = pd.DataFrame(list(title_dict.items()), columns=['PROFLN', 'TITLE'])
title_df.head(20)

Unnamed: 0,PROFLN,TITLE
0,0.0,ACS DEMOGRAPHIC AND HOUSING ESTIMATES
1,0.5,
2,0.8,SEX AND AGE
3,1.0,Total population
4,2.0,Male
5,3.0,Female
6,4.0,Sex ratio (males per 100 females)
7,4.3,
8,5.0,Under 5 years
9,6.0,5 to 9 years


### In the above table, you can see that the PROFLN number is associated with a Title which will eventually become our columns headers.
### By combining titles we get more accurate information (i.e. index 4 will become a combination of PROFLN 1&2 to create the title "Total Population - Male"

In [17]:
title_dict

{'0': 'ACS DEMOGRAPHIC AND HOUSING ESTIMATES',
 '0.5': nan,
 '0.8': 'SEX AND AGE',
 '1': 'Total population',
 '2': 'Male',
 '3': 'Female',
 '4': 'Sex ratio (males per 100 females)',
 '4.3': nan,
 '5': 'Under 5 years',
 '6': '5 to 9 years',
 '7': '10 to 14 years',
 '8': '15 to 19 years',
 '9': '20 to 24 years',
 '10': '25 to 34 years',
 '11': '35 to 44 years',
 '12': '45 to 54 years',
 '13': '55 to 59 years',
 '14': '60 to 64 years',
 '15': '65 to 74 years',
 '16': '75 to 84 years',
 '17': '85 years and over',
 '17.3': nan,
 '18': 'Median age (years)',
 '18.3': nan,
 '19': 'Under 18 years',
 '20': '16 years and over',
 '21': '18 years and over',
 '22': '21 years and over',
 '23': '62 years and over',
 '24': '65 years and over',
 '24.3': nan,
 '25': '18 years and over',
 '26': 'Male',
 '27': 'Female',
 '28': 'Sex ratio (males per 100 females)',
 '28.3': nan,
 '29': '65 years and over',
 '30': 'Male',
 '31': 'Female',
 '32': 'Sex ratio (males per 100 females)',
 '32.3': nan,
 '32.5': 'RAC

In [18]:
## Combining PROFLN numbers which will become new TITLEs

# "Total Population-" Category
for i in range(2, 25):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-' + str(i)

# "Total Population- 18 years and over-" Category
for i in range(25, 29):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-25' + ('' if i == 25 else '-' + str(i))

#"Total Population- 65 years and over-" Category
for i in range(29, 33):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-29' + ('' if i == 29 else '-' + str(i))

# "RACE-" Category
for i in range(33, 37):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-' + str(i)

# "RACE- One race-" Category
for i in range(37, 58):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-36-' + str(i)

# "RACE- Two or more races-" Category
for i in range(58, 63):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-58' + ('' if i == 58 else '-' + str(i))

#"RACE- Race alone or in combination with one or more other races-" Category
for i in range(63, 70):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-62.5-' + str(i)

# 'HISPANIC OR LATINO AND RACE-" Category
for i in range(70, 86):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '69.5-' + str(i)

# 'CITIZEN, VOTING AGE POPULATION' Category
for i in range(87, 90):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '86.5-' + str(i)

In [19]:
# Translate new profln numbers to combo titles
def translate_profln(profln):
    # split the PROFLN value into individual keys
    keys = profln.split('-')

    # translate each key from the title_dict
    titles = [str(title_dict.get(key, '')) for key in keys]

    # concat translated title strings
    translated_title = ' - '.join(title for title in titles if title)

    return translated_title

# replace TITLE values with their new names using function
data['TITLE'] = data['PROFLN'].apply(translate_profln)
data['TITLE'].value_counts()


TITLE
nan                                                                                                              84
RACE - Two or more races                                                                                         14
RACE - One race                                                                                                  14
Total population - 65 years and over                                                                             14
Total population - 18 years and over                                                                             14
RACE - One race - Native Hawaiian                                                                                 7
RACE - Two or more races - Black or African American and American Indian and Alaska Native                        7
RACE - One race - Other Asian                                                                                     7
RACE - Race alone or in combination with one or more other races -

In [20]:
# remove rows with NaN's, placeholder 'N' for 'Not Large Enough Sample Size', and placeholder '(X)'' rows
invalid_rows = data['PRF_ESTIMATE'].isin(['N', '(X)']) | data['PRF_ESTIMATE'].isna()

# filter out these rows from the Dataframe
data = data[~invalid_rows]



In [21]:
# remove rows of duplicate information or strong overlap

# List of PROFLN values to remove (many values are repeats from other categories and subcategories)
profln_to_remove = ['1-29', '1-25', '32.5-33', '32.5-36', '32.5-58', 
                    '32.5-62.5-63', '32.5-62.5-64', '32.5-62.5-65', 
                    '32.5-62.5-66', '32.5-62.5-67', '32.5-62.5-68', 
                    '32.5-62.5-69', '69.5-70', '69.5-76', '69.5-77', 
                    '69.5-78', '69.5-79', '69.5-80', '69.5-81', '69.5-82']

# filter the data to observe which rows we are removing
rows_to_remove = data[data['PROFLN'].isin(profln_to_remove)]
print(rows_to_remove['TITLE'])

# Remove these rows from the DataFrame
data = data[~data['PROFLN'].isin(profln_to_remove)]

31                                               Total population - 18 years and over
36                                               Total population - 65 years and over
42                                                            RACE - Total population
46                                                                    RACE - One race
68                                                           RACE - Two or more races
                                            ...                                      
734                     HISPANIC OR LATINO AND RACE - Black or African American alone
735             HISPANIC OR LATINO AND RACE - American Indian and Alaska Native alone
736                                         HISPANIC OR LATINO AND RACE - Asian alone
737    HISPANIC OR LATINO AND RACE - Native Hawaiian and Other Pacific Islander alone
738                               HISPANIC OR LATINO AND RACE - Some other race alone
Name: TITLE, Length: 138, dtype: object


In [22]:
# Delete margin of error columns
data = data.drop(columns=['PRF_MG_ERROR', 'PCT_MG_ERROR'])

# Some MEDIAN and MEAN values are not listed in PCT_ESTIMATE column, but are relevant
data.loc[data['PCT_ESTIMATE'] == '(X)', 'PCT_ESTIMATE'] = data['PRF_ESTIMATE']

# Convert all the values to numeric
data['PCT_ESTIMATE'] = data['PCT_ESTIMATE'].str.replace(',', '').astype(float)
data['PRF_ESTIMATE'] = data['PRF_ESTIMATE'].str.replace(',', '').astype(float)

data

Unnamed: 0,TBLID,GEOID,District,PROFLN,TITLE,PRF_ESTIMATE,PCT_ESTIMATE
3,DP05,5001800US0101,"Congressional District 1 (118th Congress), Alabama",1,Total population,727212.0,727212.0
4,DP05,5001800US0101,"Congressional District 1 (118th Congress), Alabama",1-2,Total population - Male,353457.0,48.6
5,DP05,5001800US0101,"Congressional District 1 (118th Congress), Alabama",1-3,Total population - Female,373755.0,51.4
6,DP05,5001800US0101,"Congressional District 1 (118th Congress), Alabama",1-4,Total population - Sex ratio (males per 100 females),94.6,94.6
8,DP05,5001800US0101,"Congressional District 1 (118th Congress), Alabama",1-5,Total population - Under 5 years,41746.0,5.7
...,...,...,...,...,...,...,...
741,DP05,5001800US0107,"Congressional District 7 (118th Congress), Alabama",69.5-85,"HISPANIC OR LATINO AND RACE - Two races excluding Some other race, and Three or more races",14020.0,2.0
743,DP05,5001800US0107,"Congressional District 7 (118th Congress), Alabama",86,Total housing units,344719.0,344719.0
746,DP05,5001800US0107,"Congressional District 7 (118th Congress), Alabama",86.5-87,"CITIZEN, VOTING AGE POPULATION - Citizen, 18 and over population",546972.0,546972.0
747,DP05,5001800US0107,"Congressional District 7 (118th Congress), Alabama",86.5-88,"CITIZEN, VOTING AGE POPULATION - Male",252332.0,46.1


In [23]:
## Converting the long district titles to their state and district codes ('AL-01' = 'Alabama District 1')
# State Mapping
states = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
    'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH',
    'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
    'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',
    'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
    'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}

def format_district(district):
    # Extract the state name and district number from the district string
    match = re.search(r'Congressional District (\d+|\(at Large\)) \(118th Congress\), ([A-Za-z\s]+)', district)
    if match:
        district_num, state_name = match.groups()
        # Convert the district number to 'AL' for 'at large' districts
        if district_num == '(at Large)':
            district_num = 'AL'
        else:
            district_num = district_num.zfill(2)  # Pad single digit numbers with a leading zero
        # Convert state name to its abbreviation
        state_abbr = states.get(state_name.strip(), state_name)
        # Return the formatted district name
        return f'{state_abbr}-{district_num}'
    
# Apply the function to the 'District' column in both dataframes to replace values with their short-hand
data['Formatted_District'] = data['District'].apply(format_district)



In [24]:
#ensure all districts have the same number of values
data['Formatted_District'].value_counts()


Formatted_District
AL-01    61
AL-02    61
AL-03    61
AL-04    61
AL-05    61
AL-06    61
AL-07    61
Name: count, dtype: int64

In [25]:
# Pivot the dataframe separately into two dataframes with 'PRF_ESTIMATE' (raw numbers) and 'PCT_ESTIMATE' (%'s)
prf_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PRF_ESTIMATE')
pct_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PCT_ESTIMATE')

# Reset the column names after pivot
prf_estimate_df.columns = [f'{col}' for col in prf_estimate_df.columns]
pct_estimate_df.columns = [f'{col}' for col in pct_estimate_df.columns]

# Reset the index
prf_estimate_df.reset_index(inplace=True)
pct_estimate_df.reset_index(inplace=True)

prf_estimate_df

Unnamed: 0,Formatted_District,GEOID,"CITIZEN, VOTING AGE POPULATION - Citizen, 18 and over population","CITIZEN, VOTING AGE POPULATION - Female","CITIZEN, VOTING AGE POPULATION - Male",HISPANIC OR LATINO AND RACE - Cuban,HISPANIC OR LATINO AND RACE - Hispanic or Latino (of any race),HISPANIC OR LATINO AND RACE - Mexican,HISPANIC OR LATINO AND RACE - Other Hispanic or Latino,HISPANIC OR LATINO AND RACE - Puerto Rican,...,Total population - 65 years and over - Male,Total population - 65 years and over - Sex ratio (males per 100 females),Total population - 75 to 84 years,Total population - 85 years and over,Total population - Female,Total population - Male,Total population - Median age (years),Total population - Sex ratio (males per 100 females),Total population - Under 18 years,Total population - Under 5 years
0,AL-01,5001800US0101,555123.0,289663.0,265460.0,1480.0,25346.0,14082.0,6571.0,3213.0,...,61808.0,82.8,40508.0,12282.0,373755.0,353457.0,41.0,94.6,162658.0,41746.0
1,AL-02,5001800US0102,539010.0,284652.0,254358.0,479.0,29072.0,18554.0,5484.0,4555.0,...,54456.0,76.9,38134.0,12806.0,370483.0,345729.0,39.0,93.3,165222.0,41997.0
2,AL-03,5001800US0103,558552.0,289423.0,269129.0,766.0,22960.0,11728.0,6468.0,3998.0,...,55915.0,79.6,39234.0,8784.0,368853.0,352436.0,40.0,95.5,152113.0,39510.0
3,AL-04,5001800US0104,541258.0,280884.0,260374.0,1322.0,52766.0,37454.0,11988.0,2002.0,...,59392.0,79.6,40315.0,12003.0,369637.0,353433.0,40.5,95.6,165303.0,45914.0
4,AL-05,5001800US0105,557827.0,285421.0,272406.0,1481.0,41789.0,20557.0,13835.0,5916.0,...,54646.0,78.2,37413.0,12440.0,370374.0,359010.0,40.5,96.9,157595.0,37683.0
5,AL-06,5001800US0106,531046.0,277760.0,253286.0,1180.0,36253.0,24216.0,9624.0,1233.0,...,52246.0,79.7,35741.0,11321.0,363521.0,342976.0,39.6,94.3,161580.0,41408.0
6,AL-07,5001800US0107,546972.0,294640.0,252332.0,1024.0,29248.0,18521.0,8699.0,1004.0,...,50770.0,71.7,32485.0,12120.0,377358.0,338855.0,37.5,89.8,155471.0,41833.0


# I - Census 2021 DEMOGRAPHICS Data 
## b. (converting previous steps into functions to apply and join all 50 state files)

In [26]:
#LOAD FUNCTION

def load_and_rename_data(csv):
    data = pd.read_csv(csv, dtype={'PROFLN': str})
    data.rename(columns = {'GEONAME' :'District'}, inplace=True)
    title_dict = dict(zip(data['PROFLN'], data['TITLE']))
    return data, title_dict

In [27]:
#UPDATE PROFLN FUNCTION

def update_profln_values(data):
    for i in range(2, 25):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-' + str(i)
    for i in range(25, 29):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-25' + ('' if i == 25 else '-' + str(i))
    for i in range(29, 33):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-29' + ('' if i == 29 else '-' + str(i))
    for i in range(33, 37):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-' + str(i)
    for i in range(37, 58):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-36-' + str(i)
    for i in range(58, 63):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-58' + ('' if i == 58 else '-' + str(i))
    for i in range(63, 70):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-62.5-' + str(i)
    for i in range(70, 86):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '69.5-' + str(i)
    for i in range(87, 90):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '86.5-' + str(i)
    return data

In [36]:
## REFORM TITLES FUNCTION

def translate_profln(profln, title_dict):
    # Split the PROFLN value into keys
    keys = profln.split('-')

    # Translate each key using the title_dict, ensuring that all elements are converted to strings
    titles = [str(title_dict.get(key, '')) for key in keys]

    # Concatenate the translated titles
    translated_title = ' - '.join(title for title in titles if title)

    return translated_title

In [29]:
# CLEAN UP DATA AND DROP REDUNDANT INFO FUNCTION

def clean_up_data(data):
    # drop unneccessary rows and rows with placeholders
    invalid_rows = data['PRF_ESTIMATE'].isin(['N', '(X)']) | data['PRF_ESTIMATE'].isna()
    data = data[~invalid_rows]

    # The rows associated with these index markers contain duplicate data
    profln_to_remove = ['1-29', '1-25', '32.5-33', '32.5-36', '32.5-58', 
                        '32.5-62.5-63', '32.5-62.5-64', '32.5-62.5-65', 
                        '32.5-62.5-66', '32.5-62.5-67', '32.5-62.5-68', 
                        '32.5-62.5-69', '69.5-70', '69.5-76', '69.5-77', 
                        '69.5-78', '69.5-79', '69.5-80', '69.5-81', '69.5-82']
    data = data[~data['PROFLN'].isin(profln_to_remove)]

    # Delete the margin of error columns
    data = data.drop(columns=['PRF_MG_ERROR', 'PCT_MG_ERROR'])

    # Replace '(X)' values in PCT_ESTIMATE with their PRF_ESTIMATE values (average and median values, usually)
    data.loc[data['PCT_ESTIMATE'] == '(X)', 'PCT_ESTIMATE'] = data['PRF_ESTIMATE']

    # convert to numeric
    data['PCT_ESTIMATE'] = data['PCT_ESTIMATE'].str.replace(',', '').astype(float)
    data['PRF_ESTIMATE'] = data['PRF_ESTIMATE'].str.replace(',', '').astype(float)
    
    return data


In [30]:
# CONVERT TO STATE-DISTRICT FUNCTION

def format_districts(data):
    # Mapping of state names to abbreviations
    states = {
        'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
        'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
        'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
        'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
        'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
        'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH',
        'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
        'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
        'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',
        'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
        'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
    }

    # function to format a single district
    def format_district(district):
        # extract the state name and district number from the district string
        match = re.search(r'Congressional District (\d+|\(at Large\)) \(118th Congress\), ([A-Za-z\s]+)', district)
        if match:
            district_num, state_name = match.groups()
            # convert the district number to 'AL' for 'at large' districts
            if district_num == '(at Large)':
                district_num = 'AL'
            else:
                district_num = district_num.zfill(2)  # pad single digit numbers with a leading zero
            # convert the state name to its abbreviation
            state_abbr = states.get(state_name.strip(), state_name)
            # return the formatted district name
            return f'{state_abbr}-{district_num}'
        else:
            return district

    # Apply the function to the District column
    data['Formatted_District'] = data['District'].apply(format_district)

    return data


In [32]:
## TRANSPOSE DATAFRAME FUNCTION
def reorient_dfs(data):
    # Pivot the dataframe separately for 'PRF_ESTIMATE' and 'PCT_ESTIMATE'
    prf_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PRF_ESTIMATE')
    pct_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PCT_ESTIMATE')

    # Reset the column names after pivot
    prf_estimate_df.columns = [f'{col}' for col in prf_estimate_df.columns]
    pct_estimate_df.columns = [f'{col}' for col in pct_estimate_df.columns]

    # Reset the index
    prf_estimate_df.reset_index(inplace=True)
    pct_estimate_df.reset_index(inplace=True)

    return prf_estimate_df, pct_estimate_df


# I- Census 2021 DEMOGRAPHIC Data
## c. process all 50 states demographic files

In [37]:
## PROCESS ALL 50 STATES FUNCTION
def process_demographics(file_state_list):
    prf_dfs = []
    pct_dfs = []
    
    for file_state in file_state_list:
        csv, state = file_state

        data, title_dict = load_and_rename_data(csv)
        data = update_profln_values(data)
        data['TITLE'] = data['PROFLN'].apply(lambda x: translate_profln(x, title_dict))
        data = clean_up_data(data)
        data = format_districts(data)

        prf_estimate_df, pct_estimate_df = reorient_dfs(data)

        prf_dfs.append(prf_estimate_df)
        pct_dfs.append(pct_estimate_df)
    
    df_prfdemo_all = pd.concat(prf_dfs, ignore_index=True)
    df_pctdemo_all = pd.concat(pct_dfs, ignore_index=True)
    
    return df_prfdemo_all, df_pctdemo_all


In [38]:
file_state_list = [
 ('data/ALLCD_DP05_AK.csv', 'AK'),('data/ALLCD_DP05_AL.csv', 'AL'),('data/ALLCD_DP05_AR.csv', 'AR'),
 ('data/ALLCD_DP05_AZ.csv', 'AZ'),('data/ALLCD_DP05_CA.csv', 'CA'),('data/ALLCD_DP05_CO.csv', 'CO'),
 ('data/ALLCD_DP05_CT.csv', 'CT'),('data/ALLCD_DP05_DE.csv', 'DE'),('data/ALLCD_DP05_FL.csv', 'FL'),
 ('data/ALLCD_DP05_GA.csv', 'GA'),('data/ALLCD_DP05_HI.csv', 'HI'),('data/ALLCD_DP05_IA.csv', 'IA'),
 ('data/ALLCD_DP05_ID.csv', 'ID'),('data/ALLCD_DP05_IL.csv', 'IL'),('data/ALLCD_DP05_IN.csv', 'IN'),
 ('data/ALLCD_DP05_KS.csv', 'KS'),('data/ALLCD_DP05_KY.csv', 'KY'),('data/ALLCD_DP05_LA.csv', 'LA'),
 ('data/ALLCD_DP05_MA.csv', 'MA'),('data/ALLCD_DP05_MD.csv', 'MD'),('data/ALLCD_DP05_ME.csv', 'ME'),
 ('data/ALLCD_DP05_MI.csv', 'MI'),('data/ALLCD_DP05_MN.csv', 'MN'),('data/ALLCD_DP05_MO.csv', 'MO'),
 ('data/ALLCD_DP05_MS.csv', 'MS'),('data/ALLCD_DP05_MT.csv', 'MT'),('data/ALLCD_DP05_NC.csv', 'NC'),
 ('data/ALLCD_DP05_ND.csv', 'ND'),('data/ALLCD_DP05_NE.csv', 'NE'),('data/ALLCD_DP05_NH.csv', 'NH'),
 ('data/ALLCD_DP05_NJ.csv', 'NJ'),('data/ALLCD_DP05_NM.csv', 'NM'),('data/ALLCD_DP05_NV.csv', 'NV'),
 ('data/ALLCD_DP05_NY.csv', 'NY'),('data/ALLCD_DP05_OH.csv', 'OH'),('data/ALLCD_DP05_OK.csv', 'OK'),
 ('data/ALLCD_DP05_OR.csv', 'OR'),('data/ALLCD_DP05_PA.csv', 'PA'),('data/ALLCD_DP05_RI.csv', 'RI'),
 ('data/ALLCD_DP05_SC.csv', 'SC'),('data/ALLCD_DP05_SD.csv', 'SD'),('data/ALLCD_DP05_TN.csv', 'TN'),
 ('data/ALLCD_DP05_TX.csv', 'TX'),('data/ALLCD_DP05_UT.csv', 'UT'),('data/ALLCD_DP05_VA.csv', 'VA'),
 ('data/ALLCD_DP05_VT.csv', 'VT'),('data/ALLCD_DP05_WA.csv', 'WA'),('data/ALLCD_DP05_WI.csv', 'WI'),
 ('data/ALLCD_DP05_WV.csv', 'WV'),('data/ALLCD_DP05_WY.csv', 'WY')
]


df_prfdemo_all, df_pctdemo_all = process_demographics(file_state_list)


In [39]:
df_prfdemo_all

Unnamed: 0,Formatted_District,GEOID,"CITIZEN, VOTING AGE POPULATION - Citizen, 18 and over population","CITIZEN, VOTING AGE POPULATION - Female","CITIZEN, VOTING AGE POPULATION - Male",HISPANIC OR LATINO AND RACE - Cuban,HISPANIC OR LATINO AND RACE - Hispanic or Latino (of any race),HISPANIC OR LATINO AND RACE - Mexican,HISPANIC OR LATINO AND RACE - Other Hispanic or Latino,HISPANIC OR LATINO AND RACE - Puerto Rican,...,Total population - 65 years and over - Male,Total population - 65 years and over - Sex ratio (males per 100 females),Total population - 75 to 84 years,Total population - 85 years and over,Total population - Female,Total population - Male,Total population - Median age (years),Total population - Sex ratio (males per 100 females),Total population - Under 18 years,Total population - Under 5 years
0,AK-AL,5001800US0200,533852.0,252485.0,281367.0,3182.0,52291.0,22299.0,18157.0,8653.0,...,49676.0,101.9,24410.0,6709.0,349552.0,383121.0,35.6,109.6,179401.0,46198.0
1,AL-01,5001800US0101,555123.0,289663.0,265460.0,1480.0,25346.0,14082.0,6571.0,3213.0,...,61808.0,82.8,40508.0,12282.0,373755.0,353457.0,41.0,94.6,162658.0,41746.0
2,AL-02,5001800US0102,539010.0,284652.0,254358.0,479.0,29072.0,18554.0,5484.0,4555.0,...,54456.0,76.9,38134.0,12806.0,370483.0,345729.0,39.0,93.3,165222.0,41997.0
3,AL-03,5001800US0103,558552.0,289423.0,269129.0,766.0,22960.0,11728.0,6468.0,3998.0,...,55915.0,79.6,39234.0,8784.0,368853.0,352436.0,40.0,95.5,152113.0,39510.0
4,AL-04,5001800US0104,541258.0,280884.0,260374.0,1322.0,52766.0,37454.0,11988.0,2002.0,...,59392.0,79.6,40315.0,12003.0,369637.0,353433.0,40.5,95.6,165303.0,45914.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,WI-07,5001800US5507,579658.0,284696.0,294962.0,675.0,19067.0,11203.0,5014.0,2175.0,...,77436.0,93.4,45479.0,17297.0,365784.0,378859.0,44.9,103.6,157839.0,36524.0
431,WI-08,5001800US5508,562233.0,281545.0,280688.0,548.0,44334.0,29112.0,7778.0,6896.0,...,62612.0,87.9,38348.0,14223.0,365825.0,371023.0,41.1,101.4,162548.0,40347.0
432,WV-01,5001800US5401,696591.0,357316.0,339275.0,404.0,9502.0,3745.0,3927.0,1426.0,...,88221.0,83.6,55153.0,19269.0,447881.0,434099.0,44.0,96.9,180487.0,43443.0
433,WV-02,5001800US5402,712176.0,356501.0,355675.0,491.0,20848.0,7160.0,9278.0,3919.0,...,81831.0,87.8,51215.0,16552.0,447718.0,453261.0,41.6,101.2,180905.0,45341.0
