In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Set pandas display options to show up to 100 rows
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)

In [3]:
data = pd.read_csv('data/upper/ALLSLDU_DP05_AL.csv', dtype={'Line Number': str})

In [4]:
# Load in Alabama DEMOGRAPHIC file for all districts

data.rename(columns={
    'GEONAME': 'District',
    'Line Number': 'PROFLN',
    'Title':'TITLE',
    'Estimate': 'PRF_ESTIMATE',
    'Percent Estimate':'PCT_ESTIMATE', 
    'Margin of Error': 'PRF_MG_ERROR',
    'Percent Margin of Error': 'PCT_MG_ERROR'
}, inplace=True)

# 'Title':'TITLE',
#     'TITLE': 'Title',
# make a dictionary of Line Number keys to Title values
title_dict = dict(zip(data['PROFLN'], data['TITLE']))

# convert the dictionary to dataframe and observe 
title_df = pd.DataFrame(list(title_dict.items()), columns=['PROFLN', 'TITLE'])
title_df.head(10)

Unnamed: 0,PROFLN,TITLE
0,0.0,ACS DEMOGRAPHIC AND HOUSING ESTIMATES
1,0.5,
2,0.8,SEX AND AGE
3,1.0,Total population
4,2.0,Male
5,3.0,Female
6,4.0,Sex ratio (males per 100 females)
7,4.3,
8,5.0,Under 5 years
9,6.0,5 to 9 years


In [5]:
title_dict.values()

dict_values(['ACS DEMOGRAPHIC AND HOUSING ESTIMATES', nan, 'SEX AND AGE', 'Total population', 'Male', 'Female', 'Sex ratio (males per 100 females)', nan, 'Under 5 years', '5 to 9 years', '10 to 14 years', '15 to 19 years', '20 to 24 years', '25 to 34 years', '35 to 44 years', '45 to 54 years', '55 to 59 years', '60 to 64 years', '65 to 74 years', '75 to 84 years', '85 years and over', nan, 'Median age (years)', nan, 'Under 18 years', '16 years and over', '18 years and over', '21 years and over', '62 years and over', '65 years and over', nan, '18 years and over', 'Male', 'Female', 'Sex ratio (males per 100 females)', nan, '65 years and over', 'Male', 'Female', 'Sex ratio (males per 100 females)', nan, 'RACE', 'Total population', 'One race', 'Two or more races', nan, 'One race', 'White', 'Black or African American', 'American Indian and Alaska Native', 'Cherokee tribal grouping', 'Chippewa tribal grouping', 'Navajo tribal grouping', 'Sioux tribal grouping', 'Asian', 'Asian Indian', 'Chin

In [6]:
title_dict

{'0': 'ACS DEMOGRAPHIC AND HOUSING ESTIMATES',
 '0.5': nan,
 '0.8': 'SEX AND AGE',
 '1': 'Total population',
 '2': 'Male',
 '3': 'Female',
 '4': 'Sex ratio (males per 100 females)',
 '4.3': nan,
 '5': 'Under 5 years',
 '6': '5 to 9 years',
 '7': '10 to 14 years',
 '8': '15 to 19 years',
 '9': '20 to 24 years',
 '10': '25 to 34 years',
 '11': '35 to 44 years',
 '12': '45 to 54 years',
 '13': '55 to 59 years',
 '14': '60 to 64 years',
 '15': '65 to 74 years',
 '16': '75 to 84 years',
 '17': '85 years and over',
 '17.3': nan,
 '18': 'Median age (years)',
 '18.3': nan,
 '19': 'Under 18 years',
 '20': '16 years and over',
 '21': '18 years and over',
 '22': '21 years and over',
 '23': '62 years and over',
 '24': '65 years and over',
 '24.3': nan,
 '25': '18 years and over',
 '26': 'Male',
 '27': 'Female',
 '28': 'Sex ratio (males per 100 females)',
 '28.3': nan,
 '29': '65 years and over',
 '30': 'Male',
 '31': 'Female',
 '32': 'Sex ratio (males per 100 females)',
 '32.3': nan,
 '32.5': 'RAC

In [7]:
## Combining PROFLN numbers which will become new TITLEs

# "Total Population-" Category
for i in range(2, 25):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-' + str(i)

# "Total Population- 18 years and over-" Category
for i in range(25, 29):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-25' + ('' if i == 25 else '-' + str(i))

#"Total Population- 65 years and over-" Category
for i in range(29, 33):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-29' + ('' if i == 29 else '-' + str(i))

# "RACE-" Category
for i in range(33, 37):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-' + str(i)

# "RACE- One race-" Category
for i in range(37, 58):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-36-' + str(i)

# "RACE- Two or more races-" Category
for i in range(58, 63):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-58' + ('' if i == 58 else '-' + str(i))

#"RACE- Race alone or in combination with one or more other races-" Category
for i in range(63, 70):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-62.5-' + str(i)

# 'HISPANIC OR LATINO AND RACE-" Category
for i in range(70, 86):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '69.5-' + str(i)

# 'CITIZEN, VOTING AGE POPULATION' Category
for i in range(87, 90):
    data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '86.5-' + str(i)

In [8]:
# Translate new profln numbers to combo titles
def translate_profln(profln):
    # split the PROFLN value into individual keys
    keys = profln.split('-')

    # translate each key from the title_dict
    titles = [str(title_dict.get(key, '')) for key in keys]

    # concat translated title strings
    translated_title = ' - '.join(title for title in titles if title)

    return translated_title

# replace TITLE values with their new names using function
data['TITLE'] = data['PROFLN'].apply(translate_profln)
data['TITLE'].value_counts()


TITLE
nan                                                                                                              420
RACE - Two or more races                                                                                          70
RACE - One race                                                                                                   70
Total population - 65 years and over                                                                              70
Total population - 18 years and over                                                                              70
RACE - One race - Native Hawaiian                                                                                 35
RACE - Two or more races - Black or African American and American Indian and Alaska Native                        35
RACE - One race - Other Asian                                                                                     35
RACE - Race alone or in combination with one or more other

In [9]:
# remove rows with NaN's, placeholder 'N' for 'Not Large Enough Sample Size', and placeholder '(X)'' rows
invalid_rows = data['PRF_ESTIMATE'].isin(['N', '(X)']) | data['PRF_ESTIMATE'].isna()

# filter out these rows from the Dataframe
data = data[~invalid_rows]



In [10]:
# remove rows of duplicate information or strong overlap

# List of PROFLN values to remove (many values are repeats from other categories and subcategories)
profln_to_remove = ['1-29', '1-25', '32.5-33', '32.5-36', '32.5-58', 
                    '32.5-62.5-63', '32.5-62.5-64', '32.5-62.5-65', 
                    '32.5-62.5-66', '32.5-62.5-67', '32.5-62.5-68', 
                    '32.5-62.5-69', '69.5-70', '69.5-76', '69.5-77', 
                    '69.5-78', '69.5-79', '69.5-80', '69.5-81', '69.5-82']

# filter the data to observe which rows we are removing
rows_to_remove = data[data['PROFLN'].isin(profln_to_remove)]
print(rows_to_remove['TITLE'])

# Remove these rows from the DataFrame
data = data[~data['PROFLN'].isin(profln_to_remove)]

31                                                Total population - 18 years and over
36                                                Total population - 65 years and over
42                                                             RACE - Total population
46                                                                     RACE - One race
68                                                            RACE - Two or more races
                                             ...                                      
3730                     HISPANIC OR LATINO AND RACE - Black or African American alone
3731             HISPANIC OR LATINO AND RACE - American Indian and Alaska Native alone
3732                                         HISPANIC OR LATINO AND RACE - Asian alone
3733    HISPANIC OR LATINO AND RACE - Native Hawaiian and Other Pacific Islander alone
3734                               HISPANIC OR LATINO AND RACE - Some other race alone
Name: TITLE, Length: 700, dtype: object


In [11]:
# Delete margin of error columns
data = data.drop(columns=['PRF_MG_ERROR', 'PCT_MG_ERROR'])

# Some MEDIAN and MEAN values are not listed in PCT_ESTIMATE column, but are relevant
data.loc[data['PCT_ESTIMATE'] == '(X)', 'PCT_ESTIMATE'] = data['PRF_ESTIMATE']

# Convert all the values to numeric
data['PCT_ESTIMATE'] = data['PCT_ESTIMATE'].str.replace(',', '').astype(float)
data['PRF_ESTIMATE'] = data['PRF_ESTIMATE'].str.replace(',', '').astype(float)

data

Unnamed: 0,GEOID,District,PROFLN,TITLE,PRF_ESTIMATE,PCT_ESTIMATE
3,610U800US01001,"State Senate District 1 (2022), Alabama",1,Total population,144420.0,144420.0
4,610U800US01001,"State Senate District 1 (2022), Alabama",1-2,Total population - Male,70028.0,48.5
5,610U800US01001,"State Senate District 1 (2022), Alabama",1-3,Total population - Female,74392.0,51.5
6,610U800US01001,"State Senate District 1 (2022), Alabama",1-4,Total population - Sex ratio (males per 100 females),94.1,94.1
8,610U800US01001,"State Senate District 1 (2022), Alabama",1-5,Total population - Under 5 years,7209.0,5.0
...,...,...,...,...,...,...
3737,610U800US01035,"State Senate District 35 (2022), Alabama",69.5-85,"HISPANIC OR LATINO AND RACE - Two races excluding Some other race, and Three or more races",2714.0,2.0
3739,610U800US01035,"State Senate District 35 (2022), Alabama",86,Total housing units,58835.0,58835.0
3742,610U800US01035,"State Senate District 35 (2022), Alabama",86.5-87,"CITIZEN, VOTING AGE POPULATION - Citizen, 18 and over population",101758.0,101758.0
3743,610U800US01035,"State Senate District 35 (2022), Alabama",86.5-88,"CITIZEN, VOTING AGE POPULATION - Male",48181.0,47.3


In [12]:
## Converting the long district titles to their state and district codes ('AL-01' = 'Alabama District 1')
# State Mapping
states = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
    'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH',
    'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
    'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
    'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',
    'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
    'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}

def format_senate_district(district):
    # Regular expression to extract the district number and state from the new format
    match = re.search(r'State Senate District (\d+) \(\d+\), ([A-Za-z\s]+)', district)
    if match:
        district_num, state_name = match.groups()
        district_num = district_num.zfill(2)  # Pad the district number with leading zeros if necessary
        state_abbr = states.get(state_name.strip(), state_name)  # Get the state abbreviation
        return f'{state_abbr}-Sen-{district_num}'  # Format it as required
    return district  # Return the original if no match found


data['Formatted_District'] = data['District'].apply(format_senate_district)

In [13]:
#ensure all districts have the same number of values
data['Formatted_District'].value_counts()


Formatted_District
AL-Sen-01    65
AL-Sen-27    65
AL-Sen-21    65
AL-Sen-22    65
AL-Sen-23    65
AL-Sen-24    65
AL-Sen-25    65
AL-Sen-26    65
AL-Sen-28    65
AL-Sen-19    65
AL-Sen-29    65
AL-Sen-30    65
AL-Sen-31    65
AL-Sen-32    65
AL-Sen-33    65
AL-Sen-34    65
AL-Sen-20    65
AL-Sen-18    65
AL-Sen-02    65
AL-Sen-09    65
AL-Sen-03    65
AL-Sen-04    65
AL-Sen-05    65
AL-Sen-06    65
AL-Sen-07    65
AL-Sen-08    65
AL-Sen-10    65
AL-Sen-17    65
AL-Sen-11    65
AL-Sen-12    65
AL-Sen-13    65
AL-Sen-14    65
AL-Sen-15    65
AL-Sen-16    65
AL-Sen-35    65
Name: count, dtype: int64

In [14]:
# Pivot the dataframe separately into two dataframes with 'PRF_ESTIMATE' (raw numbers) and 'PCT_ESTIMATE' (%'s)
prf_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PRF_ESTIMATE')
pct_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PCT_ESTIMATE')

# Reset the column names after pivot
prf_estimate_df.columns = [f'{col}' for col in prf_estimate_df.columns]
pct_estimate_df.columns = [f'{col}' for col in pct_estimate_df.columns]

# Reset the index
prf_estimate_df.reset_index(inplace=True)
pct_estimate_df.reset_index(inplace=True)

prf_estimate_df

Unnamed: 0,Formatted_District,GEOID,"CITIZEN, VOTING AGE POPULATION - Citizen, 18 and over population","CITIZEN, VOTING AGE POPULATION - Female","CITIZEN, VOTING AGE POPULATION - Male",HISPANIC OR LATINO AND RACE - Cuban,HISPANIC OR LATINO AND RACE - Hispanic or Latino (of any race),HISPANIC OR LATINO AND RACE - Mexican,HISPANIC OR LATINO AND RACE - Other Hispanic or Latino,HISPANIC OR LATINO AND RACE - Puerto Rican,...,Total population - 65 years and over - Male,Total population - 65 years and over - Sex ratio (males per 100 females),Total population - 75 to 84 years,Total population - 85 years and over,Total population - Female,Total population - Male,Total population - Median age (years),Total population - Sex ratio (males per 100 females),Total population - Under 18 years,Total population - Under 5 years
0,AL-Sen-01,610U800US01001,113137.0,58694.0,54443.0,283.0,5335.0,4073.0,682.0,297.0,...,12026.0,81.1,8383.0,2322.0,74392.0,70028.0,41.4,94.1,29418.0,7209.0
1,AL-Sen-02,610U800US01002,110858.0,55307.0,55551.0,168.0,9178.0,4724.0,2193.0,2093.0,...,8016.0,83.1,5398.0,1740.0,74024.0,75866.0,36.2,102.5,34273.0,7307.0
2,AL-Sen-03,610U800US01003,109617.0,56710.0,52907.0,307.0,12025.0,7599.0,2984.0,1135.0,...,10685.0,79.1,6993.0,2643.0,75076.0,73194.0,40.0,97.5,34540.0,9186.0
3,AL-Sen-04,610U800US01004,107936.0,54844.0,53092.0,165.0,5460.0,3868.0,1194.0,233.0,...,12327.0,83.7,8943.0,2191.0,70579.0,69592.0,42.4,98.6,30683.0,7948.0
4,AL-Sen-05,610U800US01005,104142.0,53002.0,51140.0,56.0,2642.0,1611.0,797.0,178.0,...,11753.0,84.1,7849.0,2427.0,67554.0,67891.0,41.9,100.5,30380.0,7563.0
5,AL-Sen-06,610U800US01006,109876.0,56733.0,53143.0,249.0,10245.0,6237.0,3433.0,326.0,...,11405.0,76.2,8237.0,2650.0,73638.0,70689.0,41.3,96.0,31875.0,8699.0
6,AL-Sen-07,610U800US01007,114223.0,59492.0,54731.0,323.0,7030.0,4278.0,1488.0,941.0,...,11457.0,76.6,8122.0,2830.0,76588.0,72117.0,40.2,94.2,31713.0,9811.0
7,AL-Sen-08,610U800US01008,113191.0,57590.0,55601.0,56.0,8726.0,4630.0,3694.0,346.0,...,12047.0,87.6,7630.0,1946.0,75302.0,74201.0,41.4,98.5,33074.0,8090.0
8,AL-Sen-09,610U800US01009,105286.0,54200.0,51086.0,75.0,18207.0,10402.0,6807.0,923.0,...,10837.0,75.9,7400.0,2962.0,74595.0,73350.0,39.6,98.3,36296.0,9782.0
9,AL-Sen-10,610U800US01010,113324.0,59005.0,54319.0,239.0,10021.0,6094.0,3240.0,448.0,...,12710.0,79.3,8539.0,2908.0,76816.0,72828.0,41.7,94.8,32616.0,8219.0


In [15]:
#LOAD FUNCTION

def load_and_rename_data(csv):
    data = pd.read_csv(csv, dtype={'Line Number': str})
    data.rename(columns={
            'GEONAME': 'District',
            'Line Number': 'PROFLN',
            'Title': 'TITLE',
            'Estimate': 'PRF_ESTIMATE',
            'Percent Estimate':'PCT_ESTIMATE', 
            'Margin of Error': 'PRF_MG_ERROR',
            'Percent Margin of Error': 'PCT_MG_ERROR'
             }, inplace=True)
     # Filter out rows where 'District' mentions "not defined"
    data = data[~data['District'].str.contains("not defined", case=False, na=False)]
    return data

In [16]:
## translate PROFLN number into associated title
def translate_profln(profln, title_dict):
    # split the PROFLN value into keys by '-' for multi-labeled categories
    keys = profln.split('-')
    # translate each key using the title_dict
    titles = [str(title_dict.get(key, '')) for key in keys]
    # concatenate the translated titles
    translated_title = ' - '.join(title for title in titles if title)

    return translated_title

In [17]:
def update_titles_demo(data):
    # Create the title_dict
    title_dict = dict(zip(data['PROFLN'], data['TITLE']))
    # Update PROFLN values
    for i in range(2, 25):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-' + str(i)
    for i in range(25, 29):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-25' + ('' if i == 25 else '-' + str(i))
    for i in range(29, 33):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '1-29' + ('' if i == 29 else '-' + str(i))
    for i in range(33, 37):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-' + str(i)
    for i in range(37, 58):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-36-' + str(i)
    for i in range(58, 63):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-58' + ('' if i == 58 else '-' + str(i))
    for i in range(63, 70):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '32.5-62.5-' + str(i)
    for i in range(70, 86):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '69.5-' + str(i)
    for i in range(87, 90):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '86.5-' + str(i)

    # Update TITLE values
    data['TITLE'] = data['PROFLN'].apply(lambda x: translate_profln(x, title_dict)) #using translate_profln function

    return data


In [18]:
# CLEAN UP DATA AND DROP REDUNDANT INFO FUNCTION

def clean_up_data_demo(data):
    # drop unneccessary rows and rows with placeholders
    invalid_rows = data['PRF_ESTIMATE'].isin(['N', '(X)']) | data['PRF_ESTIMATE'].isna()
    data = data[~invalid_rows]

    # The rows associated with these index markers (profln) contain duplicated data
    profln_to_remove = ['1-29', '1-25', '32.5-33', '32.5-36', '32.5-58', 
                        '32.5-62.5-63', '32.5-62.5-64', '32.5-62.5-65', 
                        '32.5-62.5-66', '32.5-62.5-67', '32.5-62.5-68', 
                        '32.5-62.5-69', '69.5-70', '69.5-76', '69.5-77', 
                        '69.5-78', '69.5-79', '69.5-80', '69.5-81', '69.5-82']
    data = data[~data['PROFLN'].isin(profln_to_remove)]

    # Delete the margin of error columns
    data = data.drop(columns=['PRF_MG_ERROR', 'PCT_MG_ERROR'])

    # Replace '(X)' values in PCT_ESTIMATE with their PRF_ESTIMATE values (usually, average and median values)
    data.loc[data['PCT_ESTIMATE'] == '(X)', 'PCT_ESTIMATE'] = data['PRF_ESTIMATE']

    # convert to numeric
    data['PCT_ESTIMATE'] = data['PCT_ESTIMATE'].str.replace(',', '').astype(float)
    data['PRF_ESTIMATE'] = data['PRF_ESTIMATE'].str.replace(',', '').astype(float)
    
    return data


In [19]:
# SHORTHAND STATE-DISTRICT FUNCTION

def format_districts(data):
    # dictionary of state names to abbreviations
    states = {
        'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
        'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
        'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
        'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
        'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
        'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH',
        'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
        'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
        'Rhode Island': 'RI', 'South Carolina': 'SC', 'South Dakota': 'SD', 'Tennessee': 'TN',
        'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
        'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
    }

    # function to format a single district
    def format_district(district):
        # extract the state name and district number from the district string
        match = re.search(r'State Senate District (\d+) \(\d+\), ([A-Za-z\s]+)', district)
        if match:
            district_num, state_name = match.groups()
            district_num = district_num.zfill(2)  # Pad the district number with leading zeros if necessary
            state_abbr = states.get(state_name.strip(), state_name)  # Get the state abbreviation
            return f'{state_abbr}-Sen-{district_num}'  # Format it as required
        else:
            return district

    # Apply the function to the District column
    data['Formatted_District'] = data['District'].apply(format_district)

    return data


In [20]:
## TRANSPOSE DATAFRAME FUNCTION
def reorient_dfs(data):
    # Pivot the dataframe separately for 'PRF_ESTIMATE' and 'PCT_ESTIMATE'
    prf_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PRF_ESTIMATE')
    pct_estimate_df = data.pivot_table(index=['Formatted_District', 'GEOID'], columns='TITLE', values='PCT_ESTIMATE')

    # Reset the column names after pivot
    prf_estimate_df.columns = [f'{col}' for col in prf_estimate_df.columns]
    pct_estimate_df.columns = [f'{col}' for col in pct_estimate_df.columns]

    # Reset the index
    prf_estimate_df.reset_index(inplace=True)
    pct_estimate_df.reset_index(inplace=True)

    return prf_estimate_df, pct_estimate_df


# Process all 50 states demographic characteristics

In [21]:
## PROCESS ALL 50 STATES FUNCTION
def process_demographics(file_state_list):
    prf_dfs = []
    pct_dfs = []
    
    for file_state in file_state_list:
        csv, state = file_state

        data = load_and_rename_data(csv)
        data = update_titles_demo(data)
        data = clean_up_data_demo(data)
        data = format_districts(data)

        prf_estimate_df, pct_estimate_df = reorient_dfs(data)

        prf_dfs.append(prf_estimate_df)
        pct_dfs.append(pct_estimate_df)
    
    df_prfdemo_all = pd.concat(prf_dfs, ignore_index=True)
    df_pctdemo_all = pd.concat(pct_dfs, ignore_index=True)
    
    return df_prfdemo_all, df_pctdemo_all


In [22]:
file_state_list = [
 ('data/upper/ALLSLDU_DP05_AK.csv', 'AK'),('data/upper/ALLSLDU_DP05_AL.csv', 'AL'),('data/upper/ALLSLDU_DP05_AR.csv', 'AR'),
 ('data/upper/ALLSLDU_DP05_AZ.csv', 'AZ'),('data/upper/ALLSLDU_DP05_CA.csv', 'CA'),('data/upper/ALLSLDU_DP05_CO.csv', 'CO'),
 ('data/upper/ALLSLDU_DP05_CT.csv', 'CT'),('data/upper/ALLSLDU_DP05_DE.csv', 'DE'),('data/upper/ALLSLDU_DP05_FL.csv', 'FL'),
 ('data/upper/ALLSLDU_DP05_GA.csv', 'GA'),('data/upper/ALLSLDU_DP05_HI.csv', 'HI'),('data/upper/ALLSLDU_DP05_IA.csv', 'IA'),
 ('data/upper/ALLSLDU_DP05_ID.csv', 'ID'),('data/upper/ALLSLDU_DP05_IL.csv', 'IL'),('data/upper/ALLSLDU_DP05_IN.csv', 'IN'),
 ('data/upper/ALLSLDU_DP05_KS.csv', 'KS'),('data/upper/ALLSLDU_DP05_KY.csv', 'KY'),('data/upper/ALLSLDU_DP05_LA.csv', 'LA'),
 ('data/upper/ALLSLDU_DP05_MA.csv', 'MA'),('data/upper/ALLSLDU_DP05_MD.csv', 'MD'),('data/upper/ALLSLDU_DP05_ME.csv', 'ME'),
 ('data/upper/ALLSLDU_DP05_MI.csv', 'MI'),('data/upper/ALLSLDU_DP05_MN.csv', 'MN'),('data/upper/ALLSLDU_DP05_MO.csv', 'MO'),
 ('data/upper/ALLSLDU_DP05_MS.csv', 'MS'),('data/upper/ALLSLDU_DP05_MT.csv', 'MT'),('data/upper/ALLSLDU_DP05_NC.csv', 'NC'),
 ('data/upper/ALLSLDU_DP05_ND.csv', 'ND'),('data/upper/ALLSLDU_DP05_NE.csv', 'NE'),('data/upper/ALLSLDU_DP05_NH.csv', 'NH'),
 ('data/upper/ALLSLDU_DP05_NJ.csv', 'NJ'),('data/upper/ALLSLDU_DP05_NM.csv', 'NM'),('data/upper/ALLSLDU_DP05_NV.csv', 'NV'),
 ('data/upper/ALLSLDU_DP05_NY.csv', 'NY'),('data/upper/ALLSLDU_DP05_OH.csv', 'OH'),('data/upper/ALLSLDU_DP05_OK.csv', 'OK'),
 ('data/upper/ALLSLDU_DP05_OR.csv', 'OR'),('data/upper/ALLSLDU_DP05_PA.csv', 'PA'),('data/upper/ALLSLDU_DP05_RI.csv', 'RI'),
 ('data/upper/ALLSLDU_DP05_SC.csv', 'SC'),('data/upper/ALLSLDU_DP05_SD.csv', 'SD'),('data/upper/ALLSLDU_DP05_TN.csv', 'TN'),
 ('data/upper/ALLSLDU_DP05_TX.csv', 'TX'),('data/upper/ALLSLDU_DP05_UT.csv', 'UT'),('data/upper/ALLSLDU_DP05_VA.csv', 'VA'),
 ('data/upper/ALLSLDU_DP05_VT.csv', 'VT'),('data/upper/ALLSLDU_DP05_WA.csv', 'WA'),('data/upper/ALLSLDU_DP05_WI.csv', 'WI'),
 ('data/upper/ALLSLDU_DP05_WV.csv', 'WV'),('data/upper/ALLSLDU_DP05_WY.csv', 'WY')
]


df_prfdemo_all, df_pctdemo_all = process_demographics(file_state_list)


In [23]:
df_prfdemo_all.columns

Index(['Formatted_District', 'GEOID',
       'CITIZEN, VOTING AGE POPULATION - Citizen, 18 and over population',
       'CITIZEN, VOTING AGE POPULATION - Female',
       'CITIZEN, VOTING AGE POPULATION - Male',
       'HISPANIC OR LATINO AND RACE - Cuban',
       'HISPANIC OR LATINO AND RACE - Hispanic or Latino (of any race)',
       'HISPANIC OR LATINO AND RACE - Mexican',
       'HISPANIC OR LATINO AND RACE - Other Hispanic or Latino',
       'HISPANIC OR LATINO AND RACE - Puerto Rican',
       'HISPANIC OR LATINO AND RACE - Two or more races',
       'HISPANIC OR LATINO AND RACE - Two races excluding Some other race, and Three or more races',
       'HISPANIC OR LATINO AND RACE - Two races including Some other race',
       'RACE - One race',
       'RACE - One race - American Indian and Alaska Native',
       'RACE - One race - Asian', 'RACE - One race - Asian Indian',
       'RACE - One race - Black or African American',
       'RACE - One race - Chamorro', 'RACE - One race - Chi

In [24]:
df_pctdemo_all.shape

(1942, 67)

# Housing Characteristics

In [25]:
## LOAD_AND_RENAME_DATA FUNCTION REMAINS THE SAME
AL_housing_df = load_and_rename_data('data/upper/ALLSLDU_DP04_AL.csv')

In [26]:
#Profln values and their corresponding TITLE/associations have changed. Revamping title formatting
def update_titles_housing(data):
    # Create the title_dict
    title_dict = dict(zip(data['PROFLN'], data['TITLE']))
    
    #Update PROFLN numbers by category
    for i in range(1, 6): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '0.8-' + str(i) # "HOUSING OCCUPANCY-" Category
    for i in range(6, 16): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '5.5-' + str(i) # "UNITS IN STRUCTURE-" Category
    for i in range(16, 27): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '15.5-' + str(i) # "YEAR STRUCTURE BUILT-" Category
    for i in range(27, 38): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '26.5-' + str(i) # "ROOMS-" Category
    for i in range(39, 45): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '37.5-' + str(i) # "BEDROOMS-" Category
    for i in range(45, 57): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '44.5-' + str(i) # "HOUSING TENURE-" Category
    for i in range(57, 62): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '56.5-' + str(i) # "VEHICLES AVAILABLE-" Category
    for i in range(62, 72): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '61.5-' + str(i) # "HOUSE HEATING FUEL-" Category
    for i in range(72, 76): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '71.5-' + str(i) # "SELECTED CHARACTERISTICS-" Category
    for i in range(76, 80): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '75.5-' + str(i) # "OCCUPANTS PER ROOM-" Category
    for i in range(80, 90): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '79.5-' + str(i)  # "VALUE-" Category
    for i in range(90, 93): 
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '89.5-' + str(i)   # "MORTGAGE STATUS-" Category
    for i in range(126, 136):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '125.5-' + str(i) # "GROSS RENT-" Category
        
    ###LONG TITLES BELOW###
    
    # "SELECTED MONTHLY OWNER COSTS (SMOC)- Housing units with a mortgage- " Category
    for i in range(93, 102):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '92.5-93' + ('' if i == 93 else '-' + str(i))
    # "SELECTED MONTHLY OWNER COSTS (SMOC)- Housing units without a mortgage- " Category
    for i in range(102, 110):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '92.5-102' + ('' if i == 102 else '-' + str(i))  
    # "SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME (SMOCAPI)- 
    # Housing units with a mortgage (excluding units where SMOCAPI cannot be computed)-" Category
    for i in range(110, 116):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '109.5-110' + ('' if i == 110 else '-' + str(i))  
    # "SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME (SMOCAPI)-
    # Housing unit without a mortgage (excluding units where SMOCAPI cannot be computed)-" Category
    for i in range(117, 125):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '109.5-117' + ('' if i == 117 else '-' + str(i))
    #"GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME (GRAPI)-" Category
    for i in range(136, 143):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '135.5-' + str(i)
        
    # Translate TITLE values using previously defined 'translate_profln' function
    data['TITLE'] = data['PROFLN'].apply(lambda x: translate_profln(x, title_dict))

    return data

In [27]:
AL_housing_df = update_titles_housing(AL_housing_df)

In [28]:
import pandas as pd

# Temporary settings to display more data
with pd.option_context('display.max_rows', None):
    print(AL_housing_df['TITLE'].unique())


['SELECTED HOUSING CHARACTERISTICS' 'nan' 'HOUSING OCCUPANCY'
 'HOUSING OCCUPANCY - Total housing units'
 'HOUSING OCCUPANCY - Occupied housing units'
 'HOUSING OCCUPANCY - Vacant housing units'
 'HOUSING OCCUPANCY - Homeowner vacancy rate'
 'HOUSING OCCUPANCY - Rental vacancy rate' 'UNITS IN STRUCTURE'
 'UNITS IN STRUCTURE - Total housing units'
 'UNITS IN STRUCTURE - 1-unit, detached'
 'UNITS IN STRUCTURE - 1-unit, attached' 'UNITS IN STRUCTURE - 2 units'
 'UNITS IN STRUCTURE - 3 or 4 units' 'UNITS IN STRUCTURE - 5 to 9 units'
 'UNITS IN STRUCTURE - 10 to 19 units'
 'UNITS IN STRUCTURE - 20 or more units'
 'UNITS IN STRUCTURE - Mobile home'
 'UNITS IN STRUCTURE - Boat, RV, van, etc.' 'YEAR STRUCTURE BUILT'
 'YEAR STRUCTURE BUILT - Total housing units'
 'YEAR STRUCTURE BUILT - Built 2020 or later'
 'YEAR STRUCTURE BUILT - Built 2010 to 2019'
 'YEAR STRUCTURE BUILT - Built 2000 to 2009'
 'YEAR STRUCTURE BUILT - Built 1990 to 1999'
 'YEAR STRUCTURE BUILT - Built 1980 to 1989'
 'YEAR STR

In [29]:
# CLEAN UP DATA AND DROP REDUNDANT INFO FROM HOUSING DATASET FUNCTION

def clean_up_data_housing(data):
    # drop unneccessary rows and rows with placeholders
    invalid_rows = data['PRF_ESTIMATE'].isin(['N', '(X)']) | data['PRF_ESTIMATE'].isna()
    data = data[~invalid_rows]

    # Drop rows where 'TITLE' contains 'Not computed'
    data = data[~data['TITLE'].str.contains('Not computed', na=False)]

    # The rows associated with these (profln) contain duplicated data
    profln_to_remove = ['5.5-6','15.5-16', '26.5-27', '38',                        # Duplicated Total Housing Units
                        '44.5-45','44.5-50', '56.5-57', '61.5-62','71.5-72', '75.5-76', #Dup. Occupied HU
                        '79.5-80', '89.5-90', '92.5-93','109.5-110','92.5-102', '109.5-117', #Mortgage/No Mortgage
                        '135.5-136']                                                       #Dup. HU Renting
    
    data = data[~data['PROFLN'].isin(profln_to_remove)]

    # Delete the margin of error columns
    data = data.drop(columns=['PRF_MG_ERROR', 'PCT_MG_ERROR'])

    # Replace '(X)' values in PCT_ESTIMATE with their PRF_ESTIMATE values (usually average and median values)
    data.loc[data['PCT_ESTIMATE'] == '(X)', 'PCT_ESTIMATE'] = data['PRF_ESTIMATE']

    # Remove '+' and ',' from 'PCT_ESTIMATE' and 'PRF_ESTIMATE', then convert to float
    for col in ['PCT_ESTIMATE', 'PRF_ESTIMATE']:
        data[col] = data[col].str.replace(',', '').str.replace('+', '').astype(float)
    
    return data



In [30]:
AL_housing_df = clean_up_data_housing(AL_housing_df)

In [31]:
format_districts(AL_housing_df)

Unnamed: 0,GEOID,District,PROFLN,TITLE,PRF_ESTIMATE,PCT_ESTIMATE,Formatted_District
3,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-1,HOUSING OCCUPANCY - Total housing units,66317.0,66317.0,AL-Sen-01
4,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-2,HOUSING OCCUPANCY - Occupied housing units,57820.0,87.2,AL-Sen-01
5,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-3,HOUSING OCCUPANCY - Vacant housing units,8497.0,12.8,AL-Sen-01
7,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-4,HOUSING OCCUPANCY - Homeowner vacancy rate,1.1,1.1,AL-Sen-01
8,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-5,HOUSING OCCUPANCY - Rental vacancy rate,3.8,3.8,AL-Sen-01
...,...,...,...,...,...,...,...
6503,610U800US01035,"State Senate District 35 (2022), Alabama",135.5-138,GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME (GRAPI) - 15.0 to 19.9 percent,1714.0,13.3,AL-Sen-35
6504,610U800US01035,"State Senate District 35 (2022), Alabama",135.5-139,GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME (GRAPI) - 20.0 to 24.9 percent,1367.0,10.6,AL-Sen-35
6505,610U800US01035,"State Senate District 35 (2022), Alabama",135.5-140,GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME (GRAPI) - 25.0 to 29.9 percent,1433.0,11.1,AL-Sen-35
6506,610U800US01035,"State Senate District 35 (2022), Alabama",135.5-141,GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME (GRAPI) - 30.0 to 34.9 percent,1120.0,8.7,AL-Sen-35


In [32]:
AL_housing_prf, AL_housing_pct = reorient_dfs(AL_housing_df)

In [33]:
AL_housing_pct

Unnamed: 0,Formatted_District,GEOID,BEDROOMS - 1 bedroom,BEDROOMS - 2 bedrooms,BEDROOMS - 3 bedrooms,BEDROOMS - 4 bedrooms,BEDROOMS - 5 or more bedrooms,BEDROOMS - No bedroom,"GROSS RENT - $1,000 to $1,499","GROSS RENT - $1,500 to $1,999",...,YEAR STRUCTURE BUILT - Built 1939 or earlier,YEAR STRUCTURE BUILT - Built 1940 to 1949,YEAR STRUCTURE BUILT - Built 1950 to 1959,YEAR STRUCTURE BUILT - Built 1960 to 1969,YEAR STRUCTURE BUILT - Built 1970 to 1979,YEAR STRUCTURE BUILT - Built 1980 to 1989,YEAR STRUCTURE BUILT - Built 1990 to 1999,YEAR STRUCTURE BUILT - Built 2000 to 2009,YEAR STRUCTURE BUILT - Built 2010 to 2019,YEAR STRUCTURE BUILT - Built 2020 or later
0,AL-Sen-01,610U800US01001,5.4,22.2,50.4,16.6,4.3,1.1,12.4,3.2,...,4.1,3.4,8.5,12.8,14.9,13.7,16.8,15.2,10.4,0.1
1,AL-Sen-02,610U800US01002,8.4,17.0,39.6,26.0,7.1,1.8,42.1,8.5,...,0.8,0.6,3.3,8.1,7.8,17.4,21.0,24.4,16.2,0.2
2,AL-Sen-03,610U800US01003,6.5,20.2,48.5,18.9,4.1,1.7,20.1,3.3,...,3.3,2.4,6.8,12.6,14.6,15.5,16.4,15.1,13.2,0.2
3,AL-Sen-04,610U800US01004,4.4,26.2,50.5,14.4,2.8,1.7,11.0,1.3,...,4.9,3.0,6.2,12.1,18.5,15.0,20.4,11.9,8.0,0.1
4,AL-Sen-05,610U800US01005,4.4,25.4,49.9,16.2,2.4,1.6,9.2,1.0,...,5.8,5.0,7.3,9.0,16.7,14.7,18.2,14.8,8.4,0.1
5,AL-Sen-06,610U800US01006,4.8,22.8,55.2,12.5,3.1,1.6,8.3,0.7,...,5.2,3.8,8.0,14.3,17.1,13.4,16.8,12.6,8.7,0.1
6,AL-Sen-07,610U800US01007,8.3,18.1,42.3,23.8,6.2,1.3,26.3,6.2,...,3.4,3.2,8.3,20.9,14.3,14.8,13.1,12.8,9.1,0.2
7,AL-Sen-08,610U800US01008,4.4,22.1,52.9,16.5,3.5,0.6,12.5,2.1,...,4.7,3.8,5.7,10.0,14.6,15.3,22.2,15.6,7.9,0.3
8,AL-Sen-09,610U800US01009,5.6,23.8,49.1,17.0,3.5,1.0,15.7,4.3,...,3.6,2.8,7.9,14.2,14.9,16.2,18.5,13.3,8.4,0.2
9,AL-Sen-10,610U800US01010,4.4,26.4,51.7,13.1,3.1,1.3,10.4,1.1,...,6.8,6.0,11.5,13.1,15.4,13.3,15.8,12.7,5.2,0.2


In [34]:
## PROCESS ALL 50 STATES HOUSING FUNCTION
def process_housing(file_state_list):
    prf_dfs = []
    pct_dfs = []
    
    for file_state in file_state_list:
        csv, state = file_state

        data = load_and_rename_data(csv)
        data = update_titles_housing(data)
        data = clean_up_data_housing(data)
        data = format_districts(data)

        prf_estimate_df, pct_estimate_df = reorient_dfs(data)

        prf_dfs.append(prf_estimate_df)
        pct_dfs.append(pct_estimate_df)
    
    df_prfhousing_all = pd.concat(prf_dfs, ignore_index=True)
    df_pcthousing_all = pd.concat(pct_dfs, ignore_index=True)
    
    return df_prfhousing_all, df_pcthousing_all


In [35]:
file_state_list = [
 ('data/upper/ALLSLDU_DP04_AK.csv', 'AK'),('data/upper/ALLSLDU_DP04_AL.csv', 'AL'),('data/upper/ALLSLDU_DP04_AR.csv', 'AR'),
 ('data/upper/ALLSLDU_DP04_AZ.csv', 'AZ'),('data/upper/ALLSLDU_DP04_CA.csv', 'CA'),('data/upper/ALLSLDU_DP04_CO.csv', 'CO'),
 ('data/upper/ALLSLDU_DP04_CT.csv', 'CT'),('data/upper/ALLSLDU_DP04_DE.csv', 'DE'),('data/upper/ALLSLDU_DP04_FL.csv', 'FL'),
 ('data/upper/ALLSLDU_DP04_GA.csv', 'GA'),('data/upper/ALLSLDU_DP04_HI.csv', 'HI'),('data/upper/ALLSLDU_DP04_IA.csv', 'IA'),
 ('data/upper/ALLSLDU_DP04_ID.csv', 'ID'),('data/upper/ALLSLDU_DP04_IL.csv', 'IL'),('data/upper/ALLSLDU_DP04_IN.csv', 'IN'),
 ('data/upper/ALLSLDU_DP04_KS.csv', 'KS'),('data/upper/ALLSLDU_DP04_KY.csv', 'KY'),('data/upper/ALLSLDU_DP04_LA.csv', 'LA'),
 ('data/upper/ALLSLDU_DP04_MA.csv', 'MA'),('data/upper/ALLSLDU_DP04_MD.csv', 'MD'),('data/upper/ALLSLDU_DP04_ME.csv', 'ME'),
 ('data/upper/ALLSLDU_DP04_MI.csv', 'MI'),('data/upper/ALLSLDU_DP04_MN.csv', 'MN'),('data/upper/ALLSLDU_DP04_MO.csv', 'MO'),
 ('data/upper/ALLSLDU_DP04_MS.csv', 'MS'),('data/upper/ALLSLDU_DP04_MT.csv', 'MT'),('data/upper/ALLSLDU_DP04_NC.csv', 'NC'),
 ('data/upper/ALLSLDU_DP04_ND.csv', 'ND'),('data/upper/ALLSLDU_DP04_NE.csv', 'NE'),('data/upper/ALLSLDU_DP04_NH.csv', 'NH'),
 ('data/upper/ALLSLDU_DP04_NJ.csv', 'NJ'),('data/upper/ALLSLDU_DP04_NM.csv', 'NM'),('data/upper/ALLSLDU_DP04_NV.csv', 'NV'),
 ('data/upper/ALLSLDU_DP04_NY.csv', 'NY'),('data/upper/ALLSLDU_DP04_OH.csv', 'OH'),('data/upper/ALLSLDU_DP04_OK.csv', 'OK'),
 ('data/upper/ALLSLDU_DP04_OR.csv', 'OR'),('data/upper/ALLSLDU_DP04_PA.csv', 'PA'),('data/upper/ALLSLDU_DP04_RI.csv', 'RI'),
 ('data/upper/ALLSLDU_DP04_SC.csv', 'SC'),('data/upper/ALLSLDU_DP04_SD.csv', 'SD'),('data/upper/ALLSLDU_DP04_TN.csv', 'TN'),
 ('data/upper/ALLSLDU_DP04_TX.csv', 'TX'),('data/upper/ALLSLDU_DP04_UT.csv', 'UT'),('data/upper/ALLSLDU_DP04_VA.csv', 'VA'),
 ('data/upper/ALLSLDU_DP04_VT.csv', 'VT'),('data/upper/ALLSLDU_DP04_WA.csv', 'WA'),('data/upper/ALLSLDU_DP04_WI.csv', 'WI'),
 ('data/upper/ALLSLDU_DP04_WV.csv', 'WV'),('data/upper/ALLSLDU_DP04_WY.csv', 'WY')
]


df_prfhousing_all, df_pcthousing_all = process_housing(file_state_list)


In [36]:
df_prfhousing_all.columns

Index(['Formatted_District', 'GEOID', 'BEDROOMS - 1 bedroom',
       'BEDROOMS - 2 bedrooms', 'BEDROOMS - 3 bedrooms',
       'BEDROOMS - 4 bedrooms', 'BEDROOMS - 5 or more bedrooms',
       'BEDROOMS - No bedroom', 'GROSS RENT - $1,000 to $1,499',
       'GROSS RENT - $1,500 to $1,999',
       ...
       'YEAR STRUCTURE BUILT - Built 1939 or earlier',
       'YEAR STRUCTURE BUILT - Built 1940 to 1949',
       'YEAR STRUCTURE BUILT - Built 1950 to 1959',
       'YEAR STRUCTURE BUILT - Built 1960 to 1969',
       'YEAR STRUCTURE BUILT - Built 1970 to 1979',
       'YEAR STRUCTURE BUILT - Built 1980 to 1989',
       'YEAR STRUCTURE BUILT - Built 1990 to 1999',
       'YEAR STRUCTURE BUILT - Built 2000 to 2009',
       'YEAR STRUCTURE BUILT - Built 2010 to 2019',
       'YEAR STRUCTURE BUILT - Built 2020 or later'],
      dtype='object', length=125)

In [37]:
df_prfhousing_all

Unnamed: 0,Formatted_District,GEOID,BEDROOMS - 1 bedroom,BEDROOMS - 2 bedrooms,BEDROOMS - 3 bedrooms,BEDROOMS - 4 bedrooms,BEDROOMS - 5 or more bedrooms,BEDROOMS - No bedroom,"GROSS RENT - $1,000 to $1,499","GROSS RENT - $1,500 to $1,999",...,YEAR STRUCTURE BUILT - Built 1939 or earlier,YEAR STRUCTURE BUILT - Built 1940 to 1949,YEAR STRUCTURE BUILT - Built 1950 to 1959,YEAR STRUCTURE BUILT - Built 1960 to 1969,YEAR STRUCTURE BUILT - Built 1970 to 1979,YEAR STRUCTURE BUILT - Built 1980 to 1989,YEAR STRUCTURE BUILT - Built 1990 to 1999,YEAR STRUCTURE BUILT - Built 2000 to 2009,YEAR STRUCTURE BUILT - Built 2010 to 2019,YEAR STRUCTURE BUILT - Built 2020 or later
0,"State Senate District A (2022), Alaska",610U800US0200A,2920.0,5192.0,6222.0,2319.0,685.0,1250.0,1455.0,685.0,...,1723.0,770.0,1276.0,1523.0,4250.0,3876.0,2600.0,1677.0,884.0,9.0
1,"State Senate District B (2022), Alaska",610U800US0200B,2661.0,5275.0,5600.0,1944.0,338.0,766.0,1812.0,1005.0,...,1115.0,403.0,679.0,1298.0,3945.0,4070.0,2197.0,1794.0,1073.0,10.0
2,"State Senate District C (2022), Alaska",610U800US0200C,3517.0,5179.0,6358.0,2486.0,739.0,1762.0,1094.0,645.0,...,261.0,379.0,900.0,1192.0,3215.0,5299.0,3781.0,3722.0,1278.0,14.0
3,"State Senate District D (2022), Alaska",610U800US0200D,2386.0,4760.0,7103.0,2743.0,721.0,1335.0,1051.0,454.0,...,75.0,78.0,559.0,1357.0,4112.0,4473.0,2710.0,4172.0,1459.0,53.0
4,"State Senate District E (2022), Alaska",610U800US0200E,704.0,2511.0,5674.0,3781.0,809.0,589.0,711.0,805.0,...,50.0,55.0,463.0,534.0,4187.0,3785.0,1850.0,2560.0,584.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1937,WY-Sen-27,610U800US56027,702.0,2506.0,2357.0,1870.0,1143.0,100.0,973.0,143.0,...,410.0,216.0,1450.0,648.0,1883.0,996.0,554.0,880.0,1587.0,54.0
1938,WY-Sen-28,610U800US56028,1309.0,2495.0,2685.0,1886.0,607.0,149.0,647.0,129.0,...,2201.0,449.0,1259.0,715.0,1302.0,1192.0,436.0,793.0,784.0,0.0
1939,WY-Sen-29,610U800US56029,625.0,1851.0,3808.0,1660.0,868.0,153.0,526.0,149.0,...,519.0,454.0,2648.0,1086.0,1686.0,945.0,366.0,744.0,517.0,0.0
1940,WY-Sen-30,610U800US56030,549.0,1461.0,3784.0,1700.0,560.0,64.0,416.0,146.0,...,556.0,416.0,369.0,414.0,1724.0,1293.0,607.0,1551.0,1118.0,70.0


# SOCIAL CHARACTERISTICS

In [38]:
## LOAD_AND_RENAME_DATA FUNCTION REMAINS THE SAME
AL_social_df = load_and_rename_data('data/upper/ALLSLDU_DP02_AL.csv')

In [39]:
def update_titles_social(data):
    # Create the title_dict
    title_dict = dict(zip(data['PROFLN'], data['TITLE']))

    ####### DICTIONARIES FOR PROFLN NUMBERS THAT NEED A DIRECT MAPPING#######
    # "HOUSEHOLDS BY TYPE" category
    households_by_type_dict = {
        '2': '1-2', '3': '1-2-3', '4': '1-4', '5': '1-4-5',
        '6': '1-6', '7': '1-6-7', '8': '1-6-8', '9': '1-6-9',
        '10': '1-10', '11': '1-10-11', '12': '1-10-12', '13': '1-10-13'
    }
    # "FERTILITY" category
    fertility_dict = {
        '37': '36.5-37', '38': '36.5-38', '39': '36.5-37-39', 
        '40': '36.5-37-40', '41': '36.5-37-41', '42': '36.5-37-42', 
        '43': '36.5-37-43'
    }
    # "GRANDPARENTS" category
    grandparents_dict = {
        '44': '43.5-44', '45': '43.5-44-45', '45.9': '45-45.9',
        '46': '45-45.9-46', '47': '45-45.9-47', '48': '45-45.9-48',
        '49': '45-45.9-49', '51': '50-51', '52': '50-52'
    }
    # "VETERAN STATUS" category
    veteran_status_dict = {
        '69': '68.5-69', '70': '68.5-70'
    }
    # "PLACE OF BIRTH" category
    place_of_birth_dict = {
        '88': '87.5-88', '89': '87.5-89', '90': '87.5-89-90', 
        '91': '87.5-89-91', '92': '87.5-89-92', '93': '87.5-89-93', 
        '94': '87.5-94'
    }
    # "U.S. CITIZENSHIP STATUS" category
    citizenship_status_dict = {
        '96': '95-96', '97': '95-97'
    }
    # "YEAR OF ENTRY" category
    year_of_entry_dict = {
        '99': '98-99', '100': '98-99-100', '101': '98-99-101',
        '102': '98-102', '103': '98-102-103', '104': '98-102-104'
    }
    # "LANGUAGE SPOKEN AT HOME" category
    language_spoken_dict = {
        '112': '111.5-112', '113': '111.5-113', '114': '111.5-114',
        '115': '111.5-114-115', '116': '111.5-116', '117': '111.5-114-117',
        '118': '111.5-118', '119': '111.5-118-119', '120': '111.5-120',
        '121': '111.5-120-121', '122': '111.5-114-122', '123': '111.5-114-122-123'
    }
    # "COMPUTERS AND INTERNET USE" category
    computers_and_internet_use_dict = {
        '152': '151.5-152', 
        '153': '151.5-152-153', 
        '154': '151.5-152-154'
    }
    

    ###### PROFLN NUMBERS THAT FOLLOW A SIMPLE PATTERN OVER A LONG RANGE######
    # "RELATIONSHIP" category
    for i in range(19, 25):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '18-' + str(i)
    # "MARITAL STATUS" category
    for i in range(26, 31):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '25-' + str(i)
    for i in range(32, 37):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '31-' + str(i)
    # "SCHOOL ENROLLMENT" category
    for i in range(54, 59):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '53-' + str(i)
    # "EDUCATIONAL ATTAINMENT" category
    for i in range(59, 69):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '58.5-59' + ('' if i == 59 else '-' + str(i))
    # "DISABILITY STATUS OF THE CIVILIAN NONINSTITUTIONALIZED POPULATION" category
    for i in range(72, 79):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '71-' + str(i)
    # "RESIDENCE 1 YEAR AGO" category
    for i in range(80, 88):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '78.5-' + str(i)
    # "WORLD REGION OF BIRTH OF FOREIGN BORN" category
    for i in range(105, 112):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '104.5-' + str(i)
    # "ANCESTRY" category
    for i in range(125, 152):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '123.5-' + str(i)



    ###### DIRECT MAPPING #####
    for old, new in households_by_type_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in fertility_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in grandparents_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in veteran_status_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in place_of_birth_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in citizenship_status_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in year_of_entry_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in language_spoken_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in computers_and_internet_use_dict.items():
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new

    # Translate TITLE values using previously defined 'translate_profln' function
    data['PROFLN'] = data['PROFLN'].astype(str)
    data['TITLE'] = data['PROFLN'].apply(lambda x: translate_profln(x, title_dict))



    return data


In [40]:
AL_social_df = update_titles_social(AL_social_df)

# Unhash here

In [41]:
# CLEAN UP DATA AND DROP REDUNDANT INFO FROM SOCIAL DATASET FUNCTION

def clean_up_data_social(data):
    # data = data.copy()                <---------- unhash
    
    # drop unneccessary rows and rows with placeholders
    invalid_rows = (data['PRF_ESTIMATE'].isin(['N', '(X)']) | 
                data['PCT_ESTIMATE'].isin(['N', '(X)']) |
                data['PRF_ESTIMATE'].isna())
    data = data[~invalid_rows]

    # Replace problematic placeholders in 'PCT_ESTIMATE' with '0' directly
    data['PCT_ESTIMATE'].replace({'-': '0'}, inplace=True)

    # Drop rows where 'TITLE' contains 'Not computed'
    # data = data[~data['TITLE'].str.contains('Not computed', na=False)]  <-------------- unhash

    # The rows associated with these (profln) contain duplicated data
    profln_to_remove = ['50','95','98-102', 
                        '1','152','124']  # Duplicated Grandparents, Foreign born, Total households, total pop               
    
    data = data[~data['PROFLN'].isin(profln_to_remove)]

    # Delete the margin of error columns
    data = data.drop(columns=['PRF_MG_ERROR', 'PCT_MG_ERROR'])

    # Replace '(X)' values in PCT_ESTIMATE with their PRF_ESTIMATE values (usually average and median values)
    data.loc[data['PCT_ESTIMATE'] == '(X)', 'PCT_ESTIMATE'] = data['PRF_ESTIMATE']

    # Remove '+' and ',' from 'PCT_ESTIMATE' and 'PRF_ESTIMATE', then convert to float
    for col in ['PCT_ESTIMATE', 'PRF_ESTIMATE']:
        data[col] = data[col].str.replace(',', '').str.replace('+', '').astype(float)
    
    return data




In [42]:
AL_social_df = clean_up_data_social(AL_social_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['PCT_ESTIMATE'].replace({'-': '0'}, inplace=True)


In [43]:
format_districts(AL_social_df)

Unnamed: 0,GEOID,District,PROFLN,TITLE,PRF_ESTIMATE,PCT_ESTIMATE,Formatted_District
4,610U800US01001,"State Senate District 1 (2022), Alabama",1-2,Total households - Married-couple household,29923.0,51.8,AL-Sen-01
5,610U800US01001,"State Senate District 1 (2022), Alabama",1-2-3,Total households - Married-couple household - With children of the householder under 18 years,10418.0,18.0,AL-Sen-01
6,610U800US01001,"State Senate District 1 (2022), Alabama",1-4,Total households - Cohabiting couple household,2547.0,4.4,AL-Sen-01
7,610U800US01001,"State Senate District 1 (2022), Alabama",1-4-5,Total households - Cohabiting couple household - With children of the householder under 18 years,801.0,1.4,AL-Sen-01
8,610U800US01001,"State Senate District 1 (2022), Alabama",1-6,"Total households - Male householder, no spouse/partner present",9144.0,15.8,AL-Sen-01
...,...,...,...,...,...,...,...
6993,610U800US01035,"State Senate District 35 (2022), Alabama",123.5-150,ANCESTRY - Welsh,1168.0,0.9,AL-Sen-35
6994,610U800US01035,"State Senate District 35 (2022), Alabama",123.5-151,ANCESTRY - West Indian (excluding Hispanic origin groups),223.0,0.2,AL-Sen-35
6997,610U800US01035,"State Senate District 35 (2022), Alabama",151.5-152,COMPUTERS AND INTERNET USE - Total households,51317.0,51317.0,AL-Sen-35
6998,610U800US01035,"State Senate District 35 (2022), Alabama",151.5-152-153,COMPUTERS AND INTERNET USE - Total households - With a computer,47594.0,92.7,AL-Sen-35


In [44]:
AL_social_prf, AL_social_pct = reorient_dfs(AL_social_df)

In [45]:
AL_social_prf

Unnamed: 0,Formatted_District,GEOID,ANCESTRY - American,ANCESTRY - Arab,ANCESTRY - Czech,ANCESTRY - Danish,ANCESTRY - Dutch,ANCESTRY - English,ANCESTRY - French (except Basque),ANCESTRY - French Canadian,...,Total households - Married-couple household - With children of the householder under 18 years,VETERAN STATUS - Civilian population 18 years and over,VETERAN STATUS - Civilian veterans,WORLD REGION OF BIRTH OF FOREIGN BORN - Africa,WORLD REGION OF BIRTH OF FOREIGN BORN - Asia,WORLD REGION OF BIRTH OF FOREIGN BORN - Europe,"WORLD REGION OF BIRTH OF FOREIGN BORN - Foreign-born population, excluding population born at sea",WORLD REGION OF BIRTH OF FOREIGN BORN - Latin America,WORLD REGION OF BIRTH OF FOREIGN BORN - Northern America,WORLD REGION OF BIRTH OF FOREIGN BORN - Oceania
0,AL-Sen-01,610U800US01001,19088.0,241.0,236.0,168.0,934.0,16953.0,1667.0,200.0,...,10418.0,114890.0,8744.0,92.0,901.0,301.0,3350.0,1895.0,144.0,17.0
1,AL-Sen-02,610U800US01002,14174.0,512.0,114.0,245.0,947.0,16766.0,2317.0,361.0,...,12209.0,115151.0,13666.0,869.0,4513.0,1634.0,10614.0,3242.0,331.0,25.0
2,AL-Sen-03,610U800US01003,24926.0,707.0,172.0,114.0,1100.0,15159.0,1548.0,500.0,...,10595.0,113692.0,10522.0,264.0,1263.0,430.0,7174.0,4986.0,186.0,45.0
3,AL-Sen-04,610U800US01004,34779.0,101.0,11.0,76.0,1161.0,13322.0,1486.0,95.0,...,9649.0,109428.0,7873.0,14.0,385.0,319.0,2651.0,1847.0,66.0,20.0
4,AL-Sen-05,610U800US01005,30242.0,71.0,5.0,30.0,1411.0,13692.0,1376.0,137.0,...,10032.0,105024.0,7290.0,0.0,295.0,220.0,1732.0,1095.0,122.0,0.0
5,AL-Sen-06,610U800US01006,43145.0,63.0,36.0,46.0,1098.0,11653.0,742.0,83.0,...,8459.0,112333.0,7546.0,0.0,553.0,274.0,4947.0,4101.0,19.0,0.0
6,AL-Sen-07,610U800US01007,15784.0,183.0,198.0,133.0,1658.0,17192.0,2387.0,514.0,...,9167.0,116647.0,11774.0,638.0,2181.0,1103.0,6847.0,2713.0,207.0,5.0
7,AL-Sen-08,610U800US01008,22737.0,89.0,74.0,138.0,1479.0,14947.0,1552.0,209.0,...,10621.0,116367.0,9433.0,120.0,474.0,306.0,4679.0,3499.0,240.0,40.0
8,AL-Sen-09,610U800US01009,42656.0,118.0,205.0,50.0,674.0,13425.0,1976.0,306.0,...,10617.0,111301.0,10442.0,389.0,1428.0,762.0,10321.0,7607.0,109.0,26.0
9,AL-Sen-10,610U800US01010,25183.0,189.0,7.0,58.0,789.0,12487.0,1345.0,52.0,...,9137.0,117010.0,8687.0,150.0,697.0,437.0,5792.0,4363.0,113.0,32.0


In [46]:
## PROCESS ALL 50 STATES SOCIAL FUNCTION
def process_social(file_state_list):
    prf_dfs = []
    pct_dfs = []
    
    for file_state in file_state_list:
        csv, state = file_state

        data = load_and_rename_data(csv)
        data = update_titles_social(data)
        data = clean_up_data_social(data)
        data = format_districts(data)

        prf_estimate_df, pct_estimate_df = reorient_dfs(data)

        prf_dfs.append(prf_estimate_df)
        pct_dfs.append(pct_estimate_df)
    
    df_prfsocial_all = pd.concat(prf_dfs, ignore_index=True)
    df_pctsocial_all = pd.concat(pct_dfs, ignore_index=True)
    
    return df_prfsocial_all, df_pctsocial_all


In [47]:
import pandas as pd

def process_social(file_state_list):
    prf_dfs = []
    pct_dfs = []
    error_files = []

    for file_state in file_state_list:
        csv, state = file_state
        try:
            data = load_and_rename_data(csv)
            data = update_titles_social(data)
            data = clean_up_data_social(data)
            data = format_districts(data)

            prf_estimate_df, pct_estimate_df = reorient_dfs(data)

            prf_dfs.append(prf_estimate_df)
            pct_dfs.append(pct_estimate_df)

        except ValueError as e:
            print(f"Error processing file: {csv}")
            error_files.append((csv, str(e)))
            # Optionally, print or log the rows causing the error
            problematic_rows = data[(data['PCT_ESTIMATE'].str.contains('-', na=False)) |
                                    (data['PRF_ESTIMATE'].str.contains('-', na=False))]
            print(problematic_rows)

    df_prfsocial_all = pd.concat(prf_dfs, ignore_index=True) if prf_dfs else pd.DataFrame()
    df_pctsocial_all = pd.concat(pct_dfs, ignore_index=True) if pct_dfs else pd.DataFrame()

    return df_prfsocial_all, df_pctsocial_all, error_files

# Usage
file_state_list = file_state_list = [
 ('data/upper/ALLSLDU_DP02_AK.csv', 'AK'),('data/upper/ALLSLDU_DP02_AL.csv', 'AL'),('data/upper/ALLSLDU_DP02_AR.csv', 'AR'),
 ('data/upper/ALLSLDU_DP02_AZ.csv', 'AZ'),('data/upper/ALLSLDU_DP02_CA.csv', 'CA'),('data/upper/ALLSLDU_DP02_CO.csv', 'CO'),
 ('data/upper/ALLSLDU_DP02_CT.csv', 'CT'),('data/upper/ALLSLDU_DP02_DE.csv', 'DE'),('data/upper/ALLSLDU_DP02_FL.csv', 'FL'),
 ('data/upper/ALLSLDU_DP02_GA.csv', 'GA'),('data/upper/ALLSLDU_DP02_HI.csv', 'HI'),('data/upper/ALLSLDU_DP02_IA.csv', 'IA'),
 ('data/upper/ALLSLDU_DP02_ID.csv', 'ID'),('data/upper/ALLSLDU_DP02_IL.csv', 'IL'),('data/upper/ALLSLDU_DP02_IN.csv', 'IN'),
 ('data/upper/ALLSLDU_DP02_KS.csv', 'KS'),('data/upper/ALLSLDU_DP02_KY.csv', 'KY'),('data/upper/ALLSLDU_DP02_LA.csv', 'LA'),
 ('data/upper/ALLSLDU_DP02_MA.csv', 'MA'),('data/upper/ALLSLDU_DP02_MD.csv', 'MD'),('data/upper/ALLSLDU_DP02_ME.csv', 'ME'),
 ('data/upper/ALLSLDU_DP02_MI.csv', 'MI'),('data/upper/ALLSLDU_DP02_MN.csv', 'MN'),('data/upper/ALLSLDU_DP02_MO.csv', 'MO'),
 ('data/upper/ALLSLDU_DP02_MS.csv', 'MS'),('data/upper/ALLSLDU_DP02_MT.csv', 'MT'),('data/upper/ALLSLDU_DP02_NC.csv', 'NC'),
 ('data/upper/ALLSLDU_DP02_ND.csv', 'ND'),('data/upper/ALLSLDU_DP02_NE.csv', 'NE'),('data/upper/ALLSLDU_DP02_NH.csv', 'NH'),
 ('data/upper/ALLSLDU_DP02_NJ.csv', 'NJ'),('data/upper/ALLSLDU_DP02_NM.csv', 'NM'),('data/upper/ALLSLDU_DP02_NV.csv', 'NV'),
 ('data/upper/ALLSLDU_DP02_NY.csv', 'NY'),('data/upper/ALLSLDU_DP02_OH.csv', 'OH'),('data/upper/ALLSLDU_DP02_OK.csv', 'OK'),
 ('data/upper/ALLSLDU_DP02_OR.csv', 'OR'),('data/upper/ALLSLDU_DP02_PA.csv', 'PA'),('data/upper/ALLSLDU_DP02_RI.csv', 'RI'),
 ('data/upper/ALLSLDU_DP02_SC.csv', 'SC'),('data/upper/ALLSLDU_DP02_SD.csv', 'SD'),('data/upper/ALLSLDU_DP02_TN.csv', 'TN'),
 ('data/upper/ALLSLDU_DP02_TX.csv', 'TX'),('data/upper/ALLSLDU_DP02_UT.csv', 'UT'),('data/upper/ALLSLDU_DP02_VA.csv', 'VA'),
 ('data/upper/ALLSLDU_DP02_VT.csv', 'VT'),('data/upper/ALLSLDU_DP02_WA.csv', 'WA'),('data/upper/ALLSLDU_DP02_WI.csv', 'WI'),
 ('data/upper/ALLSLDU_DP02_WV.csv', 'WV'),('data/upper/ALLSLDU_DP02_WY.csv', 'WY')
]
df_prfsocial_all, df_pctsocial_all, error_files = process_social(file_state_list)

if error_files:
    print("Files with errors:", error_files)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['PCT_ESTIMATE'].replace({'-': '0'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['PCT_ESTIMATE'].replace({'-': '0'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['PCT_ESTIMATE'].replace({'-': '0'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dat

In [48]:
error_files

[]

In [189]:
file_state_list = [
 ('data/upper/ALLSLDU_DP02_AK.csv', 'AK'),('data/upper/ALLSLDU_DP02_AL.csv', 'AL'),('data/upper/ALLSLDU_DP02_AR.csv', 'AR'),
 ('data/upper/ALLSLDU_DP02_AZ.csv', 'AZ'),('data/upper/ALLSLDU_DP02_CA.csv', 'CA'),('data/upper/ALLSLDU_DP02_CO.csv', 'CO'),
 ('data/upper/ALLSLDU_DP02_CT.csv', 'CT'),('data/upper/ALLSLDU_DP02_DE.csv', 'DE'),('data/upper/ALLSLDU_DP02_FL.csv', 'FL'),
 ('data/upper/ALLSLDU_DP02_GA.csv', 'GA'),('data/upper/ALLSLDU_DP02_HI.csv', 'HI'),('data/upper/ALLSLDU_DP02_IA.csv', 'IA'),
 ('data/upper/ALLSLDU_DP02_ID.csv', 'ID'),('data/upper/ALLSLDU_DP02_IL.csv', 'IL'),('data/upper/ALLSLDU_DP02_IN.csv', 'IN'),
 ('data/upper/ALLSLDU_DP02_KS.csv', 'KS'),('data/upper/ALLSLDU_DP02_KY.csv', 'KY'),('data/upper/ALLSLDU_DP02_LA.csv', 'LA'),
 ('data/upper/ALLSLDU_DP02_MA.csv', 'MA'),('data/upper/ALLSLDU_DP02_MD.csv', 'MD'),('data/upper/ALLSLDU_DP02_ME.csv', 'ME'),
 ('data/upper/ALLSLDU_DP02_MI.csv', 'MI'),('data/upper/ALLSLDU_DP02_MN.csv', 'MN'),('data/upper/ALLSLDU_DP02_MO.csv', 'MO'),
 ('data/upper/ALLSLDU_DP02_MS.csv', 'MS'),('data/upper/ALLSLDU_DP02_MT.csv', 'MT'),('data/upper/ALLSLDU_DP02_NC.csv', 'NC'),
 ('data/upper/ALLSLDU_DP02_ND.csv', 'ND'),('data/upper/ALLSLDU_DP02_NE.csv', 'NE'),('data/upper/ALLSLDU_DP02_NH.csv', 'NH'),
 ('data/upper/ALLSLDU_DP02_NJ.csv', 'NJ'),('data/upper/ALLSLDU_DP02_NM.csv', 'NM'),('data/upper/ALLSLDU_DP02_NV.csv', 'NV'),
 ('data/upper/ALLSLDU_DP02_NY.csv', 'NY'),('data/upper/ALLSLDU_DP02_OH.csv', 'OH'),('data/upper/ALLSLDU_DP02_OK.csv', 'OK'),
 ('data/upper/ALLSLDU_DP02_OR.csv', 'OR'),('data/upper/ALLSLDU_DP02_PA.csv', 'PA'),('data/upper/ALLSLDU_DP02_RI.csv', 'RI'),
 ('data/upper/ALLSLDU_DP02_SC.csv', 'SC'),('data/upper/ALLSLDU_DP02_SD.csv', 'SD'),('data/upper/ALLSLDU_DP02_TN.csv', 'TN'),
 ('data/upper/ALLSLDU_DP02_TX.csv', 'TX'),('data/upper/ALLSLDU_DP02_UT.csv', 'UT'),('data/upper/ALLSLDU_DP02_VA.csv', 'VA'),
 ('data/upper/ALLSLDU_DP02_VT.csv', 'VT'),('data/upper/ALLSLDU_DP02_WA.csv', 'WA'),('data/upper/ALLSLDU_DP02_WI.csv', 'WI'),
 ('data/upper/ALLSLDU_DP02_WV.csv', 'WV'),('data/upper/ALLSLDU_DP02_WY.csv', 'WY')
]


df_prfsocial_all, df_pctsocial_all = process_social(file_state_list)

In [190]:
df_prfsocial_all

Unnamed: 0,Formatted_District,GEOID,ANCESTRY - American,ANCESTRY - Arab,ANCESTRY - Czech,ANCESTRY - Danish,ANCESTRY - Dutch,ANCESTRY - English,ANCESTRY - French (except Basque),ANCESTRY - French Canadian,...,Total households - Married-couple household - With children of the householder under 18 years,VETERAN STATUS - Civilian population 18 years and over,VETERAN STATUS - Civilian veterans,WORLD REGION OF BIRTH OF FOREIGN BORN - Africa,WORLD REGION OF BIRTH OF FOREIGN BORN - Asia,WORLD REGION OF BIRTH OF FOREIGN BORN - Europe,"WORLD REGION OF BIRTH OF FOREIGN BORN - Foreign-born population, excluding population born at sea",WORLD REGION OF BIRTH OF FOREIGN BORN - Latin America,WORLD REGION OF BIRTH OF FOREIGN BORN - Northern America,WORLD REGION OF BIRTH OF FOREIGN BORN - Oceania
0,"State Senate District A (2022), Alaska",610U800US0200A,812.0,73.0,128.0,188.0,451.0,3068.0,825.0,220.0,...,2280.0,28035.0,2791.0,50.0,2046.0,237.0,2958.0,380.0,201.0,44.0
1,"State Senate District B (2022), Alaska",610U800US0200B,1419.0,52.0,90.0,299.0,491.0,3697.0,985.0,212.0,...,2641.0,28310.0,2163.0,61.0,1762.0,250.0,3125.0,569.0,208.0,275.0
2,"State Senate District C (2022), Alaska",610U800US0200C,941.0,54.0,214.0,305.0,652.0,3592.0,954.0,501.0,...,2694.0,27593.0,3072.0,69.0,2719.0,274.0,3706.0,416.0,150.0,78.0
3,"State Senate District D (2022), Alaska",610U800US0200D,1469.0,71.0,244.0,523.0,867.0,4852.0,764.0,310.0,...,2629.0,28249.0,2958.0,13.0,506.0,212.0,1075.0,246.0,70.0,28.0
4,"State Senate District E (2022), Alaska",610U800US0200E,2055.0,69.0,229.0,473.0,725.0,4125.0,949.0,360.0,...,2752.0,27101.0,2487.0,49.0,1430.0,440.0,3280.0,881.0,167.0,313.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1937,WY-Sen-27,610U800US56027,549.0,0.0,65.0,340.0,220.0,2337.0,366.0,94.0,...,1229.0,13741.0,1335.0,0.0,140.0,57.0,376.0,119.0,60.0,0.0
1938,WY-Sen-28,610U800US56028,729.0,55.0,96.0,150.0,108.0,1776.0,939.0,100.0,...,1334.0,14347.0,1388.0,0.0,82.0,108.0,246.0,13.0,38.0,5.0
1939,WY-Sen-29,610U800US56029,791.0,2.0,352.0,91.0,239.0,1921.0,215.0,143.0,...,1355.0,14643.0,1366.0,0.0,65.0,59.0,399.0,254.0,14.0,7.0
1940,WY-Sen-30,610U800US56030,982.0,0.0,62.0,122.0,382.0,2208.0,470.0,85.0,...,1336.0,14151.0,1155.0,7.0,14.0,78.0,416.0,257.0,60.0,0.0


In [191]:
# Assuming 'data' is your DataFrame
nan_counts = data.isna().sum()
print(nan_counts)


GEOID                 0
District              0
PROFLN                0
TITLE                 0
PRF_ESTIMATE          0
PCT_ESTIMATE          0
Formatted_District    0
dtype: int64


# Economic Characteristics

In [193]:
AL_econ_df = load_and_rename_data('data/upper/ALLSLDU_DP03_AL.csv')

In [194]:
def update_titles_econ(data):
   
    # Shorten two long titles--
    #INCOME AND BENEFITS (IN 2021 INFLATION-ADJUSTED DOLLARS)
    data.loc[data['PROFLN'] == '50.5', 'TITLE'] = 'INCOME AND BENEFITS' 
    #PERCENTAGE OF FAMILIES AND PEOPLE WHOSE INCOME IN THE PAST 12 MONTHS IS BELOW THE POVERTY LEVEL
    data.loc[data['PROFLN'] == '118.5', 'TITLE'] = 'INCOME BELOW POVERTY LEVEL' 
   
    # now create the title_dict
    title_dict = dict(zip(data['PROFLN'], data['TITLE']))

    ####### DICTIONARIES FOR PROFLN NUMBERS THAT NEED A DIRECT MAPPING#######
    
    # Dictionary "MISC FEMALE EMPLOYMENT"
    misc_female_employment_dict = {
        '15': '0.8-1-14-15',
        '16': '0.8-10-16',
        '17': '0.8-10-16-17'
    }
    # Dictionary "MISC INCOME AND BENEFITS" category
    misc_income_and_benefits_dict = {
        '65': '50.5-51-64-65',
        '66': '50.5-51-66',
        '67': '50.5-51-66-67',
        '68': '50.5-51-68',
        '69': '50.5-51-68-69',
        '70': '50.5-51-70',
        '71': '50.5-51-70-71',
        '72': '50.5-51-72',
        '73': '50.5-51-72-73',
        '74': '50.5-51-74',
    }
    
    # Dictionary HEALTH INSURANCE COVERAGE category
    health_insurance_coverage_dict = {
        '95': '94.5-95',
        '96': '94.5-95-96',
        '97': '94.5-95-97',
        '98': '94.5-95-98',
        '99': '94.5-95-99',
        '100': '94.5-100',
        '101': '94.5-100-101',
        '103': '102-103',
        '104': '94.5-104',
        '105': '94.5-104-105',
        '106': '94.5-104-106',
        '107': '94.5-104-107',
        '108': '94.5-104-108',
        '109': '94.5-109',
        '110': '94.5-109-110',
        '111': '94.5-109-111',
        '112': '94.5-109-112',
        '113': '94.5-109-113',
        '114': '94.5-114',
        '115': '94.5-114-115',
        '116': '94.5-114-116',
        '117': '94.5-114-117',
        '118': '94.5-114-118'
    }
    # Dictionary FAMILY POVERTY LEVEL
    family_poverty_level_dict = {
        '119': '118.5-119',
        '120': '118.5-119-120',
        '121': '118.5-119-121',
        '122': '118.5-122',
        '123': '118.5-122-123',
        '124': '118.5-122-124',
        '125': '118.5-125',
        '126': '118.5-125-126',
        '127': '118.5-125-127'
    }


    ###### PROFLN NUMBERS THAT FOLLOW A SIMPLE PATTERN OVER A LONG RANGE######
    # EMPLOYMENT STATUS category
    for i in range(1, 10):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '0.8-1' + ('' if i == 1 else '-' + str(i))
    # "FEMALE EMPLOYMENT STATUS" category
    for i in range(10, 15):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '0.8-10' + ('' if i == 10 else '-' + str(i))
    # "COMMUTING TO WORK" category
    for i in range(18, 26):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '17.5-' + str(i)
    # "OCCUPATION" category
    for i in range(26, 32):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '25.5-' + str(i)
    # "INDUSTRY" category
    for i in range(33, 46):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '31.5-' + str(i)
    #"CLASS OF WORKER" category
    for i in range(47, 51):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '45.5-' + str(i)
    # "INCOME AND BENEFITS" category
    for i in range(52, 65):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '50.5-51-' + str(i)
    # FAMILY INCOME AND BENEFITS category
    for i in range(75, 88):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '50.5-75' + ('' if i == 75 else '-' + str(i))
    # NON-FAMILY INCOME AND BENEFITS category
    for i in range(88, 95):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '50.5-' + str(i)
    # ALL PEOPLE POVERTY LEVEL category
    for i in range (128,138):
        data.loc[data['PROFLN'] == str(i), 'PROFLN'] = '118.5-128'+ ('' if i == 128 else '-' + str(i))
    
    
    ###### DIRECT MAPPING #####
    for old, new in misc_female_employment_dict.items(): #Misc Female Employment
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in misc_income_and_benefits_dict.items(): #Misc income and benefits
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in health_insurance_coverage_dict.items(): #Health Insurance Coverage
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    for old, new in family_poverty_level_dict.items():     # Family poverty level
        data.loc[data['PROFLN'] == old, 'PROFLN'] = new
    

    # Translate TITLE values using previously defined 'translate_profln' function
    data['PROFLN'] = data['PROFLN'].astype(str)
    data['TITLE'] = data['PROFLN'].apply(lambda x: translate_profln(x, title_dict))

    return data


In [195]:
AL_econ_df = update_titles_econ(AL_econ_df)
AL_econ_df[['PROFLN','TITLE']]

Unnamed: 0,PROFLN,TITLE
0,0,SELECTED ECONOMIC CHARACTERISTICS
1,0.5,
2,0.8,EMPLOYMENT STATUS
3,0.8-1,EMPLOYMENT STATUS - Population 16 years and over
4,0.8-1-2,EMPLOYMENT STATUS - Population 16 years and over - In labor force
...,...,...
5875,118.5-128-133,INCOME BELOW POVERTY LEVEL - All people - 18 years and over
5876,118.5-128-134,INCOME BELOW POVERTY LEVEL - All people - 18 to 64 years
5877,118.5-128-135,INCOME BELOW POVERTY LEVEL - All people - 65 years and over
5878,118.5-128-136,INCOME BELOW POVERTY LEVEL - All people - People in families


In [196]:
def clean_up_data_econ(data):
    
    # Replace '(X)' values in PCT_ESTIMATE with their PRF_ESTIMATE values (and vice versa)
    data.loc[data['PCT_ESTIMATE'] == '(X)', 'PCT_ESTIMATE'] = data['PRF_ESTIMATE']
    data.loc[data['PRF_ESTIMATE'] == '(X)', 'PRF_ESTIMATE'] = data['PCT_ESTIMATE']
    
    
    # Drop unnecessary rows and rows with placeholders
    invalid_rows = (data['PRF_ESTIMATE'].isin(['N', '(X)']) | 
                data['PCT_ESTIMATE'].isin(['N', '(X)']) |
                data['PRF_ESTIMATE'].isna())
    data = data[~invalid_rows]

    # Drop rows where 'TITLE' contains 'Not computed'
    data = data[~data['TITLE'].str.contains('Not computed', na=False)]

    # The rows associated with these (profln) contain duplicated data
    profln_to_remove = ['25.5-26', '32', '46',   # Duplicated 'Employed population 16+'
                        '0.8-1-8']               # Duplicated 'Civilian labor force'
    
    data = data[~data['PROFLN'].isin(profln_to_remove)]

    # Delete the margin of error columns
    data = data.drop(columns=['PRF_MG_ERROR', 'PCT_MG_ERROR'])


    # Convert to number - Remove '+' and ',' from 'PCT_ESTIMATE' and 'PRF_ESTIMATE', then convert to float
    for col in ['PCT_ESTIMATE', 'PRF_ESTIMATE']:
        data[col] = data[col].str.replace(',', '').str.replace('+', '').astype(float)
    
    return data


In [197]:
AL_econ_df = clean_up_data_econ(AL_econ_df)

In [198]:
format_districts(AL_econ_df)

Unnamed: 0,GEOID,District,PROFLN,TITLE,PRF_ESTIMATE,PCT_ESTIMATE,Formatted_District
3,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-1,EMPLOYMENT STATUS - Population 16 years and over,118470.0,118470.0,AL-Sen-01
4,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-1-2,EMPLOYMENT STATUS - Population 16 years and over - In labor force,69215.0,58.4,AL-Sen-01
5,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-1-3,EMPLOYMENT STATUS - Population 16 years and over - Civilian labor force,69103.0,58.3,AL-Sen-01
6,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-1-4,EMPLOYMENT STATUS - Population 16 years and over - Employed,66408.0,56.1,AL-Sen-01
7,610U800US01001,"State Senate District 1 (2022), Alabama",0.8-1-5,EMPLOYMENT STATUS - Population 16 years and over - Unemployed,2695.0,2.3,AL-Sen-01
...,...,...,...,...,...,...,...
5875,610U800US01035,"State Senate District 35 (2022), Alabama",118.5-128-133,INCOME BELOW POVERTY LEVEL - All people - 18 years and over,11.7,11.7,AL-Sen-35
5876,610U800US01035,"State Senate District 35 (2022), Alabama",118.5-128-134,INCOME BELOW POVERTY LEVEL - All people - 18 to 64 years,12.9,12.9,AL-Sen-35
5877,610U800US01035,"State Senate District 35 (2022), Alabama",118.5-128-135,INCOME BELOW POVERTY LEVEL - All people - 65 years and over,7.6,7.6,AL-Sen-35
5878,610U800US01035,"State Senate District 35 (2022), Alabama",118.5-128-136,INCOME BELOW POVERTY LEVEL - All people - People in families,11.4,11.4,AL-Sen-35


In [199]:
AL_econ_prf, AL_econ_pct = reorient_dfs(AL_econ_df)
AL_econ_prf

Unnamed: 0,Formatted_District,GEOID,CLASS OF WORKER - Government workers,CLASS OF WORKER - Private wage and salary workers,CLASS OF WORKER - Self-employed in own not incorporated business workers,CLASS OF WORKER - Unpaid family workers,"COMMUTING TO WORK - Car, truck, or van -- carpooled","COMMUTING TO WORK - Car, truck, or van -- drove alone",COMMUTING TO WORK - Mean travel time to work (minutes),COMMUTING TO WORK - Other means,...,INDUSTRY - Public administration,INDUSTRY - Retail trade,"INDUSTRY - Transportation and warehousing, and utilities",INDUSTRY - Wholesale trade,"OCCUPATION - Management, business, science, and arts occupations","OCCUPATION - Natural resources, construction, and maintenance occupations","OCCUPATION - Production, transportation, and material moving occupations",OCCUPATION - Sales and office occupations,OCCUPATION - Service occupations,Total households
0,AL-Sen-01,610U800US01001,11258.0,50980.0,4036.0,134.0,5495.0,55729.0,26.7,714.0,...,3706.0,9727.0,3478.0,1536.0,21398.0,7544.0,11515.0,15205.0,10746.0,57820.0
1,AL-Sen-02,610U800US01002,13049.0,59582.0,2729.0,116.0,3470.0,61986.0,21.3,859.0,...,7091.0,8357.0,3186.0,877.0,39920.0,3870.0,7880.0,14133.0,9673.0,58433.0
2,AL-Sen-03,610U800US01003,10159.0,53954.0,3232.0,104.0,4089.0,58277.0,22.7,410.0,...,3851.0,7106.0,2944.0,1509.0,25889.0,6262.0,12844.0,12822.0,9632.0,57828.0
3,AL-Sen-04,610U800US01004,7728.0,47676.0,4330.0,87.0,7043.0,49190.0,26.8,203.0,...,2285.0,7948.0,4197.0,1297.0,16331.0,8170.0,13452.0,12747.0,9121.0,53289.0
4,AL-Sen-05,610U800US01005,8854.0,43252.0,2862.0,159.0,5081.0,46320.0,29.1,464.0,...,2604.0,6543.0,3271.0,1563.0,16555.0,6549.0,11418.0,11756.0,8849.0,50505.0
5,AL-Sen-06,610U800US01006,7994.0,49517.0,3895.0,71.0,5456.0,52705.0,25.4,398.0,...,2237.0,7009.0,3395.0,1593.0,17975.0,7197.0,15290.0,11415.0,9600.0,55286.0
6,AL-Sen-07,610U800US01007,11644.0,52412.0,4150.0,75.0,4339.0,55640.0,21.1,715.0,...,5031.0,7883.0,2548.0,1222.0,32395.0,4274.0,8451.0,12841.0,10320.0,62193.0
7,AL-Sen-08,610U800US01008,9512.0,50282.0,5052.0,50.0,5208.0,53766.0,26.2,823.0,...,3450.0,6624.0,3288.0,1120.0,23132.0,6717.0,13392.0,12538.0,9117.0,56197.0
8,AL-Sen-09,610U800US01009,8926.0,50153.0,3896.0,161.0,5853.0,52312.0,24.8,317.0,...,3495.0,6951.0,3272.0,1711.0,23424.0,6446.0,11800.0,11655.0,9811.0,56232.0
9,AL-Sen-10,610U800US01010,9026.0,49037.0,4332.0,52.0,5087.0,53278.0,26.0,384.0,...,2783.0,7978.0,3365.0,1102.0,17917.0,7400.0,14325.0,11891.0,10914.0,55298.0


In [200]:
## PROCESS ALL 50 STATES HOUSING FUNCTION
def process_econ(file_state_list):
    prf_dfs = []
    pct_dfs = []
    
    for file_state in file_state_list:
        csv, state = file_state

        data = load_and_rename_data(csv)
        data = update_titles_econ(data)
        data = clean_up_data_econ(data)
        data = format_districts(data)

        prf_estimate_df, pct_estimate_df = reorient_dfs(data)

        prf_dfs.append(prf_estimate_df)
        pct_dfs.append(pct_estimate_df)
    
    df_prfecon_all = pd.concat(prf_dfs, ignore_index=True)
    df_pctecon_all = pd.concat(pct_dfs, ignore_index=True)
    
    return df_prfecon_all, df_pctecon_all


In [201]:
file_state_list = [
 ('data/upper/ALLSLDU_DP03_AK.csv', 'AK'),('data/upper/ALLSLDU_DP03_AL.csv', 'AL'),('data/upper/ALLSLDU_DP03_AR.csv', 'AR'),
 ('data/upper/ALLSLDU_DP03_AZ.csv', 'AZ'),('data/upper/ALLSLDU_DP03_CA.csv', 'CA'),('data/upper/ALLSLDU_DP03_CO.csv', 'CO'),
 ('data/upper/ALLSLDU_DP03_CT.csv', 'CT'),('data/upper/ALLSLDU_DP03_DE.csv', 'DE'),('data/upper/ALLSLDU_DP03_FL.csv', 'FL'),
 ('data/upper/ALLSLDU_DP03_GA.csv', 'GA'),('data/upper/ALLSLDU_DP03_HI.csv', 'HI'),('data/upper/ALLSLDU_DP03_IA.csv', 'IA'),
 ('data/upper/ALLSLDU_DP03_ID.csv', 'ID'),('data/upper/ALLSLDU_DP03_IL.csv', 'IL'),('data/upper/ALLSLDU_DP03_IN.csv', 'IN'),
 ('data/upper/ALLSLDU_DP03_KS.csv', 'KS'),('data/upper/ALLSLDU_DP03_KY.csv', 'KY'),('data/upper/ALLSLDU_DP03_LA.csv', 'LA'),
 ('data/upper/ALLSLDU_DP03_MA.csv', 'MA'),('data/upper/ALLSLDU_DP03_MD.csv', 'MD'),('data/upper/ALLSLDU_DP03_ME.csv', 'ME'),
 ('data/upper/ALLSLDU_DP03_MI.csv', 'MI'),('data/upper/ALLSLDU_DP03_MN.csv', 'MN'),('data/upper/ALLSLDU_DP03_MO.csv', 'MO'),
 ('data/upper/ALLSLDU_DP03_MS.csv', 'MS'),('data/upper/ALLSLDU_DP03_MT.csv', 'MT'),('data/upper/ALLSLDU_DP03_NC.csv', 'NC'),
 ('data/upper/ALLSLDU_DP03_ND.csv', 'ND'),('data/upper/ALLSLDU_DP03_NE.csv', 'NE'),('data/upper/ALLSLDU_DP03_NH.csv', 'NH'),
 ('data/upper/ALLSLDU_DP03_NJ.csv', 'NJ'),('data/upper/ALLSLDU_DP03_NM.csv', 'NM'),('data/upper/ALLSLDU_DP03_NV.csv', 'NV'),
 ('data/upper/ALLSLDU_DP03_NY.csv', 'NY'),('data/upper/ALLSLDU_DP03_OH.csv', 'OH'),('data/upper/ALLSLDU_DP03_OK.csv', 'OK'),
 ('data/upper/ALLSLDU_DP03_OR.csv', 'OR'),('data/upper/ALLSLDU_DP03_PA.csv', 'PA'),('data/upper/ALLSLDU_DP03_RI.csv', 'RI'),
 ('data/upper/ALLSLDU_DP03_SC.csv', 'SC'),('data/upper/ALLSLDU_DP03_SD.csv', 'SD'),('data/upper/ALLSLDU_DP03_TN.csv', 'TN'),
 ('data/upper/ALLSLDU_DP03_TX.csv', 'TX'),('data/upper/ALLSLDU_DP03_UT.csv', 'UT'),('data/upper/ALLSLDU_DP03_VA.csv', 'VA'),
 ('data/upper/ALLSLDU_DP03_VT.csv', 'VT'),('data/upper/ALLSLDU_DP03_WA.csv', 'WA'),('data/upper/ALLSLDU_DP03_WI.csv', 'WI'),
 ('data/upper/ALLSLDU_DP03_WV.csv', 'WV'),('data/upper/ALLSLDU_DP03_WY.csv', 'WY')
]


df_prfecon_all, df_pctecon_all = process_econ(file_state_list)


ValueError: could not convert string to float: '-'

In [None]:
df_pctecon_all

In [202]:
import pandas as pd

def process_econ(file_state_list):
    prf_dfs = []
    pct_dfs = []
    error_files = []

    for file_state in file_state_list:
        csv, state = file_state
        try:
            data = load_and_rename_data(csv)
            data = update_titles_econ(data)
            data = clean_up_data_econ(data)
            data = format_districts(data)

            prf_estimate_df, pct_estimate_df = reorient_dfs(data)

            prf_dfs.append(prf_estimate_df)
            pct_dfs.append(pct_estimate_df)

        except ValueError as e:
            print(f"Error processing file: {csv}")
            error_files.append((csv, str(e)))
            # Optionally, print or log the rows causing the error
            problematic_rows = data[(data['PCT_ESTIMATE'].str.contains('-', na=False)) |
                                    (data['PRF_ESTIMATE'].str.contains('-', na=False))]
            print(problematic_rows)

    df_prfecon_all = pd.concat(prf_dfs, ignore_index=True) if prf_dfs else pd.DataFrame()
    df_pctecon_all = pd.concat(pct_dfs, ignore_index=True) if pct_dfs else pd.DataFrame()

    return df_prfecon_all, df_pctecon_all, error_files

# Usage
file_state_list = [
 ('data/upper/ALLSLDU_DP03_AK.csv', 'AK'),('data/upper/ALLSLDU_DP03_AL.csv', 'AL'),('data/upper/ALLSLDU_DP03_AR.csv', 'AR'),
 ('data/upper/ALLSLDU_DP03_AZ.csv', 'AZ'),('data/upper/ALLSLDU_DP03_CA.csv', 'CA'),('data/upper/ALLSLDU_DP03_CO.csv', 'CO'),
 ('data/upper/ALLSLDU_DP03_CT.csv', 'CT'),('data/upper/ALLSLDU_DP03_DE.csv', 'DE'),('data/upper/ALLSLDU_DP03_FL.csv', 'FL'),
 ('data/upper/ALLSLDU_DP03_GA.csv', 'GA'),('data/upper/ALLSLDU_DP03_HI.csv', 'HI'),('data/upper/ALLSLDU_DP03_IA.csv', 'IA'),
 ('data/upper/ALLSLDU_DP03_ID.csv', 'ID'),('data/upper/ALLSLDU_DP03_IL.csv', 'IL'),('data/upper/ALLSLDU_DP03_IN.csv', 'IN'),
 ('data/upper/ALLSLDU_DP03_KS.csv', 'KS'),('data/upper/ALLSLDU_DP03_KY.csv', 'KY'),('data/upper/ALLSLDU_DP03_LA.csv', 'LA'),
 ('data/upper/ALLSLDU_DP03_MA.csv', 'MA'),('data/upper/ALLSLDU_DP03_MD.csv', 'MD'),('data/upper/ALLSLDU_DP03_ME.csv', 'ME'),
 ('data/upper/ALLSLDU_DP03_MI.csv', 'MI'),('data/upper/ALLSLDU_DP03_MN.csv', 'MN'),('data/upper/ALLSLDU_DP03_MO.csv', 'MO'),
 ('data/upper/ALLSLDU_DP03_MS.csv', 'MS'),('data/upper/ALLSLDU_DP03_MT.csv', 'MT'),('data/upper/ALLSLDU_DP03_NC.csv', 'NC'),
 ('data/upper/ALLSLDU_DP03_ND.csv', 'ND'),('data/upper/ALLSLDU_DP03_NE.csv', 'NE'),('data/upper/ALLSLDU_DP03_NH.csv', 'NH'),
 ('data/upper/ALLSLDU_DP03_NJ.csv', 'NJ'),('data/upper/ALLSLDU_DP03_NM.csv', 'NM'),('data/upper/ALLSLDU_DP03_NV.csv', 'NV'),
 ('data/upper/ALLSLDU_DP03_NY.csv', 'NY'),('data/upper/ALLSLDU_DP03_OH.csv', 'OH'),('data/upper/ALLSLDU_DP03_OK.csv', 'OK'),
 ('data/upper/ALLSLDU_DP03_OR.csv', 'OR'),('data/upper/ALLSLDU_DP03_PA.csv', 'PA'),('data/upper/ALLSLDU_DP03_RI.csv', 'RI'),
 ('data/upper/ALLSLDU_DP03_SC.csv', 'SC'),('data/upper/ALLSLDU_DP03_SD.csv', 'SD'),('data/upper/ALLSLDU_DP03_TN.csv', 'TN'),
 ('data/upper/ALLSLDU_DP03_TX.csv', 'TX'),('data/upper/ALLSLDU_DP03_UT.csv', 'UT'),('data/upper/ALLSLDU_DP03_VA.csv', 'VA'),
 ('data/upper/ALLSLDU_DP03_VT.csv', 'VT'),('data/upper/ALLSLDU_DP03_WA.csv', 'WA'),('data/upper/ALLSLDU_DP03_WI.csv', 'WI'),
 ('data/upper/ALLSLDU_DP03_WV.csv', 'WV'),('data/upper/ALLSLDU_DP03_WY.csv', 'WY')
]

df_prfecon_all, df_pctecon_all, error_files = process_econ(file_state_list)

if error_files:
    print("Files with errors:", error_files)


Error processing file: data/upper/ALLSLDU_DP03_ND.csv
               GEOID                                       District  \
4460  610U800US38027  State Senate District 27 (2022), North Dakota   
6708  610U800US38040  State Senate District 40 (2022), North Dakota   

             PROFLN  \
4460  50.5-51-72-73   
6708  118.5-125-127   

                                                                                                                                               TITLE  \
4460                      INCOME AND BENEFITS - Total households - With cash public assistance income - Mean cash public assistance income (dollars)   
6708  INCOME BELOW POVERTY LEVEL - Families with female householder, no spouse present - With related children of the householder under 5 years only   

     PRF_ESTIMATE PRF_MG_ERROR PCT_ESTIMATE PCT_MG_ERROR  
4460            -           **            -          (X)  
6708            -          (X)            -           **  
Error processing file: data/

def find_placeholders(data, columns):
    for col in columns:
        # Find rows where the column has the placeholder '-'
        placeholder_rows = data[data[col] == '-']
        if not placeholder_rows.empty:
            print(f"Placeholders found in column '{col}':")
            print(placeholder_rows[[col]])  # Display only the problematic column for clarity
        else:
            print(f"No placeholders found in column '{col}'.")

# Specify the columns you want to check for placeholders
columns_to_check = ['PCT_ESTIMATE', 'PRF_ESTIMATE']

# Call the function with your DataFrame and the columns list
find_placeholders(data, columns_to_check)


import pandas as pd
import numpy as np

def clean_up_data_housing(data):
    # Columns to clean
    columns_to_clean = ['PCT_ESTIMATE', 'PRF_ESTIMATE']

    # Replace placeholders with NaN
    for col in columns_to_clean:
        data[col] = pd.to_numeric(data[col], errors='coerce')

    return data

def process_housing(file_state_list):
    prf_dfs = []
    pct_dfs = []

    for csv, state in file_state_list:
        data = load_and_rename_data(csv)
        data = update_titles_housing(data)
        data = clean_up_data_housing(data)
        data = format_districts(data)

        prf_estimate_df, pct_estimate_df = reorient_dfs(data)

        prf_dfs.append(prf_estimate_df)
        pct_dfs.append(pct_estimate_df)

    df_prfhousing_all = pd.concat(prf_dfs, ignore_index=True)
    df_pcthousing_all = pd.concat(pct_dfs, ignore_index=True)

    return df_prfhousing_all, df_pcthousing_all

# Assuming file_state_list is defined and contains paths and state codes for your datasets
df_prfhousing_all, df_pcthousing_all = process_housing(file_state_list)


import pandas as pd

def load_and_rename_data(csv):
    # Loading and renaming data as per your setup
    data = pd.read_csv(csv, dtype={'Line Number': str})
    data.rename(columns={
        'GEONAME': 'District',
        'Line Number': 'PROFLN',
        'Title': 'TITLE',
        'Estimate': 'PRF_ESTIMATE',
        'Percent Estimate': 'PCT_ESTIMATE',
        'Margin of Error': 'PRF_MG_ERROR',
        'Percent Margin of Error': 'PCT_MG_ERROR'
    }, inplace=True)
    return data

def check_for_placeholders(data, file_name):
    # Check columns for placeholders
    problematic_columns = ['PRF_ESTIMATE', 'PCT_ESTIMATE']
    for column in problematic_columns:
        if data[column].astype(str).str.contains('-').any():
            print(f"Placeholder found in {file_name} within column {column}")

def process_files(file_state_list):
    for file_path, state in file_state_list:
        data = load_and_rename_data(file_path)
        check_for_placeholders(data, file_path)  # Check and log if placeholders are found

# Example file_state_list
file_state_list = [
    ('data/upper/ALLSLDU_DP02_AK.csv', 'AK'), 
    ('data/upper/ALLSLDU_DP02_AL.csv', 'AL')
    # Add other files as needed
]

process_files(file_state_list)


import pandas as pd

def load_and_rename_data(csv):
    data = pd.read_csv(csv, dtype={'Line Number': str})
    data.rename(columns={
        'GEONAME': 'District',
        'Line Number': 'PROFLN',
        'Title': 'TITLE',
        'Estimate': 'PRF_ESTIMATE',
        'Percent Estimate': 'PCT_ESTIMATE',
        'Margin of Error': 'PRF_MG_ERROR',
        'Percent Margin of Error': 'PCT_MG_ERROR'
    }, inplace=True)
    return data

def find_non_numeric_entries(data, file_name):
    problematic_columns = ['PRF_ESTIMATE', 'PCT_ESTIMATE']
    issues_found = False
    for column in problematic_columns:
        # Attempt to convert to float and catch failures
        try:
            pd.to_numeric(data[column], errors='raise')
        except Exception as e:
            issues_found = True
            problematic_rows = data[data[column].apply(lambda x: not is_convertible_to_float(x))]
            print(f"Issues in file: {file_name} within column {column}")
            print(problematic_rows[[column]].drop_duplicates())  # Show unique non-convertible entries
    return issues_found

def is_convertible_to_float(x):
    try:
        float(x)
        return True
    except:
        return False

def process_files(file_state_list):
    for file_path, state in file_state_list:
        data = load_and_rename_data(file_path)
        if find_non_numeric_entries(data, file_path):
            print(f"Non-numeric entries found in {file_path}")

# Example usage
file_state_list = [
    ('data/upper/ALLSLDU_DP02_AK.csv', 'AK'),
    ('data/upper/ALLSLDU_DP02_AL.csv', 'AL')
    # add more files as needed
]

process_files(file_state_list)


import pandas as pd
import numpy as np

def clean_up_data_social(data):
    # Drop rows with placeholders or missing 'PRF_ESTIMATE'
    invalid_rows = data['PRF_ESTIMATE'].isin(['N', '(X)', '-', '']) | data['PRF_ESTIMATE'].isna()
    data = data[~invalid_rows]

    # Drop rows where 'TITLE' contains 'Not computed'
    data = data[~data['TITLE'].str.contains('Not computed', na=False)]

    # Replace problematic placeholders in 'PCT_ESTIMATE' and 'PRF_ESTIMATE' before conversion
    data['PCT_ESTIMATE'].replace({'(X)': np.nan, '-': np.nan, '': np.nan}, inplace=True)
    data['PRF_ESTIMATE'].replace({'(X)': np.nan, '-': np.nan, '': np.nan}, inplace=True)

    # Convert 'PCT_ESTIMATE' and 'PRF_ESTIMATE' to float, handling non-convertible values
    data['PCT_ESTIMATE'] = pd.to_numeric(data['PCT_ESTIMATE'].str.replace(',', '').str.replace('+', ''), errors='coerce')
    data['PRF_ESTIMATE'] = pd.to_numeric(data['PRF_ESTIMATE'].str.replace(',', '').str.replace('+', ''), errors='coerce')

    return data
