In [1]:
import os
import regex as re
import numpy as np
import pandas as pd

from tqdm import tqdm

In [2]:
os.listdir("../data/census")

['.DS_Store', 'cleaned', 'raw']

In [3]:
df = pd.read_csv("../data/census/raw/usa_00008.csv.gz", compression = 'gzip')

In [4]:
df = df[df['YEAR'] == 2020]
df = df[df['SAMPLE'] == 202003]

<h2>Columns</h2>

* Sex 
    * 1 = male
    * 2 = female

<h2>Recode Age</h2>

In [5]:
#drop under age
df = df[df['AGE'] >= 18]

In [6]:
def recode_age(s):
    if s >= 18 and s <= 34:
        return 1
    elif s>=35 and s<=54:
        return 2
    elif s>=55:
        return 3
    elif pd.isnull(s):
        return 9

In [7]:
df['age_recoded'] = df['AGE'].apply(recode_age)

<h2>Recode Race</h2>

In [8]:
def recode_race(s):
    if s['HISPAN'] in [1, 2, 3, 4, 9]:
        return 4
    elif s['RACE'] == 4.0 or s['RACE'] == 5.0 or s['RACE'] == 6.0:
        return 3
    elif s['RACE'] == 1.0:
        return 1
    elif s['RACE'] == 2.0:
        return 2
    else:
        return 9

In [9]:
df['race_recoded'] = df.apply(recode_race, axis = 1)

<h2>Recode Gender</h2>

In [10]:
df['male'] = df['SEX'] == 1

<h2>Recode Education</h2>

In [11]:
def recode_education(s):
    if s >=2 and s <= 63:
        return 1
    elif s >=64 and s != 999:
        return 2
    else:
        return 3

In [12]:
df['education_recoded'] = df['EDUC'].apply(recode_education)

<h2>Recode Region</h2>

In [13]:
### https://www2.census.gov/geo/pdfs/maps-data/maps/reference/us_regdiv.pdf
def recode_region(s):
    new_england = [9.0, 23.0, 25.0, 33.0, 44.0, 50.0, 34.0, 36.0, 42.0]
    midwest = [18.0, 17.0, 26.0, 39.0, 55.0, 19.0, 20.0, 27.0, 29.0, 31.0, 38.0, 46.0]
    south = [10.0, 11.0, 12.0, 13.0, 24.0, 37.0, 45.0, 51.0, 54.0, 1.0, 21.0, 28.0, 47.0, 5.0, 22.0, 40.0, 48.0]
    west = [4.0, 8.0, 16.0, 35.0, 30.0, 49.0, 32.0, 56.0, 2.0, 6.0, 15.0, 41.0, 53.0]
    
    if s in new_england:
        return 1
    elif s in midwest:
        return 2 
    elif s in south:
        return 3
    elif s in west:
        return 4

In [14]:
df['region'] = df['STATEFIP'].apply(recode_region)

<h2>Prepare Stratification Tab;e</h2>

In [17]:
targets = ['STATEFIP', 'PERWT', 'age_recoded', 'race_recoded', 'male', 'education_recoded']

In [18]:
df = df[targets]

In [23]:
strats = df.groupby(['STATEFIP', 'age_recoded', 'race_recoded', 'male', 'education_recoded'])['PERWT'].sum().reset_index()

In [24]:
tot = strats['PERWT'].sum()

In [25]:
strats['prop'] = strats['PERWT'] / tot

In [26]:
strats.to_csv("../data/census/cleaned/post_stratification_data_by_state.csv", index = False)