In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup

### Census.gov API Call

In [2]:
response = None
url1 = 'https://api.census.gov/data/timeseries/poverty/saipe?get=NAME,SAEPOVRTALL_PT,SAEPOVRT0_17_PT,SAEMHI_PT,YEAR&for=state:*'
url2 ='https://api.census.gov/data/timeseries/poverty/saipe?get=SAEPOVRTALL_PT,SAEPOVRT0_17_PT,SAEMHI_PT,YEAR&for=us'

poverty_response1 = requests.get(url1)
poverty_response2 = requests.get(url2)

Convert our api response to json

In [3]:
poverty_response_json1 = poverty_response1.json()
poverty_response_json2 = poverty_response2.json()

load Poverty Data in DF

In [4]:
poverty_state = pd.DataFrame(poverty_response_json1[1:], columns=poverty_response_json1[0])
poverty_nat = pd.DataFrame(poverty_response_json2[1:], columns=poverty_response_json2[0])

In [5]:
print(poverty_state.columns)
print(poverty_nat.columns)

Index(['NAME', 'SAEPOVRTALL_PT', 'SAEPOVRT0_17_PT', 'SAEMHI_PT', 'YEAR',
       'state'],
      dtype='object')
Index(['SAEPOVRTALL_PT', 'SAEPOVRT0_17_PT', 'SAEMHI_PT', 'YEAR', 'us'], dtype='object')


In [6]:
poverty_state = poverty_state.drop(columns='state')
poverty_nat = poverty_nat.drop(columns='us')

poverty_state = poverty_state.rename(columns={"NAME": "STATE", "SAEPOVRTALL_PT": "PR_ALL", "SAEPOVRT0_17_PT": "PR_YOUTH", 'SAEMHI_PT': 'MED_HH_INCOME' })
poverty_nat = poverty_nat.rename(columns={"NAME": "STATE", "SAEPOVRTALL_PT": "PR_ALL", "SAEPOVRT0_17_PT": "PR_YOUTH", 'SAEMHI_PT': 'MED_HH_INCOME' })

In [7]:
poverty_state = poverty_state.astype({'PR_ALL': float, 'PR_YOUTH': float, 'MED_HH_INCOME': float, 'YEAR': int})
poverty_nat = poverty_nat.astype({'PR_ALL': float, 'PR_YOUTH': float, 'MED_HH_INCOME': float, 'YEAR': int})

In [8]:
print(poverty_state.dtypes)
print(poverty_nat.dtypes)

STATE             object
PR_ALL           float64
PR_YOUTH         float64
MED_HH_INCOME    float64
YEAR               int32
dtype: object
PR_ALL           float64
PR_YOUTH         float64
MED_HH_INCOME    float64
YEAR               int32
dtype: object


In [9]:
poverty_nat = poverty_nat[poverty_nat['YEAR'] >= 2017]
poverty_state = poverty_state[poverty_state['YEAR'] >= 2017]

In [10]:
poverty_state = poverty_state[poverty_state['STATE'] != 'Guam'] 
poverty_state = poverty_state[poverty_state['STATE'] != 'Puerto Rico'] 
poverty_state = poverty_state[poverty_state['STATE'] != 'Virgin Islands']
poverty_state = poverty_state[poverty_state['STATE'] != 'District of Columbia']

# National salary data

In [11]:
national2018_df = pd.read_excel("data/national_M2018_dl.xlsx")
national2019_df = pd.read_excel("data/national_M2019_dl.xlsx")
national2020_df = pd.read_excel("data/national_M2020_dl.xlsx")
national2021_df = pd.read_excel("data/national_M2021_dl.xlsx")
national2017_df = pd.read_excel("data/national_M2017_dl.xlsx")
national2017_df['Year'] = 2017
national2018_df['Year'] = 2018
national2019_df['Year'] = 2019
national2020_df['Year'] = 2020
national2021_df['Year'] = 2021

Add Year columns to data

Reusable function to change column names to upper

In [12]:
def upper_columns(data):
    columns = data
    myList = []
    for name in columns:
        myList.append(name.upper())
    data.columns = myList

In [13]:
upper_columns(national2017_df)
upper_columns(national2018_df)
upper_columns(national2019_df)
upper_columns(national2020_df)
upper_columns(national2021_df)

Rename columns that dont match

In [14]:
national2019_df = national2019_df.rename(columns={'O_GROUP': 'OCC_GROUP'})
national2020_df = national2020_df.rename(columns={'O_GROUP': 'OCC_GROUP'})
national2021_df = national2021_df.rename(columns={'O_GROUP': 'OCC_GROUP'})

Filter columns and set correct column order

In [15]:
national2017_ordered = national2017_df[[
    'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP', 'EMP_PRSE', 'H_MEAN',
    'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75',
    'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90',
    'YEAR']]
national2018_ordered = national2018_df[[
    'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP', 'EMP_PRSE', 'H_MEAN',
    'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75',
    'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90',
    'YEAR']]
national2019_ordered = national2019_df[[
    'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP', 'EMP_PRSE', 'H_MEAN',
    'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75',
    'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90',
    'YEAR']]
national2020_ordered = national2020_df[[
    'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP', 'EMP_PRSE', 'H_MEAN',
    'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75',
    'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90',
    'YEAR']]
national2021_ordered = national2021_df[[
    'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP', 'EMP_PRSE', 'H_MEAN',
    'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75',
    'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90',
    'YEAR']]

Concat all National data into one df

In [16]:
national_df = pd.concat([national2017_ordered, national2018_ordered, national2019_ordered, national2020_ordered, national2021_ordered], axis=0)
national_df.shape
national_df

Unnamed: 0,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,YEAR
0,00-0000,All Occupations,total,142549250,0.1,24.34,50620,0.1,9.6,11.91,18.12,29.38,46.23,19970,24770,37690,61110,96150,2017
1,11-0000,Management Occupations,major,7280330,0.2,57.65,119910,0.1,23.19,33.6,49.32,71.83,#,48220,69880,102590,149410,#,2017
2,11-1000,Top Executives,minor,2473740,0.3,61.55,128020,0.2,20.74,31.74,49.58,78.72,#,43140,66030,103120,163740,#,2017
3,11-1010,Chief Executives,broad,210160,0.7,94.25,196050,0.4,32.74,54.55,88.11,#,#,68110,113470,183270,#,#,2017
4,11-1011,Chief Executives,detailed,210160,0.7,94.25,196050,0.4,32.74,54.55,88.11,#,#,68110,113470,183270,#,#,2017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1398,53-7081,Refuse and Recyclable Material Collectors,detailed,126050,1.5,20.57,42780,0.7,11.99,14.56,18.51,23.5,29.96,24950,30270,38500,48870,62310,2021
1399,53-7120,"Tank Car, Truck, and Ship Loaders",broad,12090,6.8,26.6,55330,2.6,15.91,18.54,23.75,37.31,38.39,33090,38570,49390,77610,79860,2021
1400,53-7121,"Tank Car, Truck, and Ship Loaders",detailed,12090,6.8,26.6,55330,2.6,15.91,18.54,23.75,37.31,38.39,33090,38570,49390,77610,79860,2021
1401,53-7190,Miscellaneous Material Moving Workers,broad,22470,4.7,19.32,40190,1.8,13.68,14.4,17.38,22.78,29.24,28450,29960,36150,47380,60830,2021


In [17]:
national_df = national_df[[
    'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP', 'H_MEAN',
    'A_MEAN', 'H_MEDIAN', 'A_MEDIAN', 'YEAR']]
national_df

Unnamed: 0,OCC_TITLE,OCC_GROUP,TOT_EMP,H_MEAN,A_MEAN,H_MEDIAN,A_MEDIAN,YEAR
0,All Occupations,total,142549250,24.34,50620,18.12,37690,2017
1,Management Occupations,major,7280330,57.65,119910,49.32,102590,2017
2,Top Executives,minor,2473740,61.55,128020,49.58,103120,2017
3,Chief Executives,broad,210160,94.25,196050,88.11,183270,2017
4,Chief Executives,detailed,210160,94.25,196050,88.11,183270,2017
...,...,...,...,...,...,...,...,...
1398,Refuse and Recyclable Material Collectors,detailed,126050,20.57,42780,18.51,38500,2021
1399,"Tank Car, Truck, and Ship Loaders",broad,12090,26.6,55330,23.75,49390,2021
1400,"Tank Car, Truck, and Ship Loaders",detailed,12090,26.6,55330,23.75,49390,2021
1401,Miscellaneous Material Moving Workers,broad,22470,19.32,40190,17.38,36150,2021


Remove missing data:

In [18]:
cols = list(national_df.columns)
for col in cols: 
    national_df = national_df[national_df[col] != '#']
    national_df = national_df[national_df[col] != '*']
    national_df = national_df[national_df[col] != '**']
national_df.head(1)

Unnamed: 0,OCC_TITLE,OCC_GROUP,TOT_EMP,H_MEAN,A_MEAN,H_MEDIAN,A_MEDIAN,YEAR
0,All Occupations,total,142549250,24.34,50620,18.12,37690,2017


Convert types:

In [19]:
print(national_df.columns)
print(national_df.dtypes)

Index(['OCC_TITLE', 'OCC_GROUP', 'TOT_EMP', 'H_MEAN', 'A_MEAN', 'H_MEDIAN',
       'A_MEDIAN', 'YEAR'],
      dtype='object')
OCC_TITLE    object
OCC_GROUP    object
TOT_EMP      object
H_MEAN       object
A_MEAN       object
H_MEDIAN     object
A_MEDIAN     object
YEAR          int64
dtype: object


In [31]:
national_df = national_df.astype({'TOT_EMP': int, 'H_MEAN': float, 'A_MEAN': float, 'H_MEDIAN': float, 'A_MEDIAN': float, 'YEAR': int})
print(national_df.dtypes)

OCC_TITLE     object
OCC_GROUP     object
TOT_EMP        int32
H_MEAN       float64
A_MEAN       float64
H_MEDIAN     float64
A_MEDIAN     float64
YEAR           int32
dtype: object


# State salary data

Read in state salary data from excel

In [21]:
state2018_df = pd.read_excel("data/state_M2018_dl.xlsx")
state2019_df = pd.read_excel("data/state_M2019_dl.xlsx")
state2020_df = pd.read_excel("data/state_M2020_dl.xlsx")
state2021_df = pd.read_excel("data/state_M2021_dl.xlsx")
state2017_df = pd.read_excel("data/state_M2017_dl.xlsx")
state2017_df['Year'] = 2017
state2018_df['Year'] = 2018
state2019_df['Year'] = 2019
state2020_df['Year'] = 2020
state2021_df['Year'] = 2021

In [22]:
upper_columns(state2017_df)
upper_columns(state2018_df)
upper_columns(state2019_df)
upper_columns(state2020_df)
upper_columns(state2021_df)

Rename columns that are unmatched

In [23]:
state2019_df = state2019_df.rename(columns={'AREA_TITLE': 'STATE'})
state2020_df = state2020_df.rename(columns={'AREA_TITLE': 'STATE'})
state2021_df = state2021_df.rename(columns={'AREA_TITLE': 'STATE'})

state2019_df = state2019_df.rename(columns={'O_GROUP': 'OCC_GROUP'})
state2020_df = state2020_df.rename(columns={'O_GROUP': 'OCC_GROUP'})
state2021_df = state2021_df.rename(columns={'O_GROUP': 'OCC_GROUP'})

state2017_df = state2017_df.rename(columns={'LOC_Q': 'LOC_QUOTIENT'})
state2018_df = state2018_df.rename(columns={'LOC_Q': 'LOC_QUOTIENT'})


Filter columns and set correct column order

In [24]:
state2017_ordered = state2017_df[[
    'AREA', 'STATE', 'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
    'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 'A_MEAN', 'MEAN_PRSE',
    'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10',
    'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'YEAR']]
state2018_ordered = state2018_df[[
    'AREA', 'STATE', 'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
    'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 'A_MEAN', 'MEAN_PRSE',
    'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10',
    'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'YEAR']]
state2019_ordered = state2019_df[[
    'AREA', 'STATE', 'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
    'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 'A_MEAN', 'MEAN_PRSE',
    'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10',
    'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'YEAR']]
state2020_ordered = state2020_df[[
    'AREA', 'STATE', 'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
    'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 'A_MEAN', 'MEAN_PRSE',
    'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10',
    'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'YEAR']]
state2021_ordered = state2021_df[[
    'AREA', 'STATE', 'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
    'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 'A_MEAN', 'MEAN_PRSE',
    'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90', 'A_PCT10',
    'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'YEAR']]

Concat all State data into one df

In [25]:
state_df = pd.concat([state2017_ordered, state2018_ordered, state2019_ordered, state2020_ordered, state2021_ordered], axis=0)
state_df.shape

(183936, 23)

Remove non-state territories

In [26]:
state_df = state_df[state_df['STATE'] != 'Guam'] 
state_df = state_df[state_df['STATE'] != 'Puerto Rico'] 
state_df = state_df[state_df['STATE'] != 'Virgin Islands']
state_df = state_df[state_df['STATE'] != 'District of Columbia']
state_df.columns

Index(['AREA', 'STATE', 'OCC_CODE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
       'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 'A_MEAN',
       'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 'H_PCT75', 'H_PCT90',
       'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90', 'YEAR'],
      dtype='object')

Filter out unnecessary columns

In [27]:
state_df = state_df[[
    'STATE', 'OCC_TITLE', 'OCC_GROUP', 'TOT_EMP',
    'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 'A_MEAN', 
    'H_MEDIAN', 'A_MEDIAN', 'YEAR']]

Remove missing data

In [28]:
cols = list(state_df.columns)
for col in cols:
    state_df = state_df[state_df[col] != '#']
    state_df = state_df[state_df[col] != '*']
    state_df = state_df[state_df[col] != '**']

Convert types

In [29]:
state_df = state_df.astype({'TOT_EMP': int, 'JOBS_1000': float, 'LOC_QUOTIENT': float, 'H_MEAN': float, 'A_MEAN': int, 'H_MEDIAN': float, 'A_MEDIAN': float, 'YEAR': int})


In [32]:
print(poverty_nat.isna().sum())
print(poverty_state.isna().sum())
print(national_df.isna().sum())
print(state_df.isna().sum())

PR_ALL           0
PR_YOUTH         0
MED_HH_INCOME    0
YEAR             0
dtype: int64
STATE            0
PR_ALL           0
PR_YOUTH         0
MED_HH_INCOME    0
YEAR             0
dtype: int64
OCC_TITLE    0
OCC_GROUP    0
TOT_EMP      0
H_MEAN       0
A_MEAN       0
H_MEDIAN     0
A_MEDIAN     0
YEAR         0
dtype: int64
STATE           0
OCC_TITLE       0
OCC_GROUP       0
TOT_EMP         0
JOBS_1000       0
LOC_QUOTIENT    0
H_MEAN          0
A_MEAN          0
H_MEDIAN        0
A_MEDIAN        0
YEAR            0
dtype: int64


In [30]:
poverty_state.to_csv('output/state_poverty.csv', index=False)
poverty_nat.to_csv('output/national_poverty.csv', index=False)
national_df.to_csv('output/national_salary.csv', index=False)
state_df.to_csv('output/state_salary.csv', index=False)