In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 


## 1. Download the topical data file, in SAS format, from:

https://www2.census.gov/programs-surveys/nsch/datasets/2021/nsch_2021_topical_SAS.zip. Turn it into a data frame. We're only interested in the following columns:

- FIPSST
- VEGETABLES
- FRUIT
- SUGARDRINK

In [4]:
data = "data/nsch_2021_topical_SAS/nsch_2021_topical.sas7bdat"

In [87]:
df = pd.read_sas(data, format='sas7bdat', encoding="ISO-8859-1")

In [88]:
df.head()

Unnamed: 0,FIPSST,STRATUM,HHID,FORMTYPE,TOTKIDS_R,TENURE,HHLANGUAGE,SC_AGE_YEARS,SC_SEX,K2Q35A_1_YEARS,...,HHCOUNT_IF,HIGRADE,HIGRADE_TVIS,FPL_I1,FPL_I2,FPL_I3,FPL_I4,FPL_I5,FPL_I6,FWC
0,48,1,21000002,T1,2.0,1.0,3.0,2.0,1.0,,...,0.0,3.0,4.0,400.0,400.0,400.0,400.0,400.0,400.0,2068.528102
1,2,1,21000009,T3,1.0,2.0,1.0,12.0,1.0,,...,0.0,2.0,2.0,139.0,139.0,139.0,139.0,139.0,139.0,172.907963
2,40,1,21000017,T2,2.0,1.0,1.0,8.0,1.0,,...,0.0,3.0,4.0,395.0,395.0,395.0,395.0,395.0,395.0,2169.272128
3,26,1,21000030,T1,1.0,1.0,1.0,2.0,2.0,,...,0.0,2.0,2.0,336.0,336.0,336.0,336.0,336.0,336.0,884.19635
4,22,1,21000031,T1,3.0,3.0,1.0,5.0,2.0,,...,0.0,2.0,2.0,96.0,96.0,96.0,96.0,96.0,96.0,812.410936


In [89]:
sorted(df.columns)

['A1_ACTIVE',
 'A1_AGE',
 'A1_BORN',
 'A1_DEPLSTAT',
 'A1_EMPLOYED',
 'A1_GRADE',
 'A1_GRADE_IF',
 'A1_LIVEUSA',
 'A1_MARITAL',
 'A1_MENTHEALTH',
 'A1_PHYSHEALTH',
 'A1_RELATION',
 'A1_SEX',
 'A2_ACTIVE',
 'A2_AGE',
 'A2_BORN',
 'A2_DEPLSTAT',
 'A2_EMPLOYED',
 'A2_GRADE',
 'A2_LIVEUSA',
 'A2_MARITAL',
 'A2_MENTHEALTH',
 'A2_PHYSHEALTH',
 'A2_RELATION',
 'A2_SEX',
 'ACE1',
 'ACE10',
 'ACE11',
 'ACE12',
 'ACE3',
 'ACE4',
 'ACE5',
 'ACE6',
 'ACE7',
 'ACE8',
 'ACE9',
 'ADDTREAT',
 'AGEPOS4',
 'ALLERGIES',
 'ALLERGIES_CURR',
 'ALLERGIES_DESC',
 'ALTHEALTH',
 'APPOINTMENT',
 'ARRANGEHC',
 'ARTHRITIS',
 'ARTHRITIS_CURR',
 'ARTHRITIS_DESC',
 'ASKQUESTION',
 'ASKQUESTION2',
 'ATHOMEHC',
 'AUTISMMED',
 'AUTISMTREAT',
 'AVAILABLE',
 'AVOIDCHG',
 'BEDTIME',
 'BESTFORCHILD',
 'BIRTHWT',
 'BIRTHWT_L',
 'BIRTHWT_OZ_S',
 'BIRTHWT_VL',
 'BIRTH_MO',
 'BIRTH_YR',
 'BIRTH_YR_F',
 'BLINDNESS',
 'BLOOD',
 'BLOOD_DESC',
 'BLOOD_OTHER',
 'BLOOD_SCREEN',
 'BMICLASS',
 'BORNUSA',
 'BREASTFEDEND_DAY_S',
 'BREAST

In [90]:
cols = ["FIPSST", "VEGETABLES", "FRUIT", "SUGARDRINK"]
df = df[cols]

In [91]:
df.head()

Unnamed: 0,FIPSST,VEGETABLES,FRUIT,SUGARDRINK
0,48,3.0,2.0,1.0
1,2,,,
2,40,,,
3,26,2.0,2.0,2.0
4,22,2.0,3.0,2.0


## 2. Turn the FIPSST column into an integer, and make it the index.

In [92]:
df["FIPSST"] = df.FIPSST.astype("int8")

In [93]:
df = df.set_index("FIPSST")

## 3. What percentage of children had, on average, less than one vegetable per day during the week preceding the study?

In [104]:
df.loc[df['VEGETABLES'] < 4, 'VEGETABLES'].count() / df['VEGETABLES'].count()

0.47014480177883833

## 4. What percentage of children had, on average, less than one vegetable per day and less than one fruit per day during the week preceding the study?

In [105]:
df.loc[(df['VEGETABLES'] < 4) & (df["FRUIT"] < 4), 'VEGETABLES'].count() / df['VEGETABLES'].count()

0.2626498183198655

## 5. What percentage of children had, on average, less than one vegetable per day and less than one fruit per day and did have a sugary drink during the week preceding the study?

In [106]:
df.loc[(df['VEGETABLES'] < 4) & (df["FRUIT"] < 4) & (df["SUGARDRINK"] > 1), 'VEGETABLES'].count() / df['VEGETABLES'].count()

0.1590107923423179

## 6. Download the FIPS state reference info, in CSV format, from https://www2.census.gov/geo/docs/reference/state.txt. Turn this into a data frame, with the STATE column as the index. The only other column we care about is STATE_NAME.

In [107]:
url = "https://www2.census.gov/geo/docs/reference/state.txt"
df_states = pd.read_table(url, delimiter="|", header=0)
df_states = df_states[["STATE", "STATE_NAME"]]

In [108]:
df_states.head()

Unnamed: 0,STATE,STATE_NAME
0,1,Alabama
1,2,Alaska
2,4,Arizona
3,5,Arkansas
4,6,California


In [109]:
df = pd.merge(df, df_states, left_index=True, right_on="STATE")

In [110]:
df.head()

Unnamed: 0,VEGETABLES,FRUIT,SUGARDRINK,STATE,STATE_NAME
43,3.0,2.0,1.0,48,Texas
43,,,,48,Texas
43,,,,48,Texas
43,2.0,4.0,2.0,48,Texas
43,,,,48,Texas


## 7. What percentage of children, per state, had, on average, less than one vegetaable per day during the week preceding the study?


In [112]:
(df.loc[df['VEGETABLES'] < 4, ['STATE_NAME','VEGETABLES']].groupby('STATE_NAME').count()['VEGETABLES'] / df.groupby('STATE_NAME')['VEGETABLES'].count()).sort_values() * 100

STATE_NAME
Vermont                 30.000000
Maine                   33.501259
District of Columbia    35.218509
Minnesota               38.235294
New Hampshire           38.977636
Montana                 39.265537
Kansas                  40.860215
Oregon                  41.118421
Tennessee               41.964286
Alaska                  42.045455
California              42.151163
Colorado                42.574257
Ohio                    42.660550
Washington              42.896936
Massachusetts           42.948718
Wisconsin               43.283582
Maryland                43.971631
North Dakota            44.342508
Wyoming                 44.943820
Connecticut             45.405405
Iowa                    46.089385
North Carolina          46.710526
New Mexico              46.905537
South Carolina          47.040498
Michigan                47.222222
Missouri                47.321429
Pennsylvania            47.826087
Virginia                47.854785
West Virginia           48.245614
Geo