# NHANES 2021-2023 Inferential Analytics Assignment

In [12]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, ttest_ind, pearsonr
import matplotlib.pyplot as plt
import seaborn as sns

In [28]:
demographic_path = 'DEMO_L.XPT'
demo = pd.read_sas(demographic_path, format='xport') #, encoding='latin1')  # Try 'latin1' or 'iso-8859-1' if UTF-8 fails
demo

Unnamed: 0,SEQN,SDDSRVYR,RIDSTATR,RIAGENDR,RIDAGEYR,RIDAGEMN,RIDRETH1,RIDRETH3,RIDEXMON,RIDEXAGM,...,DMDHRGND,DMDHRAGZ,DMDHREDZ,DMDHRMAZ,DMDHSEDZ,WTINT2YR,WTMEC2YR,SDMVSTRA,SDMVPSU,INDFMPIR
0,130378.0,12.0,2.0,1.0,43.0,,5.0,6.0,2.0,,...,,,,,,50055.450807,54374.463898,173.0,2.0,5.00
1,130379.0,12.0,2.0,1.0,66.0,,3.0,3.0,2.0,,...,,,,,,29087.450605,34084.721548,173.0,2.0,5.00
2,130380.0,12.0,2.0,2.0,44.0,,2.0,2.0,1.0,,...,,,,,,80062.674301,81196.277992,174.0,1.0,1.41
3,130381.0,12.0,2.0,2.0,5.0,,5.0,7.0,1.0,71.0,...,2.0,2.0,2.0,3.0,,38807.268902,55698.607106,182.0,2.0,1.53
4,130382.0,12.0,2.0,1.0,2.0,,3.0,3.0,2.0,34.0,...,2.0,2.0,3.0,1.0,2.0,30607.519774,36434.146346,182.0,2.0,3.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11928,142306.0,12.0,2.0,1.0,9.0,,2.0,2.0,1.0,111.0,...,1.0,3.0,3.0,3.0,,11147.192563,13459.129019,176.0,1.0,2.01
11929,142307.0,12.0,2.0,2.0,49.0,,4.0,4.0,2.0,,...,,,,,,69419.620456,64962.328962,181.0,1.0,
11930,142308.0,12.0,2.0,1.0,50.0,,2.0,2.0,1.0,,...,,,,,,32696.313477,44367.534132,183.0,2.0,1.95
11931,142309.0,12.0,2.0,1.0,40.0,,2.0,2.0,1.0,,...,,,,,,30547.974564,46249.361849,176.0,1.0,3.11


In [16]:
survey_huq_path = 'HUQ_L.XPT'
access = pd.read_sas(survey_huq_path, format='xport')
access

Unnamed: 0,SEQN,HUQ010,HUQ030,HUQ042,HUQ055,HUQ090
0,130378.0,1.0,1.0,1.0,2.0,2.0
1,130379.0,3.0,1.0,1.0,1.0,2.0
2,130380.0,3.0,1.0,1.0,1.0,2.0
3,130381.0,1.0,1.0,1.0,2.0,2.0
4,130382.0,3.0,1.0,1.0,2.0,
...,...,...,...,...,...,...
11928,142306.0,1.0,2.0,,2.0,2.0
11929,142307.0,3.0,1.0,1.0,2.0,2.0
11930,142308.0,2.0,2.0,,2.0,2.0
11931,142309.0,1.0,1.0,4.0,2.0,2.0


In [17]:
fasting_glucose_path = 'GLU_L.XPT'
glucose = pd.read_sas(fasting_glucose_path, format='xport')
glucose

Unnamed: 0,SEQN,WTSAF2YR,LBXGLU,LBDGLUSI
0,130378.0,1.200253e+05,113.0,6.27
1,130379.0,5.397605e-79,99.0,5.50
2,130380.0,1.450908e+05,156.0,8.66
3,130386.0,8.259962e+04,100.0,5.55
4,130394.0,1.004203e+05,88.0,4.88
...,...,...,...,...
3991,142301.0,3.112337e+04,110.0,6.11
3992,142303.0,1.095823e+05,160.0,8.88
3993,142305.0,8.479001e+04,132.0,7.33
3994,142308.0,5.397605e-79,,


In [18]:
crp_path = 'HSCRP_L.XPT'
crp = pd.read_sas(crp_path, format='xport')
crp

Unnamed: 0,SEQN,WTPH2YR,LBXHSCRP,LBDHRPLC
0,130378.0,5.604213e+04,1.78,5.397605e-79
1,130379.0,3.743571e+04,2.03,5.397605e-79
2,130380.0,8.532884e+04,5.62,5.397605e-79
3,130381.0,5.397605e-79,,
4,130382.0,5.963893e+04,,
...,...,...,...,...
8722,142306.0,5.397605e-79,,
8723,142307.0,6.899418e+04,4.30,5.397605e-79
8724,142308.0,5.397605e-79,,
8725,142309.0,4.628442e+04,0.53,5.397605e-79


In [19]:
body_path = 'BMX_L.XPT'
body = pd.read_sas(body_path, format='xport')
body

Unnamed: 0,SEQN,BMDSTATS,BMXWT,BMIWT,BMXRECUM,BMIRECUM,BMXHEAD,BMIHEAD,BMXHT,BMIHT,...,BMXLEG,BMILEG,BMXARML,BMIARML,BMXARMC,BMIARMC,BMXWAIST,BMIWAIST,BMXHIP,BMIHIP
0,130378.0,1.0,86.9,,,,,,179.5,,...,42.8,,42.0,,35.7,,98.3,,102.9,
1,130379.0,1.0,101.8,,,,,,174.2,,...,38.5,,38.7,,33.7,,114.7,,112.4,
2,130380.0,1.0,69.4,,,,,,152.9,,...,38.5,,35.5,,36.3,,93.5,,98.0,
3,130381.0,1.0,34.3,,,,,,120.1,,...,,,25.4,,23.4,,70.4,,,
4,130382.0,3.0,13.6,,,1.0,,,,1.0,...,,,,1.0,,1.0,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8855,142306.0,1.0,25.3,,,,,,128.0,,...,32.0,,25.0,,19.0,,57.7,,,
8856,142307.0,3.0,,1.0,,,,,143.8,,...,,1.0,34.0,,35.4,,,1.0,,1.0
8857,142308.0,1.0,79.3,,,,,,173.3,,...,41.8,,40.0,,30.6,,98.4,,97.7,
8858,142309.0,1.0,81.9,,,,,,179.1,,...,44.0,,40.0,,30.8,,96.0,,103.3,


In [21]:
bp_path = 'BPXO_L.XPT'
bp = pd.read_sas(bp_path, format='xport')
bp

Unnamed: 0,SEQN,BPAOARM,BPAOCSZ,BPXOSY1,BPXODI1,BPXOSY2,BPXODI2,BPXOSY3,BPXODI3,BPXOPLS1,BPXOPLS2,BPXOPLS3
0,130378.0,b'R',4.0,135.0,98.0,131.0,96.0,132.0,94.0,82.0,79.0,82.0
1,130379.0,b'R',4.0,121.0,84.0,117.0,76.0,113.0,76.0,72.0,71.0,73.0
2,130380.0,b'R',4.0,111.0,79.0,112.0,80.0,104.0,76.0,84.0,83.0,77.0
3,130386.0,b'R',4.0,110.0,72.0,120.0,74.0,115.0,75.0,59.0,64.0,64.0
4,130387.0,b'R',4.0,143.0,76.0,136.0,74.0,145.0,78.0,80.0,80.0,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7796,142306.0,b'R',2.0,,,,,,,,,
7797,142307.0,b'R',4.0,127.0,75.0,132.0,73.0,131.0,72.0,71.0,70.0,67.0
7798,142308.0,b'R',3.0,106.0,65.0,106.0,69.0,112.0,74.0,58.0,61.0,69.0
7799,142309.0,b'R',3.0,127.0,81.0,125.0,82.0,128.0,81.0,80.0,79.0,83.0


In [24]:
demo['DMDMARTZ'] = demo['DMDMARTZ'].map({1: 'Married', 2: 'Not Married'})

In [25]:
demo['DMDEDUC2'] = demo['DMDEDUC2'].map({5: "Bachelor's or Higher", 1: 'Less than Bachelor’s', 2: 'Less than Bachelor’s'})


In [26]:
demo['RIDAGEYR'].describe()


Unnamed: 0,RIDAGEYR
count,11933.0
mean,38.31786
std,25.60199
min,5.397605e-79
25%,13.0
50%,37.0
75%,62.0
max,80.0


In [37]:
df['PAD680'] = df['PAD680'].replace([7777, 9999], np.nan)
df['PAD680'].dropna(inplace=True)

NameError: name 'df' is not defined

In [31]:
body['WHD020'] = body['WHD020'].replace([7777, 9999], np.nan)
body['WHD020'].dropna(inplace=True)


KeyError: 'WHD020'

In [33]:
demo[['DMDMARTZ', 'DMDEDUC2', 'RIDAGEYR']].info()
demo[['DMDMARTZ', 'DMDEDUC2']].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11933 entries, 0 to 11932
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DMDMARTZ  7792 non-null   float64
 1   DMDEDUC2  7794 non-null   float64
 2   RIDAGEYR  11933 non-null  float64
dtypes: float64(3)
memory usage: 279.8 KB


Unnamed: 0_level_0,Unnamed: 1_level_0,count
DMDMARTZ,DMDEDUC2,Unnamed: 2_level_1
1.0,5.0,1631
1.0,4.0,1151
1.0,3.0,842
2.0,4.0,692
3.0,4.0,525
2.0,3.0,523
3.0,5.0,517
2.0,5.0,473
3.0,3.0,382
1.0,2.0,302


In [38]:
# Question 1
contingency_table = pd.crosstab(demo['DMDMARTZ'], demo['DMDEDUC2'])
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f'Chi-Square Test:\n Chi2 = {chi2}, p-value = {p}')

Chi-Square Test:
 Chi2 = 199.95521135886386, p-value = 1.1485405257909855e-31


## Question 1 Summary
### Association between Marital Status and Education Level
Based on the chi-square test, I found that...


In [36]:
# Question 2
married = demo[demo['DMDMARTZ'] == 'Married']['PAD680']
not_married = demo[demo['DMDMARTZ'] == 'Not Married']['PAD680']
t_stat, p_val = ttest_ind(married, not_married)
print(f'T-test:\n t-stat = {t_stat}, p-value = {p_val}')

KeyError: 'PAD680'

## Question 2 Summary
### Difference in Mean Sedentary Behavior TIme by Marital Status
...

In [39]:
# Question 3
sns.lmplot(data=demo, x='RIDAGEYR', y='BPXOSY3', hue='DMDMARTZ')
plt.show()

KeyError: "['BPXOSY3'] not in index"

## Question 3 Summary
### Effect of Age and Marital Status on Systolic Blood Pressure
...

In [40]:
# Question 4
corr, p_value = pearsonr(demo['WHD020'], demo['PAD680'])
print(f'Correlation between weight and sedentary behavior:\n Correlation = {corr}, p-value = {p_value}')

KeyError: 'WHD020'

## Question 4 Summary
### Correlation between Self-Reported Weight and Sedentary Behavior
...

In [None]:
# Question 5


## Question 5 Summary
### tbd

In [42]:
# Visualization
sns.boxplot(x='DMDMARTZ', y='PAD680', data=demo)
plt.show()

ValueError: Could not interpret value `PAD680` for `y`. An entry with this name does not appear in `data`.