In [37]:
import pandas as pd
import numpy as np
import scipy.stats as stats


In [56]:
demo = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEMO_L.XPT', format='xport')
bp = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/BPXO_L.XPT', format='xport')
body = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/BMX_L.XPT', format='xport')
chol_total = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/TCHOL_L.XPT', format='xport')
glycohemo = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/GHB_L.XPT', format='xport')
crp = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/HSCRP_L.XPT', format='xport')
dm = pd.read_sas('https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DIQ_L.XPT', format='xport')

In [58]:
#Question 1: "Is there an association between marital status (married or not married) and education level (bachelor’s degree or higher vs. less than a bachelor’s degree)?"
marital_status = demo.DMDMARTZ
educ_status = demo.DMDEDUC2

In [40]:
#Data Cleaning
marital_status = marital_status.replace([7777, 9999, np.nan], np.nan)  # Remove placeholder values in marital status
marital_status = educ_status.replace([7777, 9999, np.nan], np.nan)  # Remove placeholder values in education level

#Remove Null values
ms_null_count = pd.isna(marital_status).value_counts()
educ_null_count = pd.isna(educ_status).value_counts()

print("Marital Status Null Count:\n", ms_null_count)
print("Education Level Null Count:\n", educ_null_count)

demo_nonull = demo.dropna(subset=['DMDMARTZ', 'DMDEDUC2'])
demo_nonull[['DMDMARTZ','DMDEDUC2']]

print("\nCleaned Data:")
print(demo_nonull[['DMDMARTZ', 'DMDEDUC2']].head())

Marital Status Null Count:
 DMDEDUC2
False    7794
True     4139
Name: count, dtype: int64
Education Level Null Count:
 DMDEDUC2
False    7794
True     4139
Name: count, dtype: int64

Cleaned Data:
   DMDMARTZ  DMDEDUC2
0       1.0       5.0
1       1.0       5.0
2       1.0       3.0
6       3.0       2.0
7       1.0       3.0


In [41]:
#Recode Marital Status and Education Level variable
marital_status = marital_status.map({1: 'Married', 2: 'Not Married'}).dropna()
educ_status = educ_status.map({1: 'College', 2: 'No College'}).dropna()



In [42]:
#Frequency Counts
print("Marital Status Distribution:\n", marital_status.value_counts())
print("Education Level Distribution:\n", educ_status.value_counts())

Marital Status Distribution:
 DMDEDUC2
Not Married    666
Married        373
Name: count, dtype: int64
Education Level Distribution:
 DMDEDUC2
No College    666
College       373
Name: count, dtype: int64


In [43]:
#Contingency Table
contingency_table = pd.crosstab(marital_status, educ_status)
print("\nContingency Table:\n", contingency_table)


Contingency Table:
 DMDEDUC2     College  No College
DMDEDUC2                        
Married          373           0
Not Married        0         666


In [44]:
chi2_stat, p_val, dof, expected = stats.chi2_contingency(contingency_table)

In [45]:
#Statistics
print("\nChi-Square Statistic:", chi2_stat)
print("p-value:", p_val)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:\n", expected)


Chi-Square Statistic: 1034.6589609655696
p-value: 5.257557282321254e-227
Degrees of Freedom: 1
Expected Frequencies Table:
 [[133.906641 239.093359]
 [239.093359 426.906641]]


In [46]:
#Results
if p_val < 0.05:
    print("\nThere is no statistically significant association between marital status and education level.")
else:
    print("\nThere is a statistically significant association between marital status and education level.")


There is no statistically significant association between marital status and education level.


In [55]:
#Question 2: "Is there a difference in the mean sedentary behavior time between those who are married and those who are not married?"

sedentary_behavior = demo. PAD680

#Data Cleaning
sedentary_behavior = sedentary_behavior.replace([7777, 9999, np.nan], np.nan)  # Remove placeholder values


AttributeError: 'DataFrame' object has no attribute 'PAD680'

In [54]:
#Recode Marital Status
demo['marital_status'] = demo['DMDMARTZ'].map({1: 'Married', 2: 'Not Married'}).dropna()
demo_cleaned = demo.dropna(subset=['marital_status', 'PAD680'])




KeyError: ['PAD680']

In [53]:
#Checks distirbution of sedentary behvaior time for each marital group
married_data = demo_cleaned[demo_cleaned['marital_status'] == 'Married']['PAD680']
not_married_data = demo_cleaned[demo_cleaned['marital_status'] == 'Not Married']['PAD680']

NameError: name 'demo_cleaned' is not defined

In [52]:
#Statistics
t_stat, p_val = stats.ttest_ind(married_data, not_married_data)
print(f"\nT-statistic for Sedentary Behavior Comparison: {t_stat}")
print(f"P-value for Sedentary Behavior Comparison: {p_val}")



NameError: name 'married_data' is not defined

In [51]:
#Results
alpha = 0.05
if p_val < alpha:
    print("\nResult: Significant difference in sedentary behavior time by marital status.")
else:
    print("\nResult: No significant difference in sedentary behavior time by marital status.")


Result: Significant difference in sedentary behavior time by marital status.
