I do some basic analysis of 2017 CDC immunization data here. Based on my solutions to a problem set for Applied Data Science with Python from Coursera

In [1]:
import pandas as pd
import re
import scipy.stats as stats
import numpy as np

In [3]:
# This function determines the proportion of children in the dataset who had a mother with one of four
# educational attainment levels
def proportion_of_education():
    df = pd.read_csv('NISPUF17.csv', index_col=0)
    result_dict = {}
    result_dict["less than high school"] = sum(df['EDUC1']==1)/len(df)
    result_dict["high school"] = sum(df['EDUC1']==2)/len(df)
    result_dict["more than high school but not college"] = sum(df['EDUC1']==3)/len(df)
    result_dict["college"] = sum(df['EDUC1']==4)/len(df)
    return result_dict
proportion_of_education()

{'less than high school': 0.10202002459160373,
 'high school': 0.172352011241876,
 'more than high school but not college': 0.24588090637625154,
 'college': 0.47974705779026877}

In [7]:
# This function returns the mean # of flu vaccines for children 
# we know received breastmilk as a child and those we know didn't. Note the removal of NaN values.
def average_influenza_doses():
    df = pd.read_csv('NISPUF17.csv', index_col=0)
    df1 = df[df['CBF_01'] == 1]
    df2 = df[df['CBF_01'] == 2]
    x = df1[df1['P_NUMFLU']>=0]
    yes = sum(x['P_NUMFLU'])/len(x)
    y = df2[df2['P_NUMFLU']>=0]
    no = sum(y['P_NUMFLU'])/len(y)
    return(yes,no)
print('Breastmilk: {0}\nNo breastmilk: {1}'
      .format(average_influenza_doses()[0], average_influenza_doses()[1]))

Breastmilk: 1.8799187420058687
No breastmilk: 1.5963945918878317


In [5]:
# This function returns the ratio of children who got varicella but were vaccinated against it to 
# children who were vaccinated and did not contract it, by sex.
def chickenpox_by_sex():
    df = pd.read_csv('NISPUF17.csv', index_col=0)
    df3 = df[df['P_NUMVRC']>0]
    dict1 = {}
    df_men = df3[df3['SEX']==1]
    df_women = df3[df3['SEX']==2]
    dict1['male'] = sum(df_men['HAD_CPOX']==1)/sum(df_men['HAD_CPOX']==2)
    dict1['female'] = sum(df_women['HAD_CPOX']==1)/sum(df_women['HAD_CPOX']==2)
    return dict1
chickenpox_by_sex()

{'male': 0.009675583380762664, 'female': 0.0077918259335489565}

In [6]:
# This function returns the correlation between having had varicella 
# and the number of varicella doses, after doing some cleaning.
def corr_chickenpox():
    df = pd.read_csv('NISPUF17.csv', index_col=0)
    dfx = df[df['HAD_CPOX']<=2]
    dfy = dfx[dfx.notna()['P_NUMVRC']==True]
    corr, pval=stats.pearsonr(dfy['HAD_CPOX'],dfy['P_NUMVRC'])
    return corr
corr_chickenpox()

0.07044873460148