# Exploring US Births

## Data Description

U.S. births data for the years 1994 to 2003, as provided by the Centers for Disease Control and Prevention's National Center for Health Statistics, and the Social Security Administration.

Header - Definition:
* year - Year
* month - Month
* date_of_month - Day number of the month
* day_of_week - Day of week, where 1 is Monday and 7 is Sunday
* births - Number of births

## Reading and Cleaning Data

In [13]:
births_content = open("US_births_1994-2003_CDC_NCHS.csv","r").read()
births_split = births_content.split("\n")
births_split[0:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

## Converting Data from String to Lists

In [14]:
def read_csv(file):
    file_content = open(file,"r").read()
    string_list = file_content.split("\n")
    string_list = string_list[1:]
    final_list = []
    for string in string_list:
        int_fields = []
        string_fields = string.split(",")
        for e in string_fields:
            int_fields.append(int(e))
        final_list.append(int_fields)
    return final_list

cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
cdc_list[0:9]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910]]

# Birth counts by month

In [15]:
def month_births(births_list):
    births_per_month = {}
    for e in births_list:
        births = e[4]
        month = e[1]
        if month in births_per_month:
            births_per_month[month] = births_per_month[month] + births
        else:
            births_per_month[month] = births
    return births_per_month

cdc_month_births = month_births(cdc_list)
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

## Birth Counts by Day of Week

In [16]:
def dow_births(births_list):
    births_dow = {}
    for e in births_list:
        births = e[4]
        dow = e[3]
        if dow in births_dow:
            births_dow[dow] = births_dow[dow] + births
        else:
            births_dow[dow] = births
    return births_dow

cdc_dow_births = dow_births(cdc_list)
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

## General Function to Count Births

In [17]:
def calc_counts(data, column):
    dict_column = {}
    for e in data:
        count_val = e[4]
        col_val = e[column-1]
        if col_val in dict_column:
            dict_column[col_val] = dict_column[col_val] + count_val
        else:
            dict_column[col_val] = count_val
    return dict_column

cdc_dow_births = calc_counts(cdc_list,4)
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

In [18]:
cdc_dom_births = calc_counts(cdc_list,3)
cdc_dom_births

{1: 1276557,
 2: 1288739,
 3: 1304499,
 4: 1288154,
 5: 1299953,
 6: 1304474,
 7: 1310459,
 8: 1312297,
 9: 1303292,
 10: 1320764,
 11: 1314361,
 12: 1318437,
 13: 1277684,
 14: 1320153,
 15: 1319171,
 16: 1315192,
 17: 1324953,
 18: 1326855,
 19: 1318727,
 20: 1324821,
 21: 1322897,
 22: 1317381,
 23: 1293290,
 24: 1288083,
 25: 1272116,
 26: 1284796,
 27: 1294395,
 28: 1307685,
 29: 1223161,
 30: 1202095,
 31: 746696}

In [19]:
cdc_month_births = calc_counts(cdc_list,2)
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [20]:
cdc_year_births = calc_counts(cdc_list,1)
cdc_year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

## Functions to return Min/ Max values in a dictionaty and to return year differences in between a range of years

In [21]:
def min_max_dict(dict_list):
    max_dict = 0
    for e in dict_list:
        if dict_list[e] > max_dict:
            max_dict = dict_list[e]
    min_dict = max_dict
    for e in dict_list:
        if dict_list[e] < min_dict:
            min_dict = dict_list[e]
    return min_dict, max_dict

def year_diff(same_col, same_val, year_from, year_to):
    cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")
    cdc_year_births = calc_counts(cdc_list,1)
    year_dict = {}
    i = 1
    for yr in cdc_year_births:
        if yr >= year_from: 
            if yr <= year_to:
                year_dict[i] = yr
                i = i+1
    year_diff = {}
    j = 2
    year_inb = year_from
    for yr in cdc_year_births:
        if yr >= year_from: 
            if yr < year_to:
                year1 = []
                year2 = []
                for e in cdc_list:
                    if e[0] == year_inb:
                        year1.append(e)
                    if e[0] == year_inb+1:
                        year2.append(e)
                yr1 = calc_counts(year1,same_col)
                yr2 = calc_counts(year2,same_col)
                yr_str = str(year_inb) + "-" + str(year_inb+1) 
                year_diff[yr_str] = yr1[same_val] - yr2[same_val]
                year_inb = year_inb+1
    return year_diff

min_max_yr = min_max_dict(cdc_year_births)
min_max_yr

(3880894, 4089950)

In [22]:
month_year_diff = year_diff(2,1,1994,2003)
month_year_diff

{'1994-1995': 4692,
 '1995-1996': 1730,
 '1996-1997': -2928,
 '1997-1998': -2129,
 '1998-1999': 158,
 '1999-2000': -10926,
 '2000-2001': -5090,
 '2001-2002': 4524,
 '2002-2003': 871}

## Combining the Births Data from CDC and SSA

In [23]:
births_content = open("US_births_2000-2014_SSA.csv","r").read()
births_split = births_content.split("\n")
births_split[0:9]

['year,month,date_of_month,day_of_week,births',
 '2000,1,1,6,9083',
 '2000,1,2,7,8006',
 '2000,1,3,1,11363',
 '2000,1,4,2,13032',
 '2000,1,5,3,12558',
 '2000,1,6,4,12466',
 '2000,1,7,5,12516',
 '2000,1,8,6,8934']

In [24]:
ssa_list = read_csv("US_births_2000-2014_SSA.csv")
ssa_list[0:9]

[[2000, 1, 1, 6, 9083],
 [2000, 1, 2, 7, 8006],
 [2000, 1, 3, 1, 11363],
 [2000, 1, 4, 2, 13032],
 [2000, 1, 5, 3, 12558],
 [2000, 1, 6, 4, 12466],
 [2000, 1, 7, 5, 12516],
 [2000, 1, 8, 6, 8934],
 [2000, 1, 9, 7, 7949]]

In [25]:
ssa_year_births = calc_counts(ssa_list,1)
ssa_year_births

{2000: 4149598,
 2001: 4110963,
 2002: 4099313,
 2003: 4163060,
 2004: 4186863,
 2005: 4211941,
 2006: 4335154,
 2007: 4380784,
 2008: 4310737,
 2009: 4190991,
 2010: 4055975,
 2011: 4006908,
 2012: 4000868,
 2013: 3973337,
 2014: 4010532}

In [26]:
births_list = []
for yr in cdc_year_births:
    if yr >= 1994: 
        if yr < 2000:
            for e in cdc_list:
                if e[0] == yr:
                    births_list.append(e)

births_list.extend(ssa_list)
year_births = calc_counts(births_list,1)
year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4149598,
 2001: 4110963,
 2002: 4099313,
 2003: 4163060,
 2004: 4186863,
 2005: 4211941,
 2006: 4335154,
 2007: 4380784,
 2008: 4310737,
 2009: 4190991,
 2010: 4055975,
 2011: 4006908,
 2012: 4000868,
 2013: 3973337,
 2014: 4010532}