## Introduction To The Dataset

In [3]:
f = open("US_births_1994-2003_CDC_NCHS.csv", 'r')
data = f.read().split('\n')

In [4]:
data[0:10]

['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

## Converting Data Into A List Of Lists

In [5]:
def read_csv(filename):
    string_data = open(filename).read()
    # Remove the header
    string_list = string_data.split('\n') [1:]
    final_list = []
    
    for row in string_list:
        string_fields = row.split(',')
        int_fields = []
        for value in string_fields:
            int_fields.append(int(value))
        final_list.append(int_fields)
    return final_list

cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")

In [6]:
cdc_list[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

## Calculating Number of Births Each Month

In [7]:
def read_csv(filename):
    string_data = open(filename).read()
    
    # Remove the header 
    string_list = string_data.split('\n') [1:]
    final_list = []
    
    for row in string_list:
        string_fields = row.split(',')
        int_fields = []
        for value in string_fields:
            int_fields.append(int(value))
        final_list.append(int_fields)
    return final_list

cdc_list = read_csv("US_births_1994-2003_CDC_NCHS.csv")

def month_births(data):
    births_per_month = {}
    
    for row in data:
        month = row[1]
        births = row[4]
        if month in births_per_month:
            births_per_month[month] = births_per_month[month] + births
        else: 
            births_per_month[month] = births
    return births_per_month

cdc_month_births = month_births(cdc_list)

In [8]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

## Calculating Number Of Births Each Day Of Week

In [9]:
def dow_births(data):
    births_per_dow = {}
    
    for row in data:
        dow = row[3]
        births = row[4]
        if dow in births_per_dow:
            births_per_dow[dow] = births_per_dow[dow] + births
        else:
            births_per_dow[dow] = births
    return births_per_dow

cdc_dow_births = dow_births(cdc_list)

In [10]:
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

## Creating A More General Function To Count Births

In [11]:
def calc_counts(data, column):
    sums_dict = {}
    
    for row in data:
        col_value = row[column]
        births = row[4]
        if col_value in sums_dict:
            sums_dict[col_value] = sums_dict[col_value] + births
        else:
            sums_dict[col_value] = births
    return sums_dict

cdc_year_births = calc_counts(cdc_list, 0)
cdc_month_births = calc_counts(cdc_list, 1)
cdc_dom_births = calc_counts(cdc_list, 2)
cdc_dow_births = calc_counts(cdc_list, 3)

In [12]:
cdc_year_births

{1994: 3952767,
 1995: 3899589,
 1996: 3891494,
 1997: 3880894,
 1998: 3941553,
 1999: 3959417,
 2000: 4058814,
 2001: 4025933,
 2002: 4021726,
 2003: 4089950}

In [13]:
cdc_month_births

{1: 3232517,
 2: 3018140,
 3: 3322069,
 4: 3185314,
 5: 3350907,
 6: 3296530,
 7: 3498783,
 8: 3525858,
 9: 3439698,
 10: 3378814,
 11: 3171647,
 12: 3301860}

In [14]:
cdc_dom_births

{1: 1276557,
 2: 1288739,
 3: 1304499,
 4: 1288154,
 5: 1299953,
 6: 1304474,
 7: 1310459,
 8: 1312297,
 9: 1303292,
 10: 1320764,
 11: 1314361,
 12: 1318437,
 13: 1277684,
 14: 1320153,
 15: 1319171,
 16: 1315192,
 17: 1324953,
 18: 1326855,
 19: 1318727,
 20: 1324821,
 21: 1322897,
 22: 1317381,
 23: 1293290,
 24: 1288083,
 25: 1272116,
 26: 1284796,
 27: 1294395,
 28: 1307685,
 29: 1223161,
 30: 1202095,
 31: 746696}

In [15]:
cdc_dow_births

{1: 5789166,
 2: 6446196,
 3: 6322855,
 4: 6288429,
 5: 6233657,
 6: 4562111,
 7: 4079723}

## Function That Calculates The MIN and MAX Values For Any Dictionary

In [16]:
def MinMax(dictname):
    maximum = None
    minimum = None
    
    # Calculate the maximum 
    
    for key in dictname:
        if maximum is None or dictname[key] > maximum:
            maximum = dictname[key]
            
    # Calculate the minimum
            
        if minimum is None or dictname[key] < minimum:
            minimum = dictname[key]
    print(maximum)
    print(minimum)


In [17]:
# Find the maximum and minimum births in the cdc_dow_births dictionary
MinMax(cdc_dow_births)

6446196
4079723


## A Function that extracts the same values across years and Calculates the differences between consecutive values to show if number of births is increasing or decreasing

In [18]:
def check_birth_growth(birth_data_file):
    cdc_list = read_csv(birth_data_file)
    cdc_year_births = calc_counts(cdc_list, 0)
    previous_year_birth = 0
    previous_birth_diff = 0
    for year, total_births in cdc_year_births.items():
        current_year_birth = int(total_births)
        if previous_year_birth == 0:
            growth_status="Growth of births in {} not available.".format(year)
            print(growth_status)
            previous_year_birth=current_year_birth
        else:
            if current_year_birth>previous_year_birth:
                growth_status="Births increased in {}.".format(year)
                print(growth_status)
                previous_year_birth=current_year_birth
            elif current_year_birth < previous_year_birth:
                growth_status="Births decreased in {}.".format(year)
                print(growth_status)
                previous_year_birth=current_year_birth
            elif current_year_birth == previous_year_birth:
                 growth_status == "Births in {} was same as previous year.".format(year)
                 print(growth_status)
                 previous_year_birth = current_year_birth

In [19]:
check_birth_growth(birth_data_file="US_births_1994-2003_CDC_NCHS.csv")

Growth of births in 1994 not available.
Births decreased in 1995.
Births decreased in 1996.
Births decreased in 1997.
Births increased in 1998.
Births increased in 1999.
Births increased in 2000.
Births decreased in 2001.
Births decreased in 2002.
Births increased in 2003.


##  Function that combines both files CDC Data with SSA Data and deals with overlapping time periods in Data Sets

In [30]:
# Read in second dataset US_births_2000-2014_SSA.csv
ssa_list = read_csv("US_births_2000-2014_SSA.csv")

dup_list = [] 
remove_ssa_list = [] 
remove_cdc_list = [] 


for row in cdc_list:
      
    cdc_year = row[0]
    cdc_month = row[1]
    cdc_dom = row[2]
    cdc_dow = row[3]
    cdc_births = row[4]
    
for row in ssa_list:
    
    ssa_year = row[0]
    ssa_month = row[1]
    ssa_dom = row[2]
    ssa_dow = row[3]
    ssa_births = row[4]
    
    
# Check to see if year, month, date_of_month, day_of_week from ssa_list is in cdc_list
# If it is take the average of births and add to dup_list

    if cdc_year == ssa_year and cdc_month == ssa_month and cdc_dom == ssa_dom and cdc_dow == ssa_dow :
        avg_births = (float(cdc_births) + float(ssa_births)) / 2.0
        
        # add to fina_list row with average of births
        
        new_list = [cdc_year,cdc_month, cdc_dom, cdc_dow, avg_births]
        dup_list.append(new_list)
        new_list = [cdc_year,cdc_month, cdc_dom, cdc_dow, cdc_births]
        remove_cdc_list.append(new_list)
        new_list = [ssa_year,ssa_month, ssa_dom, ssa_dow, ssa_births]
        remove_ssa_list.append(new_list)
        
# Remove duplicates from both lists above 

for row in remove_cdc_list:

    # remove dup from original list 
    
    cdc_list.remove([row[0],row[1],row[2],row[3],row[4]]) 
    
for row in remove_ssa_list: 
    
    # remove dup from original list 
    
    ssa_list.remove([row[0],row[1],row[2],row[3],row[4]])
    
# now combine all the lists inlcuding the new average births from dup_list and sort for export

clean_list = cdc_list + ssa_list + dup_list 
clean_list = sorted(clean_list)
        
        





    
