# Python Basic programming:

In this notebook, we will explore two datsets on births in the U.S. This data set was compiled by FiveThirtyEight. 

US_births.csv contains U.S. births data for the years 1994 to 2003, as provided by the Centers for Disease Control and Prevention's National Center for Health Statistics.

SSA_births.csv contains U.S. births data for the years 2000 to 2014, as provided by the Social Security Administration.

## Overview

The dataset contains the following columns:

year: Year (1994 to 2014).
month: Month (1 to 12).
date_of_month: Day number of the month (1 to 31).
day_of_week: Day of week (1 to 7).
births: Number of births that day.

In [7]:
# Read csv file and print 5 records
open_file = open("US_Births.csv")
read_file = open_file.read()
split_file = read_file.split("\n")
split_file[0:10]


['year,month,date_of_month,day_of_week,births',
 '1994,1,1,6,8096',
 '1994,1,2,7,7772',
 '1994,1,3,1,10142',
 '1994,1,4,2,11248',
 '1994,1,5,3,11053',
 '1994,1,6,4,11406',
 '1994,1,7,5,11251',
 '1994,1,8,6,8653',
 '1994,1,9,7,7910']

In [9]:
# Convert csv file into list of list

csv_list = []
for row in split_file:
    split_row = row.split(",")
    csv_list.append(split_row)
    
csv_list[0:10]    

[['year', 'month', 'date_of_month', 'day_of_week', 'births'],
 ['1994', '1', '1', '6', '8096'],
 ['1994', '1', '2', '7', '7772'],
 ['1994', '1', '3', '1', '10142'],
 ['1994', '1', '4', '2', '11248'],
 ['1994', '1', '5', '3', '11053'],
 ['1994', '1', '6', '4', '11406'],
 ['1994', '1', '7', '5', '11251'],
 ['1994', '1', '8', '6', '8653'],
 ['1994', '1', '9', '7', '7910']]

In [18]:
# Create function to read csv and convert to list of list

def read_csv(file_name,header = True):
    file = open(file_name).read().split("\n")
    list_file = []
    if header == True:
        file = file[1:len(file)]
    for row in file:
        split_row = row.split(",")
        list_file.append(split_row)
    return list_file    
    

In [19]:
csv_list = read_csv("US_Births.csv")
csv_list[0:10]

[['1994', '1', '1', '6', '8096'],
 ['1994', '1', '2', '7', '7772'],
 ['1994', '1', '3', '1', '10142'],
 ['1994', '1', '4', '2', '11248'],
 ['1994', '1', '5', '3', '11053'],
 ['1994', '1', '6', '4', '11406'],
 ['1994', '1', '7', '5', '11251'],
 ['1994', '1', '8', '6', '8653'],
 ['1994', '1', '9', '7', '7910'],
 ['1994', '1', '10', '1', '10498']]

In [25]:
def calc_counts(birth_list,idx):
    birth_dict = {}
    count = 0
    for row in birth_list:
        month = row[idx]
        births = int(row[4])
        if month in birth_dict:
            birth_dict[month] = count + births
        else:
            birth_dict[month] = count
    return birth_dict



In [27]:
cdc_year_births = calc_counts(csv_list,0)
cdc_month_births = calc_counts(csv_list,1)
cdc_dom_births = calc_counts(csv_list,2)
cdc_dow_births = calc_counts(csv_list,3)

In [28]:
cdc_year_births

{'1994': 8809,
 '1995': 7596,
 '1996': 11924,
 '1997': 11818,
 '1998': 11389,
 '1999': 9335,
 '2000': 7721,
 '2001': 10018,
 '2002': 12339,
 '2003': 12374}

In [29]:
cdc_dow_births

{'1': 12823,
 '2': 14438,
 '3': 12374,
 '4': 6628,
 '5': 10218,
 '6': 8646,
 '7': 7645}

In [40]:
# function to get min and max values

def calc_min_max(dict_data):
    max_key = max(dict_data, key = lambda x: dict_data.get(x))
    min_key = min(dict_data, key = lambda x: dict_data.get(x))
    return [min_key,dict_data[min_key],max_key,dict_data[max_key]]
    

In [41]:
year_min_max = calc_min_max(cdc_year_births)
year_min_max

['1995', 7596, '2003', 12374]

In [45]:
# number of births in saturday change between 1994 and 2003
def calc_counts1(birth_list,idx,idx1):
    birth_dict = {}
    count = 0
    for row in birth_list:
        year_day = row[idx]+"-"+row[idx1]
        births = int(row[4])
        if year_day in birth_dict:
            birth_dict[year_day] = count + births
        else:
            birth_dict[year_day] = count
    return birth_dict

In [50]:
cdc_births = calc_counts1(csv_list,0,3)
cdc_births

{'1994-1': 8454,
 '1994-2': 11131,
 '1994-3': 12398,
 '1994-4': 12189,
 '1994-5': 12051,
 '1994-6': 8809,
 '1994-7': 7192,
 '1995-1': 7027,
 '1995-2': 9447,
 '1995-3': 11897,
 '1995-4': 12530,
 '1995-5': 12207,
 '1995-6': 9093,
 '1995-7': 7596,
 '1996-1': 11911,
 '1996-2': 11924,
 '1996-3': 7092,
 '1996-4': 10321,
 '1996-5': 12260,
 '1996-6': 9103,
 '1996-7': 8193,
 '1997-1': 12206,
 '1997-2': 13521,
 '1997-3': 11818,
 '1997-4': 7055,
 '1997-5': 10075,
 '1997-6': 8679,
 '1997-7': 7864,
 '1998-1': 11980,
 '1998-2': 13637,
 '1998-3': 13297,
 '1998-4': 11389,
 '1998-5': 7020,
 '1998-6': 7735,
 '1998-7': 7829,
 '1999-1': 11579,
 '1999-2': 13158,
 '1999-3': 12629,
 '1999-4': 11935,
 '1999-5': 9335,
 '1999-6': 6674,
 '1999-7': 7432,
 '2000-1': 6566,
 '2000-2': 10070,
 '2000-3': 12858,
 '2000-4': 13532,
 '2000-5': 13234,
 '2000-6': 9177,
 '2000-7': 7721,
 '2001-1': 10018,
 '2001-2': 6443,
 '2001-3': 10735,
 '2001-4': 13762,
 '2001-5': 13918,
 '2001-6': 9365,
 '2001-7': 7679,
 '2002-1': 12906,

In [86]:
# check for only saturday for each year by removing arguments and passing min and max year

def calc_counts1(birth_list,year_min,year_max):
    birth_dict = {}
    count = 0
    for row in birth_list:
        year = int(row[0])
        births = int(row[4])
        if ((row[3] == '6') & (year >= year_min) & (year <= year_max)):
            if year in birth_dict:
                birth_dict[year] = count + births
            else:
                birth_dict[year] = count
    return birth_dict

In [87]:
# select births between years
cdc_births = calc_counts1(csv_list,1994,2003)
cdc_births

{1994: 8809,
 1995: 9093,
 1996: 9103,
 1997: 8679,
 1998: 7735,
 1999: 6674,
 2000: 9177,
 2001: 9365,
 2002: 8953,
 2003: 8646}

In [90]:
# Create function to read csv and convert to list of list to integer values

def read_csv_int(file_name,header = True):
    file = open(file_name).read().split("\n")
    list_file = []
    if header == True:
        file = file[1:len(file)]
    for row in file:
        split_row = row.split(",")
        int_list = []
        for i in split_row:
            int_list.append(int(i))
        list_file.append(int_list)
    return list_file   


In [93]:
ssa_list_int = read_csv_int("SSA_Births.csv")
ssa_list_int[0:10]

[[2000, 1, 1, 6, 9083],
 [2000, 1, 2, 7, 8006],
 [2000, 1, 3, 1, 11363],
 [2000, 1, 4, 2, 13032],
 [2000, 1, 5, 3, 12558],
 [2000, 1, 6, 4, 12466],
 [2000, 1, 7, 5, 12516],
 [2000, 1, 8, 6, 8934],
 [2000, 1, 9, 7, 7949],
 [2000, 1, 10, 1, 11668]]

In [95]:
us_list_int = read_csv_int("US_Births.csv")
us_list_int[0:10]

[[1994, 1, 1, 6, 8096],
 [1994, 1, 2, 7, 7772],
 [1994, 1, 3, 1, 10142],
 [1994, 1, 4, 2, 11248],
 [1994, 1, 5, 3, 11053],
 [1994, 1, 6, 4, 11406],
 [1994, 1, 7, 5, 11251],
 [1994, 1, 8, 6, 8653],
 [1994, 1, 9, 7, 7910],
 [1994, 1, 10, 1, 10498]]

In [109]:
# combine CDC and SSA Data
cdc_year_births = calc_counts(us_list_int,0)
ssa_year_births = calc_counts(ssa_list_int,0)
print("CDC Births: " , cdc_year_births)
print("SSA Births: " , ssa_year_births)

combine_dict = {}
for key in cdc_year_births:
    if key in ssa_year_births:
        combine_dict[key] = cdc_year_births[key]+ssa_year_births[key]
    else:
        combine_dict[key] = cdc_year_births[key]

for key in ssa_year_births:
    if key not in combine_dict:
        combine_dict[key] = ssa_year_births[key]
    
        
        
print("Combine Births: ", combine_dict)  

CDC Births:  {1994: 8809, 1995: 7596, 1996: 11924, 1997: 11818, 1998: 11389, 1999: 9335, 2000: 7721, 2001: 10018, 2002: 12339, 2003: 12374}
SSA Births:  {2000: 7892, 2001: 10272, 2002: 12582, 2003: 12540, 2004: 10130, 2005: 8635, 2006: 7569, 2007: 11102, 2008: 12906, 2009: 11667, 2010: 9751, 2011: 8035, 2012: 10634, 2013: 12525, 2014: 11990}
Combine Births:  {1994: 8809, 1995: 7596, 1996: 11924, 1997: 11818, 1998: 11389, 1999: 9335, 2000: 15613, 2001: 20290, 2002: 24921, 2003: 24914, 2004: 10130, 2005: 8635, 2006: 7569, 2007: 11102, 2008: 12906, 2009: 11667, 2010: 9751, 2011: 8035, 2012: 10634, 2013: 12525, 2014: 11990}
