In [1]:
import os
import pandas as pd
import urllib.request
import zipfile

In [2]:
data_path = "raw_data/state"
if not os.path.isdir(data_path): # creates path if it does not exist
    os.makedirs(data_path)

In [3]:
# Download files from U.S. Social Security Gov website. Check if files already exists
os.chdir(data_path)

ssa_url = 'https://www.ssa.gov/oact/babynames/state/namesbystate.zip' 

if not os.path.isfile("namesbystate.zip"):
    print("Downloading.")
    urllib.request.urlretrieve(ssa_url, 'namesbystate.zip')
else: print("Data already downloaded.")

if not os.path.isfile("AK.txt") or not os.path.isfile("WY.txt"):
    print("Extracting.")
    with zipfile.ZipFile('namesbystate.zip') as zf:
        for member in zf.infolist():
            zf.extract(member)
else: print("Data already extracted.")

os.chdir("../")

Downloading.
Extracting.


In [None]:
#Create pandas dataframes from U.S. Social Security baby names database and preserve for later use in other notebooks.

redo_dataframes = False
os.chdir(data_path)

if (redo_dataframes == True or
    not os.path.isfile("state.pickle") or 
    not os.path.isfile("names.pickle") or
    not os.path.isfile("yob.preserve")):

    print("Processing.")
    
    # read individual files, yob1880.txt, yob1881.txt, etc. and assemble into a dataframe
    states = range(AK, WY) # stops at 2016: update this when Social Security Administration adds to data 
    parts = []
    part_columns = ['state', 'sex', 'year', 'name', 'births']
    for state in states:
        path = str(state) + '.txt'
        part_df = pd.read_csv(path, names=part_columns)
        part_df['state'] = year
        parts.append(part_df)
    state = pd.concat(parts, ignore_index=True)
    
    # add column 'pct': the number of births of that name and sex in that year
    # divided by the total number of births of that sex in that year, multiplied by
    # 100 to turn into a percentage and reduce leading zeroes
    def add_pct(group):
        births = group.births.astype(float)
        group['pct'] = (births / births.sum() * 100)
        return group
    yob = yob.groupby(['year', 'sex']).apply(add_pct)
    #add rank of each name each year each sex
    yob['ranked'] = yob.groupby(['year', 'sex'])['births'].rank(ascending=False)
    yob.to_pickle("yob.preserve")
    
    # names dataframe: discards individual birth or pct values, and instead collects data on unique names.
    # There is one row per unique combination of name and sex.
    yobf = yob[yob.sex == 'F']
    yobm = yob[yob.sex == 'M']
    names_count = pd.DataFrame(yobf['name'].value_counts())
    names_count.columns= ['year_count']
    names_min = pd.DataFrame(yobf.groupby('name').year.min()) 
    names_min.columns = ['year_min']
    names_max = pd.DataFrame(yobf.groupby('name').year.max()) 
    names_max.columns = ['year_max']
    names_pctsum = pd.DataFrame(yobf.groupby('name').pct.sum()) 
    names_pctsum.columns = ['pct_sum']
    names_pctmax = pd.DataFrame(yobf.groupby('name').pct.max())
    names_pctmax.columns = ['pct_max']
    names_f = names_count.join(names_min)
    names_f = names_f.join(names_max)
    names_f = names_f.join(names_pctsum)
    names_f = names_f.join(names_pctmax)
    names_f['sex'] = "F"
    names_f.reset_index(inplace=True, drop=False)
    names_f.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
    names_f = names_f[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
    names_count = pd.DataFrame(yobm['name'].value_counts()) 
    names_count.columns=['year_count']
    names_min = pd.DataFrame(yobm.groupby('name').year.min()) 
    names_min.columns = ['year_min']
    names_max = pd.DataFrame(yobm.groupby('name').year.max()) 
    names_max.columns = ['year_max']
    names_pctsum = pd.DataFrame(yobm.groupby('name').pct.sum()) 
    names_pctsum.columns = ['pct_sum']
    names_pctmax = pd.DataFrame(yobm.groupby('name').pct.max()) 
    names_pctmax.columns = ['pct_max']
    names_m = names_count.join(names_min)
    names_m = names_m.join(names_max)
    names_m = names_m.join(names_pctsum)
    names_m = names_m.join(names_pctmax)
    names_m['sex'] = "M"
    names_m.reset_index(inplace=True, drop=False)
    names_m.columns = ['name', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max', 'sex']
    names_m = names_m[['name', 'sex', 'year_count', 'year_min', 'year_max', 'pct_sum', 'pct_max']]
    names = pd.concat([names_f, names_m], ignore_index=True)
    names.to_pickle('names.preserve')
    
    # create years dataframe. Discards individual name data, aggregating by year.
    total = pd.DataFrame(yob.pivot_table('births', index='year', columns = 'sex', aggfunc=sum))
    total.reset_index(drop=False, inplace=True)
    total.columns = ['year', 'births_f', 'births_m']
    total['births_t'] = total.births_f + total.births_m
    newnames = pd.DataFrame(names.groupby('year_min').year_min.count())
    newnames.columns = ['firstyearcount']
    newnames.reset_index(drop=False, inplace=True)
    newnames.columns = ['year', 'new_names']
    uniquenames = pd.DataFrame()
    for yr in range(1880, 2016):
        uniquenames = uniquenames.append(pd.DataFrame([{'year':yr, 'unique_names':len(yob[yob.year == yr].name.unique())}]), ignore_index=True)
    years = pd.merge(left=total, right=newnames, on='year', right_index=False, left_index=False)
    years = pd.merge(left=years, right=uniquenames, on='year', right_index=False, left_index=False)
    years['sexratio'] = 100.0 * years.births_m / years.births_f
    years.to_pickle('years.preserve')
    
else:
    
    print("Reading from preserve.")
    yob = pd.read_pickle('yob.preserve')
    names = pd.read_pickle('names.preserve')
    years = pd.read_pickle('years.preserve')
    
os.chdir("../")