# Frequency of Baby Names US


## Check out data and paths 

In [None]:
import pandas as pd
import numpy as np
        

In [None]:
# Checkout one particular year / file
path1880 = '../datasets/babynames/yob1880.txt'

In [None]:
names1880 = pd.read_csv(path1880, names=['names','sex','births'])
names1880

In [None]:
names1880.groupby('sex').births.sum()

## Concanate all files into a DataFrame

In [None]:
# Combine all years (files) in one dataframe
years = range(1880, 2011) #excludes 2011
blobs = []
columns = ['names','sex','births']

for year in years:
    path = '../datasets/babynames/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)
    frame['year'] = year
    blobs.append(frame)

In [None]:
#print('LAST FRAME:' '\n', frame)
#print('ALL BLOBS:' '\n', blobs)

In [None]:
names = pd.concat(blobs, ignore_index = True)

In [None]:
names


## Explore the data

In [None]:
# Whats the total birth rate per year
# total_births = names.pivot_table('births', rows='year', cols='sex', aggfunc=sum)
#
total_births = names.pivot_table(values=['births','sex', 'names'], index=['year', 'names'], columns='sex', aggfunc=sum)
total_births[:10]

#mean_ratings = pd.pivot_table(data, values=['rating'], index=['title'], columns='gender', aggfunc={'rating': np.mean}

In [None]:
# Plot may work only the second time, probs with jupyther server?
total_births.plot(title='Total births by sex and year')

## Extending the data using 'group operations'

In [None]:
# Insert column 'prop' with fraction of babies. e.g. prop == 0.02 means 2 out of 100 have that name/
def add_prop(group):
    #Integer division floors
    births = group.births.astype(float)
    group['prop'] = births / births.sum()
    return group

# Then add this column to the dataset
names = names.groupby(['year', 'sex']).apply(add_prop)


In [None]:
# Sanity check
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)

In [None]:
names


In [None]:
# Get the top 1000 names for each sex/year combination
def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]

grouped = names.groupby(['year','sex'])
top1000 = grouped.apply(get_top1000)

In [None]:
top1000

## Analyzing naming trends

In [None]:
# Split the top1000 into boys / girls
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']

In [None]:
# checking
total_births

In [None]:
#Create a subset dataframe with names of interest
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False, title="Number of births per year")