In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# turn of data table rendering
pd.set_option('display.notebook_repr_html', False)
plt.style.use('ggplot')
pd.__version__

## Data

In [None]:
# The data set contains the top 1000 baby names in the US
# for each year from 1880 until 2008
names_df = pd.read_csv('data/baby-names2.csv')
names_df.head()

## Explore the data

In [None]:
# Get popular names for 1969
names_df[names_df.year == 1969].tail()

In [None]:
# Separate the boys from the girls
boys_df = names_df[names_df.sex == 'boy']
girls_df = names_df[names_df.sex == 'girl']
girls_df.head()

In [None]:
# Get the number of boy baby name rows per year (showing first 5) 
boys_df.groupby('year').size().head()

In [None]:
# Get the hierarchical index of baby name rows per year 
# grouped by gender for the year 1969
names_df.groupby(['year', 'sex']).size().loc[1969]

In [None]:
# Get the most popular boy name for the year 1969
rowid = boys_df[boys_df.year == 1969].prop.idxmax()
boys_df.loc[rowid]

In [None]:
# Apply the get_max_proportion to the whole dataframe
# to get the most popular name for each year (showing first 5)
def get_max_proportion(group):
    return group.loc[group.prop.idxmax()]
    
popular_boys_df = boys_df.groupby('year').apply(get_max_proportion)
popular_boys_df.tail()

In [None]:
# Plot the proportion of the most popular boy name per year
# Maybe showing more diversity in chosen boy names over time
popular_boys_df.prop.plot(legend=True, title='Proportion most popular boy name', 
                          kind='area', color='#00A99D', alpha=.5)

In [None]:
# Set the index to a name/year combination
boys_index_df = boys_df.set_index(['name', 'year'])
boys_index_df.head()

In [None]:
# Narcissistic query for the popularity of my first name
leon_popularity = boys_index_df.loc['Leon']
leon_popularity.prop.plot(kind='area', color='#00A99D', alpha=.5, 
                          legend=True, title='Popularity of the name Leon')

In [None]:
# My name was the most popular in the year...
boys_index_df.loc['Leon'].prop.idxmax()

In [None]:
# Highest boy name proportion ever...
boys_index_df.prop.idxmax()

In [None]:
# Calculate the mean proportion for each girl name in the set
# Show first 5 in descending order
mean_prop = girls_df.groupby('name')['prop'].mean()

mean_prop.sort_values(ascending=False).head()

In [None]:
# Retrieve summary statistics for each year in the data set
result = girls_df.groupby('year').describe()
result.head(24)

## Calculating Baby Name Diversity

In [None]:
# Get a single year of girl names and sort by proportion descending
girls_df_1972 = girls_df[girls_df.year == 1972]
girls_df_1972_sorted = girls_df_1972.sort_values(by='prop', ascending=False)
girls_df_1972_sorted.head()


In [None]:
# What is the proportion of this top 1000 girl names of the total names?
girls_df_1972_sorted.prop.cumsum().max()

In [None]:
# Where is the 50% crossover point for cumulative sum
girls_df_1972_cumsum = girls_df_1972_sorted.prop.cumsum()
print("girls_df_1972_cumsum: ")
print(girls_df_1972_cumsum)
crossover = girls_df_1972_cumsum.searchsorted(.5)
print(crossover)
# girls_df_1972_cumsum[(crossover[0]-2):(crossover[0]+3)]

In [None]:
# The crossover can be seen as a meassure of diversity
# It's the number of names accounting for 50% of all the names given
# So the higher, the more creative name giving
crossover[0]

In [None]:
# Let's calculate this crossover point for boys and girls per year
def get_crossover(group, quantile):
    df = group.sort_index(by='prop', ascending=False)
    return df.prop.cumsum().searchsorted(quantile)[0]

q = .5

girls_crossover = girls_df.groupby('year').apply(get_crossover, quantile=q)
girls_crossover.name = 'girls'

boys_crossover = boys_df.groupby('year').apply(get_crossover, quantile=q)
boys_crossover.name = 'boys'

# Plot the crossover counts over time
girls_crossover.plot(kind='area', color='#00A99D', alpha=.5, legend=True)
boys_crossover.plot(title='Baby name diversity', kind='area', 
                    color='#F5CA0C', alpha=.5, legend=True)