In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
data = pd.read_csv("/kaggle/input/us-baby-names/NationalNames.csv", delimiter=",",
               index_col='Id')
data.head()

In [None]:
data.isnull().sum()

# Interesting Trends in US baby names:
## 1. Popularity
- Top 5 popular names in 1880, 1950, 2010
- Names that are not popular in early 1900s, but become popular now.
- Names that are popular in early 1900s, but become obsoltete now.

## 2. Patterns
- Length of names
- Distribution of last letter among boys and girls
- Distribution of the initial letter.

## 3. Usage of names
Names like Alexis, Ashley, Monroe, Shannon, Harper, Paris used to be boy's name in early 1900s
- Find names that are mostly used for boys in early 1900s, but now mostly used for girls.
- Find names that are mostly used for girls in early 1900s, but now mostly used for boys.
- Find out which year the gender percentage of these names went across 50%.

## 4. Diversity of names
- What is the total percentage of the top 20 most popular names.
- What is the total number of names used per year?
- Does boys name have more diversity, or girls?

In [None]:
# Split the data frame according to years
group_byyear = data.groupby('Year')

In [None]:
total_num = group_byyear.sum()

In [None]:
total_num.plot()

In [None]:
# This does not work since each year's total is different
# data['Percentage'] = data['Count'] / # the total number of names of that year

In [None]:
# Define a function that creates a Percentage column for each subgroup
def add_percentage(group):
    group['Percentage'] = group['Count'] / group['Count'].sum() # The demonimator is the total number of babies
    return group
# Apply the function to all groups
# data = data.groupby('Year').apply(add_percentage)
data_byyear = data.groupby('Year')
# for year, group in data_byyear:
#     print(year)
#     print(group.head())
data = data_byyear.apply(add_percentage)
data.head()

In [None]:
# Change of percentage of Mary 
data_mary = data[data['Name'] == 'Mary']
data_mary

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

subdata = data_mary[data_mary['Gender'] == 'F']
plt.plot(subdata['Year'], subdata['Percentage'])

In [None]:
index = (data['Name'] == "Emma") & (data['Gender'] == "F")
data_emma = data[index]
data_emma.head()

In [None]:
plt.plot(data_emma['Year'], data_emma['Percentage'], 'b.')

## I. Popularity

In [None]:
# 1. the top-5 popular names in 1880, 1950, 2010
data1880boy = data[(data['Year'] == 1880) & (data['Gender'] == "M")]
# data1880['Year'].value_counts()
data1880boy = data1880boy.sort_values(by='Percentage', ascending=False)
# data1880.head()
data1880boy[:5]['Name'].values

In [None]:
def get_popular_names(data, year, gender):
    """
    Return the top 20 popular names in the specified year.
    """
    subdata = data[(data['Year'] == year) & (data['Gender'] == gender)]
    subdata = subdata.sort_values(by='Percentage', ascending=False)
    return subdata[:20]['Name'].values

In [None]:
years = [1880, 1950, 1998, 2010]
for year in years:
    popular_boynames = get_popular_names(data, year, 'M')
    popular_girlnames = get_popular_names(data, year, 'F')
    print(year)
    print(popular_boynames)
    print(popular_girlnames)

In [None]:
# Find names that used be popular in 1900, but no longer popular in 2010

# Find top 20 names that are popular in 1900
top20_boynames_1900 = get_popular_names(data, 1900, 'M')
print(top20_boynames_1900)

In [None]:
boyname_percentage_1900 = data[(data['Year'] == 1900) & 
                               (data['Gender'] == 'M') & 
                               (data['Name'].isin(top20_boynames_1900))]
boyname_percentage_1900

In [None]:
boyname_percentage_2010 = data[(data['Year'] == 2010) & 
                               (data['Gender'] == 'M') & 
                               (data['Name'].isin(top20_boynames_1900))]
boyname_percentage_2010

In [None]:
# merge two set of percentages
col_left = ["Name", "Gender", "Percentage"]
col_right = ["Name", "Percentage"]
boyname_percentage = pd.merge(boyname_percentage_1900[col_left],
                              boyname_percentage_2010[col_right],
                             on="Name",
                              suffixes=["1900", "2010"]
                         )
boyname_percentage.head()

In [None]:
# Calculate the drop in popularity
boyname_percentage["Difference"] = boyname_percentage["Percentage1900"] - boyname_percentage['Percentage2010']
boyname_percentage.sort_values(by='Difference', ascending=False, inplace=True)
boyname_percentage

In [None]:
# find dramatic desrease in girl name popularity
top20_girlnames_1900 = get_popular_names(data, 1900, 'F')
# print(top20_girlnames_1900)
girlname_percentage_1900 = data[(data['Year'] == 1900) & 
                               (data['Gender'] == 'F') & 
                               (data['Name'].isin(top20_girlnames_1900))]
# girlname_percentage_1900
girlname_percentage_2010 = data[(data['Year'] == 2010) & 
                               (data['Gender'] == 'F') & 
                               (data['Name'].isin(top20_girlnames_1900))]
col_left = ["Name", "Gender", "Percentage"]
col_right = ["Name", "Percentage"]
girlname_percentage = pd.merge(girlname_percentage_1900[col_left],
                              girlname_percentage_2010[col_right],
                             on="Name",
                              suffixes=["1900", "2010"]
                         )
["Difference"] = girlname_percentage["Percentage1900"] - girlname_percentage['Percentage2010']
girlname_percentage.sort_values(by='Difference', ascending=False, inplace=True)
girlname_percentage.head()

In [None]:
1 / 0.000776

## Pattern

In [None]:
# distribution in length
data["NameLength"] = data['Name'].apply(len)
data.head()

In [None]:
# split the data according to year and name length
frequency_bylength = data.groupby(["Year", "NameLength"])['Percentage'].sum().unstack()

In [None]:
data1880length2 = data[(data['Year'] == 1880) & (data['NameLength'] == 11)]
data1880length2

In [None]:
frequency_bylength.head()

In [None]:
years = [1880, 1950, 2010]
frequency_bylength[frequency_bylength.index.isin(years)].T.plot.bar(figsize=(15, 5))

In 2010, the name length tends to be longer than 1900.

In [None]:
# The longest names people ever gave.
data.sort_values(by="NameLength", ascending=False).head(10)

In [None]:
# Distribution of last letter for boys
data['LastLetter'] = data['Name'].apply(lambda x: x[-1])
data.head()

In [None]:
subdata = data[data['Gender'] == 'M']
subdata = subdata.groupby(["Year", "LastLetter"])['Percentage'].sum().unstack()
subdata = subdata[subdata.index.isin(years)]
# subdata = subdata.transpose()
subdata = subdata.T
subdata.head()

In [None]:
subdata.plot.bar(figsize=(15, 5))

In [None]:
# For each year and each last letter, find the total percentage of girl names in that year with that last letter.
data_girl = data[data['Gender'] == "F"]
data_girl.head()

In [None]:
groups = data_girl.groupby(['Year', 'LastLetter'])

In [None]:
# Find the total percentage for each group
popularity_by_letter = groups['Percentage'].sum()
popularity_by_letter

In [None]:
# Flatten the result
popularity_by_letter = popularity_by_letter.unstack(level=0)

In [None]:
# Extract the three typical years
years

In [None]:
popularity_by_letter.columns

In [None]:
popularity_by_letter = popularity_by_letter[years]
popularity_by_letter.head()

In [None]:
popularity_by_letter.plot.bar(figsize=(15, 5))

In [None]:
# plot 2010 boy distribution with 2010 girl distribution
boy2010 = subdata[[2010]]
boy2010.head()

In [None]:
girl2010 = popularity_by_letter[[2010]]
girl2010.head()

In [None]:
popularity2010 = pd.merge(boy2010, girl2010, left_index=True, right_index=True,
                          suffixes=['boy', 'girl'])
popularity2010.head()

In [None]:
popularity2010.sum()

In [None]:
popularity2010.plot.bar(figsize=(15, 5))

## III. Gender of names

In [None]:
data_boy = data[data['Gender'] == 'M']
data_girl = data[data['Gender'] == 'F']
cols = ["Name", "Year", "Percentage"]
data_merged = pd.merge(data_boy[cols], data_girl[cols], on=['Name', 'Year'],
                       suffixes=['Boy', 'Girl'], how="outer")
data_merged.head()

In [None]:
# To identify gender tendency, it is helpful to calculate the ratio between boy frequency and girl frequency
data_merged['BoyIndex'] = data_merged['PercentageBoy'] / (data_merged['PercentageGirl'] + \
                                                          data_merged['PercentageBoy'])
data_merged.sort_values(by='BoyIndex').head()
# if a name is mainly used by boys, its boyindex should be close to 1.
# If a name is mainly used by girls, its boyindex should be close to 0.

In [None]:
# Find the change in ratio 1880 vs. 2010
ratio1880 = data_merged[data_merged['Year'] == 1880]
ratio2010 = data_merged[data_merged['Year'] == 2010]
ratio1880.head()

In [None]:
cols = ["Name", "BoyIndex"]
ratio18802010 = pd.merge(ratio1880[cols], ratio2010[cols], on="Name",
                         suffixes=["1880", "2010"], how="outer")
ratio18802010.head()

In [None]:
ratio18802010.shape

In [None]:
# If a name is predominately boy in 1880 and predonimately girl in 2010, its 1880 ratio should 
# large and its 2010 ratio should be small
index = (ratio18802010['BoyIndex1880'] > 0.8 ) & (ratio18802010['BoyIndex2010'] < 0.2)
boy_to_girl_names = ratio18802010[index]
boy_to_girl_names

In [None]:
ratio18802010[ratio18802010["Name"] == "Ashley"]

In [None]:
# This is not a perfect approach, since the result will be NAN unless PercentBoy and PercentGirl are both nonzero.

## IV Diversity of names

In [None]:
# In 1880, how many different boy's name are used?
len(data[(data["Gender"] == "M") & (data['Year'] == 1880)]["Name"].unique())

In [None]:
num_names = data.groupby(["Gender", "Year"]).size().unstack(level=0)
num_names.head()

In [None]:
num_names.plot.bar(figsize=(15, 5))