In [None]:
!pip install fuzzy

In [None]:
import pandas as pd
import seaborn as sns
import plotnine
sns.set_style('whitegrid')
%pylab inline

df = pd.read_csv("../input/us-baby-names/NationalNames.csv")
df[:5]

# Double Metaphone Sounds-like Algorithm
Designed by Lawrence Phillips in 1990, the original [metaphone algorithm](https://en.wikipedia.org/wiki/Metaphone) does its phonetic matching through complex rules for variations in vowel and consonant sounds. Since then, there has been two updates to the algorithm. Fortunately for us, there is a Python port from C/C++ code, and it features the first update of the algorithm, the Double Metaphone.

In the following code, we first get the fingerprint (a.k.a. hash code) of all the names in the data. It looks something like:

    Mark -> MRK
    Marc -> MRK
    Marck -> MRK
    Marco -> MRK



In [None]:
import fuzzy

names = df["Name"].unique()
# names = ["Cass", "Cassie", "Cassandra"]
fingerprint_algo = fuzzy.DMetaphone()

list_fingerprint = []
for n in names:
    list_fingerprint.append(fingerprint_algo(n)[0])


# Our baby's first name: Cassandra

In [None]:
def get_subset(df, df_fp, names):
    fingerprint_candidates = []
    for name in names:
        fingerprint_candidates.extend(df_fp[df_fp["name"] == name]["fingerprint"].values.tolist())

    name_candidates = df_fp.loc[df_fp["fingerprint"].isin(fingerprint_candidates), "name"]

    df_subset = df[(df["Name"].isin(name_candidates)) & (df["Gender"] == "F")]
    return df_subset

In [None]:
df_fp_names = pd.DataFrame([list_fingerprint, names]).T
df_fp_names.columns=["fingerprint", "name"]

df_subset = get_subset(df, df_fp_names, ["Cass", "Cassandra"])

## Get the top 5 variants in terms of overall popularity and plot
I discovered this nice library, plotnine that emulates R's ggplot. I want to use ggplot because I want to plot something unique as you will see below

In [None]:
from plotnine import ggplot, geom_text, labs, aes, geom_line, scale_y_continuous, theme, element_blank, element_text

top_n = 5

def get_ts_global_values(df_subset, top_n, names_to_study=[]):
    if len(names_to_study) == 0: 
         names_to_study = df_subset.groupby("Name")["Count"].sum().sort_values()[::-1][:top_n].index.tolist()
    df_top_n_global = df_subset[df_subset["Name"].isin(names_to_study)].copy()

    # format year to be date time
    df_top_n_global["Year"] = pd.to_datetime(df_top_n_global["Year"], format="%Y")

    # groupby by decades
    df_top_n_global = df_top_n_global.groupby(["Name", pd.Grouper(key="Year", freq="10YS")])["Count"].sum().reset_index()

    # use 1900s only
    df_top_n_global = df_top_n_global.query("`Year` > '1900-01-01'")
    return df_top_n_global

In [None]:
df_top_n_global = get_ts_global_values(df_subset, top_n)

In [None]:
ggplot(df_top_n_global, aes(x = "Year", y = "Count", colour = "Name")) + \
    geom_text(aes(label = "Name"), show_legend = False) +\
    geom_line() +\
    labs(y = 'Number of babies', title = 'Cass: 1900\'s and beyond') +\
        theme(panel_grid_minor_y=element_blank(),
           panel_grid_major_y=element_blank())

In [None]:
ggplot(df_top_n_global, aes(x = "Year", y = "Count", colour = "Name")) + \
    geom_text(aes(label = "Name"), show_legend = False) +\
    geom_line() +\
    labs(y = 'Number of babies', title = 'Cass: 1900\'s and beyond') +\
    scale_y_continuous(trans='log10') +\
    theme(panel_grid_minor_y=element_blank(),
           panel_grid_major_y=element_blank())

## Get the fastest growing ones since 1980
- Get a simple linear least squares approach and find the maximum slopes.

In [None]:
from scipy.stats import linregress
df_subset_1980 = df_subset[df_subset["Year"] > 1980]

df_linregress = df_subset_1980.groupby(["Name"]).apply(lambda g: pd.Series(linregress(g["Year"], g["Count"])))
df_linregress.columns = ["slope", "intercept", "rvalue", "pvalue", "stderr"]

# select names with more than a thousand count that has the highest slope
# with an r value > 0.6, just to have a moderate correlation
df_linregress = df_linregress.join(df_subset_1980.groupby("Name")["Count"].sum())\
                                .dropna()\
                                .sort_values(by="slope", ascending=False)
df_linregress = df_linregress[df_linregress["rvalue"] >= 0.5]
df_linregress[:10]

In [None]:
names_to_study = df_linregress[df_linregress["Count"] > 1000].index.tolist()

# add the base Cassandra and Cassie
names_to_study.extend(["Cassandra", "Cassie"])
names_to_study = set(names_to_study)

# plot only 1970's onwards
df_top_n_global = get_ts_global_values(df_subset, top_n, names_to_study=names_to_study)
df_top_n_global = df_top_n_global[df_top_n_global["Year"] > "1960-01-01"]

In [None]:
ggplot(df_top_n_global, aes(x = "Year", y = "Count", colour = "Name")) + \
    geom_text(aes(label = "Name"), show_legend = False) +\
    geom_line() +\
    labs(y = 'Number of babies', title = 'Cass and it\'s newest most popular variants') +\
    scale_y_continuous(trans='log10') +\
    theme(panel_grid_minor_y=element_blank(),
           panel_grid_major_y=element_blank(),
          panel_grid_minor_x = element_blank(),
          axis_text_x = element_text(angle = 30, hjust=1))

# Our baby's second name: Zoe
Same story!

In [None]:
top_n = 5
df_subset = get_subset(df, df_fp_names, ["Zoe"])
df_top_n_global = get_ts_global_values(df_subset, top_n)

In [None]:
ggplot(df_top_n_global, aes(x = "Year", y = "Count", colour = "Name")) + \
    geom_text(aes(label = "Name"), show_legend = False) +\
    geom_line() +\
    labs(y = 'Number of babies', title = 'Zoe: 1900\'s and beyond') +\
        theme(panel_grid_minor_y=element_blank(),
           panel_grid_major_y=element_blank())

In [None]:
ggplot(df_top_n_global, aes(x = "Year", y = "Count", colour = "Name")) + \
    geom_text(aes(label = "Name"), show_legend = False) +\
    geom_line() +\
    labs(y = 'Number of babies', title = 'Zoe: 1900\'s and beyond') +\
    scale_y_continuous(trans='log10') +\
    theme(panel_grid_minor_y=element_blank(),
           panel_grid_major_y=element_blank())

In [None]:
from scipy.stats import linregress
df_subset_1980 = df_subset[df_subset["Year"] > 1980]

df_linregress = df_subset_1980.groupby(["Name"]).apply(lambda g: pd.Series(linregress(g["Year"], g["Count"])))
df_linregress.columns = ["slope", "intercept", "rvalue", "pvalue", "stderr"]

# select names with more than a thousand count that has the highest slope
# with an r value > 0.6, just to have a moderate correlation
df_linregress = df_linregress.join(df_subset_1980.groupby("Name")["Count"].sum())\
                                .dropna()\
                                .sort_values(by="slope", ascending=False)
df_linregress = df_linregress[df_linregress["rvalue"] >= 0.5]
df_linregress[:10]

In [None]:
names_to_study = df_linregress[df_linregress["Count"] > 1000].index.tolist()

# add the base Zoe
names_to_study.extend(["Zoe"])
names_to_study = set(names_to_study)

# plot only 1970's onwards
df_top_n_global = get_ts_global_values(df_subset, top_n, names_to_study=names_to_study)
df_top_n_global = df_top_n_global[df_top_n_global["Year"] > "1960-01-01"]

In [None]:
ggplot(df_top_n_global, aes(x = "Year", y = "Count", colour = "Name")) + \
    geom_text(aes(label = "Name"), show_legend = False) +\
    geom_line() +\
    labs(y = 'Number of babies', title = 'Zoe and it\'s newest most popular variants') +\
    scale_y_continuous(trans='log10') +\
    theme(panel_grid_minor_y=element_blank(),
           panel_grid_major_y=element_blank(),
          panel_grid_minor_x = element_blank(),
          axis_text_x = element_text(angle = 30, hjust=1))