 # Project 7: Explanatory Data Analysis & Advanced Visualization (Baby Names Dataset)

## First Inspection: The most popular Names in 2018

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.float_format = '{:.2f}'.format

In [2]:
df = pd.read_csv("us_baby_names.csv")

In [3]:
df

Unnamed: 0,Year,Name,Gender,Count
0,1880,Mary,F,7065
1,1880,Anna,F,2604
2,1880,Emma,F,2003
3,1880,Elizabeth,F,1939
4,1880,Minnie,F,1746
...,...,...,...,...
1957041,2018,Zylas,M,5
1957042,2018,Zyran,M,5
1957043,2018,Zyrie,M,5
1957044,2018,Zyron,M,5


In [4]:
df.info()

TypeError: Cannot interpret '<attribute 'dtype' of 'numpy.generic' objects>' as a data type

In [None]:
df.nunique()

In [None]:
df.Gender = df.Gender.astype("category")

In [None]:
names_2018 = df[df.Year == 2018].copy()
names_2018

In [None]:
names_2018[names_2018.Gender == "F"].nlargest(10, "Count")

In [None]:
def most_pop(year, gender, n):
    return df[(df.Year == year) & (df.Gender == gender)].nlargest(n, "Count")

In [None]:
most_pop(2018, "F", 10)

In [None]:
most_pop(2018, "M", 10)

## Evergreen Names (1880 - 2018)

In [None]:
f_2018 = most_pop(2018, "F", 20)
f_2018

In [None]:
f_1880 = most_pop(1880, "F", 20)
f_1880

In [None]:
f_2018[["Name", "Count"]].merge(f_1880[["Name", "Count"]], how = "inner", on = "Name")

In [None]:
f_evergreen = f_2018[["Name", "Count"]].merge(f_1880[["Name", "Count"]], how = "inner", on = "Name", 
                           suffixes=("_2018", "_1880"))
f_evergreen

In [None]:
m_2018 = most_pop(2018, "M", 20)
m_2018

In [None]:
m_1880 = most_pop(1880, "M", 20)
m_1880

In [None]:
m_evergreen = m_2018[["Name", "Count"]].merge(m_1880[["Name", "Count"]], how = "inner", on = "Name", 
                                              suffixes=("_2018", "_1880"))
m_evergreen

## Advanced Data Aggregation

In [None]:
df = pd.read_csv("us_baby_names.csv")

In [None]:
df

In [None]:
df.groupby(["Name", "Gender"]).Count.sum().reset_index(level = -1)

In [None]:
agg = df.groupby(["Name", "Gender"]).agg(Total = ("Count", "sum"), No_Year = ("Count", "count"), 
                                         First_Year = ("Year", "min"), Last_Year = ("Year", "max"),
                                         Max_Count = ("Count", "max"))

In [None]:
agg

In [None]:
def best_year(group):
    return group.nlargest(1, "Count").Year

In [None]:
best_y = df.groupby(["Name", "Gender"]).apply(best_year) # this can take a couple of minutes!!!

In [None]:
best_y

In [None]:
best_y.droplevel(-1)

In [None]:
agg["Best_Year"] = best_y.droplevel(-1)

In [None]:
agg

In [None]:
agg.reset_index(inplace = True)

In [None]:
agg

In [None]:
agg[(agg.Name == "Mary") & (agg.Gender == "M")]

## Most Popular Names of all Times

In [None]:
agg.groupby("Gender").apply(lambda x: x.nlargest(10, "Total"))

In [None]:
all_times = agg.groupby("Gender").apply(lambda x: x.nlargest(10, "Total")).reset_index(drop = True)
all_times

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize = (15, 10))
sns.barplot(x = "Name", y = "Total", data = all_times, hue = "Gender")
plt.show()

## General Trends over time (1880 - 2018)

In [None]:
pd.options.display.float_format = '{:.0f}'.format

In [None]:
babies_per_year = df.groupby("Year").Count.sum()

In [None]:
babies_per_year

In [None]:
babies_per_year.plot(kind = "bar", y= "Count", figsize = (15, 10), fontsize = 15)
plt.xticks(ticks = (range(0, len(babies_per_year), 5)), labels =  range(1880, 2019, 5))
plt.title("Registered Babies per Year", fontsize = 20)
plt.show()

In [None]:
diff_names = df.groupby("Year").Name.count()
diff_names

In [None]:
diff_names.plot(kind = "bar", figsize = (15, 10), fontsize = 15)
plt.xticks(ticks = (range(0, len(diff_names), 5)), labels =  range(1880, 2019, 5))
plt.title("Unique Names per Year", fontsize = 20)
plt.show()

## Creating the Features "Popularity" and "Rank"

In [None]:
df

__Popularity (babies per million)__

In [None]:
df.groupby(["Year", "Gender"]).Count.transform("sum")

In [None]:
7065 / 90994 * 1000000

In [None]:
df.Count.div(df.groupby(["Year", "Gender"]).Count.transform("sum"))*1000000

In [None]:
df["Popularity"] = df.Count.div(df.groupby(["Year", "Gender"]).Count.transform("sum"))*1000000

In [None]:
df

__Rank__

In [None]:
df.groupby(["Year", "Gender"]).Count.apply(lambda x: x.rank(ascending = False))

In [None]:
df["Rank"] =  df.groupby(["Year", "Gender"]).Count.apply(lambda x: x.rank(ascending = False))

In [None]:
df

In [None]:
df[(df.Year == 2018) & (df.Gender == "M")].tail(60)

In [None]:
df[(df.Year == 2018) & (df.Gender == "M")].Popularity.sum()

## Visualizing Name Trends over Time

In [None]:
df

In [None]:
mary = df[(df.Name == "Mary") & (df.Gender == "F")]
mary.tail(60)

In [None]:
fig, ax1 = plt.subplots(figsize = (12, 8))
ax1.bar(x = mary.Year, height= mary.Popularity, label = "Popularity")
ax2 = ax1.twinx()
ax2.plot(mary.Year, mary.Rank, color = "red", label = "Rank")
ax1.set_xlabel("Year", fontsize = 15)
ax1.set_ylabel("Babies per million", fontsize = 15)
ax2.set_ylabel("Rank", fontsize = 15)
ax1.set_title("Mary", fontsize = 18)
ax1.tick_params(labelsize = 12)
ax2.tick_params(labelsize = 12)
ax1.legend(loc = 2, fontsize = 15)
ax2.legend(loc = 1, fontsize = 15)
plt.show()

In [None]:
def plot_name(name, gender, crit = "Popularity"):
    data = df.loc[(df.Name == name) & (df.Gender == gender), ["Name", "Gender", "Year", "Rank", crit]]
    data = data.set_index("Year")
    
    fig, ax1 = plt.subplots(figsize = (12, 8))
    ax1.bar(x = data.index, height= data[crit], label = crit)
    ax2 = ax1.twinx()
    ax2.plot(data.index, data.Rank, color = "red", label = "Rank")
    ax1.set_xlabel("Year", fontsize = 15)
    ax1.set_ylabel(crit, fontsize = 15)
    ax2.set_ylabel("Rank", fontsize = 15)
    ax1.set_title(name, fontsize = 18)
    ax1.tick_params(labelsize = 12)
    ax2.tick_params(labelsize = 12)
    ax1.legend(loc = 2, fontsize = 15)
    ax2.legend(loc = 1, fontsize = 15)
    plt.show()

In [None]:
plot_name("Mary", "F")

In [None]:
plot_name("Mary", "F", crit = "Count")

In [None]:
plot_name("James", "M", crit = "Popularity")

In [None]:
m_evergreen

In [None]:
for name in m_evergreen.Name:
    plot_name(name, "M", crit = "Popularity")

In [None]:
for name in f_evergreen.Name:
    plot_name(name, "F", crit = "Popularity")

## Why does a Name´s Popularity suddenly change? (Part 1)

In [None]:
df

In [None]:
df.loc[(df.Name == "Mary") & (df.Gender == "F"), "Popularity"].shift()

In [None]:
df["Pop_lag1"] = df.groupby(["Name", "Gender"]).Popularity.shift()

In [None]:
df

In [None]:
df[df.Name == "Ashanti"]

In [None]:
df.loc[(df.Year > 1880) & (df.Pop_lag1.isna()), "Pop_lag1"] = 0

In [None]:
df[df.Name == "Ashanti"]

In [None]:
df["Pop_diff"] = df.Popularity - df.Pop_lag1

In [None]:
df[(df.Name == "Mary") & (df.Gender == "F")]

In [None]:
df[df.Name == "Ashanti"]

In [None]:
df2 = df[df.Year >= 1900].copy()

In [None]:
df2

In [None]:
df2.nlargest(20, "Pop_diff")

In [None]:
plot_name("Linda", "F")

In [None]:
plot_name("Shirley", "F")

In [None]:
plot_name("Jason", "M")

In [None]:
df2.nsmallest(20, "Pop_diff")

## Why does a Name´s Popularity suddenly change? (Part 2)

__'From 0 to 100' Names__

In [None]:
df2[(df2.Pop_lag1 < 5)].nlargest(20, "Pop_diff")

In [None]:
df2[df2.Name == "Nakia"]

In [None]:
plot_name("Nakia", "F")

In [None]:
plot_name("Nakia", "M")

In [None]:
plot_name("Kizzy", "F")

In [None]:
plot_name("Rihanna", "F")

In [None]:
plot_name("Whitney", "F")

In [None]:
plot_name("Shaquille", "M")

__Highest percentage decrease among popular names__

In [None]:
df2

In [None]:
df2["%change"] = df2.Popularity.div(df2.Pop_lag1).sub(1).mul(100)

In [None]:
df2

In [None]:
df2[df2.Pop_lag1 > 1000].nsmallest(20, "%change")

In [None]:
plot_name("Katina", "F")

In [None]:
plot_name("Iesha", "F")

In [None]:
plot_name("Ashanti", "F")

In [None]:
plot_name("Woodrow", "M")

In [None]:
plot_name("Hillary", "F")

## Persistant vs. Spike-Fade Names

In [None]:
agg

In [None]:
agg["Spike_Score"] = agg.Max_Count.div(agg.Total).mul(100)

In [None]:
agg

In [None]:
agg.sort_values(by = "Spike_Score", ascending = True).head(20)

In [None]:
pers = agg.sort_values(by = "Spike_Score", ascending = True).head(20)

In [None]:
pers

In [None]:
l = list(zip(pers.Name, pers.Gender))
l

In [None]:
for name in l:
    plot_name(name = name[0], gender = name[1], crit = "Count")

In [None]:
spike = agg[(agg.Total > 1000) & (agg.No_Year > 10)].sort_values(by = "Spike_Score",
                                                                 ascending = False).head(20)
spike

In [None]:
l2 = list(zip(spike.Name, spike.Gender))
l2

In [None]:
for name in l2:
    plot_name(name = name[0], gender = name[1], crit = "Count")

## Most Popular Unisex Names

In [None]:
df

In [None]:
df[df.Name == "Mary"].groupby(["Name", "Gender"]).Count.sum()

In [None]:
df.groupby(["Name", "Gender"]).Count.sum().unstack()

In [None]:
unisex = df.groupby(["Name", "Gender"]).Count.sum().unstack()
unisex

In [None]:
unisex.dropna(inplace = True)

In [None]:
unisex

In [None]:
unisex.min(axis = 1).sort_values(ascending = False)

In [None]:
unisex_names = unisex.min(axis = 1).sort_values(ascending = False).index
unisex_names

In [None]:
unisex.loc[unisex_names].head(20)

In [None]:
unisex = unisex[unisex.min(axis = 1) > 10000]
unisex

In [None]:
abs(unisex.F.div(unisex.M).sub(1)).sort_values()

In [None]:
unisex_names2 = abs(unisex.F.div(unisex.M).sub(1)).sort_values().index
unisex_names2

In [None]:
unisex.loc[unisex_names2].head(20)