# SSA Popular Baby Names

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import datetime as dt
import glob
import os

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [4]:
today = dt.datetime.today().strftime("%Y-%m-%d")

---

### Download latest batch of names data from SSA

In [5]:
!curl -s 'https://www.ssa.gov/oact/babynames/names.zip' --output data/raw/names.zip

In [6]:
!unzip -o -q 'data/raw/names.zip' -d 'data/raw/years'

In [7]:
!cat data/raw/years/*.txt > 'data/raw/all.txt'

---

In [8]:
path = "data/raw/years"
all_files = glob.glob(os.path.join(path, "*.txt"))

df_from_each_file = (
    pd.read_csv(
        f,
        encoding="ISO-8859-1",
        header=None,
        sep=",",
        low_memory=False,
        names=["name", "sex", "count"],
    ).assign(year=f)
    for f in all_files
)
names = pd.concat(df_from_each_file, ignore_index=True)

In [9]:
names["year"] = (
    names["year"]
    .str.replace("data/raw/years/yob", "", regex=True)
    .str.replace(".txt", "", regex=True)
)

In [10]:
names.head(15)

Unnamed: 0,name,sex,count,year
0,Emily,F,25957,2000
1,Hannah,F,23085,2000
2,Madison,F,19968,2000
3,Ashley,F,17997,2000
4,Sarah,F,17708,2000
5,Alexis,F,17631,2000
6,Samantha,F,17265,2000
7,Jessica,F,15710,2000
8,Elizabeth,F,15112,2000
9,Taylor,F,15079,2000


### Limit names since 1950

In [11]:
names[names["year"].astype(int) >= 1950].tail()

Unnamed: 0,name,sex,count,year
2052776,Zyheem,M,5,2019
2052777,Zykel,M,5,2019
2052778,Zyking,M,5,2019
2052779,Zyn,M,5,2019
2052780,Zyran,M,5,2019


---

### Get birth totals for normalization

In [12]:
url = "https://www.ssa.gov/oact/babynames/numberUSbirths.html"

In [13]:
births = pd.read_html(url)[0]

In [14]:
births.rename(
    columns={
        "Year of birth": "year",
        "Male": "male",
        "Female": "female",
        "Total": "total",
    },
    inplace=True,
)

In [15]:
births.year = births.year.astype(str)

In [16]:
births.head(15)

Unnamed: 0,year,male,female,total
0,1880,118399,97606,216005
1,1881,108279,98855,207134
2,1882,122031,115694,237725
3,1883,112475,120059,232534
4,1884,122738,137585,260323
5,1885,115945,141947,257892
6,1886,119041,153734,272775
7,1887,109311,155423,264734
8,1888,129906,189445,319351
9,1889,119032,189218,308250


--- 

### Merge the dataframes together

In [17]:
df = pd.merge(names, births, on="year")

In [18]:
df.head()

Unnamed: 0,name,sex,count,year,male,female,total
0,Emily,F,25957,2000,2088111,1995520,4083631
1,Hannah,F,23085,2000,2088111,1995520,4083631
2,Madison,F,19968,2000,2088111,1995520,4083631
3,Ashley,F,17997,2000,2088111,1995520,4083631
4,Sarah,F,17708,2000,2088111,1995520,4083631


### Calculate rate

In [19]:
df["prop"] = df["count"] * 1.0 / df["total"]
df["fprop"] = df[df["sex"] == "F"]["count"] / df[df["sex"] == "F"]["female"]
df["mprop"] = df[df["sex"] == "M"]["count"] / df[df["sex"] == "M"]["male"]

In [20]:
df.head()

Unnamed: 0,name,sex,count,year,male,female,total,prop,fprop,mprop
0,Emily,F,25957,2000,2088111,1995520,4083631,0.006356,0.013008,
1,Hannah,F,23085,2000,2088111,1995520,4083631,0.005653,0.011568,
2,Madison,F,19968,2000,2088111,1995520,4083631,0.00489,0.010006,
3,Ashley,F,17997,2000,2088111,1995520,4083631,0.004407,0.009019,
4,Sarah,F,17708,2000,2088111,1995520,4083631,0.004336,0.008874,


---

### Export

In [22]:
df.to_csv("data/processed/names_births.csv", index=False)