# SSA Popular Baby Names

### Import Python tools and Jupyter configuration

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
import datetime as dt
import glob
import os
import requests

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()
pd.options.display.max_colwidth = None

In [4]:
today = dt.datetime.today().strftime("%Y-%m-%d")

---

### Download [latest batch](https://www.ssa.gov/oact/babynames/limits.html) of names data from SSA

In [5]:
!curl -s 'https://www.ssa.gov/oact/babynames/names.zip' --output data/raw/names.zip

In [6]:
!unzip -o -q 'data/raw/names.zip' -d 'data/raw/years'

In [7]:
!cat data/raw/years/*.txt > 'data/raw/all.txt'

---

In [29]:
path = "data/raw/years"
all_files = glob.glob(os.path.join(path, "*.txt"))

df_from_each_file = (
    pd.read_csv(
        f,
        encoding="ISO-8859-1",
        header=None,
        sep=",",
        low_memory=False,
        names=["name", "sex", "count"],
    ).assign(year=f)
    for f in all_files
)
names = pd.concat(df_from_each_file, ignore_index=True)

In [30]:
names["year"] = (
    names["year"]
    .str.replace("data/raw/years/yob", "", regex=True)
    .str.replace(".txt", "", regex=True)
)

In [31]:
names.head(15)

Unnamed: 0,name,sex,count,year
0,Emily,F,25959,2000
1,Hannah,F,23086,2000
2,Madison,F,19968,2000
3,Ashley,F,17998,2000
4,Sarah,F,17713,2000
5,Alexis,F,17631,2000
6,Samantha,F,17264,2000
7,Jessica,F,15711,2000
8,Elizabeth,F,15116,2000
9,Taylor,F,15079,2000


### Limit names since 1950

In [35]:
names[names["year"].astype(int) >= 1900].head()

Unnamed: 0,name,sex,count,year
0,Emily,F,25959,2000
1,Hannah,F,23086,2000
2,Madison,F,19968,2000
3,Ashley,F,17998,2000
4,Sarah,F,17713,2000


---

### Get birth totals for normalization

In [33]:
url = "https://www.ssa.gov/oact/babynames/numberUSbirths.html"

In [13]:
births = pd.read_html(url)[0]

In [14]:
births = births.rename(
    columns={
        "Year of  birth": "year",
        "Male": "male",
        "Female": "female",
        "Total": "total",
    },
)

In [15]:
births["year"] = births.year.astype(str)

In [16]:
births

Unnamed: 0,year,male,female,total
0,1880,118399,97605,216004
1,1881,108276,98855,207131
2,1882,122031,115694,237725
3,1883,112475,120060,232535
4,1884,122738,137585,260323
5,1885,115945,141947,257892
6,1886,119041,153734,272775
7,1887,109311,155423,264734
8,1888,129907,189445,319352
9,1889,119032,189218,308250


--- 

### Merge the dataframes together

In [39]:
df = pd.merge(names, births, on="year")

In [40]:
df.head()

Unnamed: 0,name,sex,count,year,male,female,total
0,Emily,F,25959,2000,2088325,1995743,4084068
1,Hannah,F,23086,2000,2088325,1995743,4084068
2,Madison,F,19968,2000,2088325,1995743,4084068
3,Ashley,F,17998,2000,2088325,1995743,4084068
4,Sarah,F,17713,2000,2088325,1995743,4084068


### Calculate rate

In [43]:
# Calculate the percentage of births with the specific name
df["name_percentage"] = df.apply(
    lambda row: (
        round((row["count"] / row["female"] * 100), 2)
        if row["sex"] == "F"
        else round((row["count"] / row["male"] * 100), 2)
    ),
    axis=1,
)

# Calculate the number of births per 100,000
df["name_per_100k"] = df.apply(
    lambda row: (
        round((row["count"] / row["female"] * 100000), 2)
        if row["sex"] == "F"
        else round((row["count"] / row["male"] * 100000), 2)
    ),
    axis=1,
)

In [49]:
df.query('name == "Taylor" and sex == "F"').sort_values("year")

Unnamed: 0,name,sex,count,year,male,female,total,name_percentage,name_per_100k
1198379,Taylor,F,7,1951,1911680,1847272,3758952,0.0,0.38
1350729,Taylor,F,5,1954,2069135,1990843,4059978,0.0,0.25
1621920,Taylor,F,6,1956,2146229,2059534,4205763,0.0,0.29
1499483,Taylor,F,8,1957,2188901,2098135,4287036,0.0,0.38
778747,Taylor,F,6,1958,2154872,2065751,4220623,0.0,0.29
920504,Taylor,F,11,1960,2165354,2079741,4245095,0.0,0.53
856244,Taylor,F,7,1961,2156992,2077143,4234135,0.0,0.34
952184,Taylor,F,6,1962,2101687,2026891,4128578,0.0,0.3
1011598,Taylor,F,5,1963,2064633,1988078,4052711,0.0,0.25
787928,Taylor,F,14,1964,2027035,1957358,3984393,0.0,0.72


---

#### Normalized for TikTok vid

In [50]:
# Load the data
ssa_applications_url = (
    "https://stilesdata.com/babynames/us_ssa_applications_birth_years.json"
)
population_url = "https://stilesdata.com/babynames/us_population_years_census.json"

ssa_applications = pd.read_json(ssa_applications_url)
population = pd.read_json(population_url)

# Convert the 'year' column to integers
ssa_applications["year"] = ssa_applications["year"].astype(int)
population["year"] = population["year"].astype(int)

# Filter the data to start from the year 1900
ssa_applications = ssa_applications[ssa_applications["year"] >= 1900]

# Merge the two dataframes on the 'year' column
merged_data = pd.merge(ssa_applications, population, on="year", how="left")

# Normalize the SSA applications by population per 1 million
merged_data["male_per_million"] = (
    merged_data["male"] / merged_data["population"]
) * 1e6
merged_data["female_per_million"] = (
    merged_data["female"] / merged_data["population"]
) * 1e6

# Fill any NaN values resulting from the merge with 0
merged_data.fillna(0, inplace=True)

# Save the processed data to a JSON file
normalized_data = merged_data[["year", "male_per_million", "female_per_million"]].query(
    "year > 1899 and year < 2023"
)
normalized_data.to_json(
    "data/processed/normalized_ssa_applications_per_million.json", orient="records"
)

---

### Export

In [51]:
df.to_csv("data/processed/names_births.csv", index=False)

In [52]:
births.to_json(
    "data/processed/us_ssa_applications_birth_years.json",
    indent=4,
    orient="records",
    lines=False,
)