In [1]:
import pandas as pd
import plotly.express as px
import requests
import zipfile
import io
import os

In [2]:
# Download and extract the SSA baby names dataset
def download_data():
    base_url = "https://www.ssa.gov/oact/babynames/names.zip"
    r = requests.get(base_url)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall()


def extract_data(year_range):
    dfs = []
    for year in range(year_range[0], year_range[1] + 1):
        filename = f"data/yob{year}.txt"
        if os.path.exists(filename):
            df = pd.read_csv(filename, names=['name', 'sex', 'count'])
            df['year'] = year
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)

In [3]:
# Process the data
def process_data(df):
    # Calculate total births per year
    yearly_total = df.groupby('year')['count'].sum().reset_index()
    yearly_total.columns = ['year', 'total_births']
    
    # Merge with the main dataframe
    df = df.merge(yearly_total, on='year')
    
    # Calculate percentage
    df['percentage'] = df['count'] / df['total_births'] * 100
    
    return df

In [4]:
# Create interactive line plot
def create_line_plot(df, names, sex='F'):
    filtered_df = df[(df['name'].isin(names)) & (df['sex'] == sex)]
    fig = px.line(filtered_df, x='year', y='percentage', color='name',
                  title=f"Popularity of Baby Names ({sex})",
                  labels={'percentage': 'Percentage', 'year': 'Year'})
    return fig

In [7]:
# Main function
def analyze_baby_names(
    year_range=(1950, 2024),
    names=[
        'Riley', 'Anna', 'Camden',
        'Tanner',
        'Cooper',
        'Elliot',
        'Owen',
        'Luke',
        'Nathan',
        'Nathaniel',
        'Everett',
        'Tatum',
        'Reid',
        'Garrett',
        'Emmit',
        'Oliver',
        ]
):
    # df = download_data()
    df = extract_data(year_range)
    df = process_data(df)
    fig = create_line_plot(df, names)
    fig.show()


In [6]:
# Run the analysis
analyze_baby_names()