In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Set display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load the Data

In [None]:
# Load the baby names dataset
data_path = Path('../data/babynames.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head(10)

In [None]:
# Check data types and missing values
print("Data Info:")
df.info()

print("\n" + "="*50)
print("Missing values:")
print(df.isnull().sum())

## 2. Basic Dataset Statistics

In [None]:
# Time range
print(f"Years covered: {df['Year'].min()} to {df['Year'].max()}")
print(f"Total years: {df['Year'].nunique()}")

# Gender distribution
print(f"\nGender distribution:")
print(df.groupby('Gender')['Count'].sum())

# Unique names
print(f"\nUnique names: {df['Name'].nunique():,}")
print(f"Total records: {len(df):,}")
print(f"Total babies: {df['Count'].sum():,}")

## 3. Births Over Time

In [None]:
# Total births per year
births_by_year = df.groupby('Year')['Count'].sum().reset_index()
births_by_year.columns = ['Year', 'Total_Births']

# Plot
fig = px.line(births_by_year, x='Year', y='Total_Births',
              title='Total U.S. Births Over Time (SSA Data)',
              labels={'Total_Births': 'Total Births'})

# Add key immigration policy dates
fig.add_vline(x=1924, line_dash="dash", line_color="red",
              annotation_text="1924 Immigration Act", annotation_position="top")
fig.add_vline(x=1965, line_dash="dash", line_color="green",
              annotation_text="1965 Hart-Celler Act", annotation_position="top")

fig.show()

In [None]:
# Births by gender over time
births_by_year_gender = df.groupby(['Year', 'Gender'])['Count'].sum().reset_index()

fig = px.line(births_by_year_gender, x='Year', y='Count', color='Gender',
              title='U.S. Births by Gender Over Time',
              labels={'Count': 'Total Births'})

fig.add_vline(x=1924, line_dash="dash", line_color="red")
fig.add_vline(x=1965, line_dash="dash", line_color="green")

fig.show()

## 4. Top Names Analysis

In [None]:
# Most popular names overall (total counts across all years)
top_names_overall = df.groupby('Name')['Count'].sum().sort_values(ascending=False)

print("Top 50 most common names (all time):")
print(top_names_overall.head(50))

# Visualize top 20
fig = px.bar(x=top_names_overall.head(20).index,
             y=top_names_overall.head(20).values,
             title='Top 20 Most Common Names (All Time)',
             labels={'x': 'Name', 'y': 'Total Count'})
fig.show()

In [None]:
# Top names by gender
top_names_female = df[df['Gender'] == 'F'].groupby('Name')['Count'].sum().sort_values(ascending=False)
top_names_male = df[df['Gender'] == 'M'].groupby('Name')['Count'].sum().sort_values(ascending=False)

print("Top 30 Female Names:")
print(top_names_female.head(30))

print("\nTop 30 Male Names:")
print(top_names_male.head(30))

## 5. Name Diversity Over Time

In [None]:
# Number of unique names per year
unique_names_by_year = df.groupby('Year')['Name'].nunique().reset_index()
unique_names_by_year.columns = ['Year', 'Unique_Names']

fig = px.line(unique_names_by_year, x='Year', y='Unique_Names',
              title='Name Diversity Over Time',
              labels={'Unique_Names': 'Number of Unique Names'})

fig.add_vline(x=1924, line_dash="dash", line_color="red")
fig.add_vline(x=1965, line_dash="dash", line_color="green")

fig.show()

## 6. Concentration of Popular Names

In [None]:
# Calculate the share of births going to the top 10 names each year
def top_n_share(group, n=10):
    total = group['Count'].sum()
    top_n = group.nlargest(n, 'Count')['Count'].sum()
    return top_n / total * 100

top10_share_by_year = df.groupby('Year').apply(top_n_share).reset_index()
top10_share_by_year.columns = ['Year', 'Top10_Share']

fig = px.line(top10_share_by_year, x='Year', y='Top10_Share',
              title='Share of Births with Top 10 Names Over Time',
              labels={'Top10_Share': 'Share (%)'})

fig.add_vline(x=1924, line_dash="dash", line_color="red")
fig.add_vline(x=1965, line_dash="dash", line_color="green")

fig.show()

## 7. Save Top Names for Origin Mapping

We'll extract the top 1000 names overall to create our origin mapping in the next notebook.

In [None]:
# Get top 1000 names
top_1000_names = top_names_overall.head(1000).reset_index()
top_1000_names.columns = ['Name', 'Total_Count']

# Add gender information (most common gender for each name)
def get_dominant_gender(name):
    name_data = df[df['Name'] == name].groupby('Gender')['Count'].sum()
    return name_data.idxmax() if len(name_data) > 0 else 'U'

top_1000_names['Dominant_Gender'] = top_1000_names['Name'].apply(get_dominant_gender)

# Save to CSV for manual annotation
output_path = Path('../data/top_1000_names_for_mapping.csv')
top_1000_names.to_csv(output_path, index=False)

print(f"Saved top 1000 names to {output_path}")
print(f"\nFirst 20 names:")
print(top_1000_names.head(20))

## Summary

Key findings from this EDA:
1. Dataset covers multiple decades of U.S. birth records
2. Significant changes in name diversity over time
3. Clear trends in the concentration of popular names
4. Top 1000 names identified for origin mapping

**Next steps:** In notebook 02, we'll create the name-to-origin mapping.