In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path

# Set display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load the Data

In [2]:
# Load the baby names dataset
data_path = Path('../data/babynames.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head(10)

Dataset shape: (1825433, 5)

Column names: ['Id', 'Name', 'Year', 'Gender', 'Count']

First few rows:


Unnamed: 0,Id,Name,Year,Gender,Count
0,1,Mary,1880,F,7065
1,2,Anna,1880,F,2604
2,3,Emma,1880,F,2003
3,4,Elizabeth,1880,F,1939
4,5,Minnie,1880,F,1746
5,6,Margaret,1880,F,1578
6,7,Ida,1880,F,1472
7,8,Alice,1880,F,1414
8,9,Bertha,1880,F,1320
9,10,Sarah,1880,F,1288


In [3]:
# Check data types and missing values
print("Data Info:")
df.info()

print("\n" + "="*50)
print("Missing values:")
print(df.isnull().sum())

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1825433 entries, 0 to 1825432
Data columns (total 5 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Name    object
 2   Year    int64 
 3   Gender  object
 4   Count   int64 
dtypes: int64(3), object(2)
memory usage: 69.6+ MB

Missing values:
Id        0
Name      0
Year      0
Gender    0
Count     0
dtype: int64


## 2. Basic Dataset Statistics

In [4]:
# Time range
print(f"Years covered: {df['Year'].min()} to {df['Year'].max()}")
print(f"Total years: {df['Year'].nunique()}")

# Gender distribution
print(f"\nGender distribution:")
print(df.groupby('Gender')['Count'].sum())

# Unique names
print(f"\nUnique names: {df['Name'].nunique():,}")
print(f"Total records: {len(df):,}")
print(f"Total babies: {df['Count'].sum():,}")

Years covered: 1880 to 2014
Total years: 135

Gender distribution:


Gender
F    167070477
M    170064949
Name: Count, dtype: int64

Unique names: 93,889
Total records: 1,825,433
Total babies: 337,135,426


## 3. Births Over Time

In [5]:
# Total births per year
births_by_year = df.groupby('Year')['Count'].sum().reset_index()
births_by_year.columns = ['Year', 'Total_Births']

# Plot
fig = px.line(births_by_year, x='Year', y='Total_Births',
              title='Total U.S. Births Over Time (SSA Data)',
              labels={'Total_Births': 'Total Births'})

# Add key immigration policy dates
fig.add_vline(x=1924, line_dash="dash", line_color="red",
              annotation_text="1924 Immigration Act", annotation_position="top")
fig.add_vline(x=1965, line_dash="dash", line_color="green",
              annotation_text="1965 Hart-Celler Act", annotation_position="top")

fig.show()

In [6]:
# Births by gender over time
births_by_year_gender = df.groupby(['Year', 'Gender'])['Count'].sum().reset_index()

fig = px.line(births_by_year_gender, x='Year', y='Count', color='Gender',
              title='U.S. Births by Gender Over Time',
              labels={'Count': 'Total Births'})

fig.add_vline(x=1924, line_dash="dash", line_color="red")
fig.add_vline(x=1965, line_dash="dash", line_color="green")

fig.show()

## 4. Top Names Analysis

In [7]:
# Most popular names overall (total counts across all years)
top_names_overall = df.groupby('Name')['Count'].sum().sort_values(ascending=False)

print("Top 50 most common names (all time):")
print(top_names_overall.head(50))

# Visualize top 20
fig = px.bar(x=top_names_overall.head(20).index,
             y=top_names_overall.head(20).values,
             title='Top 20 Most Common Names (All Time)',
             labels={'x': 'Name', 'y': 'Total Count'})
fig.show()

Top 50 most common names (all time):
Name
James          5129096
John           5106590
Robert         4816785
Michael        4330805
Mary           4130441
William        4071368
David          3590557
Joseph         2580687
Richard        2564867
Charles        2376700
Thomas         2291517
Christopher    2004177
Daniel         1876880
Elizabeth      1606282
Patricia       1575529
Matthew        1558671
Jennifer       1467573
George         1464430
Linda          1454599
Barbara        1437083
Donald         1414511
Anthony        1410142
Paul           1386884
Mark           1348242
Edward         1286568
Steven         1277582
Kenneth        1272151
Andrew         1260738
Margaret       1243750
Joshua         1179654
Brian          1162528
Kevin          1158447
Susan          1123009
Dorothy        1109362
Ronald         1079633
Timothy        1064929
Sarah          1063944
Jessica        1044144
Jason          1019582
Helen          1019006
Nancy          1003902
Betty          

In [8]:
# Top names by gender
top_names_female = df[df['Gender'] == 'F'].groupby('Name')['Count'].sum().sort_values(ascending=False)
top_names_male = df[df['Gender'] == 'M'].groupby('Name')['Count'].sum().sort_values(ascending=False)

print("Top 30 Female Names:")
print(top_names_female.head(30))

print("\nTop 30 Male Names:")
print(top_names_male.head(30))

Top 30 Female Names:
Name
Mary         4115282
Elizabeth    1601128
Patricia     1570567
Jennifer     1462742
Linda        1450843
Barbara      1432944
Margaret     1240006
Susan        1120469
Dorothy      1105680
Sarah        1060643
Jessica      1040574
Helen        1015902
Nancy        1000955
Betty         998851
Karen         983806
Lisa          963924
Anna          873767
Sandra        872458
Ashley        834720
Donna         830272
Kimberly      828102
Ruth          820068
Carol         815779
Emily         808754
Michelle      804495
Laura         791654
Amanda        781807
Melissa       749449
Rebecca       742063
Deborah       739273
Name: Count, dtype: int64

Top 30 Male Names:
Name
James          5105919
John           5084943
Robert         4796695
Michael        4309198
William        4055473
David          3577704
Joseph         2570095
Richard        2555330
Charles        2364332
Thomas         2283080
Christopher    1994742
Daniel         1868775
Matthew        15

## 5. Name Diversity Over Time

In [9]:
# Number of unique names per year
unique_names_by_year = df.groupby('Year')['Name'].nunique().reset_index()
unique_names_by_year.columns = ['Year', 'Unique_Names']

fig = px.line(unique_names_by_year, x='Year', y='Unique_Names',
              title='Name Diversity Over Time',
              labels={'Unique_Names': 'Number of Unique Names'})

fig.add_vline(x=1924, line_dash="dash", line_color="red")
fig.add_vline(x=1965, line_dash="dash", line_color="green")

fig.show()

## 6. Concentration of Popular Names

In [10]:
# Calculate the share of births going to the top 10 names each year
def top_n_share(group, n=10):
    total = group['Count'].sum()
    top_n = group.nlargest(n, 'Count')['Count'].sum()
    return top_n / total * 100

top10_share_by_year = df.groupby('Year').apply(top_n_share).reset_index()
top10_share_by_year.columns = ['Year', 'Top10_Share']

fig = px.line(top10_share_by_year, x='Year', y='Top10_Share',
              title='Share of Births with Top 10 Names Over Time',
              labels={'Top10_Share': 'Share (%)'})

fig.add_vline(x=1924, line_dash="dash", line_color="red")
fig.add_vline(x=1965, line_dash="dash", line_color="green")

fig.show()





## 7. Save Top Names for Origin Mapping

We'll extract the top 1000 names overall to create our origin mapping in the next notebook.

In [11]:
# Get top 1000 names
top_1000_names = top_names_overall.head(1000).reset_index()
top_1000_names.columns = ['Name', 'Total_Count']

# Add gender information (most common gender for each name)
def get_dominant_gender(name):
    name_data = df[df['Name'] == name].groupby('Gender')['Count'].sum()
    return name_data.idxmax() if len(name_data) > 0 else 'U'

top_1000_names['Dominant_Gender'] = top_1000_names['Name'].apply(get_dominant_gender)

# Save to CSV for manual annotation
output_path = Path('../data/top_1000_names_for_mapping.csv')
top_1000_names.to_csv(output_path, index=False)

print(f"Saved top 1000 names to {output_path}")
print(f"\nFirst 20 names:")
print(top_1000_names.head(20))

Saved top 1000 names to ../data/top_1000_names_for_mapping.csv

First 20 names:
           Name  Total_Count Dominant_Gender
0         James      5129096               M
1          John      5106590               M
2        Robert      4816785               M
3       Michael      4330805               M
4          Mary      4130441               F
5       William      4071368               M
6         David      3590557               M
7        Joseph      2580687               M
8       Richard      2564867               M
9       Charles      2376700               M
10       Thomas      2291517               M
11  Christopher      2004177               M
12       Daniel      1876880               M
13    Elizabeth      1606282               F
14     Patricia      1575529               F
15      Matthew      1558671               M
16     Jennifer      1467573               F
17       George      1464430               M
18        Linda      1454599               F
19      Barbara     

## Summary

Key findings from this EDA:
1. Dataset covers multiple decades of U.S. birth records
2. Significant changes in name diversity over time
3. Clear trends in the concentration of popular names
4. Top 1000 names identified for origin mapping

**Next steps:** In notebook 02, I'll create the name-to-origin mapping.