In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

## 1. Load Top Names

In [None]:
# Load the top 1000 names
top_names_path = Path('../data/top_1000_names_for_mapping.csv')

if not top_names_path.exists():
    print("Please run notebook 01 first to generate the top names file.")
else:
    names_df = pd.read_csv(top_names_path)
    print(f"Loaded {len(names_df)} names")
    display(names_df.head(20))

## 2. Rule-Based Classification

We'll create a rule-based system to classify names by their likely origin.

In [None]:
# Define name patterns for each region

# Latin/Hispanic names
latin_names = [
    'Jose', 'Juan', 'Luis', 'Carlos', 'Jesus', 'Miguel', 'Antonio', 'Francisco',
    'Pedro', 'Jorge', 'Manuel', 'Rafael', 'Ramon', 'Fernando', 'Ricardo', 'Roberto',
    'Eduardo', 'Julio', 'Enrique', 'Pablo', 'Raul', 'Mario', 'Sergio', 'Ruben',
    'Hector', 'Oscar', 'Cesar', 'Diego', 'Javier', 'Angel', 'Marco', 'Alejandro',
    'Maria', 'Carmen', 'Rosa', 'Ana', 'Elena', 'Teresa', 'Lucia', 'Gloria',
    'Isabel', 'Dolores', 'Guadalupe', 'Josefina', 'Beatriz', 'Catalina', 'Margarita',
    'Adriana', 'Alicia', 'Gabriela', 'Isabella', 'Sofia', 'Camila', 'Valentina',
    'Natalia', 'Daniela', 'Victoria', 'Andrea', 'Diana', 'Angelica', 'Selena'
]

# Asian names
asian_names = [
    'Ming', 'Mei', 'Wei', 'Li', 'Chen', 'Wang', 'Zhang', 'Liu', 'Yang',
    'Yuki', 'Akira', 'Kenji', 'Sakura', 'Hiroshi', 'Takashi', 'Yumi',
    'Kim', 'Park', 'Lee', 'Jung', 'Min', 'Jin', 'Hyun', 'Ji', 'Sung',
    'Anh', 'Linh', 'Minh', 'Nguyen', 'Tran', 'Pham',
    'Priya', 'Ravi', 'Amit', 'Raj', 'Kumar', 'Arjun', 'Krishna', 'Deepak'
]

# African/Middle Eastern names
african_middle_eastern_names = [
    'Mohammed', 'Muhammad', 'Ahmad', 'Hassan', 'Omar', 'Ali', 'Ibrahim', 'Khalid',
    'Fatima', 'Aisha', 'Amina', 'Zahra', 'Layla', 'Noor', 'Mariam', 'Yasmin',
    'Jamal', 'Malik', 'Rashid', 'Kareem', 'Tariq', 'Karim',
    'Kwame', 'Kofi', 'Amara', 'Zuri', 'Imani', 'Nia', 'Aaliyah', 'Zara',
    'Jamal', 'Tyrone', 'Darnell', 'Latoya', 'Keisha', 'Tanisha', 'Ebony'
]

# Irish/Italian names
irish_italian_names = [
    'Patrick', 'Sean', 'Connor', 'Liam', 'Ryan', 'Brendan', 'Brian', 'Kevin',
    'Colleen', 'Kathleen', 'Maureen', 'Bridget', 'Erin', 'Kelly', 'Shannon',
    'Giovanni', 'Giuseppe', 'Antonio', 'Salvatore', 'Vincenzo', 'Marco', 'Luigi',
    'Carla', 'Gina', 'Rosa', 'Francesca', 'Isabella', 'Lucia', 'Angela',
    'Gianna', 'Alessandra', 'Bianca', 'Chiara'
]

# Anglo/traditional English names (most common baseline)
anglo_names = [
    'John', 'William', 'James', 'Robert', 'Michael', 'David', 'Richard', 'Charles',
    'Joseph', 'Thomas', 'Christopher', 'Daniel', 'Matthew', 'Donald', 'Mark',
    'Paul', 'Steven', 'Andrew', 'Kenneth', 'Joshua', 'George', 'Edward',
    'Mary', 'Patricia', 'Jennifer', 'Linda', 'Barbara', 'Elizabeth', 'Susan',
    'Jessica', 'Sarah', 'Nancy', 'Karen', 'Betty', 'Helen', 'Dorothy', 'Margaret',
    'Emily', 'Emma', 'Olivia', 'Ava', 'Sophia', 'Mia', 'Charlotte', 'Amelia'
]

print("Name patterns defined for each region.")

In [None]:
# Function to classify names
def classify_name(name):
    """Classify a name into a region of origin."""
    if name in latin_names:
        return 'Latin'
    elif name in asian_names:
        return 'Asian'
    elif name in african_middle_eastern_names:
        return 'African_MiddleEastern'
    elif name in irish_italian_names:
        return 'Irish_Italian'
    elif name in anglo_names:
        return 'Anglo'
    else:
        # Default to Anglo for unclassified traditional names
        return 'Anglo'

# Apply classification
names_df['Origin_Region'] = names_df['Name'].apply(classify_name)

# Show distribution
print("Distribution of names by origin region:")
print(names_df['Origin_Region'].value_counts())
print("\nPercentage:")
print(names_df['Origin_Region'].value_counts(normalize=True) * 100)

## 3. Review Classifications

In [None]:
# Show examples from each category
for region in names_df['Origin_Region'].unique():
    print(f"\n{region} names (top 20):")
    print(names_df[names_df['Origin_Region'] == region]['Name'].head(20).tolist())

## 4. Manual Adjustments

Here we can manually adjust any misclassified names.

In [None]:
# Manual adjustments (add more as needed)
manual_mappings = {
    # Format: 'Name': 'Region'
    'Anthony': 'Irish_Italian',
    'Angelo': 'Irish_Italian',
    'Dante': 'Irish_Italian',
    'Aidan': 'Irish_Italian',
    'Ciara': 'Irish_Italian',
    'Fernando': 'Latin',
    'Alejandro': 'Latin',
    'Santiago': 'Latin',
    'Mohamed': 'African_MiddleEastern',
    'Aaliyah': 'African_MiddleEastern',
}

# Apply manual mappings
for name, region in manual_mappings.items():
    if name in names_df['Name'].values:
        names_df.loc[names_df['Name'] == name, 'Origin_Region'] = region

print("Manual adjustments applied.")
print("\nUpdated distribution:")
print(names_df['Origin_Region'].value_counts())

## 5. Save the Mapping

In [None]:
# Save the final mapping
output_path = Path('../data/name_origin_mapping.csv')
names_df.to_csv(output_path, index=False)

print(f"Saved name-origin mapping to {output_path}")
print(f"\nTotal names mapped: {len(names_df)}")
print(f"\nFirst 30 rows:")
display(names_df.head(30))

## 6. Create Summary Statistics

In [None]:
# Calculate what percentage of total births each region represents
region_totals = names_df.groupby('Origin_Region')['Total_Count'].sum().sort_values(ascending=False)

print("Total births by origin region (among top 1000 names):")
print(region_totals)
print("\nPercentage:")
print((region_totals / region_totals.sum() * 100).round(2))

## Summary

We've created a mapping of the top 1000 names to their regions of origin:
- Anglo (baseline traditional English/Western)
- Irish_Italian (early European immigration)
- Latin (Hispanic/Latin American)
- Asian (East/South/Southeast Asian)
- African_MiddleEastern (African, Arabic, Middle Eastern)

**Next step:** In notebook 03, we'll join this mapping back to the full dataset and calculate regional trends over time.