In [17]:
import pandas as pd
import numpy as np
import os
import pyreadr


In [20]:
dir_data = "C:/Users/chris/Documents/Projects/Names/Data/"

In [None]:
# Initialize an empty list to store dataframes
dfs = []

# Loop through each file in the directory
for year in range(1900, 2024):
    file_path = os.path.join(dir_data, f"raw/yob{year}.txt")
    if os.path.exists(file_path):
        df = pd.read_csv(file_path, header=None)
        df['year'] = year
        dfs.append(df)

# Concatenate all dataframes
all_data = pd.concat(dfs, ignore_index=True)
all_data.columns = ['Name', 'Gender', 'Count', 'Year']
all_data

Unnamed: 0,Name,Gender,Count,Year
0,Mary,F,16705,1900
1,Helen,F,6342,1900
2,Anna,F,6114,1900
3,Margaret,F,5304,1900
4,Ruth,F,4765,1900
...,...,...,...,...
2064950,Zyell,M,5,2023
2064951,Zyen,M,5,2023
2064952,Zymirr,M,5,2023
2064953,Zyquan,M,5,2023


In [13]:
# Pivot the data to get separate columns for male and female counts
pivot_data = all_data.pivot_table(index=['Name', 'Year'], columns='Gender', values='Count', aggfunc='sum', fill_value=0).reset_index()

# Rename the columns
pivot_data.columns = ['Name', 'Year', 'Count_Female', 'Count_Male']

# Calculate the total count
pivot_data['Count_Total'] = pivot_data['Count_Female'] + pivot_data['Count_Male']

# Calculate the rank for female and male names
pivot_data['Rank_Female'] = pivot_data.groupby('Year')['Count_Female'].rank(ascending=False, method='min')
pivot_data['Rank_Male'] = pivot_data.groupby('Year')['Count_Male'].rank(ascending=False, method='min')

# Calculate the proportion of each name for female and male
pivot_data['Rate_Female'] = pivot_data['Count_Female'] / pivot_data.groupby('Year')['Count_Female'].transform('sum')
pivot_data['Rate_Male'] = pivot_data['Count_Male'] / pivot_data.groupby('Year')['Count_Male'].transform('sum')

# Calculate the yearly share of female and male names
pivot_data['Percent_Female'] = pivot_data['Count_Female'] / pivot_data['Count_Total']
pivot_data['Percent_Male'] = pivot_data['Count_Male'] / pivot_data['Count_Total']

# Sort the data by Year and then by Count_Total in descending order
pivot_data = pivot_data.sort_values(by=['Year', 'Count_Total'], ascending=[True, False])

pivot_data

Unnamed: 0,Name,Year,Count_Female,Count_Male,Count_Total,Rank_Female,Rank_Male,Rate_Female,Rate_Male,Percent_Female,Percent_Male
1223786,Mary,1900,16705,75,16780,1.0,230.0,0.055723,0.000498,0.995530,0.004470
868794,John,1900,46,9829,9875,499.0,1.0,0.000153,0.065320,0.004658,0.995342
1823156,William,1900,44,8579,8623,509.0,2.0,0.000147,0.057013,0.005103,0.994897
790651,James,1900,41,7245,7286,524.0,3.0,0.000137,0.048148,0.005627,0.994373
711149,Helen,1900,6342,18,6360,2.0,578.0,0.021155,0.000120,0.997170,0.002830
...,...,...,...,...,...,...,...,...,...,...,...
1882627,Zyen,2023,0,5,5,17534.0,12190.0,0.000000,0.000003,0.000000,1.000000
1883644,Zymirr,2023,0,5,5,17534.0,12190.0,0.000000,0.000003,0.000000,1.000000
1883650,Zyn,2023,5,0,5,15121.0,14150.0,0.000003,0.000000,1.000000,0.000000
1883855,Zyquan,2023,0,5,5,17534.0,12190.0,0.000000,0.000003,0.000000,1.000000


In [15]:
min_year = 1960

# Filter the data for years since min_year
filtered_data = all_data[all_data['Year'] >= min_year]

# Pivot the data to get separate columns for male and female counts
overall_data = filtered_data.pivot_table(index='Name', columns='Gender', values='Count', aggfunc='sum', fill_value=0).reset_index()

# Rename the columns
overall_data.columns = ['Name', 'Count_Female', 'Count_Male']

# Calculate the total count
overall_data['Count_Total'] = overall_data['Count_Female'] + overall_data['Count_Male']

# Calculate the rank for female and male names
overall_data['Rank_Female'] = overall_data['Count_Female'].rank(ascending=False, method='min')
overall_data['Rank_Male'] = overall_data['Count_Male'].rank(ascending=False, method='min')

# Calculate the proportion of each name for female and male
overall_data['Rate_Female'] = overall_data['Count_Female'] / overall_data['Count_Female'].sum()
overall_data['Rate_Male'] = overall_data['Count_Male'] / overall_data['Count_Male'].sum()

# Calculate the share of female and male names
overall_data['Percent_Female'] = overall_data['Count_Female'] / overall_data['Count_Total']
overall_data['Percent_Male'] = overall_data['Count_Male'] / overall_data['Count_Total']

# Sort the data by Count_Total in descending order
overall_data = overall_data.sort_values(by='Count_Total', ascending=False)

overall_data

Unnamed: 0,Name,Count_Female,Count_Male,Count_Total,Rank_Female,Rank_Male,Rate_Female,Rate_Male,Percent_Female,Percent_Male
61877,Michael,17132,3098647,3115779,909.0,1.0,1.538617e-04,2.595628e-02,0.005498,0.994502
21219,David,8832,2145109,2153941,1433.0,2.0,7.931978e-05,1.796882e-02,0.004100,0.995900
37479,James,10306,2082283,2092589,1294.0,3.0,9.255770e-05,1.744255e-02,0.004925,0.995075
41864,John,8237,1973259,1981496,1496.0,4.0,7.397611e-05,1.652930e-02,0.004157,0.995843
17447,Christopher,8973,1955983,1964956,1416.0,5.0,8.058609e-05,1.638458e-02,0.004567,0.995433
...,...,...,...,...,...,...,...,...,...,...
65546,Necko,0,5,5,65242.0,34931.0,0.000000e+00,4.188324e-08,0.000000,1.000000
82904,Synaia,5,0,5,55998.0,40721.0,4.490477e-08,0.000000e+00,1.000000,0.000000
82907,Synceir,0,5,5,65242.0,34931.0,0.000000e+00,4.188324e-08,0.000000,1.000000
15863,Chakeya,5,0,5,55998.0,40721.0,4.490477e-08,0.000000e+00,1.000000,0.000000


In [30]:
# Read the RData file
name_race_probs = pyreadr.read_r(dir_data+'first_nameRaceProbs.rData')
name_race_probs = name_race_probs['first_nameRaceProbs']

# Clean up the name column
name_race_probs['name'] = name_race_probs['name'].str.strip().str.title()

# Merge with overall_data with indicator set to True
merged_data = overall_data.merge(name_race_probs, left_on='Name', right_on='name', how='left', indicator=True)

# Print the indicator column's value_counts
print(merged_data['_merge'].value_counts())

# Drop the indicator column
merged_data = merged_data.drop(columns=['_merge'])
# Rename the race columns
merged_data = merged_data.rename(columns={'whi': 'pr_white', 'bla': 'pr_black', 'his': 'pr_hispanic', 'asi': 'pr_asian'})

# Drop the 'oth' and 'name' columns
merged_data = merged_data.drop(columns=['oth', 'name'])
merged_data


both          64949
left_only     30733
right_only        0
Name: _merge, dtype: int64


Unnamed: 0,Name,Count_Female,Count_Male,Count_Total,Rank_Female,Rank_Male,Rate_Female,Rate_Male,Percent_Female,Percent_Male,pr_white,pr_black,pr_hispanic,pr_asian
0,Michael,17132,3098647,3115779,909.0,1.0,1.538617e-04,2.595628e-02,0.005498,0.994502,0.817958,0.134655,0.029259,0.006104
1,David,8832,2145109,2153941,1433.0,2.0,7.931978e-05,1.796882e-02,0.004100,0.995900,0.843589,0.093486,0.044733,0.006973
2,James,10306,2082283,2092589,1294.0,3.0,9.255770e-05,1.744255e-02,0.004925,0.995075,0.828457,0.153643,0.006438,0.003470
3,John,8237,1973259,1981496,1496.0,4.0,7.397611e-05,1.652930e-02,0.004157,0.995843,0.867057,0.099637,0.018107,0.005635
4,Christopher,8973,1955983,1964956,1416.0,5.0,8.058609e-05,1.638458e-02,0.004567,0.995433,0.781474,0.160660,0.037080,0.006736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95677,Necko,0,5,5,65242.0,34931.0,0.000000e+00,4.188324e-08,0.000000,1.000000,0.000000,1.000000,0.000000,0.000000
95678,Synaia,5,0,5,55998.0,40721.0,4.490477e-08,0.000000e+00,1.000000,0.000000,,,,
95679,Synceir,0,5,5,65242.0,34931.0,0.000000e+00,4.188324e-08,0.000000,1.000000,,,,
95680,Chakeya,5,0,5,55998.0,40721.0,4.490477e-08,0.000000e+00,1.000000,0.000000,0.000000,1.000000,0.000000,0.000000


In [37]:
# Save pivot_data to parquet
pivot_data.to_parquet(os.path.join(dir_data, 'name_year_data.parquet'))

# Save merged_data to parquet
merged_data.to_parquet(os.path.join(dir_data, 'name_overall_data.parquet'))
merged_data.to_parquet(os.path.join(dir_data, 'name_overall_data.csv'))

In [45]:
name="Rowan"

merged_data.loc[merged_data['Name']==name]

Unnamed: 0,Name,Count_Female,Count_Male,Count_Total,Rank_Female,Rank_Male,Rate_Female,Rate_Male,Percent_Female,Percent_Male,pr_white,pr_black,pr_hispanic,pr_asian
692,Rowan,19768,40389,60157,809.0,442.0,0.000178,0.000338,0.328607,0.671393,0.663438,0.262712,0.031477,0.013317


In [47]:
pivot_data.loc[pivot_data['Name']==name].tail(20)

Unnamed: 0,Name,Year,Count_Female,Count_Male,Count_Total,Rank_Female,Rank_Male,Rate_Female,Rate_Male,Percent_Female,Percent_Male
1520118,Rowan,2004,379,529,908,707.0,476.0,0.000206,0.000267,0.417401,0.582599
1520119,Rowan,2005,347,603,950,765.0,437.0,0.000188,0.000302,0.365263,0.634737
1520120,Rowan,2006,452,673,1125,638.0,423.0,0.000238,0.000328,0.401778,0.598222
1520121,Rowan,2007,711,869,1580,453.0,365.0,0.00037,0.000419,0.45,0.55
1520122,Rowan,2008,676,972,1648,470.0,339.0,0.000358,0.000477,0.410194,0.589806
1520123,Rowan,2009,616,957,1573,497.0,332.0,0.000335,0.000483,0.391608,0.608392
1520124,Rowan,2010,608,955,1563,493.0,328.0,0.000342,0.000498,0.388996,0.611004
1520125,Rowan,2011,544,1047,1591,533.0,309.0,0.000309,0.000551,0.341923,0.658077
1520126,Rowan,2012,684,1151,1835,453.0,300.0,0.000389,0.000607,0.372752,0.627248
1520127,Rowan,2013,714,1197,1911,427.0,292.0,0.000407,0.000633,0.373626,0.626374


In [54]:
merged_data.loc[merged_data['pr_asian']>0.5].head(25)

Unnamed: 0,Name,Count_Female,Count_Male,Count_Total,Rank_Female,Rank_Male,Rate_Female,Rate_Male,Percent_Female,Percent_Male,pr_white,pr_black,pr_hispanic,pr_asian
1516,Muhammad,0,19015,19015,65242.0,691.0,0.0,0.000159282,0.0,1.0,0.063737,0.196364,0.005859,0.553636
2123,Arjun,0,11070,11070,65242.0,925.0,0.0,9.272949e-05,0.0,1.0,0.048115,0.002406,0.0,0.765036
2953,Priya,6352,0,6352,1753.0,40721.0,5.704701e-05,0.0,1.0,0.0,0.037234,0.038074,0.006159,0.725084
3081,Riya,5950,0,5950,1829.0,40721.0,5.343667e-05,0.0,1.0,0.0,0.02766,0.010638,0.017021,0.831915
3138,Syed,11,5759,5770,47241.0,1345.0,9.879048e-08,4.824112e-05,0.001906,0.998094,0.066026,0.008915,0.004457,0.700237
3173,Anjali,5681,0,5681,1876.0,40721.0,5.102079e-05,0.0,1.0,0.0,0.070858,0.030439,0.016467,0.676148
3307,Nikhil,0,5307,5307,65242.0,1408.0,0.0,4.445487e-05,0.0,1.0,0.044032,0.005794,0.0,0.713789
3332,Aditya,5,5247,5252,55998.0,1421.0,4.490477e-08,4.395227e-05,0.000952,0.999048,0.016158,0.0,0.001795,0.835727
3396,Mai,5075,20,5095,2035.0,25763.0,4.557834e-05,1.67533e-07,0.996075,0.003925,0.077391,0.028889,0.00517,0.765547
3431,Ishaan,0,4970,4970,65242.0,1466.0,0.0,4.163194e-05,0.0,1.0,0.005435,0.0,0.0,0.907609
