# Surname frequency and demographics from U.S. Census Bureau

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import glob
import os
import numpy as np

In [3]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

### Grab data from Census surnames API

In [4]:
# Docs: https://www.census.gov/data/developers/data-sets/surnames.html

In [5]:
years = ["2000", "2010"]

### Construct urls, read data, create concatenated dataframe

In [6]:
frames = []

for y in years:
    frames.append(
        pd.read_json(
            "https://api.census.gov/data/"
            + y
            + "/surname?get=NAME,COUNT,RANK,PCTAPI,PCTBLACK,PCTAIAN,PCTWHITE,PCT2PRACE,PCTHISPANIC&PROP100K=1:100000"
        ).assign(year=y)
    )
    src = pd.concat(frames)

### How many in each year?

In [7]:
src.year.value_counts()

2000    10887
2010    10852
Name: year, dtype: int64

### Clean up the dataframe

In [8]:
src.columns = [
    "name",
    "count",
    "rank",
    "pctapi",
    "pctblack",
    "pctaian",
    "pctwhite",
    "pcttworace",
    "pcthispanic",
    "prop100k",
    "year",
]

In [9]:
src.drop([0], axis=0, inplace=True)

In [10]:
src = src[src["name"] != "ALL OTHER NAMES"]

In [11]:
src = src.replace("(S)", "").fillna("")

In [12]:
src["count"] = src["count"].astype(int)
src["prop100k"] = src["prop100k"].astype(float)
src["name"] = src["name"].str.title()

### Make a copy for analysis

In [13]:
df = src.sort_values("count", ascending=False).copy()

In [26]:
df

Unnamed: 0,name,count,rank,pctapi,pctblack,pctaian,pctwhite,pcttworace,pcthispanic,prop100k,year
1,Smith,2442977,1,0.5,23.11,0.89,70.9,2.19,2.4,828.19,2010
1,Smith,2376206,1,0.4,22.22,0.85,73.35,1.63,1.56,880.85,2000
2,Johnson,1932812,2,0.54,34.63,0.94,58.97,2.56,2.36,655.24,2010
2,Johnson,1857160,2,0.42,33.8,0.91,61.55,1.82,1.5,688.44,2000
3,Williams,1625252,3,0.46,47.68,0.82,45.75,2.81,2.49,550.97,2010
...,...,...,...,...,...,...,...,...,...,...,...
10881,Amado,2686,10881,4.43,9.27,0.56,19.4,12.88,53.46,1.00,2000
10882,Bovee,2686,10881,0.71,0.48,0.37,95.09,1.75,1.6,1.00,2000
10883,Muench,2686,10881,0.3,0.26,0.22,97.39,0.74,1.08,1.00,2000
10885,Harger,2685,10885,0.3,2.83,0.63,93.33,1.75,1.15,1.00,2000


---

### Group by name and year

In [14]:
name_year = df.groupby(["name", "year"])["prop100k"].sum().reset_index()

In [25]:
name_year[name_year["name"] == "Stiles"]

Unnamed: 0,name,year,prop100k
18937,Stiles,2000,7.77
18938,Stiles,2010,7.24


In [16]:
df_pivot = pd.pivot_table(
    name_year, values="prop100k", index="name", columns="year", aggfunc=np.sum
).reset_index()

In [17]:
df_pivot["change"] = (
    ((df_pivot["2010"] - df_pivot["2000"]) / df_pivot["2000"]) * 100
).round(2)

In [18]:
df_pivot.sort_values("change", ascending=False).head(10)

year,name,2000,2010,change
4264,Hail,1.06,29.24,2658.49
8024,Person,7.09,22.85,222.28
7517,Nilson,1.28,3.39,164.84
11267,Zheng,3.96,8.56,116.16
719,Begum,1.99,4.15,108.54
5010,Hussein,1.17,2.43,107.69
5198,Jiang,3.18,6.58,106.92
3725,Gao,2.0,4.13,106.5
7133,Mohamed,4.63,9.48,104.75
5316,Kaur,8.09,16.53,104.33


---

### Popular Korean surnames

In [19]:
korean_surnames = list(
    map(
        lambda x: x.title(),
        [
            "Kim",
            "Lee",
            "Park",
            "Choi",
            "Chung",
            "Cho",
            "Kang",
            "Chang",
            "Shin",
            "Yoon",
            "Han",
            "Oh",
            "Hong",
            "Song",
            "Kwon",
            "Yoo",
            "Ahn",
            "Hwang",
            "Rhee",
            "Lim",
            "Suh",
            "Yang",
            "Moon",
            "Chun",
            "Ko",
            "Pak",
            "Choe",
            "Yi",
            "Jung",
            "Min",
            "Yu",
            "Yun",
            "Hahn",
            "Nam",
            "Cha",
            "Paik",
            "Ha",
            "Son",
            "Bae",
            "Koh",
            "Kwak",
            "Shim",
            "Yim",
            "Jang",
            "Ryu",
            "Huh",
            "Im",
            "Sohn",
            "Sung",
            "Jun",
        ],
    )
)

In [20]:
koreans = name_year[name_year["name"].isin(korean_surnames)]

In [21]:
koreans.head(10)

Unnamed: 0,name,year,prop100k
156,Ahn,2000,3.01
157,Ahn,2010,3.58
840,Bae,2000,1.5
841,Bae,2010,2.1
3506,Cha,2000,2.85
3507,Cha,2010,3.3
3565,Chang,2000,25.86
3566,Chang,2010,27.78
3724,Cho,2000,9.38
3725,Cho,2010,11.35


---

## Exports 

In [22]:
df.to_csv("data/raw/raw_census_surnames_2000_2010.csv", index=False)

In [23]:
koreans.to_csv("data/processed/top_korean_surnames.csv", index=False)

In [24]:
df_pivot.to_csv("data/processed/surnames_change_2000_to_2010.csv", index=False)