# Surnames from the U.S. Census Bureau API

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import random

In [3]:
# Main url: https://www.census.gov/data/developers/data-sets/surnames.html

In [4]:
# Data documentation: https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

In [5]:
# Base url: https://api.census.gov/data/{YEAR}/surname?
# Parameters: get=NAME,COUNT,RANK,PCTAPI,PCTBLACK,PCTAIAN,PCTWHITE,PCT2PRACE,PCTHISPANIC&PROP100K=1:1000

#### Read the data

In [6]:
surnames = pd.read_json(
    "https://api.census.gov/data/2010/surname?\
get=NAME,COUNT,RANK,PCTAPI,PCTBLACK,PCTAIAN,PCTWHITE,PCT2PRACE,PCTHISPANIC&PROP100K=1:1000"
)

#### Clean up the columns

In [7]:
surnames.columns = [
    "name",
    "count",
    "rank",
    "pct_api",
    "pct_black",
    "pct_aian",
    "pct_white",
    "pct_two_race",
    "pct_hispanic",
    "per_100k",
]

In [8]:
surnames.drop(0, inplace=True)

#### First five records

In [9]:
surnames.head()

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k
1,SMITH,2442977,1,0.5,23.11,0.89,70.9,2.19,2.4,828.19
2,JOHNSON,1932812,2,0.54,34.63,0.94,58.97,2.56,2.36,655.24
3,WILLIAMS,1625252,3,0.46,47.68,0.82,45.75,2.81,2.49,550.97
4,BROWN,1437026,4,0.51,35.6,0.87,57.95,2.55,2.52,487.16
5,JONES,1425470,5,0.44,38.48,1.0,55.19,2.61,2.29,483.24


#### Which years do we seek?

In [10]:
years = ["2000", "2010"]

#### Hit the API for 2000, 2010 and retrieve fields we want

In [11]:
# empty list to store the returns
frames = []

# loop though years
for year in years:
    # append the eventual return to our empty list
    frames.append(
        # read the json returned by census from constructed url, and then assigned a year column
        pd.read_json(
            "https://api.census.gov/data/"
            + year
            + "/surname?get=NAME,COUNT,RANK,PCTAPI,PCTBLACK,PCTAIAN,PCTWHITE,PCT2PRACE,PCTHISPANIC&PROP100K=0:1000"
        ).assign(year=year)
    )
    # concat the two frames, one for each decade, into a data frame
    surnames = pd.concat(frames)

### Clean up the columns

In [12]:
surnames.columns = [
    "name",
    "count",
    "rank",
    # pct_api = asian/pacific islander
    "pct_api",
    "pct_black",
    # pct_aian = american indian/alaska native
    "pct_aian",
    "pct_white",
    "pct_two_race",
    "pct_hispanic",
    "per_100k",
    "year",
]

### Drop the returned headers that we don't need, and restriction flags, and "ALL OTHER NAMES"

In [13]:
surnames.drop([0], axis=0, inplace=True)

In [14]:
src = surnames[surnames["name"] != "ALL OTHER NAMES"].replace("(S)", 0).fillna(0)

### How many in each year?

In [15]:
src.year.value_counts()

2010    162253
2000    151671
Name: year, dtype: int64

### Convert data types

In [16]:
src["count"] = src["count"].astype(int)
src["name"] = src["name"].str.title()

In [17]:
src[
    [
        "pct_api",
        "pct_black",
        "pct_aian",
        "pct_white",
        "pct_two_race",
        "pct_hispanic",
        "per_100k",
    ]
] = src[
    [
        "pct_api",
        "pct_black",
        "pct_aian",
        "pct_white",
        "pct_two_race",
        "pct_hispanic",
        "per_100k",
    ]
].astype(
    float
)

In [19]:
df = src.copy()

---

### Find your last name!

In [20]:
df[df["name"] == "Stiles"].sort_values("count", ascending=False).head(20)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
1689,Stiles,21357,1689,0.52,3.71,0.75,90.83,1.83,2.35,7.24,2010
1570,Stiles,20957,1570,0.39,3.58,0.64,92.56,1.33,1.5,7.77,2000


---

### Export

In [24]:
df.to_csv("../data/processed/surnames.csv", index=False)