# Surnames from the U.S. Census Bureau

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import random

In [3]:
# Main url: https://www.census.gov/data/developers/data-sets/surnames.html

In [4]:
# Data documentation: https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

In [5]:
# API call: https://api.census.gov/data/{YEAR}/surname?get=NAME,COUNT,RANK,PCTAPI,PCTBLACK,PCTAIAN,PCTWHITE,PCT2PRACE,PCTHISPANIC&PROP100K=1:1000

### Which years do we seek?

In [6]:
years = ["2000", "2010"]

### Hit the API for 2000, 2010 and retrieve fields we want

In [7]:
# empty list to store the returns
frames = []

# loop though years
for y in years:
    # append the eventual return to our empty list
    frames.append(
        # read the json returned by census from constructed url, and then assigned a year column
        pd.read_json(
            "https://api.census.gov/data/"
            + y
            + "/surname?get=NAME,COUNT,RANK,PCTAPI,PCTBLACK,PCTAIAN,PCTWHITE,PCT2PRACE,PCTHISPANIC&PROP100K=0:1000"
        ).assign(year=y)
    )
    # concat the two frames, one for each decade, into a data frame
    surnames = pd.concat(frames)

### Clean up the columns

In [8]:
surnames.columns = [
    "name",
    "count",
    "rank",
    # pct_api = asian/pacific islander
    "pct_api",
    "pct_black",
    # pct_aian = american indian/alaska native
    "pct_aian",
    "pct_white",
    "pct_two_race",
    "pct_hispanic",
    "per_100k",
    "year",
]

### Drop the returned headers that we don't need, and restriction flags, and "ALL OTHER NAMES"

In [9]:
surnames.drop([0], axis=0, inplace=True)

In [10]:
src = surnames[surnames["name"] != "ALL OTHER NAMES"].replace("(S)", 0).fillna(0)

### How many in each year?

In [11]:
src.year.value_counts()

2010    162253
2000    151671
Name: year, dtype: int64

### Convert data types

In [12]:
src["count"] = src["count"].astype(int)
src["name"] = src["name"].str.title()

In [13]:
src[
    [
        "pct_api",
        "pct_black",
        "pct_aian",
        "pct_white",
        "pct_two_race",
        "pct_hispanic",
        "per_100k",
    ]
] = src[
    [
        "pct_api",
        "pct_black",
        "pct_aian",
        "pct_white",
        "pct_two_race",
        "pct_hispanic",
        "per_100k",
    ]
].astype(
    float
)

### Reduce the size of the dataframe, but save Tannistha

In [14]:
df = (
    src[(src["per_100k"] >= 1) | (src["name"] == "Sinha")]
    .sort_values("count", ascending=False)
    .copy()
)

### How many names? 

In [15]:
len(df)

21737

In [16]:
df.head()

### Export

In [37]:
df.to_csv("data/surnames.csv", index=False)
df.to_json("data/surnames.json", indent=4, orient="records")

### Surnames in our class (the simple versions, at least)

In [17]:
students = [
    "Vergara",
    "Chua",
    "Guo",
    "Harmon",
    "Kang",
    "Martinez",
    "Sinha",
    "Song",
    "Wang",
]

### Select students at random into three teams

In [18]:
random.shuffle(students)
students

['Martinez',
 'Song',
 'Wang',
 'Chua',
 'Sinha',
 'Guo',
 'Vergara',
 'Harmon',
 'Kang']

In [19]:
# Another way to do random selection here: https://observablehq.com/@mattstiles/jour-554-surnames

In [20]:
team_one = students[0:3]
team_one

['Martinez', 'Song', 'Wang']

In [21]:
team_two = students[3:6]
team_two

['Chua', 'Sinha', 'Guo']

In [22]:
team_three = students[6:9]
team_three

['Vergara', 'Harmon', 'Kang']

---

### Find your last name!

In [23]:
df[df["name"] == "Stiles"].sort_values("count", ascending=False).head(20)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
1689,Stiles,21357,1689,0.52,3.71,0.75,90.83,1.83,2.35,7.24,2010
1570,Stiles,20957,1570,0.39,3.58,0.64,92.56,1.33,1.5,7.77,2000


### Find the surnames of your teammates, and sort ascending by surname

In [24]:
df[df["name"].isin(team_one)].sort_values("name")

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
10,Martinez,1060159,10,0.6,0.49,0.51,5.28,0.22,92.91,359.4,2010
11,Martinez,775072,11,0.6,0.52,0.64,6.04,0.46,91.72,287.32,2000
1415,Song,25110,1415,95.15,0.45,0.04,1.98,1.74,0.65,8.51,2010
1962,Song,16856,1962,95.12,0.39,0.06,2.08,1.91,0.43,6.25,2000
282,Wang,109883,282,95.24,0.29,0.02,2.59,1.5,0.35,37.25,2010
438,Wang,67570,438,94.47,0.19,0.03,3.25,1.73,0.33,25.05,2000


### Who in your team has the most popular names, in terms of per 100,000 rate

In [25]:
df[df["name"].isin(team_one)].sort_values("per_100k", ascending=False)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
10,Martinez,1060159,10,0.6,0.49,0.51,5.28,0.22,92.91,359.4,2010
11,Martinez,775072,11,0.6,0.52,0.64,6.04,0.46,91.72,287.32,2000
282,Wang,109883,282,95.24,0.29,0.02,2.59,1.5,0.35,37.25,2010
438,Wang,67570,438,94.47,0.19,0.03,3.25,1.73,0.33,25.05,2000
1415,Song,25110,1415,95.15,0.45,0.04,1.98,1.74,0.65,8.51,2010
1962,Song,16856,1962,95.12,0.39,0.06,2.08,1.91,0.43,6.25,2000


### In 2010, find the 5 names that are most Asian

In [26]:
df[df["year"] == "2010"].sort_values("pct_api", ascending=False).head(5)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
8942,Zhen,3661,8941,98.63,0.0,0.0,0.57,0.46,0.27,1.24,2010
8712,Kuang,3767,8712,98.38,0.27,0.0,0.19,0.64,0.53,1.28,2010
1381,Xu,25622,1381,98.25,0.12,0.02,0.83,0.55,0.24,8.69,2010
1694,Zhu,21265,1694,98.23,0.0,0.0,0.88,0.58,0.16,7.21,2010
7522,Qiu,4424,7520,98.21,0.0,0.0,0.84,0.75,0.0,1.5,2010


### Among majority Hispanic names in 2010, which five were most common?

In [27]:
df[(df["year"] == "2010") & (df["pct_hispanic"] > 50)].sort_values(
    "per_100k", ascending=False
).head(5)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
6,Garcia,1166120,6,1.41,0.45,0.47,5.38,0.26,92.03,395.32,2010
9,Rodriguez,1094924,9,0.57,0.54,0.18,4.75,0.18,93.77,371.19,2010
10,Martinez,1060159,10,0.6,0.49,0.51,5.28,0.22,92.91,359.4,2010
11,Hernandez,1043281,11,0.6,0.36,0.19,3.79,0.16,94.89,353.68,2010
12,Lopez,874523,12,1.02,0.57,0.38,4.86,0.25,92.92,296.47,2010


### How did the frequency of your name change in pct terms between 2000 and 2010?

In [28]:
### Hint: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pct_change.html
### Don't forget to sort!

In [29]:
stiles = df[df["name"] == "Stiles"].sort_values("year")

In [30]:
(stiles["count"].pct_change() * 100).round(1)

1570    NaN
1689    1.9
Name: count, dtype: float64

In [31]:
stiles

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
1570,Stiles,20957,1570,0.39,3.58,0.64,92.56,1.33,1.5,7.77,2000
1689,Stiles,21357,1689,0.52,3.71,0.75,90.83,1.83,2.35,7.24,2010


---

### Group by name and year for surnames in our class, aggregating with the mean per_100k value

In [32]:
name_year = (
    df[df["name"].isin(students)]
    .groupby(["name", "year"])
    .agg({"count": "mean"})
    .reset_index()
)

### BONUS: Calculate the % change in surnames for each name from 2000 to 2010

In [33]:
# Hints: pivot_table()
# Hints: % increase = (new number - old number) / original number × 100

In [34]:
name_year_pivot = pd.pivot_table(
    name_year, values="count", index="name", columns="year", fill_value=0
).reset_index()

In [35]:
name_year_pivot["pct_change"] = (
    (name_year_pivot["2010"] - name_year_pivot["2000"]) / name_year_pivot["2000"] * 100
).round(2)

In [36]:
name_year_pivot.sort_values("pct_change", ascending=False)

year,name,2000,2010,pct_change
1,Guo,6058,12048,98.88
5,Sinha,2360,4066,72.29
8,Wang,67570,109883,62.62
6,Song,16856,25110,48.97
0,Chua,2904,4309,48.38
7,Vergara,10844,15618,44.02
4,Martinez,775072,1060159,36.78
3,Kang,23565,32221,36.73
2,Harmon,72414,74737,3.21
