# Surnames from the U.S. Census Bureau

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import random

### Read the data

In [3]:
# Main url: https://www.census.gov/data/developers/data-sets/surnames.html
# Data documentation: https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

In [4]:
df = pd.read_csv("data/surnames.csv", dtype={"year": str})

### What's the dataframe look like? 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21737 entries, 0 to 21736
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          21737 non-null  object 
 1   count         21737 non-null  int64  
 2   rank          21737 non-null  int64  
 3   pct_api       21737 non-null  float64
 4   pct_black     21737 non-null  float64
 5   pct_aian      21737 non-null  float64
 6   pct_white     21737 non-null  float64
 7   pct_two_race  21737 non-null  float64
 8   pct_hispanic  21737 non-null  float64
 9   per_100k      21737 non-null  float64
 10  year          21737 non-null  object 
dtypes: float64(7), int64(2), object(2)
memory usage: 1.8+ MB


In [6]:
df.describe()

Unnamed: 0,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k
count,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0
mean,17959.02,5434.271335,4.256858,9.425759,0.715929,71.849012,1.620553,12.094523,6.359557
std,61710.97,3137.197749,16.521642,14.450224,2.77998,31.037646,1.324449,27.213926,21.829534
min,2360.0,1.0,0.0,0.0,0.0,0.19,0.0,0.0,0.87
25%,3910.0,2717.0,0.39,0.44,0.24,64.31,1.08,1.5,1.38
50%,6145.0,5435.0,0.53,2.48,0.45,86.26,1.46,2.05,2.18
75%,12744.0,8151.0,0.77,13.1,0.71,94.16,1.89,2.82,4.5
max,2442977.0,12122.0,98.63,96.75,96.23,99.4,40.25,97.95,880.85


In [7]:
df.head(10)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
0,Smith,2442977,1,0.5,23.11,0.89,70.9,2.19,2.4,828.19,2010
1,Smith,2376206,1,0.4,22.22,0.85,73.35,1.63,1.56,880.85,2000
2,Johnson,1932812,2,0.54,34.63,0.94,58.97,2.56,2.36,655.24,2010
3,Johnson,1857160,2,0.42,33.8,0.91,61.55,1.82,1.5,688.44,2000
4,Williams,1625252,3,0.46,47.68,0.82,45.75,2.81,2.49,550.97,2010
5,Williams,1534042,3,0.37,46.72,0.78,48.52,2.01,1.6,568.66,2000
6,Brown,1437026,4,0.51,35.6,0.87,57.95,2.55,2.52,487.16,2010
7,Jones,1425470,5,0.44,38.48,1.0,55.19,2.61,2.29,483.24,2010
8,Brown,1380145,4,0.41,34.54,0.83,60.71,1.86,1.64,511.62,2000
9,Jones,1362755,5,0.35,37.73,0.94,57.69,1.85,1.44,505.17,2000


### Which names are majority "black"?

In [8]:
df[df.pct_black > 50].head(10)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
32,Jackson,708099,19,0.39,53.04,1.06,39.89,3.12,2.5,240.05,2010
39,Jackson,666125,18,0.31,53.02,1.04,41.93,2.18,1.53,246.93,2000
268,Washington,177386,145,0.3,87.53,0.68,5.17,3.78,2.54,60.14,2010
296,Washington,163036,138,0.25,89.87,0.64,5.16,2.64,1.45,60.44,2000
549,Banks,105833,292,0.36,54.51,0.43,39.27,2.98,2.45,35.88,2010
583,Joseph,100959,313,9.76,54.19,0.99,29.59,2.49,2.98,34.23,2010
592,Banks,99294,278,0.3,54.24,0.41,41.3,2.28,1.47,36.81,2000
1038,Charles,61211,548,1.01,52.96,2.07,33.69,2.37,7.9,20.75,2010
1157,Jefferson,55179,615,0.4,74.24,1.9,17.45,3.54,2.47,18.71,2010
1257,Jefferson,51361,594,0.25,75.24,1.85,18.72,2.38,1.57,19.04,2000


### How many unique names? 

In [9]:
len(df.name.unique())

11292

### How many in each year? 

In [10]:
df.year.value_counts()

2000    10887
2010    10850
Name: year, dtype: int64

### Surnames in our class (the simple versions, at least)

In [11]:
students = [
    "Vergara",
    "Chua",
    "Guo",
    "Harmon",
    "Kang",
    "Martinez",
    "Sinha",
    "Song",
    "Wang",
]

### Select students at random into three teams

In [12]:
random.shuffle(students)
students

['Guo',
 'Kang',
 'Chua',
 'Wang',
 'Harmon',
 'Sinha',
 'Song',
 'Vergara',
 'Martinez']

In [13]:
# Another way to do random selection here: https://observablehq.com/@mattstiles/jour-554-surnames

In [14]:
team_one = students[0:3]
team_one

['Guo', 'Kang', 'Chua']

In [15]:
team_two = students[3:6]
team_two

['Wang', 'Harmon', 'Sinha']

In [16]:
team_three = students[6:9]
team_three

['Song', 'Vergara', 'Martinez']

---

### This changes each time the notebook loads. Here are the teams. 

In [17]:
team_one = ["Sinha", "Song", "Wang"]
team_two = ["Martinez", "Chua", "Vergara"]
team_three = ["Harmon", "Kang", "Guo"]

---

### Find your last name!

In [18]:
df[df["name"] == "Stiles"].sort_values("count", ascending=False).head(20)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
3228,Stiles,21357,1689,0.52,3.71,0.75,90.83,1.83,2.35,7.24,2010
3284,Stiles,20957,1570,0.39,3.58,0.64,92.56,1.33,1.5,7.77,2000


### Find the surnames of your teammates, and sort ascending by surname

In [19]:
df[df["name"].isin(team_one)].sort_values("name")

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
15698,Sinha,4066,8155,91.22,0.42,0.3,4.57,2.78,0.71,1.38,2010
21736,Sinha,2360,12122,88.26,0.34,0.21,5.72,4.92,0.55,0.87,2000
2705,Song,25110,1415,95.15,0.45,0.04,1.98,1.74,0.65,8.51,2010
4113,Song,16856,1962,95.12,0.39,0.06,2.08,1.91,0.43,6.25,2000
529,Wang,109883,282,95.24,0.29,0.02,2.59,1.5,0.35,37.25,2010
932,Wang,67570,438,94.47,0.19,0.03,3.25,1.73,0.33,25.05,2000


### Who in your team has the most popular names, in terms of per 100,000 rate

In [20]:
df[df["name"].isin(team_one)].sort_values("per_100k", ascending=False)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
529,Wang,109883,282,95.24,0.29,0.02,2.59,1.5,0.35,37.25,2010
932,Wang,67570,438,94.47,0.19,0.03,3.25,1.73,0.33,25.05,2000
2705,Song,25110,1415,95.15,0.45,0.04,1.98,1.74,0.65,8.51,2010
4113,Song,16856,1962,95.12,0.39,0.06,2.08,1.91,0.43,6.25,2000
15698,Sinha,4066,8155,91.22,0.42,0.3,4.57,2.78,0.71,1.38,2010
21736,Sinha,2360,12122,88.26,0.34,0.21,5.72,4.92,0.55,0.87,2000


### In 2010, find the 5 names that are most Asian

In [21]:
df[df["year"] == "2010"].sort_values("pct_api", ascending=False).head(5)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
17259,Zhen,3661,8941,98.63,0.0,0.0,0.57,0.46,0.27,1.24,2010
16821,Kuang,3767,8712,98.38,0.27,0.0,0.19,0.64,0.53,1.28,2010
2639,Xu,25622,1381,98.25,0.12,0.02,0.83,0.55,0.24,8.69,2010
3240,Zhu,21265,1694,98.23,0.0,0.0,0.88,0.58,0.16,7.21,2010
14505,Qiu,4424,7520,98.21,0.0,0.0,0.84,0.75,0.0,1.5,2010


### Among majority Hispanic names in 2010, which five were most common?

In [22]:
df[(df["year"] == "2010") & (df["pct_hispanic"] > 50)].sort_values(
    "per_100k", ascending=False
).head(5)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
10,Garcia,1166120,6,1.41,0.45,0.47,5.38,0.26,92.03,395.32,2010
14,Rodriguez,1094924,9,0.57,0.54,0.18,4.75,0.18,93.77,371.19,2010
16,Martinez,1060159,10,0.6,0.49,0.51,5.28,0.22,92.91,359.4,2010
17,Hernandez,1043281,11,0.6,0.36,0.19,3.79,0.16,94.89,353.68,2010
18,Lopez,874523,12,1.02,0.57,0.38,4.86,0.25,92.92,296.47,2010


### How did the frequency of your name change in pct terms between 2000 and 2010?

In [23]:
### Hint: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pct_change.html
### Don't forget to sort!

In [24]:
stiles = df[df["name"] == "Stiles"].sort_values("year")

In [25]:
(stiles["count"].pct_change() * 100).round(1)

3284    NaN
3228    1.9
Name: count, dtype: float64

In [26]:
stiles

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
3284,Stiles,20957,1570,0.39,3.58,0.64,92.56,1.33,1.5,7.77,2000
3228,Stiles,21357,1689,0.52,3.71,0.75,90.83,1.83,2.35,7.24,2010


---

### BONUS: Group by name and year for surnames in our class, aggregating with the mean per_100k value

In [27]:
name_year = (
    df[df["name"].isin(students)]
    .groupby(["name", "year"])
    .agg({"count": "mean"})
    .reset_index()
)

### BONUS: Calculate the % change in surnames for each name from 2000 to 2010

In [28]:
# Hints: pivot_table()
# Hints: % increase = (new number - old number) / original number × 100

In [29]:
name_year_pivot = pd.pivot_table(
    name_year, values="count", index="name", columns="year", fill_value=0
).reset_index()

In [30]:
name_year_pivot["pct_change"] = (
    (name_year_pivot["2010"] - name_year_pivot["2000"]) / name_year_pivot["2000"] * 100
).round(2)

In [31]:
name_year_pivot.sort_values("pct_change", ascending=False)

year,name,2000,2010,pct_change
1,Guo,6058,12048,98.88
5,Sinha,2360,4066,72.29
8,Wang,67570,109883,62.62
6,Song,16856,25110,48.97
0,Chua,2904,4309,48.38
7,Vergara,10844,15618,44.02
4,Martinez,775072,1060159,36.78
3,Kang,23565,32221,36.73
2,Harmon,72414,74737,3.21
