# Surnames from the U.S. Census Bureau

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import random

### Read the data

In [3]:
# Main url: https://www.census.gov/data/developers/data-sets/surnames.html
# Data documentation: https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

In [4]:
df = pd.read_csv("../data/surnames.csv", dtype={"year": str})

### What's the dataframe look like? 

In [5]:
df.head(10)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
0,Smith,2442977,1,0.5,23.11,0.89,70.9,2.19,2.4,828.19,2010
1,Smith,2376206,1,0.4,22.22,0.85,73.35,1.63,1.56,880.85,2000
2,Johnson,1932812,2,0.54,34.63,0.94,58.97,2.56,2.36,655.24,2010
3,Johnson,1857160,2,0.42,33.8,0.91,61.55,1.82,1.5,688.44,2000
4,Williams,1625252,3,0.46,47.68,0.82,45.75,2.81,2.49,550.97,2010
5,Williams,1534042,3,0.37,46.72,0.78,48.52,2.01,1.6,568.66,2000
6,Brown,1437026,4,0.51,35.6,0.87,57.95,2.55,2.52,487.16,2010
7,Jones,1425470,5,0.44,38.48,1.0,55.19,2.61,2.29,483.24,2010
8,Brown,1380145,4,0.41,34.54,0.83,60.71,1.86,1.64,511.62,2000
9,Jones,1362755,5,0.35,37.73,0.94,57.69,1.85,1.44,505.17,2000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21737 entries, 0 to 21736
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          21737 non-null  object 
 1   count         21737 non-null  int64  
 2   rank          21737 non-null  int64  
 3   pct_api       21737 non-null  float64
 4   pct_black     21737 non-null  float64
 5   pct_aian      21737 non-null  float64
 6   pct_white     21737 non-null  float64
 7   pct_two_race  21737 non-null  float64
 8   pct_hispanic  21737 non-null  float64
 9   per_100k      21737 non-null  float64
 10  year          21737 non-null  object 
dtypes: float64(7), int64(2), object(2)
memory usage: 1.8+ MB


In [7]:
df.describe()

Unnamed: 0,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k
count,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0,21737.0
mean,17959.02,5434.271335,4.256858,9.425759,0.715929,71.849012,1.620553,12.094523,6.359557
std,61710.97,3137.197749,16.521642,14.450224,2.77998,31.037646,1.324449,27.213926,21.829534
min,2360.0,1.0,0.0,0.0,0.0,0.19,0.0,0.0,0.87
25%,3910.0,2717.0,0.39,0.44,0.24,64.31,1.08,1.5,1.38
50%,6145.0,5435.0,0.53,2.48,0.45,86.26,1.46,2.05,2.18
75%,12744.0,8151.0,0.77,13.1,0.71,94.16,1.89,2.82,4.5
max,2442977.0,12122.0,98.63,96.75,96.23,99.4,40.25,97.95,880.85


### Which names are majority "black"?

In [8]:
df[df["pct_black"] >= 50].head(10)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
32,Jackson,708099,19,0.39,53.04,1.06,39.89,3.12,2.5,240.05,2010
39,Jackson,666125,18,0.31,53.02,1.04,41.93,2.18,1.53,246.93,2000
268,Washington,177386,145,0.3,87.53,0.68,5.17,3.78,2.54,60.14,2010
296,Washington,163036,138,0.25,89.87,0.64,5.16,2.64,1.45,60.44,2000
549,Banks,105833,292,0.36,54.51,0.43,39.27,2.98,2.45,35.88,2010
583,Joseph,100959,313,9.76,54.19,0.99,29.59,2.49,2.98,34.23,2010
592,Banks,99294,278,0.3,54.24,0.41,41.3,2.28,1.47,36.81,2000
1038,Charles,61211,548,1.01,52.96,2.07,33.69,2.37,7.9,20.75,2010
1157,Jefferson,55179,615,0.4,74.24,1.9,17.45,3.54,2.47,18.71,2010
1257,Jefferson,51361,594,0.25,75.24,1.85,18.72,2.38,1.57,19.04,2000


### How many unique names? 

In [9]:
len(df.name.unique())

11292

2000    10887
2010    10850
Name: year, dtype: int64

### How many in each year? 

In [12]:
df["year"].value_counts()

2000    10887
2010    10850
Name: year, dtype: int64

### Surnames in our class (the simple versions, at least)

In [13]:
students = [
    "Vergara",
    "Chua",
    "Guo",
    "Harmon",
    "Kang",
    "Martinez",
    "Sinha",
    "Song",
    "Wang",
]

### Select students at random into three teams

In [14]:
random.shuffle(students)
students

['Song',
 'Vergara',
 'Kang',
 'Guo',
 'Harmon',
 'Martinez',
 'Wang',
 'Sinha',
 'Chua']

In [15]:
# Another way to do random selection here: https://observablehq.com/@mattstiles/jour-554-surnames

In [16]:
team_one = students[0:3]
team_one

['Song', 'Vergara', 'Kang']

In [17]:
team_two = students[3:6]
team_two

['Guo', 'Harmon', 'Martinez']

In [18]:
team_three = students[6:9]
team_three

['Wang', 'Sinha', 'Chua']

---

### This changes each time the notebook loads. Here are the teams. 

In [None]:
team_one = ["Sinha", "Song", "Wang"]
team_two = ["Martinez", "Chua", "Vergara"]
team_three = ["Harmon", "Kang", "Guo"]

---

### Find your last name!

In [None]:
df[]

### Find the surnames of your teammates, and sort ascending by surname

### Who in your team has the most popular names, in terms of per 100,000 rate

### In 2010, find the 5 names that are most Asian

### Among majority Hispanic names in 2010, which five were most common?

### How did the frequency of your name change in pct terms between 2000 and 2010?

In [None]:
### Hint: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pct_change.html
### Don't forget to sort!

---

### BONUS: Group by name and year for surnames in our class, aggregating with the mean per_100k value

### BONUS: Calculate the % change in surnames for each name from 2000 to 2010

In [None]:
# Hints: pivot_table()
# Hints: % increase = (new number - old number) / original number × 100