# TEAM ONE: Surnames from the U.S. Census Bureau

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import random

### Read the data

In [3]:
# Main url: https://www.census.gov/data/developers/data-sets/surnames.html

In [4]:
# Data documentation: https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

In [5]:
df = pd.read_csv("../data/surnames.csv", dtype={"year": str})

---

### Surnames in our class

In [6]:
students = [
    "Vergara",
    "Chua",
    "Guo",
    "Harmon",
    "Kang",
    "Martinez",
    "Sinha",
    "Song",
    "Wang",
]

### Teams

In [7]:
team_one = ["Sinha", "Song", "Wang"]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21737 entries, 0 to 21736
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   name          21737 non-null  object 
 1   count         21737 non-null  int64  
 2   rank          21737 non-null  int64  
 3   pct_api       21737 non-null  float64
 4   pct_black     21737 non-null  float64
 5   pct_aian      21737 non-null  float64
 6   pct_white     21737 non-null  float64
 7   pct_two_race  21737 non-null  float64
 8   pct_hispanic  21737 non-null  float64
 9   per_100k      21737 non-null  float64
 10  year          21737 non-null  object 
dtypes: float64(7), int64(2), object(2)
memory usage: 1.8+ MB


### Filter the dataframe to just the last name of a team member

In [9]:
df[df["name"] == "Wang"]

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
529,Wang,109883,282,95.24,0.29,0.02,2.59,1.5,0.35,37.25,2010
932,Wang,67570,438,94.47,0.19,0.03,3.25,1.73,0.33,25.05,2000


In [10]:
df[df["name"] == "Sinha"]

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
15698,Sinha,4066,8155,91.22,0.42,0.3,4.57,2.78,0.71,1.38,2010
21736,Sinha,2360,12122,88.26,0.34,0.21,5.72,4.92,0.55,0.87,2000


In [11]:
df[df["name"] == "Song"]

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
2705,Song,25110,1415,95.15,0.45,0.04,1.98,1.74,0.65,8.51,2010
4113,Song,16856,1962,95.12,0.39,0.06,2.08,1.91,0.43,6.25,2000


### Filter the dataframe to just the surnames of your teammates, and sort ascending by surname

In [12]:
df.groupby(["year"]).agg({"per_100k": "mean"}).reset_index().sort_values(
    "year", ascending=False
)

Unnamed: 0,year,per_100k
1,2010,6.381935
0,2000,6.337254


In [13]:
popular = df[df["name"].isin(team_one)].sort_values("name", ascending=True)

In [14]:
popular

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
15698,Sinha,4066,8155,91.22,0.42,0.3,4.57,2.78,0.71,1.38,2010
21736,Sinha,2360,12122,88.26,0.34,0.21,5.72,4.92,0.55,0.87,2000
2705,Song,25110,1415,95.15,0.45,0.04,1.98,1.74,0.65,8.51,2010
4113,Song,16856,1962,95.12,0.39,0.06,2.08,1.91,0.43,6.25,2000
529,Wang,109883,282,95.24,0.29,0.02,2.59,1.5,0.35,37.25,2010
932,Wang,67570,438,94.47,0.19,0.03,3.25,1.73,0.33,25.05,2000


### Who in your team has the most popular names, in terms of per 100,000 rate

In [15]:
popular.sort_values("per_100k", ascending=False)

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
529,Wang,109883,282,95.24,0.29,0.02,2.59,1.5,0.35,37.25,2010
932,Wang,67570,438,94.47,0.19,0.03,3.25,1.73,0.33,25.05,2000
2705,Song,25110,1415,95.15,0.45,0.04,1.98,1.74,0.65,8.51,2010
4113,Song,16856,1962,95.12,0.39,0.06,2.08,1.91,0.43,6.25,2000
15698,Sinha,4066,8155,91.22,0.42,0.3,4.57,2.78,0.71,1.38,2010
21736,Sinha,2360,12122,88.26,0.34,0.21,5.72,4.92,0.55,0.87,2000


### In 2010, find the 5 names among all names that are most Asian

In [16]:
df[df["year"] == "2010"].sort_values("pct_api", ascending=False).head()

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
17259,Zhen,3661,8941,98.63,0.0,0.0,0.57,0.46,0.27,1.24,2010
16821,Kuang,3767,8712,98.38,0.27,0.0,0.19,0.64,0.53,1.28,2010
2639,Xu,25622,1381,98.25,0.12,0.02,0.83,0.55,0.24,8.69,2010
3240,Zhu,21265,1694,98.23,0.0,0.0,0.88,0.58,0.16,7.21,2010
14505,Qiu,4424,7520,98.21,0.0,0.0,0.84,0.75,0.0,1.5,2010


### Among majority Hispanic names in 2010, which five were most common?

In [17]:
df[df["year"] == "2010"].sort_values("pct_hispanic", ascending=False).head()

Unnamed: 0,name,count,rank,pct_api,pct_black,pct_aian,pct_white,pct_two_race,pct_hispanic,per_100k,year
18090,Vences,3470,9375,0.0,0.0,0.14,1.7,0.0,97.95,1.18,2010
6165,Ruvalcaba,11220,3217,0.08,0.08,0.15,1.68,0.06,97.95,3.8,2010
7818,Plascencia,8710,4079,0.1,0.08,0.07,1.99,0.07,97.69,2.95,2010
6864,Bahena,9983,3571,0.24,0.17,0.13,1.73,0.07,97.66,3.38,2010
10810,Chairez,6186,5619,0.13,0.0,0.0,2.07,0.08,97.61,2.1,2010


### How did the frequency of your team member's name above change, in pct terms between 2000 and 2010?

In [18]:
### Hint: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pct_change.html
### Don't forget to sort!

---

### BONUS: Group by name and year for just surnames in our class, aggregating with the mean per_100k value

In [19]:
studentsdf = df[df["name"].isin(students)].sort_values("name", ascending=True)

In [20]:
studentsdf.groupby(["year"]).agg({"per_100k": "mean"}).reset_index().sort_values(
    "year", ascending=False
)

Unnamed: 0,year,per_100k
1,2010,50.403333
0,2000,40.268889


### BONUS: Calculate the % change in surnames for each name from 2000 to 2010

In [22]:
# Hints: pivot_table()
# Hints: % increase = (new number - old number) / original number Ã— 100

### BONUS: Clean up the notebook, removing empty cells