# 1. Transforming Data with dplyr
Learn verbs you can use to transform your data, including select, filter, arrange, and mutate. You'll use these functions to modify the counties dataset to view particular observations and answer questions about the data

## The counties dataset
This particular dataset is from the 2015 United States Census.

### Chapter 1 verbs
- `select()`
- `filter()`
- `arrange()`
- `mutate()`

In [1]:
library(dplyr)

"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



In [3]:
counties <- readRDS("counties.rds")

In [4]:
head(counties)

census_id,state,county,region,metro,population,men,women,hispanic,white,...,other_transp,work_at_home,mean_commute,employed,private_work,public_work,self_employed,family_work,unemployment,land_area
1001,Alabama,Autauga,South,Metro,55221,26745,28476,2.6,75.8,...,1.3,1.8,26.5,23986,73.6,20.9,5.5,0.0,7.6,594.44
1003,Alabama,Baldwin,South,Metro,195121,95314,99807,4.5,83.1,...,1.4,3.9,26.4,85953,81.5,12.3,5.8,0.4,7.5,1589.78
1005,Alabama,Barbour,South,Nonmetro,26932,14497,12435,4.6,46.2,...,1.5,1.6,24.1,8597,71.8,20.8,7.3,0.1,17.6,884.88
1007,Alabama,Bibb,South,Metro,22604,12073,10531,2.2,74.5,...,1.5,0.7,28.8,8294,76.8,16.1,6.7,0.4,8.3,622.58
1009,Alabama,Blount,South,Metro,57710,28512,29198,8.6,87.9,...,0.4,2.3,34.9,22189,82.0,13.5,4.2,0.4,7.7,644.78
1011,Alabama,Bullock,South,Nonmetro,10678,5660,5018,4.4,22.2,...,1.7,2.8,27.5,3865,79.5,15.1,5.4,0.0,18.0,622.81


- Select the following four columns from the `counties` variable:
`state`, `county`, `population`, `poverty`

In [5]:
counties %>%
  # Select the columns
  select(state, county, population, poverty)

state,county,population,poverty
Alabama,Autauga,55221,12.9
Alabama,Baldwin,195121,13.4
Alabama,Barbour,26932,26.7
Alabama,Bibb,22604,16.8
Alabama,Blount,57710,16.7
Alabama,Bullock,10678,24.6
Alabama,Butler,20354,25.4
Alabama,Calhoun,116648,20.5
Alabama,Chambers,34079,21.6
Alabama,Cherokee,26008,19.2


---
## The filter and arrange verbs


In [12]:
counties_selected_scpu <- counties %>%
    select(state, county, population, unemployment)

### Arrange

In [13]:
counties_selected_scpu %>%
    arrange(population)

state,county,population,unemployment
Hawaii,Kalawao,85,0.0
Texas,King,267,5.1
Nebraska,McPherson,433,0.9
Montana,Petroleum,443,6.6
Nebraska,Arthur,448,4.0
Nebraska,Loup,548,0.7
Nebraska,Blaine,551,0.7
New Mexico,Harding,565,6.0
Texas,Kenedy,565,0.0
Colorado,San Juan,606,13.8


### Arrange: descending

In [14]:
counties_selected_scpu %>%
    arrange(desc(population))

state,county,population,unemployment
California,Los Angeles,10038388,10.0
Illinois,Cook,5236393,10.7
Texas,Harris,4356362,7.5
Arizona,Maricopa,4018143,7.7
California,San Diego,3223096,8.7
California,Orange,3116069,7.6
Florida,Miami-Dade,2639042,10.0
New York,Kings,2595259,10.0
Texas,Dallas,2485003,7.6
New York,Queens,2301139,8.6


### Filter

In [15]:
counties_selected_scpu %>%
    arrange(desc(population)) %>%
    filter(state == 'New York')

state,county,population,unemployment
New York,Kings,2595259,10.0
New York,Queens,2301139,8.6
New York,New York,1629507,7.5
New York,Suffolk,1501373,6.4
New York,Bronx,1428357,14.0
New York,Nassau,1354612,6.4
New York,Westchester,967315,7.6
New York,Erie,921584,7.0
New York,Monroe,749356,7.7
New York,Richmond,472481,6.9


In [16]:
counties_selected_scpu %>%
    arrange(desc(population)) %>%
    filter(unemployment < 6)

state,county,population,unemployment
Virginia,Fairfax,1128722,4.9
Utah,Salt Lake,1078958,5.8
Hawaii,Honolulu,984178,5.6
Texas,Collin,862215,4.9
Texas,Denton,731851,5.7
Texas,Fort Bend,658331,5.1
Kansas,Johnson,566814,4.5
Maryland,Anne Arundel,555280,5.9
Colorado,Jefferson,552344,5.9
Utah,Utah,551957,5.5


### Combining conditions

In [17]:
counties_selected_scpu %>%
    arrange(desc(population)) %>%
    filter(state == "New York", unemployment < 6)

state,county,population,unemployment
New York,Tompkins,103855,5.9
New York,Chemung,88267,5.4
New York,Madison,72427,5.1
New York,Livingston,64801,5.4
New York,Seneca,35144,5.5


## Exercise

In [18]:
# Arranging observations

counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

counties_selected %>%
  # Add a verb to sort in descending order of public_work
  arrange(desc(public_work))

state,county,population,private_work,public_work,self_employed
Hawaii,Kalawao,85,25.0,64.1,10.9
Alaska,Yukon-Koyukuk Census Area,5644,33.3,61.7,5.1
Wisconsin,Menominee,4451,36.8,59.1,3.7
North Dakota,Sioux,4380,32.9,56.8,10.2
South Dakota,Todd,9942,34.4,55.0,9.8
Alaska,Lake and Peninsula Borough,1474,42.2,51.6,6.1
California,Lassen,32645,42.6,50.5,6.8
South Dakota,Buffalo,2038,48.4,49.5,1.8
South Dakota,Dewey,5579,34.9,49.2,14.7
Texas,Kenedy,565,51.9,48.1,0.0


In [19]:
# Filtering for conditions

counties_selected <- counties %>%
  select(state, county, population)

# Filter for counties with a population above 1000000
counties_selected %>%  
  filter(population > 1000000)

state,county,population
Arizona,Maricopa,4018143
California,Alameda,1584983
California,Contra Costa,1096068
California,Los Angeles,10038388
California,Orange,3116069
California,Riverside,2298032
California,Sacramento,1465832
California,San Bernardino,2094769
California,San Diego,3223096
California,Santa Clara,1868149


In [20]:
# Filter for the counties in the state of California with a population above 1000000
counties_selected %>%
  filter(state == "California", population > 1000000)

state,county,population
California,Alameda,1584983
California,Contra Costa,1096068
California,Los Angeles,10038388
California,Orange,3116069
California,Riverside,2298032
California,Sacramento,1465832
California,San Bernardino,2094769
California,San Diego,3223096
California,Santa Clara,1868149


In [21]:
# Filtering and arranging

counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

# Filter for Texas and more than 10000 people; sort in descending order of private_work
counties_selected %>%
  # Filter for Texas and more than 10000 people
  filter(state == "Texas", population > 10000) %>%
  # Sort in descending order of private_work
  arrange(desc(private_work))

state,county,population,private_work,public_work,self_employed
Texas,Gregg,123178,84.7,9.8,5.4
Texas,Collin,862215,84.1,10.0,5.8
Texas,Dallas,2485003,83.9,9.5,6.4
Texas,Harris,4356362,83.4,10.1,6.3
Texas,Andrews,16775,83.1,9.6,6.8
Texas,Tarrant,1914526,83.1,11.4,5.4
Texas,Titus,32553,82.5,10.0,7.4
Texas,Denton,731851,82.2,11.9,5.7
Texas,Ector,149557,82.0,11.2,6.7
Texas,Moore,22281,82.0,11.7,5.9


---
## Mutate
`mutate()` verb adds new variables or change existing variables.

In [24]:
counties_selected_scpu %>%
    mutate(unemployed_population = population * unemployment / 100) %>%
    arrange(desc(unemployed_population))

state,county,population,unemployment,unemployed_population
California,Los Angeles,10038388,10.0,1003838.8
Illinois,Cook,5236393,10.7,560294.1
Texas,Harris,4356362,7.5,326727.2
Arizona,Maricopa,4018143,7.7,309397.0
California,Riverside,2298032,12.9,296446.1
California,San Diego,3223096,8.7,280409.4
Michigan,Wayne,1778969,14.9,265066.4
California,San Bernardino,2094769,12.6,263940.9
Florida,Miami-Dade,2639042,10.0,263904.2
New York,Kings,2595259,10.0,259525.9


## Exercise

In [26]:
# Calculating the number of government employees

counties_selected <- counties %>%
  select(state, county, population, public_work)

counties_selected %>%
  # Add a new column public_workers with the number of people employed in public work
  mutate(public_workers = population * public_work / 100) %>%
  # Sort in descending order of the public_workers column
  arrange(desc(public_workers))

state,county,population,public_work,public_workers
California,Los Angeles,10038388,11.5,1154414.6
Illinois,Cook,5236393,11.5,602185.2
California,San Diego,3223096,14.8,477018.2
Arizona,Maricopa,4018143,11.7,470122.7
Texas,Harris,4356362,10.1,439992.6
New York,Kings,2595259,14.4,373717.3
California,San Bernardino,2094769,16.7,349826.4
California,Riverside,2298032,14.9,342406.8
California,Sacramento,1465832,21.8,319551.4
California,Orange,3116069,10.2,317839.0


In [27]:
# Calculating the percentage of women in a county

counties_selected <- counties %>%
  # Select the columns state, county, population, men, and women
  select(state, county, population, men, women)

counties_selected %>%
  # Calculate proportion_women as the fraction of the population made up of women
  mutate(proportion_women = women / population)

state,county,population,men,women,proportion_women
Alabama,Autauga,55221,26745,28476,0.5156734
Alabama,Baldwin,195121,95314,99807,0.5115134
Alabama,Barbour,26932,14497,12435,0.4617184
Alabama,Bibb,22604,12073,10531,0.4658910
Alabama,Blount,57710,28512,29198,0.5059435
Alabama,Bullock,10678,5660,5018,0.4699382
Alabama,Butler,20354,9502,10852,0.5331630
Alabama,Calhoun,116648,56274,60374,0.5175742
Alabama,Chambers,34079,16258,17821,0.5229320
Alabama,Cherokee,26008,12975,13033,0.5011150


In [28]:
# Select, mutate, filter, and arrange

counties %>%
  select(state, county, population, men, women) %>%
  # Add the proportion_men variable
  mutate(proportion_men = men / population) %>%
  # Filter for population of at least 10,000
  filter(population >= 10000) %>%
  # Arrange proportion of men in descending order 
  arrange(desc(proportion_men))

state,county,population,men,women,proportion_men
Virginia,Sussex,11864,8130,3734,0.6852664
California,Lassen,32645,21818,10827,0.6683412
Georgia,Chattahoochee,11914,7940,3974,0.6664428
Louisiana,West Feliciana,15415,10228,5187,0.6635096
Florida,Union,15191,9830,5361,0.6470937
Texas,Jones,19978,12652,7326,0.6332966
Missouri,DeKalb,12782,8080,4702,0.6321389
Texas,Madison,13838,8648,5190,0.6249458
Virginia,Greensville,11760,7303,4457,0.6210034
Texas,Anderson,57915,35469,22446,0.6124320


*Notice Sussex County in Virginia is more than two thirds male: this is because of two men's prisons in the county.*

---
---
# 2. Aggregating Data
Now that you know how to transform your data, you'll want to know more about how to aggregate your data to make it more interpretable. You'll learn a number of functions you can use to take many observations in your data and summarize them, including count, group_by, summarize, ungroup, and top_n.

## The count verb

In [29]:
counties %>%
    count()

n
3138


### Count variable

In [30]:
counties %>%
    count(state)

state,n
Alabama,67
Alaska,28
Arizona,15
Arkansas,75
California,58
Colorado,64
Connecticut,8
Delaware,3
Florida,67
Georgia,159


### Count and sort

In [31]:
counties %>%
    count(state, sort = TRUE)

state,n
Texas,253
Georgia,159
Virginia,133
Kentucky,120
Missouri,115
Kansas,105
Illinois,102
North Carolina,100
Iowa,99
Tennessee,95


### Count population/Add weight
Add the argument `wt`, which stands for 'weight'.

In [32]:
counties %>%
    count(state, wt = population, sort = TRUE)

state,n
California,38421464
Texas,26538497
New York,19673174
Florida,19645772
Illinois,12873761
Pennsylvania,12779559
Ohio,11575977
Georgia,10006693
Michigan,9900571
North Carolina,9845333


## Exercise

In [33]:
# Counting by region

counties_selected <- counties %>%
  select(county, region, state, population, citizens)

# Use count to find the number of counties in each region
counties_selected %>%
  count(region, sort = TRUE)

region,n
South,1420
North Central,1054
West,447
Northeast,217


In [34]:
# Counting citizens by state

# Find number of counties per state, weighted by citizens, sorted in descending order
counties_selected %>%
  count(state, wt = citizens, sort = TRUE)

state,n
California,24280349
Texas,16864864
Florida,13933052
New York,13531404
Pennsylvania,9710416
Illinois,8979999
Ohio,8709050
Michigan,7380136
North Carolina,7107998
Georgia,6978660


*California is the state with the most citizens.*

In [36]:
# Mutating and counting
counties_selected <- counties %>%
  select(county, region, state, population, walk)

counties_selected %>%
  # Add population_walk containing the total number of people who walk to work 
  mutate(population_walk = population * walk / 100) %>%
  # Count weighted by the new column, sort in descending order
  count(state, wt = population_walk, sort = TRUE)

state,n
New York,1237938.17
California,1017963.68
Pennsylvania,505397.19
Texas,430783.43
Illinois,400345.6
Massachusetts,316765.03
Florida,284722.87
New Jersey,273047.19
Ohio,266910.98
Washington,239764.32


*While California had the largest total population, New York state has the largest number of people who walk to work.*

---
## The group by, summarize and ungroup verbs
`count` is a special case of a more general set of verbs: group by and summarize.

### Summarize
The summarize verb takes many observations and turns them into one observation.

In [37]:
counties %>%
    summarize(total_population = sum(population))

total_population
315845353


### Aggregate and summarize

In [38]:
counties %>%
    summarize(total_population = sum(population), 
              average_unemployment = mean(unemployment))

total_population,average_unemployment
315845353,7.798343


### Summary functions
- `sum()`
- `mean()`
- `median()`
- `min()`
- `max()`
- `n()` : the size of the group

###  Aggregate within groups + Arrange

In [40]:
counties %>%
    group_by(state) %>%
    summarize(total_pop = sum(population),
              average_unemployment = mean(unemployment)) %>%
    arrange(desc(average_unemployment))

state,total_pop,average_unemployment
Mississippi,2988081,12.02439
Arizona,6641928,11.966667
South Carolina,4777576,11.330435
Alabama,4830620,11.310448
California,38421464,10.8
Nevada,2798636,10.476471
North Carolina,9845333,10.452
Florida,19645772,10.383582
Georgia,10006693,9.974214
Michigan,9900571,9.962651


### Metro column & Group by

In [42]:
counties %>%
    group_by(state, metro) %>%
    summarize(total_pop = sum(population))

`summarise()` has grouped output by 'state'. You can override using the `.groups` argument.


state,metro,total_pop
Alabama,Metro,3671377
Alabama,Nonmetro,1159243
Alaska,Metro,494990
Alaska,Nonmetro,230471
Arizona,Metro,6295145
Arizona,Nonmetro,346783
Arkansas,Metro,1806867
Arkansas,Nonmetro,1151341
California,Metro,37587429
California,Nonmetro,834035


### Ungroup
If not want to keep state as a group, add another dplyr verb: `ungroup()`.

In [43]:
counties %>%
    group_by(state, metro) %>%
    summarize(total_pop = sum(population)) %>%
    ungroup()

`summarise()` has grouped output by 'state'. You can override using the `.groups` argument.


state,metro,total_pop
Alabama,Metro,3671377
Alabama,Nonmetro,1159243
Alaska,Metro,494990
Alaska,Nonmetro,230471
Arizona,Metro,6295145
Arizona,Nonmetro,346783
Arkansas,Metro,1806867
Arkansas,Nonmetro,1151341
California,Metro,37587429
California,Nonmetro,834035


## Exercise

In [44]:
# Summarizing

counties_selected <- counties %>%
  select(county, population, income, unemployment)

# Summarize to find minimum population, maximum unemployment, and average income
counties_selected %>%
  summarize(min_population = min(population), 
            max_unemployment = max(unemployment), 
            average_income = mean(income))

min_population,max_unemployment,average_income
85,29.4,46832


In [45]:
# Summarizing by state

counties_selected <- counties %>%
  select(state, county, population, land_area)

counties_selected %>%
  # Group by state 
  group_by(state) %>%
  # Find the total area and population
  summarise(total_area = sum(land_area), 
            total_population = sum(population)) %>%
  # Add a density column
  mutate(density = total_population / total_area) %>%
  # Sort by density in descending order
  arrange(desc(density))  

state,total_area,total_population,density
New Jersey,7354.22,8904413,1210.789587
Rhode Island,1033.82,1053661,1019.191929
Massachusetts,7800.08,6705586,859.681696
Connecticut,4842.36,3593222,742.039419
Maryland,9707.23,5930538,610.940299
Delaware,1948.55,926454,475.458161
New York,47126.43,19673174,417.455216
Florida,53624.78,19645772,366.356226
Pennsylvania,44742.71,12779559,285.623267
Ohio,40860.73,11575977,283.303235


*Looks like New Jersey and Rhode Island are the “most crowded” of the US states, with more than a thousand people per square mile.*

In [46]:
# Summarizing by state and region

counties_selected <- counties %>%
  select(region, state, county, population)

counties_selected %>%
  # Group and summarize to find the total population
  group_by(region, state) %>%
  summarize(total_pop = sum(population)) %>%
  # Calculate the average_pop and median_pop columns 
  summarize(average_pop = mean(total_pop), median_pop = median(total_pop))

`summarise()` has grouped output by 'region'. You can override using the `.groups` argument.


region,average_pop,median_pop
North Central,5627687,5580644
Northeast,6221058,3593222
South,7370486,4804098
West,5722755,2798636


*It looks like the South has the highest* `average_pop` *of 7370486, while the North Central region has the highest* `median_pop` *of 5580644.*

---
## The top_n verb
dplyr's `top n` is very useful for keeping the most extreme observations from each group.

### top_n
Like `summarize()`, `top_n()` operates on a grouped table. The function takes two arguments: the number of observations you want from each group, and the column you want to weight by.

For example, `group_by(state)` and then `top_n(1, population)` would find the county with the highest population in each state.

In [47]:
counties_selected <- counties %>%
    select(state, county, population, unemployment, income)

counties_selected %>%
    group_by(state) %>%
    top_n(1, population)

state,county,population,unemployment,income
Alabama,Jefferson,659026,9.1,45610
Alaska,Anchorage Municipality,299107,6.7,78326
Arizona,Maricopa,4018143,7.7,54229
Arkansas,Pulaski,390463,7.5,46140
California,Los Angeles,10038388,10.0,56196
Colorado,El Paso,655024,8.4,58206
Connecticut,Fairfield,939983,9.0,84233
Delaware,New Castle,549643,7.4,65476
Florida,Miami-Dade,2639042,10.0,43129
Georgia,Fulton,983903,9.9,57207


Jefferson is the highest population county in Alabama with a population of 659 thousand. Notice that it kept other columns in this table, in this case, unemployment and income.

### Highest unemployment

In [48]:
counties_selected %>%
    group_by(state) %>%
    top_n(1, unemployment)

state,county,population,unemployment,income
Alabama,Conecuh,12865,22.6,24900
Alaska,Northwest Arctic Borough,7732,21.9,63648
Arizona,Navajo,107656,19.8,35921
Arkansas,Phillips,20391,18.1,26844
California,Imperial,178206,17.4,41079
Colorado,Crowley,5551,27.0,31151
Connecticut,New Haven,862224,9.5,61640
Delaware,Kent,169509,8.4,54976
Florida,Hamilton,14395,15.8,35048
Georgia,Taylor,8401,20.6,28143


### Number of observations

In [49]:
counties_selected %>%
    group_by(state) %>%
    top_n(3, unemployment)

state,county,population,unemployment,income
Alabama,Conecuh,12865,22.6,24900
Alabama,Monroe,22217,20.7,27257
Alabama,Wilcox,11235,20.8,23750
Alaska,Bethel Census Area,17776,17.6,51012
Alaska,Northwest Arctic Borough,7732,21.9,63648
Alaska,Yukon-Koyukuk Census Area,5644,18.2,38491
Arizona,Apache,72124,18.2,31757
Arizona,Graham,37407,14.1,45964
Arizona,Navajo,107656,19.8,35921
Arkansas,Desha,12379,17.7,27197


`top_n` is often used when creating graphs.

## Exercise

In [50]:
# Selecting a county from each region

counties_selected <- counties %>%
  select(region, state, county, metro, population, walk)

counties_selected %>%
  # Group by region
  group_by(region) %>%
  # Find the greatest number of citizens who walk to work
  top_n(1, walk)

region,state,county,metro,population,walk
West,Alaska,Aleutians East Borough,Nonmetro,3304,71.2
Northeast,New York,New York,Metro,1629507,20.7
North Central,North Dakota,McIntosh,Nonmetro,2759,17.5
South,Virginia,Lexington city,Nonmetro,7071,31.7


*Notice that three of the places lots of people walk to work are low-population nonmetro counties, but that New York City also pops up.*

In [53]:
# Finding the highest-income state in each region

counties_selected <- counties %>%
  select(region, state, county, population, income)

counties_selected %>%
  group_by(region, state) %>%
  # Calculate average income
  summarise(average_income = mean(income)) %>%
  # Find the highest income state in each region
  top_n(1, average_income)

`summarise()` has grouped output by 'region'. You can override using the `.groups` argument.


region,state,average_income
North Central,North Dakota,55574.87
Northeast,New Jersey,73014.1
South,Maryland,69200.38
West,Alaska,65124.54


*New Jersey in the Northeast is the state with the highest* `average_income` *of 73014.*

In [55]:
# Using summarize, top_n, and count together

counties_selected <- counties %>%
  select(state, metro, population)

counties_selected %>%
  # Find the total population for each combination of state and metro
  group_by(state, metro) %>%
  summarise(total_pop = sum(population)) %>%
  # Extract the most populated row for each state
  top_n(1, total_pop)

`summarise()` has grouped output by 'state'. You can override using the `.groups` argument.


state,metro,total_pop
Alabama,Metro,3671377
Alaska,Metro,494990
Arizona,Metro,6295145
Arkansas,Metro,1806867
California,Metro,37587429
Colorado,Metro,4590896
Connecticut,Metro,3406918
Delaware,Metro,926454
Florida,Metro,18941821
Georgia,Metro,8233886


In [56]:
counties_selected %>%
  # Find the total population for each combination of state and metro
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  # Extract the most populated row for each state
  top_n(1, total_pop) %>%
  # Count the states with more people in Metro or Nonmetro areas
  ungroup() %>%
  count(metro)

`summarise()` has grouped output by 'state'. You can override using the `.groups` argument.


metro,n
Metro,44
Nonmetro,6


*Notice that 44 states have more people living in Metro areas, and 6 states have more people living in Nonmetro areas.*