# Summarizing with `dplyr`

Basic summarizing tools provided with the `dplyr` package.

In [10]:
# Load dependencies
library(dplyr)
library(dslabs)
data(heights)
data(murders)

## Use the `summarize` function

In [6]:
# Summarize using the `summarize` function of dplyr
df = heights %>% filter(sex == "Male") %>% summarize (average = mean(height), sd = sd(height))
df # will produce the data frame in a table format

average,sd
69.31475,3.611024


In [9]:
# Access attributes using $ and [] symbols
paste("Average: ", df$average)
df["sd"]

sd
3.611024


## Use the dot operator

In [18]:
# Compute the US murder rate
rate = murders %>% summarize(mRate = sum(total) / sum(population) * 10000)
rate
paste("rate is a", class(rate))

mRate
0.3034555


In [25]:
# Note that the rate is a data-frame -- we cannot perform numeric computations with it
# This issue can be resolved by using the dot (.) operator with a pipe
numRate = rate %>% .$mRate # since `rate` is a data-frame, its attribute can be accessed through the $ operator
numRate
paste("numRate is a", class(numRate))

## Summarize after grouping

In [29]:
group = heights %>% group_by(sex)
group %>% summarize (average = mean(height), sd = sd(height))

sex,average,sd
Female,64.93942,3.760656
Male,69.31475,3.611024


In [33]:
group1 = murders %>% group_by(region)
# str(group1)
group1 %>% summarize (average = mean(total), sd = sd(total))

region,average,sd
Northeast,163.2222,200.1935
South,246.7647,212.8622
North Central,152.3333,154.2244
West,147.0,339.1015


## Sort out full tables using `arrange` function

In [34]:
# Sort the murders data frame by their population sizes
murders %>% arrange(population) %>% head()

state,abb,region,population,total
Wyoming,WY,West,563626,5
District of Columbia,DC,South,601723,99
Vermont,VT,Northeast,625741,2
North Dakota,ND,North Central,672591,4
Alaska,AK,West,710231,19
South Dakota,SD,North Central,814180,8


In [35]:
# Sort with highest murder totals first
murders %>% arrange(desc(total)) %>% head()

state,abb,region,population,total
California,CA,West,37253956,1257
Texas,TX,South,25145561,805
Florida,FL,South,19687653,669
New York,NY,Northeast,19378102,517
Pennsylvania,PA,Northeast,12702379,457
Michigan,MI,North Central,9883640,413


In [38]:
# NESTED SORTING to break ties -- sort by total murders first, then break ties by sorting by population sizes
murders %>% arrange(total, population) #%>% top_n(10) # show the first 10 entries

state,abb,region,population,total
Vermont,VT,Northeast,625741,2
North Dakota,ND,North Central,672591,4
Wyoming,WY,West,563626,5
New Hampshire,NH,Northeast,1316470,5
Hawaii,HI,West,1360301,7
South Dakota,SD,North Central,814180,8
Maine,ME,Northeast,1328361,11
Montana,MT,West,989415,12
Idaho,ID,West,1567582,12
Rhode Island,RI,Northeast,1052567,16
