Skip to content

Commit

Permalink
Data ingest webinar
Browse files Browse the repository at this point in the history
  • Loading branch information
hadley committed Jun 11, 2015
1 parent 7415bac commit 356b167
Show file tree
Hide file tree
Showing 7 changed files with 5,878 additions and 0 deletions.
10 changes: 10 additions & 0 deletions 11-Getting-Data-into-R/1-databasees.R
@@ -0,0 +1,10 @@
library(DBI)

path <- system.file("db", "datasets.sqlite", package = "RSQLite")
db <- dbConnect(RSQLite::SQLite(), path)

dbListTables(db)
str(dbGetQuery(db, "SELECT * FROM mtcars"))

# Polite to disconnect from db when done
dbDisconnect(db)
48 changes: 48 additions & 0 deletions 11-Getting-Data-into-R/2-rvest.R
@@ -0,0 +1,48 @@
# Inspired by
# http://notesofdabbler.github.io/201408_hotelReview/scrapeTripAdvisor.html

library(rvest)

# Always start by opening in web browser and experimenting with
# selectorgadget: http://selectorgadget.cm

url <- "http://www.tripadvisor.com/Hotel_Review-g37209-d1762915-Reviews-JW_Marriott_Indianapolis-Indianapolis_Indiana.html"
httr::BROWSE(url)

reviews <- url %>%
read_html() %>%
html_nodes("#REVIEWS .innerBubble")

length(reviews)
xml_structure(reviews[[1]])

# Most important distinction to get the hang of is html_nodes() vs html_node().
# html_nodes() returns m nodes; html_node() always returns n nodes. This is
# important to make sure that the variables line up correctly.

id <- reviews %>%
html_node(".quote a") %>%
html_attr("id")

quote <- reviews %>%
html_node(".quote span") %>%
html_text()

rating <- reviews %>%
html_node(".rating .rating_s_fill") %>%
html_attr("alt") %>%
gsub(" of 5 stars", "", .) %>%
as.integer()

date <- reviews %>%
html_node(".rating .ratingDate") %>%
html_attr("title") %>%
strptime("%b %d, %Y") %>%
as.POSIXct()

review <- reviews %>%
html_node(".entry .partial_entry") %>%
html_text()

library(dplyr)
data_frame(id, quote, rating, date, review) %>% View()
34 changes: 34 additions & 0 deletions 11-Getting-Data-into-R/3-tidyr.R
@@ -0,0 +1,34 @@
library(tidyr)
library(dplyr, warn = FALSE)
library(readr)

# Load the data
tb <- read_csv("tb.csv")
tb

# To convert this messy data into tidy data
# we need two verbs. First we need to gather
# together all the columns that aren't variables
tb2 <- tb %>%
gather(demo, n, m04:fu, na.rm = TRUE)
tb2

# Then separate the demographic variable into
# sex and age
tb3 <- tb2 %>%
separate(demo, c("sex", "age"), 1)
tb3

tb4 <- tb3 %>%
rename(country = iso2) %>%
arrange(country, year, sex, age)
tb4

# Do it in one pipeline
"tb.csv" %>%
read.csv(stringsAsFactors = FALSE) %>%
tbl_df() %>%
gather(demo, n, -iso2, -year, na.rm = TRUE) %>%
separate(demo, c("sex", "age"), 1) %>%
arrange(iso2, year, sex, age) %>%
rename(country = iso2)
16 changes: 16 additions & 0 deletions 11-Getting-Data-into-R/data-ingest.Rproj
@@ -0,0 +1,16 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
Binary file added 11-Getting-Data-into-R/data-ingest.key
Binary file not shown.
Binary file added 11-Getting-Data-into-R/data-ingest.pdf
Binary file not shown.
5,770 changes: 5,770 additions & 0 deletions 11-Getting-Data-into-R/tb.csv

Large diffs are not rendered by default.

0 comments on commit 356b167

Please sign in to comment.