Data ingest webinar

rstudio · Jun 11, 2015 · 356b167 · 356b167
1 parent 7415bac
commit 356b167
Show file tree

Hide file tree

Showing 7 changed files with 5,878 additions and 0 deletions.
diff --git a/11-Getting-Data-into-R/1-databasees.R b/11-Getting-Data-into-R/1-databasees.R
@@ -0,0 +1,10 @@
+library(DBI)
+
+path <- system.file("db", "datasets.sqlite", package = "RSQLite")
+db <- dbConnect(RSQLite::SQLite(), path)
+
+dbListTables(db)
+str(dbGetQuery(db, "SELECT * FROM mtcars"))
+
+# Polite to disconnect from db when done
+dbDisconnect(db)
diff --git a/11-Getting-Data-into-R/2-rvest.R b/11-Getting-Data-into-R/2-rvest.R
@@ -0,0 +1,48 @@
+# Inspired by
+# http://notesofdabbler.github.io/201408_hotelReview/scrapeTripAdvisor.html
+
+library(rvest)
+
+# Always start by opening in web browser and experimenting with
+# selectorgadget: http://selectorgadget.cm
+
+url <- "http://www.tripadvisor.com/Hotel_Review-g37209-d1762915-Reviews-JW_Marriott_Indianapolis-Indianapolis_Indiana.html"
+httr::BROWSE(url)
+
+reviews <- url %>%
+  read_html() %>%
+  html_nodes("#REVIEWS .innerBubble")
+
+length(reviews)
+xml_structure(reviews[[1]])
+
+# Most important distinction to get the hang of is html_nodes() vs html_node().
+# html_nodes() returns m nodes; html_node() always returns n nodes. This is
+# important to make sure that the variables line up correctly.
+
+id <- reviews %>%
+  html_node(".quote a") %>%
+  html_attr("id")
+
+quote <- reviews %>%
+  html_node(".quote span") %>%
+  html_text()
+
+rating <- reviews %>%
+  html_node(".rating .rating_s_fill") %>%
+  html_attr("alt") %>%
+  gsub(" of 5 stars", "", .) %>%
+  as.integer()
+
+date <- reviews %>%
+  html_node(".rating .ratingDate") %>%
+  html_attr("title") %>%
+  strptime("%b %d, %Y") %>%
+  as.POSIXct()
+
+review <- reviews %>%
+  html_node(".entry .partial_entry") %>%
+  html_text()
+
+library(dplyr)
+data_frame(id, quote, rating, date, review) %>% View()
diff --git a/11-Getting-Data-into-R/3-tidyr.R b/11-Getting-Data-into-R/3-tidyr.R
@@ -0,0 +1,34 @@
+library(tidyr)
+library(dplyr, warn = FALSE)
+library(readr)
+
+# Load the data
+tb <- read_csv("tb.csv")
+tb
+
+# To convert this messy data into tidy data
+# we need two verbs. First we need to gather
+# together all the columns that aren't variables
+tb2 <- tb %>%
+  gather(demo, n, m04:fu, na.rm = TRUE)
+tb2
+
+# Then separate the demographic variable into
+# sex and age
+tb3 <- tb2 %>%
+  separate(demo, c("sex", "age"), 1)
+tb3
+
+tb4 <- tb3 %>%
+  rename(country = iso2) %>%
+  arrange(country, year, sex, age)
+tb4
+
+# Do it in one pipeline
+"tb.csv" %>%
+  read.csv(stringsAsFactors = FALSE) %>%
+  tbl_df() %>%
+  gather(demo, n, -iso2, -year, na.rm = TRUE) %>%
+  separate(demo, c("sex", "age"), 1) %>%
+  arrange(iso2, year, sex, age) %>%
+  rename(country = iso2)
diff --git a/11-Getting-Data-into-R/data-ingest.Rproj b/11-Getting-Data-into-R/data-ingest.Rproj
@@ -0,0 +1,16 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
diff --git a/11-Getting-Data-into-R/data-ingest.key b/11-Getting-Data-into-R/data-ingest.key
diff --git a/11-Getting-Data-into-R/data-ingest.pdf b/11-Getting-Data-into-R/data-ingest.pdf
diff --git a/11-Getting-Data-into-R/tb.csv b/11-Getting-Data-into-R/tb.csv