/
_chronicle-scrape-A.R
68 lines (51 loc) · 1.82 KB
/
_chronicle-scrape-A.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# load packages ----------------------------------------------------------------
library(tidyverse)
library(rvest)
library(lubridate)
library(robotstxt)
# check that we can scrape data from the chronicle -----------------------------
paths_allowed("https://www.dukechronicle.com")
# read page --------------------------------------------------------------------
page <- read_html("https://www.dukechronicle.com/section/opinion?page=1&per_page=500")
# parse components -------------------------------------------------------------
titles <- page |>
html_elements(".headline a") |>
html_text()
columns <- page |>
html_elements(".col-md-8 .kicker a+ a") |>
html_text()
abstracts <- page |>
html_elements(".article-abstract") |>
html_text2()
authors_dates <- page |>
html_elements(".col-md-8 .dateline") |>
html_text2() |>
str_remove("By ")
urls <- page |>
html_elements(".headline a") |>
html_attr(name = "href")
# create a data frame ----------------------------------------------------------
chronicle_raw <- tibble(
title = titles,
author_date = authors_dates,
abstract = abstracts,
column = columns,
url = urls
)
# clean up data ----------------------------------------------------------------
chronicle <- chronicle_raw |>
separate(author_date, into = c("author", "date"), sep = "\\| ", fill = "left") |>
mutate(
author = str_trim(author),
date = case_when(
str_detect(date, "hours ago") ~ "February 22, 2024",
date == "Yesterday" ~ "February 21, 2024",
date == "2 days ago" ~ "February 20, 2024",
date == "5 days ago" ~ "February 17, 2024",
date == "6 days ago" ~ "February 16, 2024",
TRUE ~ date
),
date = mdy(date)
)
# write data -------------------------------------------------------------------
write_csv(chronicle, file = "data/chronicle.csv")