-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_email_contacts.r
125 lines (91 loc) · 2.99 KB
/
get_email_contacts.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#' Get contact webpage content
#'
#' @param page_url A webpage URL
#'
#' @return out A dataframe that contains email contact
#' @importFrom stringr str_extract_all
#' @importFrom stringr str_remove_all
#' @importFrom curl curl_fetch_memory
#' @importFrom httr content
#' @importFrom rvest read_html
#' @importFrom purrr is_empty
#' @importFrom glue glue
#' @export
get_email_contact_from_webpage <- function(page_url) {
# Define the three email patterns
email_pattern_1 <- "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}\\b"
email_pattern_2 <- "\\b[A-Za-z0-9._%+-]+ at [A-Za-z0-9.-]+ dot [A-Za-z]{2,}\\b"
email_pattern_3 <- "\\b[A-Za-z0-9._%+-]+\\[at\\][A-Za-z0-9.-]+\\[dot\\][A-Za-z]{2,}\\b"
response <- curl::curl_fetch_memory(page_url)
# Check if the request was successful
if (response$status_code != 200) {
out <- data.frame(
"page_url" = page_url,
"email_contact" = NA
)
return(out)
}
# Parse the raw content as HTML
page_content <- read_html(response$content)
# Extract the text from the HTML content
webpage <- html_text(page_content)
# make sure URL exists
if (is_empty(page_content)) {
out <- data.frame(
"page_url" = page_url,
"email_contact" = NA
)
return(out)
}
emails <- unlist(stringr::str_extract_all(webpage, glue("{email_pattern_1}|{email_pattern_2}|{email_pattern_3}")))
# unclutter emails
emails <- tolower(unique(na.omit(emails)))
emails <- emails[emails != ""]
# clean emails
emails <- str_replace_all(emails, "(\\.org|\\.com|\\.net).*", "\\1")
if (is_empty(emails))
{
out <- data.frame(
"page_url" = page_url,
"email_contact" = NA
)
} else {
out <- data.frame(
"page_url" = page_url,
"email_contact" = emails
)
}
return(out)
}
#' Get child links associated with email contacts
#'
#' @param base_url A website URL
#'
#' @return out A character vector that contains child links associated with email contacts (i.e., page_urls)
#' @export
get_contact_links_from_website <- function(base_url) {
# Extract the domain from the base_url
domain_name <- sub("^https?://", "", base_url)
domain_name <- sub("/.*", "", domain_name)
# Create the contact URL
contact_url1 <- paste0(domain_name, "/contact")
contact_url1 <- sub("//contact", "/contact", contact_url1)
# Return the contact_urls
return(contact_url1)
}
#' Get emails from a base url
#'
#' @param base_url A website URL
#' @importFrom memoise memoise
#'
#' @return out A dataframe that contains the page URLs and the email addresses appearing on these URLs
#' @importFrom furrr future_map_dfr
#' @export
get_emails_from_website <- function(base_url) {
get_email_contact_from_webpage <- memoise::memoise(get_email_contact_from_webpage)
get_contact_links_from_website <- memoise::memoise(get_contact_links_from_website)
message(paste("Parsing:", base_url))
page_urls <- get_contact_links_from_website(base_url)
out <- future_map_dfr(page_urls, get_email_contact_from_webpage)
return(out)
}