-
Notifications
You must be signed in to change notification settings - Fork 1
/
prise_en_main.Rmd
321 lines (251 loc) · 11.3 KB
/
prise_en_main.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
---
title: "Prise en main"
author: "Delmotte jean"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
html_document:
code_folding: "hide"
theme: united
highlight: tango
number_sections: true
fig_caption: yes
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
# Fonction to install / load package if it's not here
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
# usage
packages <- c("ggplot2", "tidyverse", "plotly", "rentrez", "reshape","rvest", "XML", "RCurl", "easyPubMed","conflicted")
#packages <- c(packages, "wdman", "RSelenium", "seleniumPipes")
# wdman for managing the Selenium server d/l
# RSelenium for getting a connection to the Selenium server
# seleniumPipes for better navigation & scraping idioms
ipak(packages)
```
ressources : http://perso.ens-lyon.fr/lise.vaudor/tuto-texte/
# Récolte des infos avec le package entrez
```{r}
entrez_dbs() # information about al database
entrez_db_summary("pubmed")
search_fields <- entrez_db_searchable("pubmed")
search_fields$GRNT
hox_paper <- entrez_search(db="pubmed", term="10.1038/nature08789[doi]")
hox_paper$file
papers_by_year <- function(years, search_term){
return(sapply(years, function(y) entrez_search(db="pubmed",term=search_term, mindate=y, maxdate=y, retmax=0)$count))
}
years <- 1990:2015
total_papers <- papers_by_year(years, "")
omics <- c("genomic", "epigenomic", "metagenomic", "proteomic", "transcriptomic", "pharmacogenomic", "connectomic" )
trend_data <- sapply(omics, function(t) papers_by_year(years, t))
trend_props <- trend_data/total_papers
trend_df <- melt(data.frame(years, trend_props), id.vars="years")
p <- ggplot(trend_df, aes(years, value, colour=variable))+
geom_line(size=1) + scale_y_log10("number of papers")
```
```{r}
virus_oysters <- entrez_search(db="pubmed", term="herpesvirus AND oyster")
summary(virus_oysters)
lsf.str("package:rentrez")
virus_oysters$file
# Vérifier les str rentrer avec un dico ?
# utiliser les paste0 pour fomuler les queries
years <- 1960:2019
#total_papers <- papers_by_year(years, "")
recherche_an <- c("disease AND oyster", "herpesvirus AND oyster", "bacteria AND oyster" )
trend_data <- sapply(recherche_an, function(t) papers_by_year(years, t))
#trend_props <- trend_data/total_papers
#trend_df <- melt(data.frame(years, trend_props), id.vars="years")
evolution_biblio <- melt(data.frame(years, trend_data), id.vars="years")
colnames(evolution_biblio) <- c("years", "query", "papers")
p <- ggplot(evolution_biblio, aes(years, papers, colour=query))+
geom_line(size=1)
```
# Récolte des infos en scrappant avec le package rvest
Dans un premier temps je vais tenter de récupérer le titre d'un article sur la page principal :
![output_to_parse_1](~/Documents/BibliographeR/results/images/first_tittle_to_parse.png)
**Image 1**. Capture d'écran du 1er article de la page.
Ensuite j'ai copié collé les 3 Xpath des 3 premier titres pour voir où ça changé :
```html
/html/body/div[2]/div[1]/form/div[1]/div[5]/div/div[5]/div[1]/div[2]/p/a
/html/body/div[2]/div[1]/form/div[1]/div[5]/div/div[5]/div[2]/div[2]/p/a
/html/body/div[2]/div[1]/form/div[1]/div[5]/div/div[5]/div[3]/div[2]/p/a
```
L'avant dernier `div[]` est ce qui varie entre les 3 titres
```{r}
ncbi <- read_html(
"https://www.ncbi.nlm.nih.gov/pubmed/?term=herpesvirus+oyster",
encoding="UTF-8"
)
mode(ncbi)
ncbi <- read_html(
"https://www.ncbi.nlm.nih.gov/pubmed/?term=herpesvirus+oyster",
encoding="UTF-8"
)
# titre
ncbi %>%
html_nodes("div.rslt p.title") %>%
html_text()
# Auteur
ncbi %>%
html_nodes("div.rslt p.desc") %>%
html_text()
# Revue
ncbi %>%
html_nodes("div.rslt p.details") %>%
xml_child("span") %>%
xml_attr("title")
# DOIs
ncbi %>%
html_nodes("div.supp p.details") %>%
html_text()
# pb pour la récup des DOI, autre possibilité, récup le "/pubmed/30828244" et refaire une requet de l'URL
#10.3389/fmicb.2019.00473.
#https://doi.org/10.3389/fmicb.2019.00473
#il faudra rajouté "#https://doi.org/" et enlever le point
```
Remarque les requetes se feront sur la page, il faut donc procéder page par page avec une boucle qui fera des itérations jusqu'à que le nombre de page max soit atteint. Pour ça on fait avec le nombre d'élément dans la page. Il faudra gérer le cas ou lorsque on a à la dernière page un nombre égale au max d'élement dans la page.
![problem pages](~/Documents/BibliographeR/results/images/gestion_pages.png)
possible solution https://www.allinonescript.com/questions/52694072/scape-data-from-a-page-that-uses-jsf-search-using-r
```{r}
# Exemple
selServ <- selenium() #This should install the jar and start the server
selServ$log()$stderr #We need the port # so do this and look for the port in the msgs
sel <- remoteDr(browserName = "firefox", port = 4567) # Now we need to connect to it and we need to use the port # from ^^. It was 4567 in my case
# Now, go to the main URL
sel %>%
go("https://jurispub.admin.ch/publiws/pub/search.jsf")
# Start the scraping process by hitting the initial submit button
sel %>%
findElement("name", "form:searchSubmitButton") %>% # find the submit button
elementClick() # click it
<input class="iceCmdBtn" id="form:searchSubmitButton" name="form:searchSubmitButton" onblur="setFocus('');" onclick="iceSubmitPartial(form, this, event);return false;" onfocus="setFocus(this.id);" type="submit" value="suchen">
<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="4" accesskey="k" id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next ></a>
<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="5" accesskey="k" id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next ></a>
sel %>%
getPageSource() %>% # like read_html()
html_node("table.iceDatTbl") -> dtbl # this is the data table
```
```{r}
selServ <- selenium() #This should install the jar and start the server
selServ$log()$stderr #We need the port # so do this and look for the port in the msgs
sel <- remoteDr(browserName = "firefox", port = 4567) # Now we need to connect to it and we need to use the port # from ^^. It was 4567 in my case
# Now, go to the main URL
sel %>%
go("https://www.ncbi.nlm.nih.gov/pubmed/?term=herpesvirus+oyster")
#<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="4" accesskey="k" #id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next ></a>
#
#
#<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="5" accesskey="k" #id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next ></a>
#
#id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page"
# Start the scraping process by hitting the initial submit button
EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page
sel %>%
findElement(using = "id", "EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page") %>% # find the submit button
elementClick() # click it
sel %>%
getPageSource() %>% # like read_html()
html_node("table.iceDatTbl") -> dtbl # this is the data table
```
Reformatage des fichier en Bash
```
for file in ./Openaccess/* ;
do
cut -d"," -f2- $file > tmp_file
sed -e '1,1d' < tmp_file > $file
tac $file | sed "1,2{d}" | tac > tmp_file
mv tmp_file $file
done
```
```{r}
BasePath <- "~/Documents/BibliographeR/raw/"
datas <- read.table(paste0(BasePath, "2014_SCIE_SSCI_InCites_journal_citation_Reports.csv"),
header = T, dec = ".", sep = ",", stringsAsFactors=F)
fichiers <- list.files(path=BasePath,
pattern="*.csv")
fichier <- gsub("_InCites_journal_citation_Reports.csv","", fichiers[1])
types <- "Total.Cites"
Concat_files <- function(directory, types){
# boucle for
if (missing(directory)) stop("directory is missing")
fichiers <- list.files(path=directory,
pattern="*.csv")
if (length(c(types)) == 1) {
for (fichier in fichiers) {
datas <- read.table(paste0(directory, fichier),
header = T, dec = ".", sep = ",", stringsAsFactors=F)
# Construction du vecteur avec les noms des fichiers
fichier <- gsub("_InCites_journal_citation_Reports.csv","",fichier)
# Géré les noms des colonnes
if (exists("Names_VC") == FALSE) {
Names_VC <- c("Full.Journal.Title")
}
Names_VC <- append(Names_VC, fichier)
#colnames(datas) <- c("Full.Journal.Title", "Total.Cites", "Journal.Impact.Factor", "Eigenfactor.Score")
# Géré les exception pour les SNP
if (exists("VCallingDataSet") == FALSE) {
VCallingDataSet <- data.frame(matrix(NA, nrow = 1, ncol = ncol(datas)))
colnames(VCallingDataSet) <- colnames(datas)
VCallingDataSet <- VCallingDataSet[c("Full.Journal.Title", types)] # Mettre un paramètre ici
}
VCallingDataSet <- Reduce(function(x, y) merge(x = x, y = y, by = c("Full.Journal.Title"), all=TRUE),
list(VCallingDataSet,
datas[c("Full.Journal.Title", types)] ) )
if (ncol(VCallingDataSet) == 3 ) {
if (sum(is.na(VCallingDataSet$Total.Cites.x)) == nrow(VCallingDataSet)) {
if (TRUE %in% is.na(VCallingDataSet$Total.Cites.x)) {
VCallingDataSet <- VCallingDataSet[,-1]
VCallingDataSet <- VCallingDataSet[-nrow(VCallingDataSet),]
}
}
}
colnames(VCallingDataSet) <- Names_VC
#rm(VCallingDataSet)
}
#VCallingDataSet[is.na(VCallingDataSet)] <- as.numeric(0)
return(VCallingDataSet)
} else if (length(c(types)) == 2) {
writeLines("A dévelopé")
} else {
stop("To much argument for Type")
}
}
Concat_files(BasePath, "Total.Cites")
Concat_files(BasePath, "Journal.Impact.Factor")
Concat_files(BasePath, "Eigenfactor.Score")
```{r}
#https://stringr.tidyverse.org/articles/regular-expressions.html
BasePath <- "/home/jean/Documents/BibliographeR/"
dataset1 <- read.csv(paste0(BasePath,"raw/oyster_herpesvirus_pubmed_result.csv"), header = F, dec = ".", sep = ",", stringsAsFactors=F)
dataset1 <- dataset1[,-ncol(dataset1)]
colnames(dataset1) <- dataset1[1,]
dataset1 <- dataset1[-1,]
# Partie DOI
dataset1$Details[1]
dataset1$DOI <- gsub(".$","", str_match(dataset1$Details, "doi: (.*?) ")[,2] )
TMP_vec <- dataset1 %>%
mutate(col_test = str_extract_all(Details, "doi: .+$"))
doi_qui_interest <- c()
for (i in str_split(TMP_vec, " ") ) {
doi_qui_interest <- append(doi_qui_interest, i[2])
}
dataset1 <- dataset1 %>%
mutate(DOI=doi_qui_interest)
dataset1$Details[4]
dataset1$Details[1]
dataset1$DOI <- paste0("https://doi.org/",dataset1$DOI)
dataset1[which(dataset1$DOI == "https://doi.org/NA"),]$DOI <- NA
write.table(dataset1, file = paste0(BasePath,"raw/oyster_herpesvirus_pubmed_result_DOI.csv"), sep = ";", quote=FALSE, row.names = FALSE)
#
```
```{r}
# Query pubmed and fetch many results
my_query <- 'Damiano Fantini[AU] AND '
my_query <- get_pubmed_ids(my_query)
```