src/prise_en_main.Rmd

---
title: "Prise en main"
author: "Delmotte jean"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    code_folding: "hide"
    theme: united
    highlight: tango
    number_sections: true
    fig_caption: yes
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

# Fonction to install / load package if it's not here
ipak <- function(pkg){
  new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
  if (length(new.pkg))
    install.packages(new.pkg, dependencies = TRUE)
  sapply(pkg, require, character.only = TRUE)
}

# usage
packages <- c("ggplot2", "tidyverse", "plotly", "rentrez", "reshape","rvest", "XML", "RCurl", "easyPubMed","conflicted")
#packages <- c(packages, "wdman", "RSelenium", "seleniumPipes")
# wdman for managing the Selenium server d/l
# RSelenium for getting a connection to the Selenium server
# seleniumPipes for better navigation & scraping idioms
ipak(packages)
```

ressources : http://perso.ens-lyon.fr/lise.vaudor/tuto-texte/

# Récolte des infos avec le package entrez

```{r}
entrez_dbs() # information about al database

entrez_db_summary("pubmed")

search_fields <- entrez_db_searchable("pubmed")
search_fields$GRNT

hox_paper <- entrez_search(db="pubmed", term="10.1038/nature08789[doi]")
hox_paper$file


papers_by_year <- function(years, search_term){
    return(sapply(years, function(y) entrez_search(db="pubmed",term=search_term, mindate=y, maxdate=y, retmax=0)$count))
}

years <- 1990:2015
total_papers <- papers_by_year(years, "")
omics <- c("genomic", "epigenomic", "metagenomic", "proteomic", "transcriptomic", "pharmacogenomic", "connectomic" )
trend_data <- sapply(omics, function(t) papers_by_year(years, t))
trend_props <- trend_data/total_papers


trend_df <- melt(data.frame(years, trend_props), id.vars="years")
p <- ggplot(trend_df, aes(years, value, colour=variable))+
  geom_line(size=1) + scale_y_log10("number of papers")
```


```{r}
virus_oysters <- entrez_search(db="pubmed", term="herpesvirus AND oyster")
summary(virus_oysters)
lsf.str("package:rentrez")

virus_oysters$file

# Vérifier les str rentrer avec un dico ?
# utiliser les paste0 pour fomuler les queries

years <- 1960:2019
#total_papers <- papers_by_year(years, "")
recherche_an <- c("disease AND oyster", "herpesvirus AND oyster", "bacteria AND oyster" )
trend_data <- sapply(recherche_an, function(t) papers_by_year(years, t))
#trend_props <- trend_data/total_papers
#trend_df <- melt(data.frame(years, trend_props), id.vars="years")
evolution_biblio <- melt(data.frame(years, trend_data), id.vars="years")
colnames(evolution_biblio) <- c("years", "query", "papers")
p <- ggplot(evolution_biblio, aes(years, papers, colour=query))+
  geom_line(size=1)

```

# Récolte des infos en scrappant avec le package rvest

Dans un premier temps je vais tenter de récupérer le titre d'un article sur la page principal :

![output_to_parse_1](~/Documents/BibliographeR/results/images/first_tittle_to_parse.png)
**Image 1**. Capture d'écran du 1er article de la page.

Ensuite j'ai copié collé les 3 Xpath des 3 premier titres pour voir où ça changé :

```html
/html/body/div[2]/div[1]/form/div[1]/div[5]/div/div[5]/div[1]/div[2]/p/a
/html/body/div[2]/div[1]/form/div[1]/div[5]/div/div[5]/div[2]/div[2]/p/a
/html/body/div[2]/div[1]/form/div[1]/div[5]/div/div[5]/div[3]/div[2]/p/a
```

L'avant dernier `div[]` est ce qui varie entre les 3 titres

```{r}
ncbi <- read_html(
  "https://www.ncbi.nlm.nih.gov/pubmed/?term=herpesvirus+oyster",
  encoding="UTF-8"
)

mode(ncbi)
ncbi <- read_html(
  "https://www.ncbi.nlm.nih.gov/pubmed/?term=herpesvirus+oyster",
  encoding="UTF-8"
)

# titre
ncbi %>%
  html_nodes("div.rslt p.title") %>%
  html_text()

# Auteur
ncbi %>%
  html_nodes("div.rslt p.desc") %>%
  html_text()

# Revue
ncbi %>%
  html_nodes("div.rslt p.details") %>%
  xml_child("span") %>%
  xml_attr("title")

# DOIs
ncbi %>%
  html_nodes("div.supp p.details") %>%
  html_text()

# pb pour la récup des DOI, autre possibilité, récup le "/pubmed/30828244" et refaire une requet de l'URL
#10.3389/fmicb.2019.00473.
#https://doi.org/10.3389/fmicb.2019.00473

#il faudra rajouté "#https://doi.org/" et enlever le point

```
Remarque les requetes se feront sur la page, il faut donc procéder page par page avec une boucle qui fera des itérations jusqu'à que le nombre de page max soit atteint. Pour ça on fait avec le nombre d'élément dans la page. Il faudra gérer le cas ou lorsque on a à la dernière page un nombre égale au max d'élement dans la page.

![problem pages](~/Documents/BibliographeR/results/images/gestion_pages.png)

possible solution https://www.allinonescript.com/questions/52694072/scape-data-from-a-page-that-uses-jsf-search-using-r

```{r}
# Exemple
selServ <- selenium() #This should install the jar and start the server
selServ$log()$stderr #We need the port # so do this and look for the port in the msgs
sel <- remoteDr(browserName = "firefox", port = 4567) # Now we need to connect to it and we need to use the port # from ^^. It was 4567 in my case

# Now, go to the main URL
sel %>%
  go("https://jurispub.admin.ch/publiws/pub/search.jsf")

# Start the scraping process by hitting the initial submit button
sel %>%
  findElement("name", "form:searchSubmitButton") %>%  # find the submit button
  elementClick() # click it

<input class="iceCmdBtn" id="form:searchSubmitButton" name="form:searchSubmitButton" onblur="setFocus('');" onclick="iceSubmitPartial(form, this, event);return false;" onfocus="setFocus(this.id);" type="submit" value="suchen">

<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="4" accesskey="k" id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next &gt;</a>


<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="5" accesskey="k" id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next &gt;</a>

sel %>%
  getPageSource() %>% # like read_html()
  html_node("table.iceDatTbl") -> dtbl  # this is the data table
```


```{r}
selServ <- selenium() #This should install the jar and start the server
selServ$log()$stderr #We need the port # so do this and look for the port in the msgs
sel <- remoteDr(browserName = "firefox", port = 4567) # Now we need to connect to it and we need to use the port # from ^^. It was 4567 in my case

# Now, go to the main URL
sel %>%
  go("https://www.ncbi.nlm.nih.gov/pubmed/?term=herpesvirus+oyster")


#<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="4" accesskey="k" #id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next &gt;</a>
#
#  
#<a name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page" title="Next page of results" class="active page_link next" href="#" sid="3" page="5" accesskey="k" #id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page">Next &gt;</a>
#
#id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page"

# Start the scraping process by hitting the initial submit button
EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page
sel %>%
  findElement(using = "id", "EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_Pager.Page") %>%  # find the submit button
  elementClick() # click it

sel %>%
  getPageSource() %>% # like read_html()
  html_node("table.iceDatTbl") -> dtbl  # this is the data table
```


Reformatage des fichier en Bash

```
for file in ./Openaccess/* ;
  do
  cut -d"," -f2- $file > tmp_file
  sed -e '1,1d' < tmp_file > $file
  tac $file | sed "1,2{d}" | tac > tmp_file
  mv tmp_file $file
done
```


```{r}

BasePath <- "~/Documents/BibliographeR/raw/"


datas <- read.table(paste0(BasePath, "2014_SCIE_SSCI_InCites_journal_citation_Reports.csv"),
                          header = T, dec = ".", sep = ",", stringsAsFactors=F)
fichiers <- list.files(path=BasePath,
                         pattern="*.csv")
fichier <- gsub("_InCites_journal_citation_Reports.csv","", fichiers[1])
types <- "Total.Cites"


Concat_files <- function(directory, types){
  # boucle for
  if (missing(directory)) stop("directory is missing")
  fichiers <- list.files(path=directory,
                         pattern="*.csv")
  if (length(c(types)) == 1) {
    for (fichier in fichiers) {
      datas <- read.table(paste0(directory, fichier),
                          header = T, dec = ".", sep = ",", stringsAsFactors=F)
      # Construction du vecteur avec les noms des fichiers
      fichier <-  gsub("_InCites_journal_citation_Reports.csv","",fichier)
      # Géré les noms des colonnes
      if (exists("Names_VC") == FALSE) {
        Names_VC <- c("Full.Journal.Title")
      }
      Names_VC <- append(Names_VC, fichier)

      #colnames(datas) <- c("Full.Journal.Title", "Total.Cites", "Journal.Impact.Factor", "Eigenfactor.Score")
      # Géré les exception pour les SNP
      if (exists("VCallingDataSet") == FALSE) {
        VCallingDataSet <- data.frame(matrix(NA, nrow = 1, ncol = ncol(datas)))
        colnames(VCallingDataSet) <- colnames(datas)
        VCallingDataSet <- VCallingDataSet[c("Full.Journal.Title", types)] # Mettre un paramètre ici
      }
      VCallingDataSet <- Reduce(function(x, y) merge(x = x, y = y, by = c("Full.Journal.Title"), all=TRUE),
                                list(VCallingDataSet,
                                     datas[c("Full.Journal.Title", types)] ) )
      if (ncol(VCallingDataSet) == 3 ) {
        if (sum(is.na(VCallingDataSet$Total.Cites.x)) == nrow(VCallingDataSet)) {
          if (TRUE %in% is.na(VCallingDataSet$Total.Cites.x)) {
            VCallingDataSet <- VCallingDataSet[,-1]
            VCallingDataSet <- VCallingDataSet[-nrow(VCallingDataSet),]
          }
        }
      }
      colnames(VCallingDataSet) <- Names_VC
      #rm(VCallingDataSet)
    }
    #VCallingDataSet[is.na(VCallingDataSet)] <- as.numeric(0)
    return(VCallingDataSet)
  } else if (length(c(types)) == 2) {
    writeLines("A dévelopé")
  } else {
    stop("To much argument for Type")
  }
}

Concat_files(BasePath, "Total.Cites")
Concat_files(BasePath, "Journal.Impact.Factor")
Concat_files(BasePath, "Eigenfactor.Score")
```{r}
#https://stringr.tidyverse.org/articles/regular-expressions.html
BasePath <- "/home/jean/Documents/BibliographeR/"
dataset1 <- read.csv(paste0(BasePath,"raw/oyster_herpesvirus_pubmed_result.csv"), header = F,  dec = ".", sep = ",", stringsAsFactors=F)
dataset1 <- dataset1[,-ncol(dataset1)]
colnames(dataset1) <- dataset1[1,]
dataset1 <- dataset1[-1,]
# Partie DOI
dataset1$Details[1]
dataset1$DOI <- gsub(".$","", str_match(dataset1$Details, "doi: (.*?) ")[,2] )

TMP_vec <- dataset1 %>%
  mutate(col_test = str_extract_all(Details, "doi: .+$"))

doi_qui_interest <- c()
for (i in str_split(TMP_vec, " ") ) {
  doi_qui_interest <- append(doi_qui_interest, i[2])
}

dataset1 <- dataset1 %>%
  mutate(DOI=doi_qui_interest)

dataset1$Details[4]
dataset1$Details[1]

dataset1$DOI <- paste0("https://doi.org/",dataset1$DOI)
dataset1[which(dataset1$DOI == "https://doi.org/NA"),]$DOI <- NA
write.table(dataset1, file = paste0(BasePath,"raw/oyster_herpesvirus_pubmed_result_DOI.csv"), sep = ";", quote=FALSE, row.names = FALSE)
#
```

```{r}
# Query pubmed and fetch many results
my_query <- 'Damiano Fantini[AU] AND '
my_query <- get_pubmed_ids(my_query)
```