In [1]:
# rmbranto 2021/08 - original
# rmbranto 2021/10/12 - rewrite

# Demonstrate R programming lanquage 'spocc', 'scrubr' and 'ggplot2' packages to retrieve and combine
# selected invasive species occurrence data from multiple source, see also: 
# https://ocean.si.edu/ocean-life/5-invasive-species-you-should-know 
# https://docs.ropensci.org/spocc/

options(stringsAsFactors = FALSE)

library(spocc)
library(rinat)
library(sparsesvd)
library(scrubr)
library(stringr)
library(plyr)
library(sf)

#load(file='invasives.rda')
#objects()

########################################################################################
########################################################################################
# setup extraction run ...

fName<-'invasives'
fao.loc="geobase/World_Fao_Zones.shp"
eez.loc="geobase/EEZ_Land_v3_202030.shp"
species.list<-read.csv(paste(fName,'_list.csv',sep=''))
qNames=species.list$Names
prov.list=read.csv('prov_list.csv')
pNames=prov.list$prov

limit=9999
limit=9

extract=FALSE
extract=TRUE

########################################################################################
########################################################################################
# extract data ...

if(extract){
do.extract<-function(qName,pNames,limit){
    df1<-data.frame(qName=qName,
                    occ2df(occ(query = qName, 
                               from=pNames[!(pNames %in% c('UNIQUE','DUPS','inat'))], 
                               limit=limit, 
                               has_coords = TRUE)))
    df2<-get_inat_obs(
            query=qName,quality="research",
            geo=TRUE,
            maxresults=limit)    
    df2<-data.frame(qName=qName,
              name=df2$scientific_name,
              longitude=df2$longitude,
              latitude=df2$latitude,
              prov='inat',
              date=df2$datetime,
              key=df2$id)
    return(rbind(df1,df2))
}
df<-NULL
for(qName in qNames){
    cat(qName,'...\n')
    df<-rbind(df,do.extract(qName,pNames,limit))
    }
df.raw<-df
}

cat('\n# df.raw counts by species ...\n\n')
table(df.raw$qName,df.raw$prov)%>%
rbind(.,total=apply(.,2,sum,na.rm=TRUE))%>%
cbind(.,total=apply(.,1,sum,na.rm=TRUE))%>%
print(.,na.print='.')   

########################################################################################
########################################################################################
# scrub data

df<-df.raw
names(df)[1]<-'species'

# fix coordinates

df$latitude<-round(as.numeric(df$latitude),12)
df$longitude<-round(as.numeric(df$longitude),12)

df<-dframe(df) %>%
    coord_impossible() %>%
    coord_incomplete() %>%
    coord_unlikely()

# fix dates ...
df<-df[!is.na(df$date),]
df<-df[as.character(df$date)>'1111-11-11',]

df<-df[df$date<=Sys.Date(),]

df.clean<-df
nrow(df.clean)
range(df.clean$date)

cat('\n# df.clean counts by species & prov ...\n\n')
table(df.clean$species,df.clean$prov)%>%
rbind(.,col.totals=apply(.,2,sum,na.rm=TRUE))%>%
cbind(.,ALL=apply(.,1,sum,na.rm=TRUE))%>%
print(.,na.print='.')   

cat('\n# df.clean occurrrence counts by species & provs and by ALL, UNIQUE & DUPS ...\n\n')
table(df.clean$species,df.clean$prov)%>%
cbind(.,ALL=apply(.,1,sum,na.rm=TRUE))%>%
cbind(.,UNIQUE=table(count(df.clean,c('species','longitude','latitude','date'))$species))%>%
cbind(.,DUPS=.[,'ALL']-.[,'UNIQUE'])%>%
rbind(.,col.totals=apply(.,2,sum,na.rm=TRUE))%>%
print(.,na.print='.')   

########################################################################################
########################################################################################
# create df.UNIQUE ...
cat('\n# df.unique...\n\n')
df.UNIQUE<-count(df.clean,c('species','longitude','latitude','date'))
names(df.UNIQUE)[5]<-'DUPS'
df.UNIQUE$id=seq.int(nrow(df.UNIQUE))
cat('UNIQUE=',nrow(df.UNIQUE),'\n')
head(df.UNIQUE)

#create df.ALL ...
cat('\n# df.ALL...\n\n')
df.ALL=merge(df.clean,df.UNIQUE,by=c(c('species','longitude','latitude','date')))
cat('ALL=',nrow(df.ALL),'\n')
cat('UNIQUE=',nrow(count(df.ALL$id)),'\n')
cat('DUPS=',length(df.ALL$DUPS[df.ALL$DUPS>1])-nrow(count(df.ALL$id[df.ALL$DUPS>1])))
head(df.ALL)

# create df.DUPS ...
cat('\n# df.DUPS...\n\n')
df.DUPS=count(df.ALL[df.ALL$DUP>1,],c('species','longitude','latitude','date','id'))
names(df.DUPS)[6]='DUPS'
df.DUPS=df.DUPS[,c('species','longitude','latitude','date','DUPS','id')]
cat('nrows=',nrow(df.DUPS),'\n')
cat('DUPS=',sum(df.DUPS$DUPS)-nrow(df.DUPS),'\n')
head(df.DUPS)

########################################################################################
########################################################################################
# eez and fao indexing

cat('\n# unique PTS...\n\n')

fao.shp=st_read(fao.loc,quiet=T)
eez.shp=st_read(eez.loc,quiet=T)

prov.pts<-count(df.ALL,c('latitude','longitude'))[,1:2]
nrow(prov.pts)

cat('\n# EEZ...\n\n')

eez.pts<-st_as_sf(prov.pts,coords = c('longitude','latitude'), crs = st_crs(eez.shp))
eez.intersect<-data.frame(st_intersects(eez.pts, eez.shp))
nrow(eez.intersect)

prov.pts$eez='UNK'
for(i in 1:nrow(eez.intersect)){
    prov.pts$eez[eez.intersect$row.id[i]]<-eez.shp$ISO_TER1[eez.intersect$col.id[i]]    
}

cat('\n# FAO...\n\n')

fao.pts<-st_as_sf(prov.pts,coords = c('longitude','latitude'), crs = st_crs(fao.shp))
fao.nearest<-data.frame(st_nearest_feature(fao.pts, fao.shp))
nrow(fao.nearest)

prov.pts$fao='99'
for(i in 1:nrow(fao.nearest)){
    prov.pts$fao[i]<-fao.shp$zone[fao.nearest[i,1]]    
}

cat('\n# df.ALL...\n\n')

df.ALL<-merge(df.ALL,prov.pts,by=c('longitude','latitude'))#[,c(3:9,1:2,10:11)]

cat('nrows= ',nrow(df.ALL),'\n')
head(df.ALL,5)
table(df.ALL$fao)[rev(order(table(df.ALL$fao)))]
table(df.ALL$eez)[rev(order(table(df.ALL$eez)))][1:26]

Registered S3 method overwritten by 'hoardr':
  method           from
  print.cache_info httr
Linking to GEOS 3.9.1, GDAL 3.3.0, PROJ 8.0.0


Carcinus maenas ...
Caulerpa taxifolia ...
Codium fragile ...
Dreissena polymorpha ...


“No records returned in ALA for Dreissena polymorpha”

Mnemiopsis leidyi ...


“No records returned in ALA for Mnemiopsis leidyi”

Pterois volitans ...
Rapana venosa ...


“No records returned in ALA for Rapana venosa”


# df.raw counts by species ...

                     ala bison gbif idigbio inat obis total
Carcinus maenas        9     9    9       9    9    9    54
Caulerpa taxifolia     9     9    9       9    9    9    54
Codium fragile         9     9    9       9    9    9    54
Dreissena polymorpha   0     9    9       9    9    9    45
Mnemiopsis leidyi      0     9    9       9    9    9    45
Pterois volitans       9     9    9       9    9    9    54
Rapana venosa          0     0    9       9    9    9    36
total                 36    54   63      63   63   63   342


Assuming 'latitude' is latitude
Assuming 'longitude' is longitude



# df.clean counts by species & prov ...

                     ala bison gbif idigbio inat obis ALL
Carcinus maenas        7     9    9       5    9    8  47
Caulerpa taxifolia     5     9    9       7    9    7  46
Codium fragile         3     9    9       8    9    7  45
Dreissena polymorpha   0     9    9       5    9    3  35
Mnemiopsis leidyi      0     5    9       9    9    7  39
Pterois volitans       7     9    9       8    9    5  47
Rapana venosa          0     0    9       8    9    6  32
col.totals            22    50   63      50   63   43 291

# df.clean occurrrence counts by species & provs and by ALL, UNIQUE & DUPS ...

                     ala bison gbif idigbio inat obis ALL UNIQUE DUPS
Carcinus maenas        7     9    9       5    9    8  47     39    8
Caulerpa taxifolia     5     9    9       7    9    7  46     42    4
Codium fragile         3     9    9       8    9    7  45     44    1
Dreissena polymorpha   0     9    9       5    9    3  35     35    0
Mnemi

Unnamed: 0_level_0,species,longitude,latitude,date,DUPS,id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<int>,<int>
1,Carcinus maenas,-132.1368,53.24564,2021-10-12,1,1
2,Carcinus maenas,-128.2755,50.67302,2021-10-11,1,2
3,Carcinus maenas,-127.7617,51.48631,2017-07-25,1,3
4,Carcinus maenas,-125.967,49.17315,2021-01-06,1,4
5,Carcinus maenas,-122.3161,37.58452,2021-01-01,1,5
6,Carcinus maenas,-121.888,36.7899,2006-05-25,1,6



# df.ALL...

ALL= 291 
UNIQUE= 271 
DUPS= 20

Unnamed: 0_level_0,species,longitude,latitude,date,name,prov,key,DUPS,id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<chr>,<chr>,<chr>,<int>,<int>
1,Carcinus maenas,-0.02023197,49.31939,2021-10-12,Carcinus maenas,inat,98006821,1,21
2,Carcinus maenas,-121.888,36.7899,2006-05-25,carcinus maenas,idigbio,08db409c-e534-4d64-9eaa-9279d27b5df7,1,6
3,Carcinus maenas,-122.31613,37.58452,2021-01-01,"Carcinus maenas (Linnaeus, 1758)",gbif,3018141707,1,5
4,Carcinus maenas,-125.967013,49.17315,2021-01-06,"Carcinus maenas (Linnaeus, 1758)",gbif,3018091186,1,4
5,Carcinus maenas,-127.76165,51.48631,2017-07-25,carcinus maenas,idigbio,0226b0e9-ceb2-41d2-bb3c-5e8e95c95c32,1,3
6,Carcinus maenas,-128.27545938,50.67302,2021-10-11,Carcinus maenas,inat,97969889,1,2



# df.DUPS...

nrows= 10 
DUPS= 20 


Unnamed: 0_level_0,species,longitude,latitude,date,DUPS,id
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<date>,<int>,<int>
1,Carcinus maenas,-54.18,47.1,2011-06-01,8,14
2,Carcinus maenas,149.9283,-37.116,1985-03-25,2,39
3,Caulerpa taxifolia,-83.3572,26.7644,1980-10-29,2,54
4,Caulerpa taxifolia,-80.5667,24.9833,1994-12-01,2,55
5,Caulerpa taxifolia,150.47,-35.25,2000-06-07,2,77
6,Caulerpa taxifolia,151.1113,-34.06925,2021-03-01,2,78



# unique PTS...




# EEZ...



although coordinates are longitude/latitude, st_intersects assumes that they are planar
although coordinates are longitude/latitude, st_intersects assumes that they are planar



# FAO...



although coordinates are longitude/latitude, st_nearest_points assumes that they are planar



# df.ALL...

nrows=  291 


Unnamed: 0_level_0,longitude,latitude,species,date,name,prov,key,DUPS,id,eez,fao
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>,<date>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<chr>
1,-0.02023197,49.31939,Carcinus maenas,2021-10-12,Carcinus maenas,inat,98006821,1,21,FRA,27
2,-117.245147,32.6679,Codium fragile,2021-01-12,Codium fragile (Suringar) Hariot,gbif,3031703955,1,101,USA,77
3,-117.27236,32.85115,Codium fragile,1968-06-06,codium fragile,idigbio,0106de2f-d1a3-45f2-b9dc-ac876bcd2acb,1,100,USA,77
4,-118.333333,33.35,Codium fragile,1948-03-25,Codium fragile,ala,fbfc311c-1ce4-4724-80be-1e80a946e4e0,1,99,USA,77
5,-118.375805,33.7392,Codium fragile,2021-01-15,Codium fragile (Suringar) Hariot,gbif,3031702977,1,98,USA,77



21 31 37 27 77 81 57 67 71 61 51 41 47 
59 47 38 37 22 21 21 16 13  6  6  3  2 


USA CAN AUS ITA UKR NLD NZL GBR IND DEU MEX DNK JPN IDN GEO EST SWE RUS NOR NCL 
 84  36  33  15   9   9   7   7   6   6   4   4   3   3   3   3   2   2   2   2 
NAM MKD LKA FRA CYM CHE 
  2   2   2   2   2   2 

In [7]:
########################################################################################
########################################################################################
# get taxon.ids for each provider and species 
#

prov.keys<-data.frame(species=species.list$Names)

for(my.prov in c('ala','bison','gbif','idigbio','inat','obis')){

spec.keys<-NULL

for (my.species in species.list$Names){
    
    an.error.occured <- FALSE
    
    tryCatch(
        
    { 
      if(my.prov=='ala'){
        result=as.character(
        as.ala(occ(query=my.species,from='ala',limit=1))[[1]]$processed$classification$left
        )
      }
      if(my.prov=='bison'){
        result=as.character(
        as.bison(occ(query=my.species,from='bison',limit=1))[[1]]$points$ITIStsn
        )
      }
      if(my.prov=='gbif'){
        result=as.character(
        as.gbif(occ(query=my.species,from='gbif',limit=1, has_coords = TRUE))[[1]][[1]]$hierarchy$key[7]
        )
      }
      if(my.prov=='idigbio'){
        result=as.character(
        as.idigbio(occ(query=my.species,from='idigbio',limit=1, has_coords = TRUE))[[1]]$indexTerms$taxonid
        )
      }
      if(my.prov=='inat'){
        result=as.character(
        as.inat(occ(query=my.species,from='inat',limit=1, has_coords = TRUE))[[1]]$results$taxon$id
        )
      }
      if(my.prov=='obis'){
        result=as.character(
        as.obis(occ(query=my.species,from='obis',limit=1))[[1]]$results$aphiaID
        )
      }
    }

    ,error = function(e) {an.error.occured <<- TRUE}
             
            )

    if(an.error.occured){result<-NA}
    
    spec.keys<<-c(spec.keys,ifelse(is.null(result),NA,result))
}

names(spec.keys)
    
prov.keys<-cbind(prov.keys,spec.keys)

colnames(prov.keys)[ncol(prov.keys)]<-my.prov 

}

prov.keys

“Unknown or uninitialised column: `prov`.”

species,ala,bison,gbif,idigbio,inat,obis
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Carcinus maenas,284294.0,98734.0,5178595,5178595.0,52523,107381
Caulerpa taxifolia,83052.0,6974.0,2643172,2643172.0,50919,144476
Codium fragile,84082.0,6897.0,5272096,5272096.0,67555,145086
Dreissena polymorpha,,81339.0,2287072,2287072.0,116340,181566
Mnemiopsis leidyi,,53917.0,2501248,2501248.0,180788,106401
Pterois volitans,189453.0,166883.0,2334438,2334438.0,47280,159559
Rapana venosa,,,4363583,,370913,140416


In [43]:
# idigbio not showing Rapana venosa !!
x=occ(query='Pterois volitans',from='idigbio',limit=1, has_coords = TRUE)

In [44]:
as.idigbio(x)[[1]]$indexTerms$taxonid

In [None]:
########################################################################################
########################################################################################
# save outputs

objects()
save(list=objects(),file=fName )           

save(list=c('df.ALL','species.list','fao.shp','eez.shp','prov.list','prov.keys'),file=paste('short-',fName,sep=''))