In [1]:
# rmbranto 2021/08
# Demonstrate R programming lanquage 'spocc', 'scrubr' and 'ggplot2' packages to retrieve and combine
# selected invasive species occurrence data from multiple source, see also: 
# https://ocean.si.edu/ocean-life/5-invasive-species-you-should-know 
# https://docs.ropensci.org/spocc/

fName<-'invasives.rda'
limit<-9999

options(stringsAsFactors = FALSE)
library(spocc)
library(rinat)
library(sparsesvd)
library(scrubr)
library(stringr)
library(plyr)
library(sf)

Registered S3 method overwritten by 'hoardr':
  method           from
  print.cache_info httr
Linking to GEOS 3.9.1, GDAL 3.3.0, PROJ 8.0.0


In [2]:
extract<-FALSE
pNames<-c('ala','bison','gbif','eBird','idigbio','obis','VertNet')
qNames<-c('Carcinus maenas','Caulerpa taxifolia','Codium fragile','Dreissena polymorpha','Mnemiopsis leidyi','Pterois volitans','Rapana venosa')
cNames<-c('European Green Crab','Killer Algae','Dead Mans Fingers','Zebra Mussel','Sea Walnut','Lion Fish','Veined Rapa Whelk') # invasive
sColors<-c('green','cyan','magenta','red','orange','yellow','dodgerblue')
fImages<-c('https://upload.wikimedia.org/wikipedia/commons/1/17/Carcinus_maenas.jpg',
           'https://inaturalist-open-data.s3.amazonaws.com/photos/9434207/medium.jpg?1501502101',
           'https://upload.wikimedia.org/wikipedia/commons/e/ed/Codiumfragile.jpg',
           'https://upload.wikimedia.org/wikipedia/commons/thumb/a/a4/Dreissena_polymorpha_(Zebra_mussel),_Arnhem,_the_Netherlands.jpg/500px-Dreissena_polymorpha_(Zebra_mussel),_Arnhem,_the_Netherlands.jpg',
           'https://inaturalist-open-data.s3.amazonaws.com/photos/9566101/medium.jpg?1545697239',
           'https://static.inaturalist.org/photos/13375992/medium.jpg?1518045182',
           'https://inaturalist-open-data.s3.amazonaws.com/photos/5026554/medium.jpg?1474961472')

species.style<-data.frame(Names=qNames,cNames=cNames,sColors=sColors,fImages=fImages,stringsAsFactors = FALSE)

In [36]:
prov.style<-data.frame(
    prov = c("UNIQUE","DUPS","ala","bison","bold","gbif","idigbio","inat","obis"),
    id=c('UNI','DUP','ala','bis','bol','gbi','idi','ina','obi'),
    color = c("grey", "white", "red", "orange", "yellow", "green", "cyan", "dodgerblue", "magenta"),
    order=c(8,9,1:7))

In [None]:
if(extract){
do.extract<-function(qName=qNames[1],limit=9999){

    df1<-data.frame(qName=qName,
                    occ2df(occ(query = qName, from=pNames, limit=limit, has_coords = TRUE)))

    df2<-get_inat_obs(
            query=qName,quality="research",
            geo=TRUE,
            maxresults=limit)
    
    df2<-data.frame(qName=qName,
              name=df2$scientific_name,
              longitude=df2$longitude,
              latitude=df2$latitude,
              prov='inat',
              date=df2$datetime,
              key=df2$id)

    return(rbind(df1,df2))
}

df<-NULL
for(qName in qNames){
    cat(qName,'...\n')
    df<-rbind(df,do.extract(qName,limit))
    }

nrow(df)
table(df$qName,df$prov)
df.raw<-df
}

In [3]:
load(fName) ; nrow(df.raw); names(df.raw)

In [4]:
# data scrubbing

df<-df.raw
names(df)[1]<-'species'

# add prov='bold'    
df$prov[substr(df$name,1,4)=='BOLD']<-'bold'

# fix coordinates

df$latitude<-as.numeric(df$latitude)
df$longitude<-as.numeric(df$longitude)

df<-dframe(df) %>%
    coord_impossible() %>%
    coord_incomplete() %>%
    coord_unlikely()

# fix dates ...
df<-df[!is.na(df$date),]
df<-df[as.character(df$date)>'1111-11-11',]

df<-df[df$date<=Sys.Date(),]

df.clean<-df
nrow(df.clean)
range(df.clean$date)

Assuming 'latitude' is latitude
Assuming 'longitude' is longitude


In [5]:
# dedup ???

if(FALSE){
    smalldf<-df.clean[df.clean$species=='Pterois volitans',]
    smalldf<-smalldf[1:1000,]
    NROW(smalldf)
    dp <- dframe(smalldf) %>% dedup()
    NROW(dp)
    attr(dp, "dups")
}

In [6]:
# find UNIQUEs and DUPS

df.agg<-aggregate(OCCS~species+longitude+latitude+prov+date,data=data.frame(df.clean,OCCS=1),sum) 
df.unique<-count(df.agg,c('species','longitude','latitude','date','OCCS'))
names(df.unique)[dim(df.unique)[2]]<-'DUPS'

# save UNIQUE into df.prov

df.agg$DUPS<-1
df.unique$prov<-'UNIQUE'
df.unique<-df.unique[,c(1:3,7,4:6)]
df.prov<-rbind(df.agg,df.unique)
df.prov$year<-as.integer(substr(as.character(df.prov$date),1,4))
df.prov<-df.prov[,c(4,1,8,5,2:3,6:7)]

# save DUPS into df.prov

df.dups<-df.prov[df.prov$prov=='UNIQUE' & df.prov$DUPS>1,]
df.dups$prov<-'DUPS'
df.dups$DUPS<-df.dups$DUPS-1
df.dups$OCCS<-df.dups$DUPS*df.dups$OCCS
df.prov<-rbind(df.prov,df.dups)

xtab<-table(df.prov$species,df.prov$prov)
rbind(xtab,apply(xtab,2,sum))[,c(1:3,5:8,9,4)]
range(df.prov$date)
names(df.prov)
nrow(df.prov)

Unnamed: 0,ala,bison,bold,gbif,idigbio,inat,obis,UNIQUE,DUPS
Carcinus maenas,203,1327,7,8675,110,6456,7180,23569,366
Caulerpa taxifolia,378,181,0,955,304,53,490,1800,391
Codium fragile,389,1121,0,5156,721,2314,559,9111,1070
Dreissena polymorpha,0,4504,8,8896,504,1270,394,13533,2029
Mnemiopsis leidyi,0,77,0,973,6,258,1812,2773,349
Pterois volitans,496,661,0,3290,221,1462,2159,7579,665
Rapana venosa,0,0,1,242,9,181,140,549,24
,1466,7871,16,28187,1875,11994,12734,58914,4894


In [7]:
# eez and fao indexing

prov.pts<-count(df.prov,c('latitude','longitude'))[,1:2]

eez.shp<-st_read("/home/bobbranton/geoserver/data_dir/eez/EEZ_Land_v3_202030.shp",quiet=TRUE)
eez.pts<-st_as_sf(prov.pts,coords = c('longitude','latitude'), crs = st_crs(eez.shp))
eez.intersect<-data.frame(st_intersects(eez.pts, eez.shp))
nrow(eez.intersect)

prov.pts$eez='UNK'
for(i in 1:nrow(eez.intersect)){
    prov.pts$eez[eez.intersect$row.id[i]]<-eez.shp$ISO_TER1[eez.intersect$col.id[i]]    
}

although coordinates are longitude/latitude, st_intersects assumes that they are planar
although coordinates are longitude/latitude, st_intersects assumes that they are planar


In [8]:
fao.shp<-st_read("/home/bobbranton/geoserver/data_dir/fao/World_Fao_Zones.shp",quiet=TRUE)
fao.pts<-st_as_sf(prov.pts,coords = c('longitude','latitude'), crs = st_crs(fao.shp))
fao.intersect<-data.frame(st_intersects(fao.pts, fao.shp))
nrow(fao.intersect)

prov.pts$fao='99'
for(i in 1:nrow(fao.intersect)){
    prov.pts$fao[fao.intersect$row.id[i]]<-fao.shp$zone[fao.intersect$col.id[i]]    
}

although coordinates are longitude/latitude, st_intersects assumes that they are planar
“bounding box has potentially an invalid value range for longlat data”although coordinates are longitude/latitude, st_intersects assumes that they are planar


In [9]:
df.prov<-merge(df.prov,prov.pts,by=c('longitude','latitude'))[,c(3:8,1:2,9:10)]
nrow(df.prov)
head(df.prov,5)

Unnamed: 0_level_0,prov,species,year,date,OCCS,DUPS,longitude,latitude,eez,fao
Unnamed: 0_level_1,<chr>,<chr>,<int>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
1,inat,Carcinus maenas,2016,2016-06-16,1,1,-0.005321,49.32174,FRA,99
2,DUPS,Carcinus maenas,2016,2016-06-16,1,1,-0.005321,49.32174,FRA,99
3,gbif,Carcinus maenas,2016,2016-06-16,1,1,-0.005321,49.32174,FRA,99
4,UNIQUE,Carcinus maenas,2016,2016-06-16,1,2,-0.005321,49.32174,FRA,99
5,gbif,Dreissena polymorpha,2006,2006-03-02,2,1,-0.008887,51.48449,GBR,99


In [44]:
objects()
save(list=objects(),file=fName )           

In [45]:
save(list=c('df.prov','species.style','fao.shp','eez.shp','prov.style'),file=paste('short-',fName,sep=''))