# Data Preparation for SPOC


In [242]:
import pathlib
import pandas as pd
from lxml import etree

papers = pd.read_json("../data/papers.json")
papers_tei = pathlib.Path("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_tei")

In [None]:
papers.head()

In [None]:
papers.tail()

## Extracting Full-Text from TEI XML
[GROBID](https://grobid.readthedocs.io/en/latest/) extracts the full-text from the PDFs and saves the result in a
[TEI](https://tei-c.org/) XML document. We then use XPath to extract the full-text within the document.

First, we will create a TEI namespace to simplify the construction of the XPath.

In [2]:
TEI = { 'tei': 'http://www.tei-c.org/ns/1.0'}

## WorMS Marine Species Dataframe


In [243]:
import spacy
import pandas as pd
from spacy_lookup import Entity

In [None]:
taxon = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/WoRMS_marineSpecies/taxon.txt', sep="\t")

In [30]:
sci_id_names = {}
for row in taxon.iterrows():
    sci_id_names[row[1]['taxonID']] = [row[1]['scientificName']]

In [27]:
len(sci_id_names)

597914

In [97]:
sci_names_entity = Entity(keywords_dict=sci_id_names, label="SPECIES")

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fd5bcaa6ca0>)

In [37]:
ph789wb4763_doc = nlp(ph789wb4763_text)

Mysinae

In [48]:
sci_names_entity.keyword_processor.get_keyword('Mysinae')

'urn:lsid:marinespecies.org:taxname:148706'

## Location Dataframe

In [49]:
ca_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_CA.csv')

In [51]:
ca_locations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31147 entries, 0 to 31146
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   FEATURE_ID       31147 non-null  int64  
 1   FEATURE_NAME     31147 non-null  object 
 2   FEATURE_CLASS    31147 non-null  object 
 3   STATE_ALPHA      31147 non-null  object 
 4   STATE_NUMERIC    31147 non-null  int64  
 5   COUNTY_NAME      31147 non-null  object 
 6   COUNTY_NUMERIC   31147 non-null  int64  
 7   PRIMARY_LAT_DMS  31147 non-null  object 
 8   PRIM_LONG_DMS    31147 non-null  object 
 9   PRIM_LAT_DEC     31147 non-null  float64
 10  PRIM_LONG_DEC    31147 non-null  float64
 11  SOURCE_LAT_DMS   6791 non-null   object 
 12  SOURCE_LONG_DMS  6791 non-null   object 
 13  SOURCE_LAT_DEC   6791 non-null   float64
 14  SOURCE_LONG_DEC  6791 non-null   float64
 15  ELEV_IN_M        30645 non-null  float64
 16  ELEV_IN_FT       30645 non-null  float64
 17  MAP_NAME    

In [53]:
or_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_OR.csv')

In [56]:
or_locations.head()

Unnamed: 0,FEATURE_ID,FEATURE_NAME,FEATURE_CLASS,STATE_ALPHA,STATE_NUMERIC,COUNTY_NAME,COUNTY_NUMERIC,PRIMARY_LAT_DMS,PRIM_LONG_DMS,PRIM_LAT_DEC,PRIM_LONG_DEC,SOURCE_LAT_DMS,SOURCE_LONG_DMS,SOURCE_LAT_DEC,SOURCE_LONG_DEC,ELEV_IN_M,ELEV_IN_FT,MAP_NAME,DATE_CREATED,DATE_EDITED
0,1116450,Switch Back Creek,Stream,OR,41,Lane,39,440055N,1221714W,44.015401,-122.287268,440138N,1221620W,44.027222,-122.272222,868.0,2848.0,Harvey Mountain,6/1/92,
1,1116452,Dearborn Island,Island,OR,41,Lane,39,440959N,1221434W,44.166514,-122.242839,,,,,364.0,1194.0,McKenzie Bridge,6/1/92,
2,1116453,Gold Basin Springs,Spring,OR,41,Curry,15,421817N,1235344W,42.30474,-123.895584,,,,,1189.0,3901.0,Tincup Peak,6/1/92,1/2/13
3,1116454,Cedar Camp,Locale,OR,41,Curry,15,421611N,1235944W,42.26983,-123.995637,,,,,978.0,3209.0,Tincup Peak,6/1/92,
4,1116473,Jacks Camp,Locale,OR,41,Curry,15,421801N,1240319W,42.300386,-124.055362,,,,,811.0,2661.0,Big Craggies,6/1/92,


In [57]:
wa_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_WA.csv')

In [82]:
locations = pd.concat([ca_locations, or_locations, wa_locations])

In [90]:
locations_dict = dict(zip(locations.FEATURE_ID, locations.FEATURE_NAME))

In [93]:
for key in locations_dict.keys():
    place_list = [locations_dict[key],]
    locations_dict[key] = place_list

In [112]:
locations_entity.name = 'location_entity'

In [113]:
locations_entity.name

'location_entity'

In [118]:
nlp = spacy.load('en_core_web_md')

In [119]:
nlp.add_pipe(sci_names_entity)

In [120]:
nlp.add_pipe(locations_entity)

In [121]:
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fd62a83ab80>)

In [244]:
pz596wd3318 = papers_tei/"hms_pz596wd3318.tei.xml"
pz596wd3318_xml = etree.XML(pz596wd3318.read_bytes())

In [245]:
pub_date = pz596wd3318_xml.find("tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:date", namespaces=TEI)

In [255]:
pub_date.attrib.get('when', '')

'1996-06-06'

In [256]:
def xml2records(xml_path: pathlib.Path) -> pd.DataFrame:
    # Create an XML document from xml path
    tei_xml = etree.XML(xml_path.read_bytes())
    body = tei_xml.find("tei:text/tei:body", namespaces=TEI)
    # Iterate and retreive all of the text
    full_text = ''
    for t in body.itertext():
        full_text += f" {t}"
    # Run Species Pipeline 
    doc = species_nlp(full_text)
    # Get list of species entities with IDs
    species = [(ent.text, sci_names_entity.keyword_processor.get_keyword(ent.text)) for ent in doc.ents]
    species = list(set(species))
    # Run Locations Pipeline
    doc = location_nlp(full_text)
    locations = [ent.text for ent in doc.ents]
    locations = list(set(locations))
    # Extracts Publication Date if it exists
    pub_date_element = tei_xml.find("tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:date", namespaces=TEI)
    if pub_date_element is not None:
        pub_date = pub_date_element.attrib.get('when', '')
    else:
        pub_date = ''
    records = []
    for row in species:
        record = { 'Paper ID': xml_path.name, 
                   'Instance ID': row[1],
                   'Species': row[0],
                   'GBIF': f"https://www.gbif.org/species/search?q={row[0]}&qField=SCIENTIFIC",
                   'Time': pub_date,
                   'Place': ','.join(locations)}
        records.append(record)
    return pd.DataFrame(records)
        

In [162]:
xml_iterator = papers_tei.iterdir()
first25 = []
for _ in range(25):
    first25.append(next(xml_iterator))

In [224]:
number3recs = xml2records(first25[3])

In [225]:
number3recs

Unnamed: 0,Paper ID,Instance ID,Species,GBIF,Time,Place
0,fhl_2011_Phillips_26637.tei.xml,urn:lsid:marinespecies.org:taxname:181428,DIC,https://www.gbif.org/species/search?q=DIC&qFie...,,"Pacific Ocean,Harney Channel,San Juan Channel,..."


In [235]:
df.tail()

Unnamed: 0,Paper ID,Instance ID,Species,GBIF,Time,Place
138,fhl_2011_Andrykovich_26680.tei.xml,urn:lsid:marinespecies.org:taxname:1839,Ascidiacea,https://www.gbif.org/species/search?q=Ascidiac...,,"Richardson,York,National,Bryant,green,Cannon,C..."
139,fhl_2011_Ho_26600.tei.xml,urn:lsid:marinespecies.org:taxname:535826,aquaria,https://www.gbif.org/species/search?q=aquaria&...,,"Eagle Cove,wave,San Juan Island,Friday Harbor,..."
140,fhl_2011_Ho_26600.tei.xml,urn:lsid:marinespecies.org:taxname:106122,Balanus,https://www.gbif.org/species/search?q=Balanus&...,,"Eagle Cove,wave,San Juan Island,Friday Harbor,..."
141,fhl_2011_Ho_26600.tei.xml,urn:lsid:marinespecies.org:taxname:854086,anemone,https://www.gbif.org/species/search?q=anemone&...,,"Eagle Cove,wave,San Juan Island,Friday Harbor,..."
142,fhl_2011_Ho_26600.tei.xml,urn:lsid:marinespecies.org:taxname:283347,Anthopleura elegantissima,https://www.gbif.org/species/search?q=Anthople...,,"Eagle Cove,wave,San Juan Island,Friday Harbor,..."


In [257]:
start = datetime.datetime.utcnow()
all_records = None
print(f"Start conversion at {start}")
for i, tei_path in enumerate(papers_tei.iterdir()):
    records = xml2records(tei_path)
    if all_records is None:
        all_records = records
    else:
        all_records = pd.concat([all_records, records], ignore_index=True)
    if not i%10:
        print(".", end="")
    if not i%25:
        print(f"{i}", end="")
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes")
    
            

Start conversion at 2021-03-04 21:22:36.413013
.0..25...50..75...100..125...150..175...200..225...250..275...300..325...350..375...400..425...450..475...500..525...550..575...600..625...650..675...700..725...750..775...800..825...850..875...900..925...950..975...1000..1025...1050..1075...1100..1125...1150..1175...1200..1225...1250..1275...1300..1325...1350..1375...1400..1425...1450..1475...1500..1525...1550..1575...1600..1625.Finished at 2021-03-04 21:36:14.385860, total time 13.616666666666667 minutes


In [259]:
all_records.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12820 entries, 0 to 12819
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Paper ID     12820 non-null  object
 1   Instance ID  12820 non-null  object
 2   Species      12820 non-null  object
 3   GBIF         12820 non-null  object
 4   Time         12820 non-null  object
 5   Place        12820 non-null  object
dtypes: object(6)
memory usage: 601.1+ KB


In [260]:
all_records.to_json("../data/species-records.json")