# Data Preparation for SPOC


In [2]:
import pathlib
import pandas as pd
from lxml import etree

papers = pd.read_json("../data/papers.json")
papers_tei = pathlib.Path("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_tei")

In [None]:
papers.head()

In [None]:
papers.tail()

## Extracting Full-Text from TEI XML
[GROBID](https://grobid.readthedocs.io/en/latest/) extracts the full-text from the PDFs and saves the result in a
[TEI](https://tei-c.org/) XML document. We then use XPath to extract the full-text within the document.

First, we will create a TEI namespace to simplify the construction of the XPath.

In [2]:
TEI = { 'tei': 'http://www.tei-c.org/ns/1.0'}

## WorMS Marine Species Dataframe


In [37]:
import spacy
import pandas as pd
from spacy_lookup import Entity

In [36]:
taxon = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/WoRMS_marineSpecies/taxon.txt', sep="\t")

In [38]:
sci_id_names = {}
for row in taxon.iterrows():
    sci_id_names[row[1]['taxonID']] = [row[1]['scientificName']]

In [39]:
import pickle

In [43]:
pickle.load?

[0;31mSignature:[0m
[0mpickle[0m[0;34m.[0m[0mload[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfile[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfix_imports[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;34m'ASCII'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merrors[0m[0;34m=[0m[0;34m'strict'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbuffers[0m[0;34m=[0m[0;34m([0m[0;34m)[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Read and return an object from the pickle data stored in a file.

This is equivalent to ``Unpickler(file).load()``, but may be more
efficient.

The protocol version of the pickle is detected automatically, so no
protocol argument is needed.  Bytes past the pickled object's
representation are ignored.

The argument *file* must have two methods, a read() method that takes
an integer argument, and 

In [42]:
with open("../data/scientific-names.pkl", '+wb') as fo:
    pickle.dump(sci_id_names, fo)

In [97]:
sci_names_entity = Entity(keywords_dict=sci_id_names, label="SPECIES")

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fd5bcaa6ca0>)

## Location Dataframe

In [7]:
ca_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_CA.csv')

In [8]:
ca_locations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31147 entries, 0 to 31146
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   FEATURE_ID       31147 non-null  int64  
 1   FEATURE_NAME     31147 non-null  object 
 2   FEATURE_CLASS    31147 non-null  object 
 3   STATE_ALPHA      31147 non-null  object 
 4   STATE_NUMERIC    31147 non-null  int64  
 5   COUNTY_NAME      31147 non-null  object 
 6   COUNTY_NUMERIC   31147 non-null  int64  
 7   PRIMARY_LAT_DMS  31147 non-null  object 
 8   PRIM_LONG_DMS    31147 non-null  object 
 9   PRIM_LAT_DEC     31147 non-null  float64
 10  PRIM_LONG_DEC    31147 non-null  float64
 11  SOURCE_LAT_DMS   6791 non-null   object 
 12  SOURCE_LONG_DMS  6791 non-null   object 
 13  SOURCE_LAT_DEC   6791 non-null   float64
 14  SOURCE_LONG_DEC  6791 non-null   float64
 15  ELEV_IN_M        30645 non-null  float64
 16  ELEV_IN_FT       30645 non-null  float64
 17  MAP_NAME    

In [9]:
or_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_OR.csv')

In [10]:
or_locations.head()

Unnamed: 0,FEATURE_ID,FEATURE_NAME,FEATURE_CLASS,STATE_ALPHA,STATE_NUMERIC,COUNTY_NAME,COUNTY_NUMERIC,PRIMARY_LAT_DMS,PRIM_LONG_DMS,PRIM_LAT_DEC,PRIM_LONG_DEC,SOURCE_LAT_DMS,SOURCE_LONG_DMS,SOURCE_LAT_DEC,SOURCE_LONG_DEC,ELEV_IN_M,ELEV_IN_FT,MAP_NAME,DATE_CREATED,DATE_EDITED
0,1116450,Switch Back Creek,Stream,OR,41,Lane,39,440055N,1221714W,44.015401,-122.287268,440138N,1221620W,44.027222,-122.272222,868.0,2848.0,Harvey Mountain,6/1/92,
1,1116452,Dearborn Island,Island,OR,41,Lane,39,440959N,1221434W,44.166514,-122.242839,,,,,364.0,1194.0,McKenzie Bridge,6/1/92,
2,1116453,Gold Basin Springs,Spring,OR,41,Curry,15,421817N,1235344W,42.30474,-123.895584,,,,,1189.0,3901.0,Tincup Peak,6/1/92,1/2/13
3,1116454,Cedar Camp,Locale,OR,41,Curry,15,421611N,1235944W,42.26983,-123.995637,,,,,978.0,3209.0,Tincup Peak,6/1/92,
4,1116473,Jacks Camp,Locale,OR,41,Curry,15,421801N,1240319W,42.300386,-124.055362,,,,,811.0,2661.0,Big Craggies,6/1/92,


In [11]:
wa_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_WA.csv')

In [13]:
locations = pd.concat([ca_locations, or_locations, wa_locations])

In [14]:
locations_dict = dict(zip(locations.FEATURE_ID, locations.FEATURE_NAME))

In [93]:
for key in locations_dict.keys():
    place_list = [locations_dict[key],]
    locations_dict[key] = place_list

In [310]:
locations_entity = Entity(keywords_dict=locations_dict, label="LOCATION")

In [311]:
locations_entity.name = 'location_entity'

In [312]:
locations_entity.name

'location_entity'

In [45]:
with open("../data/locations.pkl", "wb") as fo:
    pickle.dump(locations_dict, fo)

## Habitat DataFrame

In [307]:
import json
habitats = []
with open("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/habitats.jsonl") as fo:
    for row in fo.readlines():
        line = json.loads(row)
        habitats.append(line.get('pattern'))

In [314]:
habitats_entity = Entity(keywords_list=habitats, label="HABITAT")
habitats_entity.name = 'habitat_entity'

## Construct spaCy Pipeline

In [315]:
nlp = spacy.load('en_core_web_md')
nlp.add_pipe(sci_names_entity)
nlp.add_pipe(locations_entity)
nlp.add_pipe(habitats_entity)
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fd56f8b4640>)

In [244]:
pz596wd3318 = papers_tei/"hms_pz596wd3318.tei.xml"
pz596wd3318_xml = etree.XML(pz596wd3318.read_bytes())

In [245]:
pub_date = pz596wd3318_xml.find("tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:date", namespaces=TEI)

In [255]:
pub_date.attrib.get('when', '')

'1996-06-06'

In [324]:
def xml2records(xml_path: pathlib.Path) -> pd.DataFrame:
    # Create an XML document from xml path
    tei_xml = etree.XML(xml_path.read_bytes())
    divs = tei_xml.findall("tei:text/tei:body/tei:div", namespaces=TEI)
    # Extracts Publication Date if it exists
    pub_date_element = tei_xml.find("tei:teiHeader/tei:fileDesc/tei:publicationStmt/tei:date", namespaces=TEI)
    # Records in the paper
    records = []
    if pub_date_element is not None:
        pub_date = pub_date_element.attrib.get('when', '')
    else:
        pub_date = ''
    # Iterate over each div, retreive all of the text, and run NER
    for i,div in enumerate(divs):
        text = ''
        for row in div.itertext():
            text += f" {row}"
        doc = nlp(text)
        species, places, habitats = [],[],[]
        # Iterate through document's entities and add to lists
        for ent in doc.ents:
            if ent.label_.startswith("SPECIES"):
                species.append(ent)
            elif ent.label_.startswith("LOCATION"):
                places.append(ent.text)
            elif ent.label_.startswith("HABITAT"):
                habitats.append(ent.text)
        # Get list of species entities with IDs
        species = [(sci_names_entity.keyword_processor.get_keyword(ent.text), ent.text) for ent in species]
        species = list(set(species))
        # Removes duplicates from locations and habit
        locations = list(set(places))
        habitats = list(set(habitats))
        div_number = i + 1
        for row in species:
            record = { 'Paper ID': xml_path.name,
                       'Instance ID': row[0],
                       'Species': row[1],
                       'GBIF': f"https://www.gbif.org/species/search?q={row[0]}&qField=SCIENTIFIC",
                       'Time': pub_date,
                       'Place': ','.join(locations),
                       'Habitats': ','.join(habitats),
                       'div_enum': div_number
                       }
            records.append(record)
    return pd.DataFrame(records)
        

In [328]:
start = datetime.datetime.utcnow()
all_records = None
print(f"Start conversion at {start}")
for i, tei_path in enumerate(papers_tei.iterdir()):
    try:
        records = xml2records(tei_path)
    except:
        print(f"Error with {tei_path.name}")
        continue
    if all_records is None:
        all_records = records
    else:
        all_records = pd.concat([all_records, records], ignore_index=True)
    if not i%10:
        print(".", end="")
    if not i%25 and i > 0:
        print(f"{i}", end="")
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes")

Start conversion at 2021-03-05 18:19:48.792341
...25.Error with fhl_2011_Clark_26624.tei.xml
..50..75...100..125...150..175.Error with fhl_2012_Townsend_27041.tei.xml
.200..225...250..275.Error with fhl_2014_Cougan_27609.tei.xml
..300..325...350..375...400..425...450..475...500..525...550..575...600..625.Error with hms_cj258ns3486.tei.xml
..650..675...700..725...750..775...800..825...850..875...900..925.Error with hms_tb429nk3829.tei.xml
..950.Error with hms_sf858vn4653.tei.xml
.975..Error with hms_vp112bk1362.tei.xml
.1000..Error with hms_vz980wk8856.tei.xml
...1050..1075Error with hms_zt342rt9331.tei.xml
.Error with hms_zx897dq3818.tei.xml
Error with hms_zz467pc7918.tei.xml
.1100..1125..Error with hms_td830hd7966.tei.xml
..1175...1200..1225...1250Error with hms_zd610ry2635.tei.xml
..1275...1300..1325...1350..1375...1400..1425...1450..1475...1500..1525...1550..1575...1600..1625.Finished at 2021-03-05 18:25:24.880164, total time 5.6 minutes


In [305]:
all_records.to_json("../data/species-records.json")

In [329]:
fhl = papers_tei/"fhl_2012_Townsend_27041.tei.xml"
fhl_df = xml2records(fhl)

ValueError: [E103] Trying to set conflicting doc.ents: '(135, 136, 'SPECIES')' and '(134, 136, 'LOCATION')'. A token can only be part of one entity, so make sure the entities you're setting don't overlap.

In [None]:
%debug

> [0;32m/Users/jpnelson/02021/sul-dlss/labs/spoc/doc/doc.pyx[0m(578)[0;36mspacy.tokens.doc.Doc.ents.__set__[0;34m()[0m



ipdb>  doc.ents


*** NameError: name 'doc' is not defined


ipdb>  u


> [0;32m/Users/jpnelson/02021/sul-dlss/labs/ml-env/lib/python3.9/site-packages/spacy_lookup/__init__.py[0m(62)[0;36m__call__[0;34m()[0m
[0;32m     60 [0;31m[0;34m[0m[0m
[0m[0;32m     61 [0;31m        [0;31m# Overwrite doc.ents and add entity – be careful not to replace![0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 62 [0;31m        [0mdoc[0m[0;34m.[0m[0ments[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mdoc[0m[0;34m.[0m[0ments[0m[0;34m)[0m [0;34m+[0m [0mspans[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     63 [0;31m[0;34m[0m[0m
[0m[0;32m     64 [0;31m        [0;32mfor[0m [0mspan[0m [0;32min[0m [0mspans[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  doc.ents


(Clara, Clara, bacteria)


ipdb>  spans


[Elwha, Santa Clara, Santa Clara, delta, Eel River, delta, delta, delta]


In [1]:
len(sci_id_names)

NameError: name 'sci_id_names' is not defined