# Data Preparation for SPOC


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import datetime
import pathlib
import pandas as pd
from lxml import etree
import spacy
from spacy_lookup import Entity
papers_tei = pathlib.Path("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/papers_tei")

## Extracting Full-Text from TEI XML
[GROBID](https://grobid.readthedocs.io/en/latest/) extracts the full-text from the PDFs and saves the result in a
[TEI](https://tei-c.org/) XML document.

## WorMS Marine Species Dataframe


In [6]:
taxon = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/WoRMS_marineSpecies/taxon.txt', sep="\t")

In [13]:
species = taxon[['taxonID', 'scientificName', 'references']]
species = species.rename(columns={"references": "URL"})

In [14]:
species.to_json("../data/species.json")

## Location DataFrame

In [17]:
ca_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_CA.csv')

In [18]:
or_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_OR.csv')

In [19]:
wa_locations = pd.read_csv('/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/NamedPlaces_WA.csv')

In [20]:
locations = pd.concat([ca_locations, or_locations, wa_locations], ignore_index=True)

In [21]:
locations = locations[['FEATURE_ID', 'FEATURE_NAME', 'STATE_ALPHA', 'PRIM_LONG_DEC', 'PRIM_LAT_DEC']]
locations = locations.rename(columns={'STATE_ALPHA': 'STATE', 
                                      'PRIM_LONG_DEC': 'LONGITUDE', 
                                      'PRIM_LAT_DEC': 'LATITUDE'})

In [25]:
locations.to_json("../data/locations.json")

In [85]:
locations_dict = dict(zip(locations.FEATURE_ID, locations.FEATURE_NAME))

## Habitat DataFrame

In [30]:
import json
habitats = []
with open("/Users/jpnelson/Google Drive/Shared drives/SUL AI 2020-2021/Project - Species Occurrences/data/habitats.jsonl") as fo:
    for row in fo.readlines():
        line = json.loads(row)
        habitats.append(line.get('pattern'))
habitats = pd.DataFrame(habitats, columns=['Habitat'])

In [32]:
habitats.to_json("../data/habitats.json")

## Construct spaCy Pipeline
Create a spaCy nlp pipeline from the existing [en_core_web_md](https://spacy.io/models/en#en_core_web_md) English pipeline and then create a new spaCy Entity Ruler.

In [2]:
nlp = spacy.load('en_core_web_md')
ruler = nlp.add_pipe("entity_ruler", config={"validate": True})

### Create Species, Locations, and Habitat Pattern Dictionaries

In [4]:
species_patterns = []
for row in species.iterrows():
    pattern = {'label': 'SPECIES', 'id': row[1]['taxonID']}
    pattern['pattern'] = [ { "LOWER": name.lower() } for name in row[1]['scientificName'].split(' ')]
    species_patterns.append(pattern)

In [5]:
location_patterns = []
for row in locations.iterrows():
    pattern = { "label": "LOCATIONS", 'id': str(row[1]['FEATURE_ID']) }
    pattern['pattern'] = [ { "LOWER": name.lower() } for name in row[1]['FEATURE_NAME'].split(" ") ]
    location_patterns.append(pattern)

In [6]:
habitat_patterns = []
for row in habitats.iterrows():
    pattern = {"label": "HABITAT"}
    pattern['pattern'] = [ {'LOWER': name.lower()} for name in row[1]['Habitat'].split(' ')]
    habitat_patterns.append(pattern)

Add `species`, `locations`, and `habitat` pattern dictionaries into the Entity Ruler

In [7]:
ruler.add_patterns(species_patterns)
ruler.add_patterns(location_patterns)
ruler.add_patterns(habitat_patterns)

Load helper functions from `lib/etl.py` module

In [8]:
import sys, os
sys.path.append("../src")
import lib.etl as etl

In [75]:
start = datetime.datetime.utcnow()
all_records = None
print(f"Start conversion at {start}")
for i, tei_path in enumerate(papers_tei.iterdir()):
    try:
        records = etl.process_xml(tei_path.read_bytes(), tei_path.name, nlp)
    except:
        print(f"Error with {tei_path.name}")
        continue
    if all_records is None:
        all_records = records
    else:
        all_records = pd.concat([all_records, records], ignore_index=True)
    if not i%10 and i > 0:
        print(".", end="")
    if not i%25 and i > 0:
        print(f"{i}", end="")
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes")

Start conversion at 2021-03-15 20:56:10.125722
..25Finished at 2021-03-15 22:01:16.233183, total time 65.1 minutes


In [305]:
all_records.to_json("../data/records.json")

In [9]:
fhl = papers_tei/"fhl_2012_Townsend_27041.tei.xml"
fhl_df = etl.process_xml(fhl.read_bytes(), "fhl_2012_Townsend_27041.tei.xml", nlp)

In [14]:
fhl_df.head()

Unnamed: 0,Paper ID,Instance ID,Species,GBIF,Time,Place,Habitats,div_enum
0,fhl_2012_Townsend_27041.tei.xml,urn:lsid:marinespecies.org:taxname:6,bacteria,https://www.gbif.org/species/search?q=bacteria...,,delta,"river,rivers",4
1,fhl_2012_Townsend_27041.tei.xml,urn:lsid:marinespecies.org:taxname:6,bacteria,https://www.gbif.org/species/search?q=bacteria...,,delta,"marine,river,river mouth",5
2,fhl_2012_Townsend_27041.tei.xml,urn:lsid:marinespecies.org:taxname:6,bacteria,https://www.gbif.org/species/search?q=bacteria...,,"Elwha,case","downstream,river,river mouth",21


In [15]:
fhl_paper1 = papers_tei/"fhl_2011_Bockmon_26635.tei.xml"

In [71]:
species_patterns[-1]

{'label': 'SPECIES',
 'id': 'urn:lsid:marinespecies.org:taxname:1338746',
 'pattern': [{'LOWER': 'relicanthus'}, {'LOWER': 'daphneae'}]}

In [79]:
nlp.component_names

['tok2vec',
 'tagger',
 'parser',
 'senter',
 'ner',
 'attribute_ruler',
 'lemmatizer',
 'entity_ruler']

In [3]:
species = pd.read_json('../data/species.json')
locations = pd.read_json('../data/locations.json')
habitats = pd.read_json('../data/habitats.json')

In [8]:
species_dict = dict(zip(species.taxonID, species.scientificName))
location_dict = dict(zip(locations.FEATURE_ID, locations.FEATURE_NAME))

In [9]:
for key, val in species_dict.items():
    species_dict[key] = [val,]
for key, val in location_dict.items():
    location_dict[key] = [val,]

In [3]:
from spacy.language import Language

@Language.factory(name='species_entity')
def create_species_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=species_dict, label='SPECIES')

@Language.factory(name='location_entity')
def create_location_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_dict=location_dict, label='LOCATION')

@Language.factory(name='habitat_entity')
def create_habitat_entity(nlp: Language, name: str):
    return Entity(name=name, keywords_list=list(habitats.Habitat), label='HABITAT')

In [None]:
nlp = spacy.load('en_core_web_md')
nlp.add_pipe('species_entity')
nlp.add_pipe('location_entity')
nlp.add_pipe('habitat_entity')
nlp.remove_pipe("ner")

In [9]:
start = datetime.datetime.utcnow()
all_records = None
print(f"Start conversion at {start}")
for i, tei_path in enumerate(papers_tei.iterdir()):
    try:
        records = etl.process_xml(tei_path.read_bytes(), tei_path.name, nlp)
    except:
        print(f"Error with {tei_path.name}")
        continue
    if all_records is None:
        all_records = records
    else:
        all_records = pd.concat([all_records, records], ignore_index=True)
    if not i%10 and i > 0:
        print(".", end="")
    if not i%25 and i > 0:
        print(f"{i}", end="")
end = datetime.datetime.utcnow()
print(f"Finished at {end}, total time {(end-start).seconds / 60.} minutes")

Start conversion at 2021-03-16 16:36:40.133033
Error with fhl_2011_Brezicha_25959.tei.xml
Error with fhl_2011_Bockmon_26635.tei.xml
Error with fhl_2011_Broell_26677.tei.xml
Error with fhl_2011_Phillips_26637.tei.xml
Error with fhl_2011_Pierce_26647.tei.xml
Error with fhl_2011_Paxton_25964.tei.xml
Error with fhl_2011_Olmstead_26608.tei.xml
Error with fhl_2011_Paxton_25962.tei.xml
Error with fhl_2011_Lowe_26644.tei.xml
Error with fhl_2011_LobatodeCarvalhoMartins_26645.tei.xml
Error with fhl_2011_Little_26605.tei.xml
Error with fhl_2011_Lipscomb_26604.tei.xml
Error with fhl_2011_Lincoln_26623.tei.xml
Error with fhl_2011_Lavelle_26625.tei.xml
Error with fhl_2011_Lee_25961.tei.xml
Error with fhl_2011_Pratt_26609.tei.xml
Error with fhl_2011_Pietsch_26638.tei.xml
Error with fhl_2011_Krauzser_26603.tei.xml
Error with fhl_2011_Raymond_26989.tei.xml
Error with fhl_2011_Barrera-Martinez_26627.tei.xml
Error with fhl_2011_Hoang_26576.tei.xml
Error with fhl_2011_Hoang_26634.tei.xml
Error with fhl_20

In [11]:
fhl_paper1 = papers_tei/"fhl_2011_Bockmon_26635.tei.xml"
fhl_paper1 = papers_tei/"fhl_2011_Bockmon_26635.tei.xml"

In [12]:
records = etl.process_xml(fhl_paper1.read_bytes(), fhl_paper1.name, nlp)

In [13]:
records

Unnamed: 0,Paper ID,Instance ID,Species,GBIF,Time,Place,Habitats,div_enum
0,fhl_2011_Bockmon_26635.tei.xml,urn:lsid:marinespecies.org:taxname:603085,trossulus,https://www.gbif.org/species/search?q=trossulu...,,"north end,tide",,3
1,fhl_2011_Bockmon_26635.tei.xml,urn:lsid:marinespecies.org:taxname:112078,ammonia,https://www.gbif.org/species/search?q=ammonia&...,,,,11
2,fhl_2011_Bockmon_26635.tei.xml,urn:lsid:marinespecies.org:taxname:112078,Ammonia,https://www.gbif.org/species/search?q=Ammonia&...,,,,11
3,fhl_2011_Bockmon_26635.tei.xml,urn:lsid:marinespecies.org:taxname:112078,ammonia,https://www.gbif.org/species/search?q=ammonia&...,,,,12


In [16]:
from pandas.io.clipboard import copy; copy("\x04")