In [1]:
# Import all necessary libraries
import glob
import pandas as pd
import spacy
from spacy import displacy
from spacy_lookup import Entity

In [2]:
# Read in the taxon data as a dataframe
taxon = pd.read_csv('WoRMS/taxon.txt', sep="\t")

In [3]:
# See what the taxon data looks like
taxon.head

<bound method NDFrame.head of                                            taxonID  \
0             urn:lsid:marinespecies.org:taxname:1   
1             urn:lsid:marinespecies.org:taxname:2   
2             urn:lsid:marinespecies.org:taxname:3   
3             urn:lsid:marinespecies.org:taxname:4   
4             urn:lsid:marinespecies.org:taxname:5   
5             urn:lsid:marinespecies.org:taxname:6   
6             urn:lsid:marinespecies.org:taxname:7   
7             urn:lsid:marinespecies.org:taxname:8   
8             urn:lsid:marinespecies.org:taxname:9   
9            urn:lsid:marinespecies.org:taxname:10   
10           urn:lsid:marinespecies.org:taxname:11   
11           urn:lsid:marinespecies.org:taxname:12   
12           urn:lsid:marinespecies.org:taxname:13   
13           urn:lsid:marinespecies.org:taxname:24   
14           urn:lsid:marinespecies.org:taxname:25   
15           urn:lsid:marinespecies.org:taxname:26   
16           urn:lsid:marinespecies.org:taxname:51  

In [4]:
# Extract just the scientific names to a list
sci_names = taxon["scientificName"].to_list()

In [5]:
# See how many scientific names we have
len(sci_names)

597914

In [6]:
# Load Spacy's medium size English model
nlp = spacy.load("en_core_web_md")

In [7]:
# Create a new pipeline step using spacy_lookup.
# We use the scientific names as the keyword dictionary, and use MARINE as the entity label
entity = Entity(keywords_list=sci_names, label="MARINE")

In [8]:
# Add our new pipeline step to our pipeline
nlp.add_pipe(entity)

In [9]:
# We have to remove the normal NER step since entities can only be in one ent type, and it causes conflicts
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x1209b2168>)

In [10]:
# Identify the directory containing all the papers we want to process
text_dir = "allpapers"

In [11]:
# Creates lists of filenames and read in texts from each file in the text_dir
fns = []
texts = []
for fn in glob.glob(text_dir + "/*.txt"):
    fns.append(fn.split("/")[1])
    with open(fn, 'r') as f:
        texts.append(f.read())

In [12]:
# Stream the full texts into the NLP model with the additional entity id step and return a list
docs= list(nlp.pipe(texts))

In [14]:
# Create a list of sets of entities for every doc
# We have to get the text representation of each ent for set to identify unique values
ent_sets = [set([ent.text.lower() for ent in doc.ents]) for doc in docs]

In [15]:
# Build a dictionary with the filenames and corresponding entity sets
data = {
    "filename": fns,
    "ents": ent_sets
}

In [16]:
# Create a dataframe with filenames and entities
ent_df = pd.DataFrame(data = data)

In [17]:
ent_df

Unnamed: 0,filename,ents
0,ucla_1989_MarKelvin.txt,"{astropecten brasiliensis armatus, polyacanthu..."
1,usc_1998_Sp_Carr.txt,"{julia, monica, sheila, boca, arbor, bacteria}"
2,ucla_1981_WuestehubeLindaJ.txt,"{spirobranchus spinosus, here, halichoeres}"
3,hms_1989_MillerStephenE.txt,"{anthopleura, clinocottus analis, oxylebius pi..."
4,ucla_1975_ChapmanJohn_Asculpta.txt,"{acanthomysis sculpta, mysis, macrocystis, aca..."
5,usc_1974_SpFa_Lissner.txt,"{centrostephanus coronatus, io, corona, strong..."
6,usc_1978_Butsumyo_etal.txt,"{perca, afer, lo, io, wak, nu, ino, pao, uva}"
7,usc_1991_Sp_Hentschke.txt,"{callianassa, callianassidae, upogebiidae, che..."
8,ucla_1981_HughesBrian.txt,"{dictyota, paralabrax clathratus, io, loligo, ..."
9,usc_1971_Coyer.txt,"{spirobranchus, lo, nereis diversicolor, hydro..."


In [16]:
# Save the dataframe to a csv file
ent_df.to_csv('all_papers_ents.csv')

In [None]:
# make this a pure python script with a cli interface by way of click
# parameters would be the taxon file and the dir name for the directory of full texts in plain text format