In [None]:
# install all requirements quietly
#!pip install -q -r requirements.txt

# Sample NER Workflow for DigiVol

Read data from digiVol CSV file and pass through the SpaCY NER 

In [None]:
import spacy
import csv
import geocoder
import pandas as pd
import utils
%load_ext autoreload
%autoreload 2

In [None]:
# download the spacy models we need
model = 'en_core_web_md'
#spacy.cli.download(model)
nlp = spacy.load(model)

We first read the data using our utility function.  This gives us a data frame with the text in one column.

In [None]:
texts = utils.read_digivol_csv('data/Project-1536729-DwC.csv')
# drop some unused columns
texts.drop(['exportComment', 'transcriberID', 'validationStatus', 'validatorID', 'dateValidated', 'individualCount', 'locality', 'transcriberNotes', 'recordedBy'], axis=1, inplace=True)
texts.head()

## NER

We now perform NER on the text using the Spacy library.  For we generate a list of location entities and for each entity, record a snippet of text around the occurence.  The result is a DataFrame containing the placename, the context and the document identifier from the original data frame.

In [None]:
entities = utils.apply_ner(texts, text='text', ident='externalIdentifier')
entities.head()

In [None]:
locations = utils.apply_ner(texts, text='text', ident='externalIdentifier', keep_entities=['GPE'])
locations.head()

## Visualisation

Spacy can be used to visualise the NER results in the notebook.  This might not be too useful but illustrates what is possible. 

In [None]:
from spacy import displacy
from IPython.core.display import display, HTML

doc = nlp(texts['text'][0])
display(HTML(displacy.render(doc, style='ent')))

## Geocoding

We can use the `geocoder` module to submit these place names to a geocoding service.  Here we use the Geonames service and make a new table with the results.

In [None]:
locations = utils.geolocate_locations(locations)
locations

In [None]:
locations.to_csv("digivol-locations.csv")