In [1]:
# install all requirements quietly
!pip install -q -r requirements.txt

# Sample NER Workflow for DigiVol

Read data from digiVol CSV file and pass through the SpaCY NER 

In [12]:
import spacy
import csv
import geocoder
import pandas as pd


In [42]:
# download the spacy models we need
spacy.cli.download('en_core_web_sm')
nlp = spacy.load('en_core_web_sm')


[93m    Linking successful[0m
    /opt/conda/lib/python3.6/site-packages/en_core_web_sm -->
    /opt/conda/lib/python3.6/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



We first read the data from the CSV file using the pandas csv reader.  The transcript can be found in the `occurrenceRemarks` column of the resulting data frame.

In [24]:
dv = pd.read_csv('data/Project-1536729-DwC.csv')
dv['occurrenceRemarks']

0     Purchased this Diary for which I paid 10/ and ...
1     Went to the Gaol with Requisition for Altering...
2     Paid Cabman 10 shillings for cab to Benevolent...
3     Rather seedy first thing this morning but cont...
4     “Seedy again”. This will not do. I must turn o...
5     Made my Returns out for the past week and sent...
6     My life is proceeding most unsatisfactory each...
7     Went to the Main Gaol the first thing this mor...
8     Got up early noticed some irregularity of the ...
9     Forwarded Letter to the Sheriff suggesting the...
10    Availed myself of Mr Dentist Hearle (sic) by c...
11    My gentle lunatic Sang, howled, blasphemed and...
12    Called at the Gaol and borrowed a razor for th...
13    Went to church this morning. Collection in aid...
14    Made out and sent in my Week’s Returns. The Vi...
15    Called on the Sheriff with reference to the ad...
16    Mr Wintle called this morning, previous I had ...
Name: occurrenceRemarks, dtype: object

In [31]:
places = []

for i, t in dv.iterrows():
    text = t['occurrenceRemarks']
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ is "GPE":
            context = doc[ent.start-2:ent.end+2]
            context = " ".join([w.text for w in context])
            d = {'placename': ent.text, 'context': context, 'doc': i}
            places.append(d)
locations = pd.DataFrame(places)
locations

Unnamed: 0,context,doc,placename
0,Government at Balaarat .,0,Balaarat
1,. Mr Ireland the Counsel,0,Ireland
2,of the Governt . to,0,Governt
3,"excitement in the Town , and",0,the Town
4,"sister at Geelong , enclosing",0,Geelong
5,passing across Richmond Paddock .,1,Richmond
6,"steered for Melbourne , leaving",1,Melbourne
7,Ball this Evening was an,2,Evening
8,"Newby at Richmond , and",3,Richmond
9,in the Evening and called,5,Evening


In [36]:
from spacy import displacy
from IPython.core.display import display, HTML

doc = nlp(dv['occurrenceRemarks'][0])
display(HTML(displacy.render(doc, style='ent')))

In [None]:
## Geocoding

We can use the `geocoder` module to submit these place names to the

In [41]:

geo = []
for place in locations['placename']:
    g = geocoder.geonames(place, key='austalk')
    if g:
        result = {'lat': g.lat, 'lng': g.lng, 'address': g.address, 'country': g.country}
    else:
        result = {'lat': 0, 'lng': 0, 'address': '', 'country': ''}
    geo.append(result)
    
geo = pd.DataFrame(geo)
geo

Status code Unknown from http://api.geonames.org/searchJSON: ERROR - HTTPConnectionPool(host='api.geonames.org', port=80): Read timed out. (read timeout=5.0)


Unnamed: 0,address,country,lat,lng
0,,,0.0,0.0
1,Ireland,Ireland,53.0,-8.0
2,,,0.0,0.0
3,Cape Town,South Africa,-33.92584,18.42322
4,Geelong,Australia,-38.14711,144.36069
5,Richmond,United States,37.55376,-77.46026
6,Melbourne,Australia,-37.814,144.96332
7,Evening Shade,United States,36.07174,-91.61931
8,Richmond,United States,37.55376,-77.46026
9,Evening Shade,United States,36.07174,-91.61931
