In [1]:
import pandas as pd
import regex
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.pipeline import EntityRuler

In [2]:
VALID_CSV = 'data/label_babel/label_babel_valid_2020-01-02.csv'
TRAIN_CSV = 'data/label_babel/label_babel_train_2020-01-02.csv'

INPUT_CSV = VALID_CSV
INPUT_COL = 'predicted_text'

SUB_ID = 'subject_id'
PRED_CAT = 'predicted_category'

In [3]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_md')

df = pd.read_csv(INPUT_CSV, na_filter=False, dtype=str)

texts = df[INPUT_COL].tolist()

In [4]:
for doc in nlp.pipe(texts):
    for token in doc:
        print(f'{token.pos_}\t{token.lemma_}\t{token.text}')
    break

NOUN	FLORA	FLORA
ADP	of	OF
PROPN	ARK	ARK
PUNCT	:	:
SPACE	

	


PROPN	CORNACEAE	CORNACEAE
PROPN	Cornus	Cornus
PROPN	drummondii	drummondii
PROPN	C	C
PROPN	A.	A.
PROPN	Mey	Mey
PUNCT	.	.
PUNCT	,	,
SPACE	
	

ADJ	rough	rough
PUNCT	-	-
NOUN	leaved	leaved
NOUN	dogwood	dogwood
PUNCT	.	.
SPACE	

	


PROPN	YELL	YELL
PROPN	COUNTY	COUNTY
PUNCT	:	:
PROPN	Casa	Casa
PROPN	AR	AR
NUM	7.5	7.5
PUNCT	'	'
NOUN	topo	Topo
PUNCT	.	.
INTJ	quad	Quad
PUNCT	.	.
SPACE	
	

ADP	along	Along
PROPN	Mill	Mill
PROPN	Creek	Creek
PUNCT	,	,
ADV	just	just
ADV	north	north
ADP	of	of
DET	the	the
NOUN	intersection	intersection
ADP	of	of
SPACE	
	

PROPN	Hwys	Hwys
PUNCT	.	.
NUM	154	154
CCONJ	and	and
NUM	155	155
PUNCT	,	,
ADV	just	just
ADV	west	west
ADP	of	of
PROPN	New	New
PROPN	Neely	Neely
PUNCT	,	,
NOUN	approx	approx
PUNCT	.	.
SPACE	
	

NUM	7.5	7.5
NOUN	mile	miles
ADV	south	south
ADP	of	of
PROPN	AR	AR
PROPN	Hwy	Hwy
PUNCT	.	.
NUM	7	7
PUNCT	;	;
VERB	locate	located
ADP	in	in
NOUN	creekside	creekside
SPACE	
	

NOUN	thicket	thicket
PUN

In [5]:
matcher = Matcher(nlp.vocab, validate=True)
ruler = EntityRuler(nlp, validate=True)

In [6]:
month_names = ' | '.join("""
    january february march april may june july
    august september october november december
    jan feb mar apr jun jul aug sept? oct
    nov dec""".split())
month = {'LOWER': {'REGEX': f'(?x) ^ (?: {month_names} ) $ '}}

label = {'LOWER': 'date', 'OP': '?'}
punct = {'IS_PUNCT': True, 'OP': '?'}
sep = {'TEXT': {'REGEX': r'(?x) ^ [/_-]+ $'}}
digits = {'TEXT': {'REGEX': '(?x) ^ ( [12]\d{3} | \d{1,2} ) $'}}

In [7]:
date_patterns = [
    [digits, punct, month, punct, digits],
    [month, punct, digits, punct, digits],
    [digits, sep, digits, sep, digits],
]

matcher.add('LABEL_DATE', None, *date_patterns)

In [8]:
for i, doc in enumerate(nlp.pipe(texts)):
    sub_id = df.loc[i, SUB_ID]
    pred_cat = df.loc[i, PRED_CAT]
    matches = matcher(doc)
    if matches:
        for match_id, start, end in matches:
            span = doc[start:end]
            string_id = doc.vocab.strings[match_id]
            print(f'{sub_id} {pred_cat} {span.text}')
    else:
        print(f'{sub_id} {pred_cat}')


2995202 typewritten 12 May 2009
2995203 typewritten 30 April 2004
2995205 handwritten
2995213 typewritten 10-26-74
2995216 handwritten
2995228 handwritten
2995232 typewritten
2995235 handwritten
2995238 typewritten 3 June 2005
2995242 typewritten 4 August 2002
2995248 typewritten 2 June 2005
2995254 typewritten
2995261 handwritten
2995267 handwritten
2995291 typewritten May 15, 1977
2995295 typewritten September 13, 1978
2995308 typewritten 09 May 2002
2995309 handwritten
2995319 typewritten May 27, 1973
2995329 typewritten May 24, 1974
2995342 typewritten May 30, 1925
2995347 typewritten 7 Oct 1988
2995353 handwritten
2995362 handwritten
2995363 typewritten
2995365 handwritten
2995370 handwritten
2995374 handwritten
2995378 typewritten
2995386 handwritten
2995389 typewritten 6-19-1973
2995407 typewritten
2995416 typewritten 08 May 2002
2995417 typewritten
2995419 typewritten 29 June 2004
2995423 typewritten April 12, 1969
2995432 typewritten
2995434 typewritten 23 March 2001
2995438 t

11782380 typewritten
11782382 typewritten
11782387 typewritten
11782394 typewritten
11782398 typewritten
11782403 handwritten
11782410 typewritten
11782411 handwritten
11782416 typewritten
11782424 typewritten
11782433 typewritten
11782436 typewritten 14 June 2005
11782437 typewritten
11782445 typewritten September 27, 1994
11782451 typewritten 18 October 2002
11782452 handwritten
11782458 typewritten 10 August 2004
11782464 typewritten
11782481 handwritten
11782482 typewritten 10 August 2004
11782488 typewritten August 10, 1989
11782490 typewritten 2 June 2003
11782495 handwritten
11782507 typewritten June 19, 1980
11782508 handwritten
11782512 handwritten
11782515 handwritten
11782523 typewritten
11782526 typewritten 10 August 2004
11782547 typewritten 7 September 2002
11782548 typewritten 16 September 2003
11782551 typewritten 20 October 2002
11782552 typewritten 10 August 2004
11782559 typewritten August 10, 1989
11782568 typewritten
11782570 typewritten June 29, 1985
11782571 type

In [9]:
date_rules = [
#     {'label': 'LABEL_DATE', 'pattern': [label, punct, digits, punct, month, punct, digits]},
#     {'label': 'LABEL_DATE', 'pattern': [label, punct, month, punct, digits, punct, digits]},
#     {'label': 'LABEL_DATE', 'pattern': [label, punct, digits, sep, digits, sep, digits]},
    {'label': 'LABEL_DATE', 'pattern': [digits, punct, month, punct, digits]},
    {'label': 'LABEL_DATE', 'pattern': [month, punct, digits, punct, digits]},
    {'label': 'LABEL_DATE', 'pattern': [digits, sep, digits, sep, digits]},
]
ruler.add_patterns(date_rules)
# nlp.remove_pipe('entity_ruler')
nlp.add_pipe(ruler, before='ner')

In [10]:
for doc in nlp.pipe(texts):
    for ent in doc.ents:
        if ent.label_ == 'LABEL_DATE':
            print(f'{ent.text}')
    # break

12 May 2009
30 April 2004
10-26-74
3 June 2005
4 August 2002
2 June 2005
May 15, 1977
September 13, 1978
09 May 2002
May 27, 1973
May 24, 1974
May 30, 1925
7 Oct 1988
6-19-1973
08 May 2002
29 June 2004
April 12, 1969
23 March 2001
6 April 2002
7 June 2002
5 October 2002
9 October 2005
1 June 2005
April 11, 1975
10 April 2005
15 July 2004
4-15-39
6 August 2003
14 September 2004
May 1, 1985
May 28, 1956
3 June 2003
17 OCT 1989
14 April 1979
May 13, 1970
4-25-73
June 15, 1965
3-30-79
7-21-1985
7 March 1985
8 September 2011
4 September 2008
Aug 30, 1996
28 August 2009
8 July 2009
Sept, 23, 1966
19 July 2010
9 August 1979
6-28-77
April 14, 1985
3 September 2006
17 September 2008
1-4-1987
4 September 2005
9 October 1976
16 June 2010
15 July 2008
21 May 1968
8 July 1987
15 July 2005
July 16, 1966
10-6-67
16 June 2005
9-29-68
20 August 2008
10 September 2005
March 20; 1960
20 July 1983
April 5, 1979
9-23-77
5 Sept 70
3 March 1970
24 April 2008
9-27-77
26 Sept 71
October 18, 1969
May 12, 1967
2