In [1]:
import pandas as pd
import regex
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.pipeline import EntityRuler

In [26]:
VALID_CSV = 'data/label_babel/label_babel_valid_2020-01-02.csv'
TRAIN_CSV = 'data/label_babel/label_babel_train_2020-01-02.csv'

INPUT_CSV = VALID_CSV
INPUT_COL = 'predicted_text'

SUB_ID = 'subject_id'
PRED_CAT = 'predicted_category'

In [3]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')

df = pd.read_csv(INPUT_CSV, na_filter=False, dtype=str)

texts = df[INPUT_COL].tolist()

In [4]:
for doc in nlp.pipe(texts):
    for token in doc:
        print(f'{token.pos_}\t{token.lemma_}\t{token.text}')
    break

NOUN	flora	FLORA
ADP	of	OF
PROPN	ARK	ARK
PUNCT	:	:
SPACE	

	


CCONJ	cornaceae	CORNACEAE
PROPN	Cornus	Cornus
VERB	drummondii	drummondii
PROPN	C	C
NOUN	a.	A.
PROPN	Mey	Mey
PROPN	.	.
PUNCT	,	,
SPACE	
	

ADJ	rough	rough
PUNCT	-	-
VERB	leave	leaved
NOUN	dogwood	dogwood
PUNCT	.	.
SPACE	

	


PROPN	YELL	YELL
PROPN	COUNTY	COUNTY
PUNCT	:	:
PROPN	Casa	Casa
PROPN	AR	AR
NUM	7.5	7.5
PUNCT	'	'
PROPN	Topo	Topo
PUNCT	.	.
ADJ	quad	Quad
PUNCT	.	.
SPACE	
	

ADP	along	Along
PROPN	Mill	Mill
PROPN	Creek	Creek
PUNCT	,	,
ADV	just	just
ADV	north	north
ADP	of	of
DET	the	the
NOUN	intersection	intersection
ADP	of	of
SPACE	
	

PROPN	Hwys	Hwys
PUNCT	.	.
NUM	154	154
CCONJ	and	and
NUM	155	155
PUNCT	,	,
ADV	just	just
NOUN	west	west
ADP	of	of
PROPN	New	New
ADV	neely	Neely
PUNCT	,	,
ADJ	approx	approx
PUNCT	.	.
SPACE	
	

NUM	7.5	7.5
NOUN	mile	miles
ADV	south	south
ADP	of	of
PROPN	AR	AR
PROPN	Hwy	Hwy
PUNCT	.	.
NUM	7	7
PUNCT	;	;
VERB	locate	located
ADP	in	in
NOUN	creekside	creekside
SPACE	
	

NOUN	thicket	thicket
PUNCT	;	

In [19]:
matcher = Matcher(nlp.vocab, validate=True)
ruler = EntityRuler(nlp, validate=True)

In [20]:
month_names = ' | '.join("""
    january february march april may june july
    august september october november december
    jan feb mar apr jun jul aug sept? oct
    nov dec""".split())
month = {'LOWER': {'REGEX': f'(?x) ^ (?: {month_names} ) $ '}}

label = {'LOWER': 'date', 'OP': '?'}
punct = {'IS_PUNCT': True, 'OP': '?'}
sep = {'TEXT': {'REGEX': r'(?x) ^ [/_-]+ $'}}
digits = {'TEXT': {'REGEX': '(?x) ^ ( [12]\d{3} | \d{1,2} ) $'}}

In [21]:
date_patterns = [
    [digits, punct, month, punct, digits],
    [month, punct, digits, punct, digits],
    [digits, sep, digits, sep, digits],
]

matcher.add('LABEL_DATE', None, *date_patterns)

In [30]:
for i, doc in enumerate(nlp.pipe(texts)):
    sub_id = df.loc[i, SUB_ID]
    pred_cat = df.loc[i, PRED_CAT]
    if sub_id != '2995254':
        continue
    for token in doc:
        print(token.text, token.pos_)
    matches = matcher(doc)
    if matches:
        for match_id, start, end in matches:
            span = doc[start:end]
            string_id = doc.vocab.strings[match_id]
            print(f'{sub_id} {pred_cat} {span.text}')
    else:
        print(f'{sub_id} {pred_cat}')


ee ADP

 SPACE
PLANTS NOUN
OF ADP
SQUXH PROPN
ARKANSAS PROPN

 SPACE
CORNACEAE PROPN


 SPACE
Family NOUN
. PUNCT

 SPACE
Genus_COrnus ADV


 SPACE
Species_PUrpusi X
Koehne PROPN

 SPACE
Common ADJ
or CCONJ
Local PROPN
Names PROPN

 SPACE
Narrow PROPN
- PUNCT
leaf NOUN
Doqwood PROPN


 SPACE
Locality NOUN
Gunner PROPN
Pool PROPN
Stone PROPN


 SPACE
MaiorxCounty PROPN

 SPACE
HabitarBanks PROPN
of ADP
No DET
. PUNCT
Sylamore PROPN
Creek PROPN
. PUNCT


 SPACE
Collected VERB
by ADP
D PROPN
, PUNCT
M PROPN
, PUNCT
MOORE PROPN
— PUNCT
Date PROPN
AUG-_11,1968 PROPN


 SPACE
and CCONJ
/ SYM
or_C+K.Moore PROPN
No_68309 PROPN
2995254 typewritten


In [10]:
date_rules = [
#     {'label': 'LABEL_DATE', 'pattern': [label, punct, digits, punct, month, punct, digits]},
#     {'label': 'LABEL_DATE', 'pattern': [label, punct, month, punct, digits, punct, digits]},
#     {'label': 'LABEL_DATE', 'pattern': [label, punct, digits, sep, digits, sep, digits]},
    {'label': 'LABEL_DATE', 'pattern': [digits, punct, month, punct, digits]},
    {'label': 'LABEL_DATE', 'pattern': [month, punct, digits, punct, digits]},
    {'label': 'LABEL_DATE', 'pattern': [digits, sep, digits, sep, digits]},
]
ruler.add_patterns(date_rules)
# nlp.remove_pipe('entity_ruler')
nlp.add_pipe(ruler, before='ner')

In [16]:
for doc in nlp.pipe(texts):
    for ent in doc.ents:
        if ent.label_ == 'LABEL_DATE':
            print(f'{ent.text}')
    # break

12 May 2009
30 April 2004
10-26-74
3 June 2005
4 August 2002
2 June 2005
May 15, 1977
September 13, 1978
09 May 2002
May 27, 1973
May 24, 1974
May 30, 1925
7 Oct 1988
6-19-1973
08 May 2002
29 June 2004
April 12, 1969
23 March 2001
6 April 2002
7 June 2002
5 October 2002
9 October 2005
1 June 2005
April 11, 1975
10 April 2005
15 July 2004
4-15-39
6 August 2003
14 September 2004
May 1, 1985
May 28, 1956
3 June 2003
17 OCT 1989
14 April 1979
May 13, 1970
4-25-73
June 15, 1965
3-30-79
7-21-1985
7 March 1985
8 September 2011
4 September 2008
Aug 30, 1996
28 August 2009
8 July 2009
Sept, 23, 1966
19 July 2010
9 August 1979
6-28-77
April 14, 1985
3 September 2006
17 September 2008
1-4-1987
4 September 2005
9 October 1976
16 June 2010
15 July 2008
21 May 1968
8 July 1987
15 July 2005
July 16, 1966
10-6-67
16 June 2005
9-29-68
20 August 2008
10 September 2005
March 20; 1960
20 July 1983
April 5, 1979
9-23-77
5 Sept 70
3 March 1970
24 April 2008
9-27-77
26 Sept 71
October 18, 1969
May 12, 1967
2