* simple Named Entity Reongition model with VAR and TYPE tags using spaCy
* training data: tex files from the Stacks Project annotated using a "Let ... be a ..." rule
* inspired by https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py

In [3]:
from __future__ import unicode_literals, print_function
import json
import pathlib
import random

import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger

import os
import re
 
try:
    unicode
except:
    unicode = str

In [4]:
nlp = spacy.load('en')
#nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

In [5]:
def tex2doc(tex_file):  #read the whole tex file in the spaCy doc object
    with open(tex_file, 'r') as tex:
        data=tex.read()
    doc = nlp(data)
    return doc

In [6]:
def rule_based_annotation(doc):
    annotation = []
    for match in re.finditer('let \$(\S+( \S+){0,3})\$ be an? (\S+)', doc.text, re.IGNORECASE):
        annotation.append((match.span(1)[0],match.span(1)[1], 'VAR'))
        annotation.append((match.span(3)[0],match.span(3)[1], 'TYPE'))    
    return (doc.text, annotation)

In [23]:
annotated_data=[]

directory = os.fsencode('tex_files/')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    print("file: ", filename)
    doc = tex2doc(os.path.join(os.fsdecode(directory), filename))
    annotated_data.append(rule_based_annotation(doc))   
    
    

file:  intersection.tex
file:  spaces-simplicial.tex
file:  stacks-sheaves.tex
file:  cotangent.tex
file:  stacks-more-morphisms.tex
file:  formal-defos.tex
file:  spaces-more-cohomology.tex
file:  divisors.tex
file:  more-morphisms.tex


In [29]:
print(len(annotated_data))
print(annotated_data[0])

9


In [38]:
random.shuffle(annotated_data)
train_data = annotated_data[:-1]
test_data = annotated_data[-1:] #we hold out one tex file for testing

In [35]:
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner

In [36]:
ner = train_ner(nlp, train_data, ['VAR', 'TYPE'])

In [37]:
#first test on a simple sentence
doc = nlp.make_doc('Let $S$ be a scheme.')  
nlp.tagger(doc)
ner(doc)
for word in doc:
    print(word.text, word.ent_type_)

Let 
$ 
S$ 
be 
a 
scheme TYPE
. TYPE


In [47]:
#then test on the hold out tex file; there are no 'VAR' tags detected, maybe something to do with dollar sign tokenization...
doc = nlp.make_doc(test_data[0][0])  
nlp.tagger(doc)
ner(doc)
for word in doc:
    print(word.text, "\t" + word.ent_type_)

\input{preamble 	
} 	


 	
% 	
OK 	
, 	
start 	
here 	
. 	

 	
% 	

 	
\begin{document 	
} 	


 	
\title{More 	
on 	
Cohomology 	
of 	
Spaces 	
} 	


 	
\maketitle 	


 	
\phantomsection 	

 	
\label{section 	
- 	
phantom 	
} 	


 	
\tableofcontents 	





 	
\section{Introduction 	
} 	

 	
\label{section 	
- 	
introduction 	
} 	


 	
\noindent 	

 	
In 	
this 	
chapter 	
continues 	
the 	
discussion 	
started 	
in 	

 	
Cohomology 	
of 	
Spaces 	
, 	
Section 	
\ref{spaces 	
- 	
cohomology 	
- 	
section 	
- 	
introduction}. 	

 	
One 	
can 	
also 	
view 	
this 	
chapter 	
as 	
the 	
analogue 	
for 	
algebraic 	
spaces 	

 	
of 	
the 	
chapter 	
on 	
\'etale 	
cohomology 	
for 	
schemes 	
, 	
see 	

 	
\'Etale 	
Cohomology 	
, 	
Section 	
\ref{etale 	
- 	
cohomology 	
- 	
section 	
- 	
introduction}. 	


 	
\medskip\noindent 	

 	
In 	
fact 	
, 	
we 	
intend 	
this 	
chapter 	
to 	
be 	
mainly 	
a 	
translation 	
of 	
the 	

 	
results 	
already 	
proved 	
for 	
schemes 	
into 	
the 	
l