* simple Named Entity Reongition model with VAR and TYPE tags using spaCy
* training data: tex files from the Stacks Project annotated using a "Let ... be a ..." rule
* inspired by https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py
* use regular expression + noun chunk   rule

In [181]:
from __future__ import unicode_literals, print_function
import json
import pathlib
import random

import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, LOWER, TAG, ORTH

import os
import re
 
try:
    unicode
except:
    unicode = str

In [182]:
nlp = spacy.load('en')

In [183]:
#initialize the triggering rule 'let $' for the rule-based annotation
matcher = Matcher(nlp.vocab)
matcher.add_pattern("let", [{LOWER: "let"}, {ORTH: "$"}])

In [184]:
def tex2doc(tex_file):  #read the whole tex file in the spaCy doc object
    with open(tex_file, 'r') as tex:
        data=tex.read()
    doc = nlp(data)
    return doc

In [185]:
def rule_based_annotation(doc):
    annotation = []
    for m in matcher(doc):
        let_position = m[2]
        dollar_position = m[3]
        annotation += add_letDollarBe_entity(doc, let_position, dollar_position)
    return (doc.text, annotation)


def add_letDollarBe_entity(doc, let_position, dollar_position):
    new_annotation=[]
    sentence = doc[let_position : dollar_position].sent   #stay in the current sentence, so to respect sentence boundaries
    print('===== sent', sentence, '========\n')
    
    part = doc[let_position : sentence.end]  #forget the already-processed first part of the sentence
    
    for regex_match in re.finditer('^let (\$+[^\$]+\$+) be (an?|the) \S+', part.text, re.IGNORECASE): #there is at most one match in this for loop because of ^
        #compute left and right char offset for candidate VAR
        left_offset = part.start_char + regex_match.span(1)[0]
        right_offset = part.start_char + regex_match.span(1)[1]
        
        print('regex_match', regex_match.group(1))
        for nn in part.noun_chunks:
            print('nn candidate', nn)
            if nn.start_char >= right_offset + 3: #we only consider noun chunks after the '$ be '
                
                
                
                new_annotation += ((left_offset, right_offset, 'VAR'),)
                
                #left and right char offset for TYPE are already among the nn attributes
                new_annotation += ((nn.start_char, nn.end_char, 'TYPE'),)
                print('nn', nn.text)
                break  #we only consider the 1st noun chunk after the 'be' 
                
    
    print('new_annotation', new_annotation)
                    
    return new_annotation

In [244]:
annotated_data=[]

directory = os.fsencode('tex_files/')
list_of_texs = [os.fsdecode(file) for file in os.listdir(directory)[0:10]]  #remove [0:10] to get the full data
for filename in list_of_texs:
    print("file: ", filename)
    doc = tex2doc(os.path.join(os.fsdecode(directory), filename))
    annotated_data.append(rule_based_annotation(doc))    

file:  pione.tex
===== sent Before we state the result we introduce the category of $G$-sets for a
topological group $G$.

\begin{definition}
\label{definition-G-set-continuous}
Let $G$ be a topological group.

regex_match $G$
nn candidate a topological group
nn a topological group
new_annotation [(794, 797, 'VAR'), (801, 820, 'TYPE')]
===== sent \medskip\noindent
Recall that if $L/K$ is an infinite Galois extension then the
Galois group $G = \text{Gal}(L/K)$ comes endowed with a canonical
topology, see Fields, Section \ref{fields-section-infinite-galois}.

\begin{lemma}
\label{lemma-sheaves-point}

regex_match $K$
nn candidate a field
nn a field
new_annotation [(1863, 1866, 'VAR'), (1870, 1877, 'TYPE')]
===== sent Let $K^{sep}$ a separable closure of $K$.
Consider the profinite group $G = \text{Gal}(K^{sep}/K)$.

new_annotation []
===== sent In this section we discuss some of the material the reader can
find in \cite[Expos\'e V, Sections 4, 5, and 6]{SGA1}.

\medskip\noindent
Let $F :

===== sent Let $B$ be the integral closure of $A$ in $L$.
Let $\mathfrak m$ be a maximal ideal of $B$.
Let $G = \text{Gal}(L/K)$,
$D = \{\sigma \in G \mid \sigma(\mathfrak m) = \mathfrak m\}$, and
$I = \{\sigma \in D \mid \sigma \bmod \mathfrak m =

regex_match $\mathfrak m$
nn candidate a maximal ideal
nn a maximal ideal
new_annotation [(122557, 122570, 'VAR'), (122574, 122589, 'TYPE')]
===== sent Let $B$ be the integral closure of $A$ in $L$.
Let $\mathfrak m$ be a maximal ideal of $B$.
Let $G = \text{Gal}(L/K)$,
$D = \{\sigma \in G \mid \sigma(\mathfrak m) = \mathfrak m\}$, and
$I = \{\sigma \in D \mid \sigma \bmod \mathfrak m =

new_annotation []
===== sent To construct the second short exact sequence let $\Lambda$ be the set
of finite Galois subextensions, i.e., $\lambda \in \Lambda$ corresponds
to $L/L_\lambda/K$. Set $G_\lambda = \text{Gal}(L_\lambda/K)$.
Recall that $G_\lambda$ is an inverse system of finite groups with surjective
transition maps and that $G = \lim_{\lambda \in

===== sent \medskip\noindent
To get rid of the ramification we are going to choose a further finite
separable extension $K^{sep}/L'/L/K$ such that the ramification
index $e$ of the induced extensions $B'/B$ is divisible by $e_i$.
Consider the normalized base change $Z'$ of $Z$ with respect to
$\Spec(B') \to \Spec(B)$, see discussion in
More on Morphisms, Section \ref{more-morphisms-section-reduced-fibre-theorem}.
Let $\xi_{i, j}$ be the points of $Z'$ mapping to $\xi_{B'}$

regex_match $\xi_{i, j}$
nn candidate the points
nn the points
new_annotation [(212135, 212147, 'VAR'), (212151, 212161, 'TYPE')]
===== sent \end{proof}

\noindent

regex_match $G$
nn candidate a profinite group
nn a profinite group
new_annotation [(213259, 213262, 'VAR'), (213266, 213283, 'TYPE')]
===== sent Let $p$ be a prime number.

regex_match $p$
nn candidate a prime number
nn a prime number
new_annotation [(213289, 213292, 'VAR'), (213296, 213310, 'TYPE')]
===== sent The {\it maximal prime-to-$p$ quotient} is

file:  spaces-simplicial.tex
===== sent \medskip\noindent

regex_match $X$
nn candidate a simplicial space
nn a simplicial space
new_annotation [(1804, 1807, 'VAR'), (1811, 1829, 'TYPE')]
===== sent \begin{lemma}
\label{lemma-simplicial-site}

regex_match $X$
nn candidate a simplicial space
nn a simplicial space
new_annotation [(2935, 2938, 'VAR'), (2942, 2960, 'TYPE')]

regex_match $X$
nn candidate a simplicial space
nn a simplicial space
new_annotation [(3068, 3071, 'VAR'), (3075, 3093, 'TYPE')]
===== sent Let $\mathcal{F}$ be a sheaf on $X_{Zar}$.
It is clear from the definition of coverings, that the restriction
of $\mathcal{F}$ to the opens of $X_n$ defines a sheaf $\mathcal{F}_n$
on the topological space $X_n$. For every $\varphi : [m] \to [n]$ the
restriction maps of $\mathcal{F}$ for pairs $U \subset X_n$, $V \subset X_m$
with $X(\varphi)(U) \subset V$, define an $X(\varphi)$-map
$\mathcal{F}(\varphi) : \mathcal{F}_m \to \mathcal{F}_n$, see
Sheaves, Definition \ref{sheaves-defi

nn candidate a hypercovering
nn a hypercovering
new_annotation [(176026, 176029, 'VAR'), (176033, 176048, 'TYPE')]
===== sent \begin{lemma}
\label{lemma-hypercovering-X-descent-sheaves}
Let $\mathcal{C}$ be a site with fibre products and $X \in \Ob(\mathcal{C})$.
Let $K$ be a hypercovering of $X$. Then
\begin{enumerate}
\item $a^{-1} : \Sh(\mathcal{C}/X) \to \Sh((\mathcal{C}/K)_{total})$
is fully faithful with essential image the cartesian sheaves of sets,
\item $a^{-1} : \textit{Ab}(\mathcal{C}/X) \to
\textit{Ab}((\mathcal{C}/K)_{total})$
is fully faithful with essential image the cartesian sheaves
of abelian groups.

regex_match $\mathcal{C}$
nn candidate a site
nn a site
new_annotation [(177356, 177369, 'VAR'), (177373, 177379, 'TYPE')]
===== sent \begin{lemma}
\label{lemma-hypercovering-X-descent-sheaves}
Let $\mathcal{C}$ be a site with fibre products and $X \in \Ob(\mathcal{C})$.
Let $K$ be a hypercovering of $X$. Then
\begin{enumerate}
\item $a^{-1} : \Sh(\mathcal{C}/X) \to \Sh(

regex_match $X$
nn candidate a simplicial algebraic space
nn a simplicial algebraic space
new_annotation [(257079, 257082, 'VAR'), (257086, 257114, 'TYPE')]
===== sent \to \Sh(Y_\etale)$.
Recall that $h_\etale^{-1}$ and $h_{\etale, *}$ have a simple
description in terms of the components, see
Lemma \ref{lemma-morphism-simplicial-sites}.
Let $\mathcal{O}_X$, resp.\ $\mathcal{O}_Y$ denote the structure
sheaf of $X$, resp.\ $Y$. We define
$h_\etale^\sharp : h_{\etale, *}\mathcal{O}_X \to \mathcal{O}_Y$
to be the map of sheaves of rings on $Y_\etale$ given by
$h_n^\sharp : h_{n, *}\mathcal{O}_{X_n} \to \mathcal{O}_{Y_n}$ on $Y_n$.
We obtain a morphism of ringed topoi
$$
h_\etale :
(\Sh(X_\etale), \mathcal{O}_X)
\longrightarrow
(\Sh(Y_\etale), \mathcal{O}_Y)
$$

\medskip\noindent
Let $X$ be a simplicial algebraic space with structure sheaf $\mathcal{O}$.
Let $X_{-1}$ be an algebraic space over $S$ and let $a_0 : X_0 \to X_{-1}$
be an augmentation of $X$. By
Lemma \ref{lemma-augmentation-sit

new_annotation [(299001, 299016, 'VAR'), (299020, 299032, 'TYPE')]
file:  stacks-sheaves.tex
===== sent \begin{definition}
\label{definition-presheaves}
Let $p : \mathcal{X} \to (\Sch/S)_{fppf}$ be a category fibred in
groupoids.

regex_match $p : \mathcal{X} \to (\Sch/S)_{fppf}$
nn candidate a category
nn a category
new_annotation [(3183, 3220, 'VAR'), (3224, 3234, 'TYPE')]
===== sent The category of {\it abelian presheaves}, i.e., presheaves of abelian
groups, is denoted $\textit{PAb}(\mathcal{X})$.

\medskip\noindent
Let $f : \mathcal{X} \to \mathcal{Y}$ be a $1$-morphism of categories
fibred in groupoids over $(\Sch/S)_{fppf}$. Recall that this
means just that $f$ is a functor over $(\Sch/S)_{fppf}$.
The material in
Sites, Section \ref{sites-section-more-functoriality-PSh}
provides us with a pair of adjoint functors\footnote{These functors
will be denoted $f^{-1}$ and $f_*$ after
Lemma \ref{lemma-functoriality-sheaves}
has been proved.}

regex_match $f : \mathcal{X} \to \mathcal{Y}

===== sent \section{The cotangent complex of a ring map}
\label{section-cotangent-ring-map}

\noindent

regex_match $A$
nn candidate a ring
nn a ring
new_annotation [(2845, 2848, 'VAR'), (2852, 2858, 'TYPE')]
===== sent Let $\textit{Alg}_A$ be the category of $A$-algebras.

regex_match $\textit{Alg}_A$
nn candidate the category
nn the category
new_annotation [(2864, 2880, 'VAR'), (2884, 2896, 'TYPE')]
===== sent Let $X_\bullet$ be the simplicial object of
$\text{Fun}(\textit{Alg}_A, \textit{Alg}_A)$ constructed in
Simplicial, Section \ref{simplicial-section-standard}.

\medskip\noindent
Consider an $A$-algebra $B$. Denote $P_\bullet = X_\bullet(B)$ the resulting

regex_match $X_\bullet$
nn candidate the simplicial object
nn the simplicial object
new_annotation [(3146, 3157, 'VAR'), (3161, 3182, 'TYPE')]
===== sent \begin{definition}
\label{definition-standard-resolution}

regex_match $A \to B$
nn candidate a ring map
nn a ring map
new_annotation [(3740, 3749, 'VAR'), (3753, 3763, 'TYPE

regex_match $\Lambda$
nn candidate a ring
nn a ring
new_annotation [(158234, 158243, 'VAR'), (158247, 158253, 'TYPE')]
===== sent Let $X$ be an algebraic space over $\Lambda$.
Let $\mathcal{C}_{X/\Lambda}$ be the category whose objects are
commutative diagrams
\begin{equation}
\label{equation-object-space}
\vcenter{
\xymatrix{
X \ar[d] & U \ar[l] \ar[d] \\

regex_match $X$
nn candidate an algebraic space
nn an algebraic space
new_annotation [(158259, 158262, 'VAR'), (158266, 158284, 'TYPE')]
===== sent Let $X$ be an algebraic space over $\Lambda$.
Let $\mathcal{C}_{X/\Lambda}$ be the category whose objects are
commutative diagrams
\begin{equation}
\label{equation-object-space}
\vcenter{
\xymatrix{
X \ar[d] & U \ar[l] \ar[d] \\

regex_match $\mathcal{C}_{X/\Lambda}$
nn candidate the category
nn the category
new_annotation [(158305, 158330, 'VAR'), (158334, 158346, 'TYPE')]
===== sent $U \to X$ of $X_\etale$
let $P_{\bullet, U}$ be the standard resolution of $\mathcal{O}_X(U)$

regex_mat

===== sent \medskip\noindent
Let $\Lambda$ be a complete Noetherian local ring with residue field $k$,
and let $\mathcal{C}_\Lambda$ denote the category of Artinian local
$\Lambda$-algebras with residue field $k$. Given a functor
$F : \mathcal{C}_\Lambda \to \textit{Sets}$ such that $F(k)$
is a one element set, Schlessinger's paper introduced conditions
(H1)-(H4) such that:
\begin{enumerate}
\item $F$ has a ``hull'' if and only if (H1)-(H3) hold.
\item $F$ is prorepresentable if and only (H1)-(H4) hold.

regex_match $\Lambda$
nn candidate a complete Noetherian local ring
nn a complete Noetherian local ring
new_annotation [(679, 688, 'VAR'), (692, 724, 'TYPE')]
===== sent \medskip\noindent
Let $\Lambda$ be a complete Noetherian local ring with residue field $k$,
and let $\mathcal{C}_\Lambda$ denote the category of Artinian local
$\Lambda$-algebras with residue field $k$. Given a functor
$F : \mathcal{C}_\Lambda \to \textit{Sets}$ such that $F(k)$
is a one element set, Schlessinger's pap

nn candidate a minimal object
nn a minimal object
new_annotation [(158809, 158819, 'VAR'), (158823, 158839, 'TYPE')]
===== sent \label{lemma-smallest-where-descends-versal}
Let $\mathcal{F}$ be a category cofibred in groupoids over

regex_match $\mathcal{F}$
nn candidate a category
nn a category
new_annotation [(159453, 159466, 'VAR'), (159470, 159480, 'TYPE')]
===== sent Let $\xi$ be a versal formal object

regex_match $\xi$
nn candidate a versal formal object
nn a versal formal object
new_annotation [(159550, 159555, 'VAR'), (159559, 159581, 'TYPE')]
===== sent \end{proof}

\begin{lemma}
\label{lemma-descends-versal}
Let $\mathcal{F}$ be a category cofibred in groupoids over

regex_match $\mathcal{F}$
nn candidate a category
nn a category
new_annotation [(162474, 162487, 'VAR'), (162491, 162501, 'TYPE')]
===== sent Let $\xi$ be a versal formal object
of $\mathcal{F}$ lying over $R$. Let $\xi' \to \xi$ be a morphism
of formal objects lying over $R' \subset R$ as constructed in
Lemma \

regex_match $\Lambda$
nn candidate a Noetherian ring
nn a Noetherian ring
new_annotation [(274161, 274170, 'VAR'), (274174, 274191, 'TYPE')]
===== sent Let $\Lambda$ be a Noetherian ring and

regex_match $\Lambda \to k$
nn candidate a finite ring map
nn a finite ring map
new_annotation [(274200, 274215, 'VAR'), (274219, 274236, 'TYPE')]
===== sent However, since in this section we will discuss what happen when we change
$k$ we will instead use the notation $\mathcal{C}_{\Lambda, k}$ to
indicate the dependence on $k$.

\begin{situation}
\label{situation-change-of-fields}
Let $\Lambda$ be a Noetherian ring and let $\Lambda \to k \to l$ be a finite

regex_match $\Lambda$
nn candidate a Noetherian ring
nn a Noetherian ring
new_annotation [(274700, 274709, 'VAR'), (274713, 274730, 'TYPE')]
===== sent However, since in this section we will discuss what happen when we change
$k$ we will instead use the notation $\mathcal{C}_{\Lambda, k}$ to
indicate the dependence on $k$.

\begin{situation}
\

===== sent A basic reference is \cite{EGA}.



\section{Associated points}
\label{section-associated}

\noindent
Let $R$ be a ring and let $M$ be an $R$-module.

regex_match $R$
nn candidate a ring
nn a ring
new_annotation [(401, 404, 'VAR'), (408, 414, 'TYPE')]
===== sent A basic reference is \cite{EGA}.



\section{Associated points}
\label{section-associated}

\noindent
Let $R$ be a ring and let $M$ be an $R$-module.

regex_match $M$
nn candidate $M$
nn candidate an $R$-module
nn an $R$-module
new_annotation [(423, 426, 'VAR'), (430, 443, 'TYPE')]
===== sent Here is the definition of associated points
for quasi-coherent sheaves on schemes
as given in \cite[IV Definition 3.1.1]{EGA}.

\begin{definition}
\label{definition-associated}
Let $X$ be a scheme.

regex_match $X$
nn candidate a scheme
nn a scheme
new_annotation [(828, 831, 'VAR'), (835, 843, 'TYPE')]
===== sent Let $\mathcal{F}$ be a quasi-coherent sheaf on $X$.
\begin{enumerate}
\item We say $x \in X$ is {\it associated} to $

regex_match $Z \subset S$
nn candidate $Z \subset
nn candidate the scheme theoretic support
nn the scheme theoretic support
new_annotation [(40337, 40350, 'VAR'), (40354, 40382, 'TYPE')]
===== sent \end{proof}

\begin{lemma}
\label{lemma-fitting-ideal-generate-locally}

regex_match $S$
nn candidate a scheme
nn a scheme
new_annotation [(41893, 41896, 'VAR'), (41900, 41908, 'TYPE')]
===== sent Let $\mathcal{F}$ be a finite type, quasi-coherent

regex_match $\mathcal{F}$
nn candidate a finite type
nn a finite type
new_annotation [(41914, 41927, 'VAR'), (41931, 41944, 'TYPE')]

new_annotation []
===== sent Then $\mathcal{F}$ can be
generated by $r$ elements in a neighbourhood of $s$ if and only
if $\text{Fit}_r(\mathcal{F})_s = \mathcal{O}_{S, s}$.
\end{lemma}

\begin{proof}
Follows immediately from
More on Algebra, Lemma \ref{more-algebra-lemma-fitting-ideal-generate-locally}.
\end{proof}

\begin{lemma}
\label{lemma-fitting-ideal-finite-locally-free}

regex_match $S$
nn candidate a scheme

nn a locally Noetherian scheme
new_annotation [(108157, 108160, 'VAR'), (108164, 108191, 'TYPE')]
===== sent Let $U \subset X$ be an open subscheme
such that the inclusion morphism $U \to X$ is affine.

regex_match $U \subset X$
nn candidate $
nn candidate an open subscheme
nn an open subscheme
new_annotation [(108197, 108210, 'VAR'), (108214, 108231, 'TYPE')]
===== sent Then $\eta \in \Spec(\mathcal{O}_{X, \xi})$ and
we see that the dimension cannot be $0$.
\end{proof}

\begin{lemma}
\label{lemma-complement-affine-open}

regex_match $X$
nn candidate a separated locally Noetherian scheme
nn a separated locally Noetherian scheme
new_annotation [(109565, 109568, 'VAR'), (109572, 109609, 'TYPE')]
===== sent Let $U \subset X$ be an

new_annotation []
===== sent \begin{lemma}
\label{lemma-complement-open-affine-effective-cartier-divisor}

regex_match $X$
nn candidate a Noetherian separated scheme
nn a Noetherian separated scheme
new_annotation [(110264, 110267, 'VAR'), (110271, 110300, 'TYP

new_annotation [(215560, 215573, 'VAR'), (215577, 215613, 'TYPE')]
===== sent Let $s$ be a regular meromorphic section of $\mathcal{L}$.
Let us denote $\mathcal{I} \subset \mathcal{O}_X$ the
sheaf of ideals defined by the rule
$$
\mathcal{I}(V)
=
\{f \in \mathcal{O}_X(V) \mid fs \in \mathcal{L}(V)\}.
$$
The formula makes sense since
$\mathcal{L}(V) \subset \mathcal{K}_X(\mathcal{L})(V)$.
Then $\mathcal{I}$ is a quasi-coherent sheaf of ideals and
we have injective maps
$$
1 : \mathcal{I} \longrightarrow \mathcal{O}_X,
\quad
s : \mathcal{I} \longrightarrow \mathcal{L}
$$
whose cokernels are supported on closed nowhere dense subsets of $X$.
\end{lemma}

\begin{proof}
The question is local on $X$.
Hence we may assume that $X = \Spec(A)$,

regex_match $s$
nn candidate a regular meromorphic section
nn a regular meromorphic section
new_annotation [(215619, 215622, 'VAR'), (215626, 215655, 'TYPE')]
===== sent \end{proof}

\begin{definition}
\label{definition-regular-meromorphic-ideal-denominat


regex_match $S$
nn candidate a scheme
nn a scheme
new_annotation [(303023, 303026, 'VAR'), (303030, 303038, 'TYPE')]
===== sent Let $Z \subset S$ be a closed subscheme.

regex_match $Z \subset S$
nn candidate $
nn candidate a closed subscheme
nn a closed subscheme
new_annotation [(303044, 303057, 'VAR'), (303061, 303079, 'TYPE')]
===== sent Let $b : S' \to S$ be the blowing up of $Z$ in $S$. Let
$g : X \to Y$ be an affine morphism of schemes over $S$.
Let $\mathcal{F}$ be a quasi-coherent sheaf on $X$.
Let $g' : X \times_S S' \to Y \times_S S'$ be the base change
of $g$. Let $\mathcal{F}'$ be the strict transform of $\mathcal{F}$
relative to $b$. Then $g'_*\mathcal{F}'$ is the strict transform
of $g_*\mathcal{F}$.
\end{lemma}

\begin{proof}

regex_match $b : S' \to S$
nn candidate S
nn candidate X \to
nn X \to
new_annotation [(303085, 303099, 'VAR'), (303142, 303147, 'TYPE')]
===== sent Let $b : S' \to S$ be the blowing up of $Z$ in $S$. Let
$g : X \to Y$ be an affine morphism of sche


regex_match $S$
nn candidate a scheme
nn a scheme
new_annotation [(1304, 1307, 'VAR'), (1311, 1319, 'TYPE')]
===== sent Let $i_X : X \to X'$ be a thickening.

regex_match $i_X : X \to X'$
nn candidate a thickening
nn a thickening
new_annotation [(1650, 1666, 'VAR'), (1670, 1682, 'TYPE')]
===== sent \begin{lemma}
\label{lemma-first-order-thickening}
Let $X$ be a scheme over a base $S$. Consider a short exact sequence
$$
0 \to \mathcal{I} \to \mathcal{A} \to \mathcal{O}_X \to 0
$$
of sheaves on $X$ where $\mathcal{A}$ is a sheaf of
$f^{-1}\mathcal{O}_S$-algebras,
$\mathcal{A} \to \mathcal{O}_X$ is a surjection
of sheaves of $f^{-1}\mathcal{O}_S$-algebras, and $\mathcal{I}$ is its kernel.

regex_match $X$
nn candidate a scheme
nn a scheme
new_annotation [(2797, 2800, 'VAR'), (2804, 2812, 'TYPE')]
===== sent Let $U = \Spec(B)$
be an affine open of $X$. Set $A = \Gamma(U, \mathcal{A})$. Note that
since $H^1(U, \mathcal{I}) = 0$ (see Cohomology of Schemes, Lemma
\ref{coherent-lemma-quasi-co

nn candidate $
nn candidate X \to
nn candidate a morphism
nn a morphism
new_annotation [(168836, 168849, 'VAR'), (168853, 168863, 'TYPE')]
===== sent Let $f : X \to Y$ be a morphism of schemes over $S$.
Let $\mathcal{F}$ be a quasi-coherent $\mathcal{O}_X$-module.

regex_match $\mathcal{F}$
nn candidate a quasi-coherent $\mathcal{O}_X$-module
nn a quasi-coherent $\mathcal{O}_X$-module
new_annotation [(168889, 168902, 'VAR'), (168906, 168945, 'TYPE')]
===== sent Y'_{s'} \ar[r] & Y_s
}
$$
the horizontal morphisms are flat as they are base changes by the flat
morphism $\Spec(\kappa(s')) \to \Spec(\kappa(s))$.
\end{proof}

\begin{lemma}
\label{lemma-base-change-flatness-fibres}

regex_match $S$
nn candidate a scheme
nn a scheme
new_annotation [(170848, 170851, 'VAR'), (170855, 170863, 'TYPE')]
===== sent Let $f : X \to Y$ be a morphism of schemes over $S$.
Assume
\begin{enumerate}
\item $X$ is locally of finite presentation over $S$,
\item $X$ is flat over $S$, and
\item $Y$ is locally of 


regex_match $f : X \to Y$
nn candidate X \to
nn candidate a morphism
nn a morphism
new_annotation [(256083, 256096, 'VAR'), (256100, 256110, 'TYPE')]
===== sent In particular this implies that $Y$ is integral, see
Properties, Lemma \ref{properties-lemma-characterize-integral}.
Let $X_\eta = X_{1, \eta} \cup \ldots \cup X_{n, \eta}$
be the decomposition of $X_\eta$ into irreducible components.

new_annotation []
===== sent Let $X_i \subset X$ be the reduced closed subscheme whose generic
fibre is $X_{i, \eta}$. Note that $Z_{i, j} = X_i \cap X_j$
is a closed subset of $X_i$ whose generic fibre $Z_{i, j, \eta}$

regex_match $X_i \subset X$
nn candidate the reduced closed subscheme
nn the reduced closed subscheme
new_annotation [(256725, 256740, 'VAR'), (256744, 256772, 'TYPE')]
===== sent \end{proof}

\begin{lemma}
\label{lemma-base-change-fibres-geometrically-irreducible}
Let $f : X \to Y$ be a morphism of schemes.
Let $g : Y' \to Y$ be any morphism, and denote
$f' : X' \to Y'$ the bas

regex_match $X \to S$
nn candidate $X
nn candidate a smooth morphism
nn a smooth morphism
new_annotation [(371915, 371924, 'VAR'), (371928, 371945, 'TYPE')]

new_annotation []
===== sent \begin{lemma}
\label{lemma-slice-smooth-given-element}
Let $f : X \to S$ be a morphism of schemes.

regex_match $f : X \to S$
nn candidate X \to
nn candidate a morphism
nn a morphism
new_annotation [(372959, 372972, 'VAR'), (372976, 372986, 'TYPE')]

regex_match $x \in X$
nn candidate a point
nn a point
new_annotation [(373003, 373012, 'VAR'), (373016, 373023, 'TYPE')]
===== sent S$.
Let $h \in \mathfrak m_x \subset \mathcal{O}_{X, x}$.
Assume
\begin{enumerate}
\item $f$ is smooth at $x$, and
\item the image $\text{d}\overline{h}$ of $\text{d}h$ in
$$
\Omega_{X_s/s, x} \otimes_{\mathcal{O}_{X_s, x}} \kappa(x) =
\Omega_{X/S, x} \otimes_{\mathcal{O}_{X, x}} \kappa(x)
$$
is nonzero.

new_annotation []
===== sent \end{proof}

\begin{lemma}
\label{lemma-slice-smooth-once}
Let $f : X \to S$ be a morphism of 

new_annotation [(506106, 506115, 'VAR'), (506119, 506126, 'TYPE')]
===== sent \end{proof}

\begin{theorem}[Stein factorization; general case]
\label{theorem-stein-factorization-general}
Let $S$ be a scheme.

regex_match $S$
nn candidate a scheme
nn a scheme
new_annotation [(507988, 507991, 'VAR'), (507995, 508003, 'TYPE')]
===== sent Let $f : X \to S$ be a proper morphism.

regex_match $f : X \to S$
nn candidate X \to
nn candidate a proper morphism
nn a proper morphism
new_annotation [(508009, 508022, 'VAR'), (508026, 508043, 'TYPE')]
===== sent Then $S' = \Spec(R')$.
Thus we may replace $S$ by $S'$ and assume that
$S = \Spec(R)$ is affine $R = \Gamma(X, \mathcal{O}_X)$.

regex_match $s \in S$
nn candidate a point
nn a point
new_annotation [(509334, 509343, 'VAR'), (509347, 509354, 'TYPE')]
===== sent Let $U \to S$ be an \'etale morphism
of affine schemes and let $u \in U$ be a point mapping to $s$.
Let $X_U \to U$ be the base change of $X$. By
Lemma \ref{lemma-characterize-geometrical


regex_match $S$
nn candidate a scheme
nn a scheme
new_annotation [(623660, 623663, 'VAR'), (623667, 623675, 'TYPE')]
===== sent Let $f : X \to Y$ be a morphism of schemes over $S$.
If $X$, $Y$ are weakly \'etale over $S$, then $f$ is weakly \'etale.

regex_match $f : X \to Y$
nn candidate $
nn candidate X \to
nn candidate a morphism
nn a morphism
new_annotation [(623681, 623694, 'VAR'), (623698, 623708, 'TYPE')]
===== sent Although the proof of the result is kind of
laborious, in essence it follows in a straightforward manner from
Epp's result on eliminating ramification, see
More on Algebra, Theorem \ref{more-algebra-theorem-epp}.

\medskip\noindent
Let $A$ be a Dedekind domain with fraction field $K$.
Let $X$ be a scheme flat and of finite type over $A$.

regex_match $A$
nn candidate a Dedekind domain
nn a Dedekind domain
new_annotation [(624878, 624881, 'VAR'), (624885, 624902, 'TYPE')]
===== sent Although the proof of the result is kind of
laborious, in essence it follows in a str

In [245]:
annotated_data_with_filenames = list(zip(annotated_data, list_of_texs))
random.shuffle(annotated_data_with_filenames)
train_data = [ann_data[0] for ann_data in annotated_data_with_filenames[:-1]]
test_data = [ann_data[0] for ann_data in annotated_data_with_filenames[-1:]] #we hold out one tex file for testing

In [246]:
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner

In [247]:
ner = train_ner(nlp, train_data, ['VAR', 'TYPE'])

In [249]:
#first test on a simple sentence
doc = nlp.make_doc('Let $S$ be a scheme and let $something here$ be a great thing you know.')  
nlp.tagger(doc)
ner(doc)

for ent in doc.ents:
    ent.merge()
    
for word in doc:
    if word.ent_type:
        print('(' + word.text + ':' + word.ent_type_ +')', end=word.whitespace_)
    else:
        print(word.text_with_ws, end='')
    


Let ($S$:VAR) be (a scheme:TYPE) and let ($something here$:VAR) be (a great thing:TYPE) you know.

In [250]:
#then test on the hold out tex file and save in a new tex file
doc = nlp.make_doc(annotated_data_with_filenames[-1][0][0])  
nlp.tagger(doc)
ner(doc)

for ent in doc.ents:
    ent.merge()

new_filename = 'a-' + annotated_data_with_filenames[-1][1] 
with open( 'annotated_tex_files/'+ new_filename, 'w') as f:
    for word in doc:
        if word.ent_type:
            f.write('(' + word.text + ':' + word.ent_type_ +')' + word.whitespace_)
        else:
            f.write(word.text_with_ws)