In [1]:
import spacy
from spacy.matcher import Matcher

In [2]:
# _load english language model from spacy library
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x7f8b6e6b9d10>

In [3]:
plain_txt = """It is a stick. With the fall of the Western Roman Empire, the city was conquered by the Franks and became a royal seat. After the division of Charlemagne's empire, it was included in the lands of his son Louis the Pious (814). In 837, it was assigned to Charles the Bald, and a few years later it was here that Carolingian heirs discussed what was to become the Treaty of Verdun (843), by which the city became part of Lotharingia under Lothair I. In 860 and 922, Koblenz was the scene of ecclesiastical synods. At the first synod, held in the Liebfrauenkirche, the reconciliation of Louis the German with his half-brother Charles the Bald took place. The city was sacked and destroyed by the Norsemen in 882. In the second, slavery was condemned, specifically it was decreed that any man that 'led away a Christian man and then sold him' should be considered guilty of homicide. In 925, it became part of the eastern German Kingdom, later the Holy Roman Empire."""
sample_doc = nlp(plain_txt)  # _tokenization of text

for token in sample_doc[:20]:
    print(f'{token}  {token.pos_}  {token.tag_}') # _parts of speech tagging information

It  PRON  PRP
is  AUX  VBZ
a  DET  DT
stick  NOUN  NN
.  PUNCT  .
With  ADP  IN
the  DET  DT
fall  NOUN  NN
of  ADP  IN
the  DET  DT
Western  PROPN  NNP
Roman  PROPN  NNP
Empire  PROPN  NNP
,  PUNCT  ,
the  DET  DT
city  NOUN  NN
was  AUX  VBD
conquered  VERB  VBN
by  ADP  IN
the  DET  DT


In [4]:
text_2 = 'they are running around the bushes'
doc_2 = nlp(text_2)

for token in doc_2:
    print(f'{token}  {token.lemma_}')

they  they
are  be
running  run
around  around
the  the
bushes  bush


In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f8b6e723b90>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f8b6e736230>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f8b6e6968a0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f8b6e753f50>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f8b48600550>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f8b6e696980>)]

In [12]:
matcher = Matcher(vocab=nlp.vocab)

# _match the words which are pronouns and verbs
match_pattern = [{'POS':'PRON'}, {'POS':'VERB'}]

matcher.add('pronoun+verb', patterns=[match_pattern])

matcher_out = matcher(sample_doc, as_spans=True)
matcher_out

[it was, it became]

In [13]:
pattern_2 = [{'POS':'PRON'}, {'POS':'PRON', 'OP':'+'}, {'POS':'VERB'}]
matcher.add('pronoun+aux+verb', patterns=[pattern_2])

matcher_out = matcher(sample_doc, as_spans=True)
matcher_out

[it was, it became]