In [3]:
# Given two sentences that are known to be paraphrases,
# pick the phrases that are similar and contribute to their overall similarity
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.util import filter_spans                                                                                                                                                                                       

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

# create common grammatical patterns in English. This is incomplete and
# non-comprehensive, but sufficiently demonstrates for this example
verb_pattern = [{"POS": "VERB"},
               {"POS": "ADV", "OP": "*"}, 
               {"OP": "?"},
               {"POS": "VERB","OP": "?"}]
noun_pattern = [{"POS": "NOUN", "OP": "*"},
                {"POS": "VERB", "OP":"!"},
                {"POS": "ADJ", "OP": "?"},
                {"POS": "NOUN", "OP": "+"}]
proper_noun_pattern = [{"IS_TITLE": True, "OP": "+"}]
number_pattern = [{"IS_DIGIT": True, "OP": "+"},
                  {"OP":"*"},
                  {"IS_DIGIT": True, "OP": "+"}]

# create a matcher with the patterns above
matcher.add("PHRASE1", None, verb_pattern)
matcher.add("PHRASE2", None, noun_pattern)
matcher.add("PHRASE3", None, proper_noun_pattern)
matcher.add("PHRASE4", None, number_pattern)

# initialize docs with text that are paraphrases of each other
doc1 = nlp("Feelings about current business conditions improved substantially from the first quarter, jumping from 40 to 55.")
doc2 = nlp("Assessment of current business conditions improved substantially, the Conference Board said, jumping to 55 from 40 in the first quarter.")

# find all pattern matches in the two texts independently
matches1 = matcher(doc1)
matches2 = matcher(doc2)

# create a span for each match, and a tuple for all the span in each text
spans1 = [Span(doc1, start1, end1, label="Phrase1") for _, start1, end1 in matches1]
spans2 = [Span(doc2, start2, end2, label="Phrase2") for _, start2, end2 in matches2]

# to avoid duplicates, filter out duplicate or overlapping spans from each tuple
spans1 = filter_spans(spans1)
spans2 = filter_spans(spans2)

threshold = 0.7

# compare each match in the first text to second text; if they are similar enough, print the phrases
for item1 in spans1:
    for item2 in spans2:
        if item1.similarity(item2) > threshold:
            print(item1.text)
            print(item2.text)

# could use built in noun_chunks function
# for chunk1 in doc1.noun_chunks:
#     for chunk2 in doc2.noun_chunks:
#         if chunk1.similarity(chunk2) > threshold:
#             print(chunk1)
#             print(chunk2)

Feelings about current business conditions
Assessment of current business conditions
improved substantially from
improved substantially,
the first quarter
the first quarter
40 to 55
55 from 40


  if item1.similarity(item2) > threshold:
