# Text or Phrase Matching

In [3]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy


In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
doc = nlp("Hello ! worlD")

In [6]:
doc

Hello ! worlD

In [7]:
for token in doc:
    print(token)

Hello
!
worlD


## Token based Matching

In [67]:
a = ['LOWER', 'IS_PUNCT', 'LOWER']
b = ['hello',True,'world']

In [68]:
[p, q, r] = list(map((lambda x, y: {x:y}), a, b))

In [69]:
pattern = [p, q, r]

In [180]:
pattern = [{'LOWER': 'hello', 'OP' : '?'}, {'IS_PUNCT': True, 'OP' : '?'},{'IS_PUNCT': True, 'OP' : '?'}, {'LOWER': 'world'}]

In [181]:
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', None, pattern)

In [189]:
doc = nlp('Hello , ! World')

In [190]:
matches = matcher(doc)
matches

[(15578876784678163569, 0, 4),
 (15578876784678163569, 1, 4),
 (15578876784678163569, 2, 4),
 (15578876784678163569, 3, 4)]

In [191]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)
    

15578876784678163569 HelloWorld 0 4 Hello , ! World
15578876784678163569 HelloWorld 1 4 , ! World
15578876784678163569 HelloWorld 2 4 ! World
15578876784678163569 HelloWorld 3 4 World


In [192]:
doc[start].text

'World'

In [193]:
doc[-1].text


'World'

In [194]:
doc[3].text

'World'

In [195]:
doc[1].text

','

### Analysing Covid Research

In [218]:
pattern2 = [{'LOWER':'can', 'OP':'?'}, {'LOWER': 'cause'}]

In [219]:
matcher2 = Matcher(nlp.vocab)
matcher2.add('covid19', None, pattern2)

In [220]:
doc2 = nlp(open('covid_research.txt').read())
doc2

Dr Sonya Babu-Narayan, Associate Medical Director at the British Heart Foundation and Honorary Consultant Cardiologist, said: 

“Every day we learn more about Covid-19. Information to date suggests that people with heart disease, or are at risk of heart disease due to factors such as high blood pressure, diabetes or being severely overweight with a body mass index higher than 40, are at an increased risk of complications caused by the virus.

“If you have one of these conditions you should be taking all precautions possible to reduce your chance of catching the virus.

“Viruses can cause significant inflammation which can injure the heart and can worsen a person’s existing heart condition even if the virus does not enter the heart directly.

“Evidence shows that people with higher levels of a protein used to measure heart injury in their blood are more likely to die after contracting Covid-19. 

“However this kind of observational evidence can’t tell us why some people suffer heart dam

In [226]:
matches2 = matcher2(doc2)

In [231]:
for match_id2, start2, end2 in matches2:
    string_id2 = nlp.vocab.strings[match_id2]
    span2 = doc2[start2:end2]
    print(match_id2, string_id2, start2, end2, span2.text)
    print(doc2[start2:(start2+25)])
    print()

8972206676154411618 covid19 111 113 can cause
can cause significant inflammation which can injure the heart and can worsen a person’s existing heart condition even if the virus does not enter

8972206676154411618 covid19 112 113 cause
cause significant inflammation which can injure the heart and can worsen a person’s existing heart condition even if the virus does not enter the

8972206676154411618 covid19 706 707 cause
cause unnecessary concern given the lack of evidence supporting it at this stage. There is however evidence supporting the statement that the risk of

8972206676154411618 covid19 1280 1281 cause
cause inflammation of the lining of the heart (pericarditis), the heart muscle (myocarditis) or trigger worsening of existing heart conditions

8972206676154411618 covid19 1385 1386 cause
cause damage to the heart muscle (detected by measuring a protein called Troponin in the blood). This heart damage is likely to



In [207]:
doc2[111:140]

can cause significant inflammation which can injure the heart and can worsen a person’s existing heart condition even if the virus does not enter the heart directly.

In [208]:
doc2[706:720]

cause unnecessary concern given the lack of evidence supporting it at this stage.

In [214]:
doc2[1280:1313]

cause inflammation of the lining of the heart (pericarditis), the heart muscle (myocarditis) or trigger worsening of existing heart conditions such as heart attack or heart failure.

In [217]:
doc2[1385:1404]

cause damage to the heart muscle (detected by measuring a protein called Troponin in the blood).