# **Rule based Matching**

This implementation handles only basic cases. For more complicated matching, more comlplicated rules needs to be created which requires in-depth knowledge of English grammar and relationaships between different dependencies in a sentence.

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
#text1 = "To simulate the behaviour of portions of the desired software product"
#text2 = "To simulate portions of the desired final product"
#Output: 0.6401728588077857

#text1 = "He has tons of stuff to throw away"
#text2 = "He needs to get rid of a lot of junk"
#Output: 0.5047788208129395

### **Cases which it can handle:**

**Synonyms**

In [None]:
text1 = "they persuaded her to confess her fault"
text2 = "they convinced her to admit her mistake"
#Output: 0.7983233482666816

**Antonyms**

There are 2 ways in which an antonym may affect the meaning of a sentence. Either it may completely change the meaning of the sentence, for eg., "she loves you", "she hates you". Or it just changes the meaning of some portion of the sentence as shown below.

In [None]:
text1 = "tigers are carnivores which hunt in night"
text2 = "tigers are carnivores which hunt in day"
#Output: 0.8

**Simple Active-Passive Voice**

In [None]:
text1 = "children are plucking flowers in the park"
text2 = "flowers are being plucked in the park by children"
#Output: 1.0

In [None]:
text1 = "we are going to watch a movie"
text2 = "a movie is going to be watched by us"
#Output: 1.0

**Negation Sentences**

If we have found opposite sentences in the sense of negation, we have assigned a low score for that answer.

In [None]:
text1 = "technology is not harmful to human beings"
text2 = "technology is harmful to human beings"
#Output: 0.2

**Conjunctions**

In [None]:
text1 = "it is a form of technology that uses telecommunication and computer systems for study"
text2 = "it is a form of technology that uses computer and telecommunication systems for study"
#Output: 0.9714285714285714

In [None]:
text1 = "we have apples, pears and oranges"
text2 = "we have oranges, apples and pears"
#Output: 0.9

One thing to notice here is that if the order of things matter then, we cannot detect those. This is the limitation of rule-based approach. One rule working fine for one type may fail if some other kind of variation is considered.

In [None]:
text1 = "he came home, took a shower and went to bed"
text2 = "he came home, went to bed and took a shower"
#Output: 0.9666666666666667

**It detects some wrong answers of type given below:**

In [None]:
text1 = "man killed a dog"
text2 = "dog killed a man"
#Output: 0.3333333333333333

Even if order is wrong, if more words are having correct orders, then it may give wrong results. But we manage these side cases.

In [None]:
text1 = "many people are being taken hostages by the terrorists"
text2 = "many terrorists are being taken hostages by the people"
#Output: 0.3333333333333333

### **Cases for which it fails**

In [None]:
text1 = "the price of a resort vacation includes meals, tips and equipment rentals, which makes your trip more cost-effective"
text2 = "all-inclusive resort vacations can make for an economical trip"
#Output: 0.239099710501727

In [None]:
text1 = "israel was established in 1979"
text2 = "The national institute for psychobiology in israel was established in 1979"
#Output: 0.6666666666666666

### **Code**

In [None]:
doc1 = nlp(text1)
doc2 = nlp(text2)

In [None]:
doc1_p = [t for t in doc1 if not t.is_stop and not t.is_punct]
doc2_p = [t for t in doc2 if not t.is_stop and not t.is_punct]
print(doc1_p)
print(doc2_p)

[simulate, behaviour, portions, desired, software, product]
[simulate, portions, desired, final, product]


In [None]:
v_similarities = {}
matches = {}
for token1 in doc1_p:
    max_sim = 0
    for token2 in doc2_p:
        lemma1 = nlp(token1.lemma_)
        lemma2 = nlp(token2.lemma_)
        sim = lemma1.similarity(lemma2)
        if sim > max_sim:
            max_sim = sim
            v_similarities[token1] = sim
            matches[token1] = token2

print(v_similarities)
print(matches)

{simulate: 1.0, behaviour: 0.3336149453173265, portions: 1.0, desired: 1.0, software: 0.4676814957410669, product: 1.0}
{simulate: simulate, behaviour: desired, portions: portions, desired: desired, software: product, product: product}


In [None]:
def synoFunc(word):
    synonyms = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name())
    return synonyms

In [None]:
def antoFunc(word):
    antonyms = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            if l.antonyms():
                antonyms.append(l.antonyms()[0].name())
    return antonyms

In [None]:
def hyperFunc(word):
    hypernyms = []
    synset = wn.synsets(word)[0]
    for hyper in synset.hypernyms():
        hypernyms.append(hyper.lemmas()[0].name())
    return hypernyms

In [None]:
def hypoFunc(word):
    hyponyms = []
    synset = wn.synsets(word)[0]
    for hypo in synset.hyponyms():
        hyponyms.append(hypo.lemmas()[0].name())
    return hyponyms

In [None]:
def holoFunc(word):
    holonyms = []
    synset = wn.synsets(word)[0]
    for holo in synset.part_holonyms():
        holonyms.append(holo.lemmas()[0].name())
    for holo in synset.substance_holonyms():
        holonyms.append(holo.lemmas()[0].name())
    return holonyms

In [None]:
def meroFunc(word):
    meronyms = []
    synset = wn.synsets(word)[0]
    for mero in synset.part_meronyms():
        meronyms.append(mero.lemmas()[0].name())
    for mero in synset.substance_meronyms():
        meronyms.append(mero.lemmas()[0].name())
    return meronyms

In [None]:
SYNONYM = 0.8
HYPERNYM = 0.7
HYPONYM = 0.6
HOLONYM = 0.5
MERONYM = 0.4
ANTONYM = 0.2

In [None]:
for word in doc1_p:
    word1 = word.lemma_
    word2 = matches[word].lemma_
    match_count = 0
    score_count = 0

    if v_similarities[word] == 1.0:
        continue

    if word2 in antoFunc(word1):
        v_similarities[word] = ANTONYM
        continue

    if word2 in synoFunc(word1):
        score_count += SYNONYM
        match_count += 1
    
    if word2 in hyperFunc(word1):
        score_count += HYPERNYM
        match_count += 1

    if word2 in hypoFunc(word1):
        score_count += HYPONYM
        match_count += 1

    if word2 in holoFunc(word1):
        score_count += HOLONYM
        match_count += 1

    if word2 in meroFunc(word1):
        score_count += MERONYM
        match_count += 1

    if match_count:
        v_similarities[word] = score_count / match_count

v_score = sum(v_similarities.values()) / len(v_similarities)
#print(v_score)
print(v_similarities)

{simulate: 1.0, behaviour: 0.3336149453173265, portions: 1.0, desired: 1.0, software: 0.4676814957410669, product: 1.0}


## **Visualization**

In [None]:
from spacy import displacy

displacy.render(doc1, style='dep', jupyter=True, options={'distance': 115})
displacy.render(doc2, style='dep', jupyter=True, options={'distance': 115})

In [None]:
for token in doc1:
    print("{2}({3}-{6}, {0}-{5})".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_, token.i+1, token.head.i+1))

aux(simulate-2, To-1)
ROOT(simulate-2, simulate-2)
det(behaviour-4, the-3)
dobj(simulate-2, behaviour-4)
prep(behaviour-4, of-5)
pobj(of-5, portions-6)
prep(portions-6, of-7)
det(product-11, the-8)
amod(product-11, desired-9)
compound(product-11, software-10)
pobj(of-7, product-11)


In [None]:
for token in doc2:
    print("{2}({3}-{6}, {0}-{5})".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_, token.i+1, token.head.i+1))

aux(simulate-2, To-1)
ROOT(simulate-2, simulate-2)
dobj(simulate-2, portions-3)
prep(portions-3, of-4)
det(product-8, the-5)
amod(product-8, desired-6)
amod(product-8, final-7)
pobj(of-4, product-8)


In [None]:
for token in doc1:
  print("{0}-{1}".format(token.text, token.tag_))

To-TO
simulate-VB
the-DT
behaviour-NN
of-IN
portions-NNS
of-IN
the-DT
desired-VBN
software-NN
product-NN


In [None]:
for token in doc2:
  print("{0}-{1}".format(token.text, token.tag_))

To-TO
simulate-VB
portions-NNS
of-IN
the-DT
desired-VBN
final-JJ
product-NN


## **Checking Word Order**

In [None]:
is_doc1_passive = 0
is_doc2_passive = 0

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern1 = [{"DEP": "nsubjpass"}]
matcher.add("IS_PASSIVE_1", None, pattern1)

pattern2 = [{"DEP": "auxpass"}]
matcher.add("IS_PASSIVE_2", None, pattern2)

found_1 = matcher(doc1)
if found_1:
    is_doc1_passive = 1
found_2 = matcher(doc2)
if found_2:
    is_doc2_passive = 1

is_passive = is_doc1_passive ^ is_doc2_passive
print(is_passive)

0


In [None]:
matcher.remove("IS_PASSIVE_1")
matcher.remove("IS_PASSIVE_2")

In [None]:
is_doc1_negative = 0
is_doc2_negative = 0

pattern3 = [{"DEP": "neg"}]
matcher.add("IS_NEGATIVE", None, pattern3)
found_3 = matcher(doc1)
if found_3:
    is_doc1_negative = 1
found_4 = matcher(doc2)
if found_4:
    is_doc2_negative = 1

is_negative = is_doc1_negative ^ is_doc2_negative
print(is_negative)

0


In [None]:
e_similarities = {}
for token in doc1_p:
    token_dep = token.dep_
    match_dep = matches[token].dep_
    token_head = token.head
    match_head = matches[token].head
    weight = 1

    if token_dep == "conj" and match_dep == "conj":
        count_t_position = 0
        count_m_position = 0
        while token_head.dep_ == "conj":
            token_head = token_head.head
            count_t_position += 1
        while match_head.dep_ == "conj":
            match_head = match_head.head
            count_m_position += 1
        if count_t_position == count_m_position:
            weight = 1
        else:
            weight = 0.9


    elif match_dep == "conj":
        while match_head.dep_ == "conj":
            match_head = match_head.head
        if token_dep == match_head.dep_:
            weight = 0.9
        else:
            weight = 0


    elif token_dep == "conj":
        while token_head.dep_ == "conj":
            token_head = token_head.head
        if match_dep == token_head.dep_:
            weight = 0.9
        else:
            weight = 0


    elif token_dep == "ROOT":
        if match_dep == "ROOT":
            weight = 1
        else:
            weight = 0


    elif token_dep == "nsubj":
        if is_passive:
            if match_dep == "dobj":
                weight = 1
            elif match_dep == "pobj":
                weight = 1
            else:
                weight = 0
        else:
            if match_dep == "nsubj":
                weight = 1
            else:
                weight = 0


    elif token_dep == "nsubjpass":
        if is_passive:
            if match_dep == "dobj":
                weight = 1
            else:
                weight = 0
        else:
            if match_dep == "nsubjpass":
                weight = 1
            else:
                weight = 0


    elif token_dep == "dobj":
        if is_passive:
            if match_dep == "dobj":
                weight = 1
            elif match_dep == "nsubjpass":
                weight = 1
            elif match_dep == "nsubj":
                weight = 1
            else:
                weight = 0
        else:
            if match_dep == "dobj":
                weight = 1
            elif match_dep == "pobj":
                weight = 0.8
            else:
                weight = 0


    elif token_dep == "pobj":
        if is_passive:
            if match_dep == "nsubj":
                weight = 1
            elif is_doc1_passive:
                if token.head.dep_ != "agent":
                    if match_dep == "pobj":
                        weight = 1
            elif is_doc2_passive:
                if matches[token].head.dep_ != "agent":
                    if match_dep == "pobj":
                        weight = 1
            else:
                weight = 0
        else:
            if match_dep == "pobj":
                weight = 1
            elif match_dep == "dobj":
                weight = 0.8
            else:
                weight = 0




    e_similarities[token] = weight

e_score = sum(e_similarities.values()) / len(e_similarities)
if is_negative:
    e_score = 0.2
score = v_score * e_score
print(score)
#print(e_score)
print(e_similarities)

0.6401728588077857
{simulate: 1, behaviour: 0, portions: 0.8, desired: 1, software: 1, product: 1}
