# Sentence match- different approachs
### Tanzir Hasan

In [11]:
import difflib
import nltk

In [2]:
target_sentence = "Ovarian tumors of low malignant potential (borderline tumors)"
sentences = ["Ovarian tumors of low malignant potential (borderline tumors)",
            "ovarian tumors of low malignant potential (borderline tumors)",
            "Ovarian tumors,  of low malignant potential (borderline tumors)",
            "Ovarian tumors, that are of low malignant potential (borderline tumors)", 
            "Ovarian tumors that are of low malignant potential (borderline tumors)",
             "Ovarian tumors that are of lowest malignant potential (borderline tumors)",
            "Ovarian tumors of low malignant potential or borderline tumors",
            "Borderline tumors is the  ovarian tumors of low malignant potential",
            "Borderline tumors,ovarian tumors of low malignant potential ",
            "Patients suffering from breast tumors for 6 yers will be excluded"]


## Exact match

In [3]:
def is_exact_match(a,b):
    """ Check if a and b are matches"""
    return a== b

for sentence in sentences:
    print(is_exact_match(target_sentence, sentence), sentence)

(True, 'Ovarian tumors of low malignant potential (borderline tumors)')
(False, 'ovarian tumors of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors,  of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors, that are of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors that are of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors that are of lowest malignant potential (borderline tumors)')
(False, 'Ovarian tumors of low malignant potential or borderline tumors')
(False, 'Borderline tumors is the  ovarian tumors of low malignant potential')
(False, 'Borderline tumors,ovarian tumors of low malignant potential ')
(False, 'Patients suffering from breast tumors for 6 yers will be excluded')


## Exact Case_insensitive Token Match

In [4]:
import nltk.corpus
import nltk.tokenize
import string

from nltk.tokenize import TreebankWordTokenizer
# Get default English stopwords and extend with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')

# Create tokenizer
tokenizer = TreebankWordTokenizer()

def is_ci_token_match(a,b):
    tokens_a =[token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a)]
    tokens_b =[token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b)]
    return (tokens_a == tokens_b)

for sentence in sentences:
    print (is_ci_token_match(target_sentence, sentence), sentence)



(True, 'Ovarian tumors of low malignant potential (borderline tumors)')
(True, 'ovarian tumors of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors,  of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors, that are of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors that are of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors that are of lowest malignant potential (borderline tumors)')
(False, 'Ovarian tumors of low malignant potential or borderline tumors')
(False, 'Borderline tumors is the  ovarian tumors of low malignant potential')
(False, 'Borderline tumors,ovarian tumors of low malignant potential ')
(False, 'Patients suffering from breast tumors for 6 yers will be excluded')


In [5]:
help(nltk.word_tokenize)

Help on function word_tokenize in module nltk.tokenize:

word_tokenize(text, language='english')
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).
    
    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus



# Exact Case-Insensitive Token Match after Stopwording

In [6]:
import nltk.corpus
import nltk.tokenize
import string
# Get default English stopwords and extende with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')

# Create tokenizer

tokenizer = TreebankWordTokenizer()

def is_ci_token_stopword_match(a,b):
    token_a =[token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
              if token.lower().strip(string.punctuation) not in stopwords]
    token_b =[token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
              if token.lower().strip(string.punctuation) not in stopwords]
    return (token_a == token_b)

for sentence in sentences:
    print( is_ci_token_stopword_match(target_sentence,sentence), sentence)

(True, 'Ovarian tumors of low malignant potential (borderline tumors)')
(True, 'ovarian tumors of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors,  of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors, that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors that are of lowest malignant potential (borderline tumors)')
(True, 'Ovarian tumors of low malignant potential or borderline tumors')
(False, 'Borderline tumors is the  ovarian tumors of low malignant potential')
(False, 'Borderline tumors,ovarian tumors of low malignant potential ')
(False, 'Patients suffering from breast tumors for 6 yers will be excluded')


## Exact token Match after stopwording and stemming

In [7]:
import nltk.corpus
import nltk.tokenize
import string
import nltk.stem.snowball

# Get default English stopwords and extend with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')

# Create tokenizer and stemmer
tokenizer = nltk.tokenize.TreebankWordTokenizer()
stemmer = nltk.stem.snowball.SnowballStemmer('english')

def is_ci_token_stopword_stem_match(a,b):
    token_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
               if token.lower().strip(string.punctuation) not in stopwords]
    token_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
               if token.lower().strip(string.punctuation) not in stopwords]
    stem_a = [stemmer.stem(token) for token in token_a]
    stem_b = [stemmer.stem(token) for token in token_b]
    return (stem_a == stem_b)
for sentence in sentences:
    print(is_ci_token_stopword_stem_match(target_sentence,sentence), sentence) 

(True, 'Ovarian tumors of low malignant potential (borderline tumors)')
(True, 'ovarian tumors of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors,  of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors, that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of low malignant potential (borderline tumors)')
(False, 'Ovarian tumors that are of lowest malignant potential (borderline tumors)')
(True, 'Ovarian tumors of low malignant potential or borderline tumors')
(False, 'Borderline tumors is the  ovarian tumors of low malignant potential')
(False, 'Borderline tumors,ovarian tumors of low malignant potential ')
(False, 'Patients suffering from breast tumors for 6 yers will be excluded')


## Exact token Match after stopwording and Lementing

In [8]:
import nltk.corpus
import nltk.tokenize
import nltk.stem.snowball
from nltk.corpus import wordnet
import string

# Get default English stopwords and extend with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')

def get_wordnet_pos(pos_tag):
    if pos_tag[1].startswith('J'):
        return(pos_tag[0], wordnet.ADJ)
    elif pos_tag[1].startswith('V'):
        return(pos_tag[0], wordnet.VERB)
    elif pos_tag[1].startswith('N'):
        return(pos_tag[0], wordnet.NOUN)
    elif pos_tag[1].startswith('R'):
        return(pos_tag[0],wordnet.ADV)
    else:
        return( pos_tag[0],wordnet.NOUN)
    
# Create tokenizer and lemmantizer
tokenizer = nltk.tokenize.TreebankWordTokenizer()
lemmantizer = nltk.stem.wordnet.WordNetLemmatizer()

def is_ci_token_stopword_lemma_match(a,b):
    pos_a = map(get_wordnet_pos,nltk.pos_tag(tokenizer.tokenize(a)))
    pos_b = map(get_wordnet_pos,nltk.pos_tag(tokenizer.tokenize(b)))
    lemmae_a = [lemmantizer.lemmatize(token.lower().strip(string.punctuation),pos) for token,pos in pos_a\
               if token.lower().strip(string.punctuation) not in stopwords]
    lemmae_b = [lemmantizer.lemmatize(token.lower().strip(string.punctuation),pos) for token,pos in pos_b\
               if token.lower().strip(string.punctuation) not in stopwords]
    return(lemmae_a == lemmae_b)

for sentence in sentences:
    print(is_ci_token_stopword_lemma_match(target_sentence, sentence), sentence)
    
    
        


(True, 'Ovarian tumors of low malignant potential (borderline tumors)')
(True, 'ovarian tumors of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors,  of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors, that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of lowest malignant potential (borderline tumors)')
(True, 'Ovarian tumors of low malignant potential or borderline tumors')
(False, 'Borderline tumors is the  ovarian tumors of low malignant potential')
(False, 'Borderline tumors,ovarian tumors of low malignant potential ')
(False, 'Patients suffering from breast tumors for 6 yers will be excluded')


##  Partial Sequence Match after Stopwording and Lemmatizing

In [9]:
import nltk.corpus
import nltk.tokenize
import nltk.stem.snowball
import string
from nltk.corpus import wordnet
# Get default English stopwords and extend with punctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')


def get_wordnet_pos(pos_tag):
    if pos_tag[1].startswith('J'):
        return (pos_tag[0], wordnet.ADJ)
    elif pos_tag[1].startswith('V'):
        return(pos_tag[0], wordnet.VERB)
    elif pos_tag[1].startswith('N'):
        return(pos_tag[0], wordnet.NOUN)
    elif pos_tag[1].startswith('R'):
        return(pos_tag[0], wordnet.ADV)
    else:
        return(pos_tag[0], wordnet.NOUN)

# Create tokenizer and lemmatizer
tokenizer = nltk.tokenize.TreebankWordTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def is_ci_partial_seq_tken_stopword_lemma_match(a,b):
    pos_a = map(get_wordnet_pos, nltk.pos_tag(tokenizer.tokenize(a)))
    pos_b = map(get_wordnet_pos, nltk.pos_tag(tokenizer.tokenize(b)))
    lemmae_a = [lemmatizer.lemmatize(token.lower().strip(string.punctuation),pos) for token, pos in pos_a \
               if token.lower().strip(string.punctuation) not in stopwords]
    lemmae_b = [lemmatizer.lemmatize(token.lower().strip(string.punctuation),pos) for token,pos in pos_a \
               if token.lower().strip(string.punctuation) not in stopwords]
    # Create sequence matcher
    s = difflib.SequenceMatcher(None, lemmae_a, lemmae_b)
    return(s.ratio() > 0.66)

for sentence in sentences:
    print(is_ci_partial_seq_tken_stopword_lemma_match(target_sentence, sentence), sentence)
    

(True, 'Ovarian tumors of low malignant potential (borderline tumors)')
(True, 'ovarian tumors of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors,  of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors, that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of lowest malignant potential (borderline tumors)')
(True, 'Ovarian tumors of low malignant potential or borderline tumors')
(True, 'Borderline tumors is the  ovarian tumors of low malignant potential')
(True, 'Borderline tumors,ovarian tumors of low malignant potential ')
(True, 'Patients suffering from breast tumors for 6 yers will be excluded')


## Partial Noun set Match after stopword and lemmatizing

In [10]:
import nltk.corpus
import nltk.tokenize
import nltk.stem.snowball
import string
from nltk.corpus import wordnet

## Get English stopwords  and extend with puctuation
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')

def get_wordnet_pos(pos_tag):
    if pos_tag[1].startswith('J'):
        return(pos_tag[0],wordnet.ADJ)
    elif pos_tag[1].startswith('V'):
        return(pos_tag[0],wordnet.VERB)
    elif pos_tag[1].startswith('N'):
        return(pos_tag[0],wordnet.NOUN)
    elif pos_tag[1].startswith('R'):
        return(pos_tag[0], wordnet.ADV)
    else:
        return(pos_tag[0],wordnet.NOUN)
    
    
# create tokenizer and lemmatizer
tokenizer = nltk.tokenize.TreebankWordTokenizer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

def is_ci_partial_set_token_stopword_lemma_match(a,b):
    pos_a = map(get_wordnet_pos,nltk.pos_tag(tokenizer.tokenize(a)))
    pos_b = map(get_wordnet_pos, nltk.pos_tag(tokenizer.tokenize(b)))
    lemmae_a = [lemmatizer.lemmatize(token.lower().strip(string.punctuation),pos) for token, pos in pos_a\
                if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords]
    lemmae_b = [lemmatizer.lemmatize(token.lower().strip(string.punctuation),pos) for token, pos in pos_b\
                if pos == wordnet.NOUN and token.lower().strip(string.punctuation) not in stopwords]
    # Calculate Jaccard similarity 
    ratio = len(set(lemmae_a).intersection(lemmae_b))/ float(len(set(lemmae_a).union(lemmae_b)))
    return(ratio > 0.66)

for sentence in sentences:
    print(is_ci_partial_set_token_stopword_lemma_match(target_sentence, sentence),sentence) 

    

(True, 'Ovarian tumors of low malignant potential (borderline tumors)')
(True, 'ovarian tumors of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors,  of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors, that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of low malignant potential (borderline tumors)')
(True, 'Ovarian tumors that are of lowest malignant potential (borderline tumors)')
(True, 'Ovarian tumors of low malignant potential or borderline tumors')
(True, 'Borderline tumors is the  ovarian tumors of low malignant potential')
(True, 'Borderline tumors,ovarian tumors of low malignant potential ')
(False, 'Patients suffering from breast tumors for 6 yers will be excluded')
