In [2]:
# A collection of useful functions built during the course
import string
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.data import find

In [10]:
# Removing the punctuation and lowering the case of a string
def remove_punctuation(line):
    return line.translate(str.maketrans('', '', string.punctuation))

# extract the words from the sentence
def words_from_sent(sent):
    # tokenized
    tokenized = word_tokenize(sent)
    # remove stopwords and return
    return [word for word in tokenized if word not in stopwords.words('english')]

# convert words to tokens
def tokens_from_words(words):
    return pos_tag(words)

# Function to get wordnet pos code
def wordnet_pos_code(tag):
    if tag.startswith('NN'):
        return wordnet.NOUN
    elif tag.startswith('VB'):
        return wordnet.VERB
    elif tag.startswith('JJ'):
        return wordnet.ADJ
    elif tag.startswith('RB'):
        return wordnet.ADV
    else:
        return None
# Tokens to lemmas using wordnet lemmatizer    
def tokens_to_lemmas(tokens):
    return list(map(token_to_lemmas, tokens))

def token_to_lemmas(token):    
    pos = wordnet_pos_code(token[1])
    if pos:
        return WordNetLemmatizer().lemmatize(token[0], pos=pos)
    return token[0]

# Convert tokens to senses where sense is the first synset + POS
def tokens_to_senses(tokens):
    return list(filter(None, map(token_to_sense, tokens)))

def token_to_sense(token):
    lemma = WordNetLemmatizer().lemmatize(token[0])
    pos = wordnet_pos_code(token[1])
    synsets = []
    if pos: 
        return wordnet.synsets(lemma, pos=pos)

# Load the lines of training text as sentences
def text_to_sentences(filename):
    sentence_pair_array = []
    for line in open(filename, encoding="UTF8").readlines():
        sentence_pair_array.append([s.strip() for s in line.split("\t")])
    return sentence_pair_array

def parse_pos(sentence):
    return bllip.parse_one(sentence)

In [3]:
def compare_synsets(synset_a, synset_b):
    lcs = synset_a.lowest_common_hypernyms(synset_b)
    similarity = synset_a.path_similarity(synset_b)
    wup_similarity = synset_a.wup_similarity(synset_b)
    lin_similarity = synset_a.lin_similarity(synset_b, brown_ic)                        
    lch_similarity = synset_a.lch_similarity(synset_b)
    return lcs, similarity, wup_similarity, lin_similarity, lch_similarity

In [2]:
def count(g,s):
    TP = TN = FP = FN = 0
    for i in range(0,len(g)):
        if (g[i]==s[i] and s[i]==1): TP+=1
        if (g[i]==s[i] and s[i]==0): TN+=1
        if (g[i]!=s[i] and s[i]==1): FP+=1
        if (g[i]!=s[i] and s[i]==0): FN+=1
    return [TP,TN,FP,FN]
    
def MSRP_eval(gs, sys):
    [TP,TN,FP,FN] = count(gs,sys)
    acc = (TP+TN)/float(TP+TN+FP+FN) # ACCURACY
    reject = TN/float(TN+FP) # precision on negative SPECIFICITY
    accept = TP/float(TP+FN) # precision on positive SENSITIVITY
    print("acc=",acc," reject=",reject," accept=",accept)