In [1]:
import string
import numpy as np
import pandas as pd

from nltk import word_tokenize, pos_tag
from nltk.parse import BllipParser
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.chunk import ne_chunk
from nltk.data import find
from sklearn.linear_model import LogisticRegression
from nltk.metrics import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
init_notebook_mode(connected=True)

In [12]:
'''
    A collection of useful functions for nlp
'''
# Removing the punctuation and lowering the case of a string
def remove_punctuation(line):
    return line.translate(str.maketrans('', '', string.punctuation))

# extract the words from the sentence
def words_from_sent(sent):
    # tokenized
    tokenized = word_tokenize(sent)
    # remove stopwords and return
    return [word for word in tokenized if word not in stopwords.words('english')]

# convert words to tokens
def tokens_from_words(words):
    return pos_tag(words)

# Function to get wordnet pos code
def wordnet_pos_code(tag):
    if tag.startswith('NN'):
        return wordnet.NOUN
    elif tag.startswith('VB'):
        return wordnet.VERB
    elif tag.startswith('JJ'):
        return wordnet.ADJ
    elif tag.startswith('RB'):
        return wordnet.ADV
    else:
        return None

# Tokens to lemmas using wordnet lemmatizer    
def tokens_to_lemmas(tokens):
    return list(map(token_to_lemmas, tokens))

def token_to_lemmas(token):    
    pos = wordnet_pos_code(token[1])
    if pos:
        return WordNetLemmatizer().lemmatize(token[0], pos=pos)
    return token[0]

# Tokens to lemmas and senses (top_synset + pos)
def extract_lemmas_senses(tokens):
    lemmas, top_synsets = [], []
    for token in tokens:
        pos = token[1]
        wn_pos = wordnet_pos_code(pos)
        lemma = WordNetLemmatizer().lemmatize(token[0])
        lemmas.append(lemma)
        if wn_pos: 
            synsets = wordnet.synsets(lemma, pos=wn_pos)
            if len(synsets) > 0:
                top_synsets.append([synsets[0], pos])
    return lemmas, top_synsets

# Load the lines of training text as sentences
def text_to_sentences(filename):
    sentence_pair_array = []
    for line in open(filename, encoding="UTF8").readlines():
        sentence_pair_array.append([s.strip() for s in line.split("\t")])
    return sentence_pair_array

def compare_synsets(synset_a, synset_b):
    lcs = synset_a.lowest_common_hypernyms(synset_b)
    similarity = synset_a.path_similarity(synset_b)
    wup_similarity = synset_a.wup_similarity(synset_b)
    lin_similarity = synset_a.lin_similarity(synset_b, brown_ic)                        
    lch_similarity = synset_a.lch_similarity(synset_b)
    return lcs, similarity, wup_similarity, lin_similarity, lch_similarity

def count(g,s):
    TP = TN = FP = FN = 0
    for i in range(0,len(g)):
        if (g[i]==s[i] and s[i]==1): TP+=1
        if (g[i]==s[i] and s[i]==0): TN+=1
        if (g[i]!=s[i] and s[i]==1): FP+=1
        if (g[i]!=s[i] and s[i]==0): FN+=1
    return [TP,TN,FP,FN]
    
def MSRP_eval(gs, sys):
    [TP,TN,FP,FN] = count(gs,sys)
    acc = (TP+TN)/float(TP+TN+FP+FN) # ACCURACY
    reject = TN/float(TN+FP) # precision on negative SPECIFICITY
    accept = TP/float(TP+FN) # precision on positive SENSITIVITY
    print("acc=",acc," reject=",reject," accept=",accept)

In [3]:
'''
    The paraphrase data that will be analysed
'''
train_input = text_to_sentences('IHLT-eval-framework/train/msr_paraphrase_train_input.txt')
test_input = text_to_sentences('IHLT-eval-framework/test/msr_paraphrase_test_input.txt')
train_classes = open('IHLT-eval-framework/train/msr_paraphrase_train_gs.txt', encoding="utf-8-sig").readlines()
test_classes = open('IHLT-eval-framework/test/msr_paraphrase_test_gs.txt', encoding="UTF8").readlines()

In [5]:
'''
    First Paraphrase detector approach, jaccard distance of lemmas    
'''
def lemma_jaccard(sent_0, sent_1):
    print('.', end='')
    sent_0, sent_1 = remove_punctuation(sent_0.lower()), remove_punctuation(sent_1.lower())
    words_0, words_1 = words_from_sent(sent_0), words_from_sent(sent_1)
    tokens_0, tokens_1 = tokens_from_words(words_0), tokens_from_words(words_1)
    lemmas_0, lemmas_1 = tokens_to_lemmas(tokens_0), tokens_to_lemmas(tokens_1)
    return jaccard_distance(set(lemmas_0), set(lemmas_1))

print('Training')
X_train = [lemma_jaccard(data[0], data[1]) for data in train_input]
y_train = [int(line.strip()) for line in train_classes]
print('Testing')
X_test = [lemma_jaccard(data[0], data[1])for data in test_input]
y_test = [int(line.strip()) for line in test_classes]
print('Results')
regression = LogisticRegression()
regression.fit(np.array(X_train).reshape(-1,1), y_train)
prediction = regression.predict(np.array(X_test).reshape(-1,1))
MSRP_eval(prediction, y_test)

Training
...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [8]:
'''
    Second Paraphrase detector approach, lemma jaccard + top synset pos jaccard 
'''
def lemma_pos_jaccard(sent_0, sent_1):
    print('.', end='')
    sent_0, sent_1 = remove_punctuation(sent_0.lower()), remove_punctuation(sent_1.lower())
    words_0, words_1 = words_from_sent(sent_0), words_from_sent(sent_1)
    tokens_0, tokens_1 = tokens_from_words(words_0), tokens_from_words(words_1)
    lemmas_0, senses_0 = extract_lemmas_senses(tokens_0)
    lemmas_1, senses_1 = extract_lemmas_senses(tokens_1)                        
    top_synsets_pos_0 = [sense[1] for sense in senses_0]
    top_synsets_pos_1 = [sense[1] for sense in senses_1]
    return [
        jaccard_distance(set(lemmas_0), set(lemmas_1)),
        jaccard_distance(set(top_synsets_pos_0), set(top_synsets_pos_1))
    ]

print('Training')
X_train = [lemma_pos_jaccard(data[0], data[1]) for data in train_input]
y_train = [int(line.strip()) for line in train_classes]
print('Testing')
X_test = [lemma_pos_jaccard(data[0], data[1])for data in test_input]
y_test = [int(line.strip()) for line in test_classes]

Training
...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [26]:
print('Results')
regression = LogisticRegression()
regression.fit(np.array(X_train), y_train)
prediction = regression.predict(np.array(X_test))
MSRP_eval(prediction, y_test)

Results
acc= 0.712463768115942  reject= 0.6171428571428571  accept= 0.7367272727272727


['n', 'v', 'a', 'r']

In [None]:
'''
    Third Paraphrase detector approach, lemma jaccard + top synset pos jaccard + synset distance
'''
def lemma_pos_jaccard(sent_0, sent_1):
    print('.', end='')
    sent_0, sent_1 = remove_punctuation(sent_0.lower()), remove_punctuation(sent_1.lower())
    words_0, words_1 = words_from_sent(sent_0), words_from_sent(sent_1)    
    tokens_0, tokens_1 = tokens_from_words(words_0), tokens_from_words(words_1)
    lemmas_0, senses_0 = extract_lemmas_senses(tokens_0)
    lemmas_1, senses_1 = extract_lemmas_senses(tokens_1)                        
    top_synsets_pos_0, top_synsets_pos_1 = [], []
    noun_distance, verb_distance, adj_distance, verb_distance = [], [], [], []
    for sense_0 in senses_0:        
        # add the pos tag (NOT the wordnet one) to the top synset pos list)         
        top_synsets_pos_0.append(sense_0[1])
        for sense_1 in senses_1:
            # add the pos tag (NOT the wordnet one) to the top synset pos list)         
            top_synsets_pos_1.append(sense_1[1])
            #  compare wordnet pos tags and calculate path similarity grouped by pos
            if ((sense_0[0].pos() == 'n') & (sense_1[0].pos() == 'n')):
                noun_distance.append(sense_0[0].path_similarity(sense_1[0]))
            if ((sense_0[0].pos() == 'v') & (sense_1[0].pos() == 'v')):
                verb_distance.append(sense_0[0].path_similarity(sense_1[0]))
            if ((sense_0[0].pos() == 'a') & (sense_1[0].pos() == 'a')):
                adj_distance.append(sense_0[0].path_similarity(sense_1[0]))
            if ((sense_0[0].pos() == 'r') & (sense_1[0].pos() == 'r')):
                verb_distance.append(sense_0[0].path_similarity(sense_1[0]))
    return [
        jaccard_distance(set(lemmas_0), set(lemmas_1)),
        jaccard_distance(set(top_synsets_pos_0), set(top_synsets_pos_1)),
        np.sum(list(filter(None, noun_distance))), 
        np.sum(list(filter(None, verb_distance))),
        np.sum(list(filter(None, adj_distance))), 
        np.sum(list(filter(None, verb_distance)))
    ]

print('Training')
X_train = [lemma_pos_jaccard(data[0], data[1]) for data in train_input]
y_train = [int(line.strip()) for line in train_classes]
print('Testing')
X_test = [lemma_pos_jaccard(data[0], data[1])for data in test_input]
y_test = [int(line.strip()) for line in test_classes]

Training
...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [46]:
print('Results')
# regression = LogisticRegression()
# regression.fit(np.array(X_train), y_train)
# prediction = regression.predict(np.array(X_test))
# MSRP_eval(prediction, y_test)

Results


[1, 2, 3]