In [22]:
import string
import numpy as np
import pandas as pd

from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk.data import find
from sklearn.linear_model import LogisticRegression
from nltk.metrics import *
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
init_notebook_mode(connected=True)

In [8]:
'''
    The paraphrase data that will be analysed
'''
train_input = text_to_sentences('IHLT-eval-framework/train/msr_paraphrase_train_input.txt')
test_input = text_to_sentences('IHLT-eval-framework/test/msr_paraphrase_test_input.txt')
train_classes = open('IHLT-eval-framework/train/msr_paraphrase_train_gs.txt', encoding="utf-8-sig").readlines()
test_classes = open('IHLT-eval-framework/test/msr_paraphrase_test_gs.txt', encoding="UTF8").readlines()

In [11]:
'''
    A collection of useful functions for nlp
'''
# Removing the punctuation and lowering the case of a string
def remove_punctuation(line):
    return line.translate(str.maketrans('', '', string.punctuation))

# extract the words from the sentence
def words_from_sent(sent):
    # tokenized
    tokenized = word_tokenize(sent)
    # remove stopwords and return
    return [word for word in tokenized if word not in stopwords.words('english')]

# convert words to tokens
def tokens_from_words(words):
    return pos_tag(words)

# Function to get wordnet pos code
def wordnet_pos_code(tag):
    if tag.startswith('NN'):
        return wordnet.NOUN
    elif tag.startswith('VB'):
        return wordnet.VERB
    elif tag.startswith('JJ'):
        return wordnet.ADJ
    elif tag.startswith('RB'):
        return wordnet.ADV
    else:
        return None
# Tokens to lemmas using wordnet lemmatizer    
def tokens_to_lemmas(tokens):
    return list(map(token_to_lemmas, tokens))

def token_to_lemmas(token):    
    pos = wordnet_pos_code(token[1])
    if pos:
        return WordNetLemmatizer().lemmatize(token[0], pos=pos)
    return token[0]

# Convert tokens to senses where sense is the first synset + POS
def tokens_to_senses(tokens):
    return list(filter(None, map(token_to_sense, tokens)))

def token_to_sense(token):
    lemma = WordNetLemmatizer().lemmatize(token[0])
    pos = wordnet_pos_code(token[1])
    synsets = []
    if pos: 
        return wordnet.synsets(lemma, pos=pos)

# Load the lines of training text as sentences
def text_to_sentences(filename):
    sentence_pair_array = []
    for line in open(filename, encoding="UTF8").readlines():
        sentence_pair_array.append([s.strip() for s in line.split("\t")])
    return sentence_pair_array

def parse_pos(sentence):
    return bllip.parse_one(sentence)

def compare_synsets(synset_a, synset_b):
    lcs = synset_a.lowest_common_hypernyms(synset_b)
    similarity = synset_a.path_similarity(synset_b)
    wup_similarity = synset_a.wup_similarity(synset_b)
    lin_similarity = synset_a.lin_similarity(synset_b, brown_ic)                        
    lch_similarity = synset_a.lch_similarity(synset_b)
    return lcs, similarity, wup_similarity, lin_similarity, lch_similarity

def count(g,s):
    TP = TN = FP = FN = 0
    for i in range(0,len(g)):
        if (g[i]==s[i] and s[i]==1): TP+=1
        if (g[i]==s[i] and s[i]==0): TN+=1
        if (g[i]!=s[i] and s[i]==1): FP+=1
        if (g[i]!=s[i] and s[i]==0): FN+=1
    return [TP,TN,FP,FN]
    
def MSRP_eval(gs, sys):
    [TP,TN,FP,FN] = count(gs,sys)
    acc = (TP+TN)/float(TP+TN+FP+FN) # ACCURACY
    reject = TN/float(TN+FP) # precision on negative SPECIFICITY
    accept = TP/float(TP+FN) # precision on positive SENSITIVITY
    print("acc=",acc," reject=",reject," accept=",accept)

In [12]:
'''
    First Paraphrase detector approach, jaccard distance of lemmas    
'''
def lemma_jaccard(sent_0, sent_1):
    print('.', end='')
    sent_0, sent_1 = remove_punctuation(sent_0.lower()), remove_punctuation(sent_1.lower())
    words_0, words_1 = words_from_sent(sent_0), words_from_sent(sent_1)
    tokens_0, tokens_1 = tokens_from_words(words_0), tokens_from_words(words_1)
    lemmas_0, lemmas_1 = tokens_to_lemmas(tokens_0), tokens_to_lemmas(tokens_1)
    return jaccard_distance(set(lemmas_0), set(lemmas_1))

print('Training')
X_train = [lemma_jaccard(data[0], data[1]) for data in train_input]
y_train = [int(line.strip()) for line in train_classes]
print('Testing')
X_test = [lemma_jaccard(data[0], data[1])for data in test_input]
y_test = [int(line.strip()) for line in test_classes]
print('Results')
regression = LogisticRegression()
regression.fit(np.array(X_train).reshape(-1,1), y_train)
prediction = regression.predict(np.array(X_test).reshape(-1,1))
MSRP_eval(prediction, y_test)

Training
...............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [28]:
'''
    Analysis of results
'''
# The specificity (reject) is observed to be lower than the other accuracies.
# The incorrect values are now selected for analysis:
false_negative_idx = np.where((prediction != y_test) & (prediction == 0))[0]
false_negatives = []
remaining_input = []
for idx, el in enumerate(test_input):
    if idx in false_negative_idx:
        false_negatives.append(el)
    else:
        remaining_input.append(el)
        
pd.DataFrame(data = {
    'false_negatives': [1],
    'remaining_input': [1]
}, index=[1,2])

Unnamed: 0,false_negatives,remaining_input
1,1,1
2,1,1


range(0, 10)