In [1]:
#import stuff
import spacy
import wmd
import nltk
from nltk import pos_tag
from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk.metrics import jaccard_distance
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

#download other stuff

#nltk.download('stopwords')
#python3 -m spacy download en_core_web_md

In [2]:
#init stuff
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) 
nlp = spacy.load('en_core_web_lg')
nlp.add_pipe(wmd.WMD.SpacySimilarityHook(nlp), last=True)


In [4]:
#general methods
def lemmatize(p):
    if p[1][0] in {'N', 'V'}:
        return wnl.lemmatize(p[0].lower(), pos=p[1][0].lower())
    return p[0]

def penn2morphy(penntag, returnNone=False):
    morphy_tag = {'NN':wn.NOUN, 'JJ':wn.ADJ,
                  'VB':wn.VERB, 'RB':wn.ADV}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return None if returnNone else ''
    
def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [5]:
def distances(input_file):  
    with open(input_file) as f:
        input_data = f.readlines()
    
    lesks_distances = []
    morphology_distances = []
    cosine_distances = []
    wmd_distances = []
    for i in input_data:
        #sentences = nltk.sent_tokenize(i)
        #gets sentences
        sentences = i.replace('\n','').split('\t')
        
        #tokenize
        words = [word_tokenize(sent) for sent in sentences]
        
        #filtering stop words
        filtered = [[w for w in s if not w in stop_words ]for s in words]
        
        #pos tag
        pairs = [pos_tag(w) for w in filtered]
        
        #lemma words
        l_words = [[lemmatize(p) for p in pair] for pair in pairs]
        
        #synsets of the tokenized
        synsets = [[[lesk(w, p[0], pos=penn2morphy(p[1][0])) for p in pair] for pair in pairs] for w in filtered]

        #lemmatized sentences
        lemmasent = [' '.join(l) for l in l_words]
        
        morphology_distances.append(jaccard_distance(set(l_words[0]),set(l_words[1])))
        lesks_distances.append(jaccard_distance(set(synsets[0][0]), set(synsets[1][1])))
        cosine_distances.append(get_cosine_sim(lemmasent[0],lemmasent[1])[0][1])
        wmd_distances.append(nlp(sentences[0]).similarity(nlp(sentences[1])))
    
    #normalizing
    norm_wmd_distance = [float(i)/max(wmd_distances) for i in wmd_distances]
                             
    #print("Cosine distances: " + str(cosine_distances))
    #print("Lesk distance: " + str(lesks_distances))
    #print("Lemma distances: " + str(morphology_distances))
    #print("WMD distances: " + str(norm_wmd_distance))
    
    gold_file = 'data/train/STS.gs.MSRpar.txt'
    with open(gold_file) as f:
        gold_data = f.readlines()
    gold = [float(g.replace('\n', '')) for g in gold_data]
        
    cos_pearson = pearsonr(cosine_distances, gold)[0]
    mor_pearson = pearsonr(morphology_distances, gold)[0]
    lesk_pearson = pearsonr(lesks_distances, gold)[0]
    wmd_pearson = pearsonr(norm_wmd_distance, gold)[0]

    print('Cosine correlation: ' + str(cos_pearson))
    print('Lemmatized words correlation: ' + str(mor_pearson))
    print('Lesk correlation: ' + str(lesk_pearson))
    print("WMD correlation: " + str(wmd_pearson))

In [50]:
train_1 = 'data/train/STS.input.MSRpar.txt'
distances(train_1)

Cosine correlation: 0.5624454928407743
Lemmatized words correlation: -0.5374512978162805
Lesk correlation: -0.43219148526405204
WMD correlation: -0.5061061956812528


In [43]:
with open(train_1) as f:
    input_data = f.readlines()

i = input_data[247]
#sentences = nltk.sent_tokenize(i)
sentences = i.replace('\n','').split('\t')
words = [nltk.word_tokenize(sent) for sent in sentences]
pairs = [pos_tag(w) for w in words]
l_words = [[lemmatize(p) for p in pair] for pair in pairs]
synsets = [[[lesk(w, p[0], pos=penn2morphy(p[1][0])) for p in pair] for pair in pairs] for w in words]
lemmasent = [' '.join(l) for l in l_words]
get_cosine_sim(sentences[0],sentences[1])[0][1]


gold_file = 'data/train/STS.gs.MSRpar.txt'
with open(gold_file) as f:
    gold_data = f.readlines()
        
gold = [float(g.replace('\n', '')) for g in gold_data]
gold

[4.0,
 3.75,
 2.8,
 3.4,
 2.4,
 1.333,
 4.6,
 3.8,
 4.2,
 2.6,
 4.4,
 4.2,
 5.0,
 5.0,
 5.0,
 3.4,
 5.0,
 4.0,
 3.2,
 5.0,
 4.4,
 3.6,
 3.6,
 0.8,
 1.4,
 3.2,
 3.8,
 4.2,
 3.0,
 3.6,
 3.0,
 3.0,
 3.0,
 4.8,
 2.5,
 2.25,
 3.5,
 3.5,
 3.0,
 3.0,
 3.667,
 1.667,
 3.6,
 3.8,
 3.4,
 3.4,
 3.8,
 2.8,
 3.75,
 4.4,
 3.75,
 2.75,
 3.75,
 2.25,
 3.8,
 3.2,
 3.8,
 4.8,
 4.0,
 1.4,
 4.2,
 1.8,
 4.2,
 2.0,
 2.6,
 3.25,
 5.0,
 3.0,
 3.75,
 4.25,
 4.75,
 3.25,
 3.75,
 3.25,
 3.8,
 5.0,
 3.6,
 3.8,
 2.0,
 3.4,
 4.2,
 3.4,
 3.0,
 3.0,
 3.25,
 2.75,
 2.6,
 3.4,
 3.6,
 3.6,
 4.8,
 4.0,
 3.6,
 3.4,
 2.4,
 4.222,
 3.0,
 3.0,
 2.75,
 4.0,
 1.75,
 1.75,
 4.5,
 3.25,
 5.0,
 3.0,
 2.8,
 2.667,
 4.0,
 4.4,
 4.25,
 4.176,
 3.0,
 2.5,
 3.8,
 4.6,
 1.4,
 2.8,
 2.8,
 4.2,
 1.0,
 4.2,
 2.75,
 2.5,
 3.5,
 4.75,
 3.0,
 3.5,
 4.0,
 2.0,
 3.2,
 3.6,
 3.8,
 5.0,
 4.0,
 3.0,
 2.5,
 3.0,
 4.0,
 2.0,
 3.0,
 2.625,
 3.2,
 4.0,
 3.2,
 2.4,
 4.4,
 3.6,
 2.4,
 3.6,
 4.6,
 4.4,
 4.4,
 4.6,
 4.4,
 2.6,
 3.6,
 4.2,
 3.2,
 2.0,
 2.

In [104]:
input_file = 'data/trial/STS.input.txt'
with open(input_file) as f:
    input_data = f.readlines()

input_data[0]
sentences = nltk.sent_tokenize(input_data[0])
words = [nltk.word_tokenize(sent) for sent in sentences]
pairs = [pos_tag(w) for w in words]
l_words = [[lemmatize(p) for p in pair] for pair in pairs]
synsets = [[[lesk(w, p[0], pos=penn2morphy(p[1][0])) for p in pair] for pair in pairs] for w in words]
lemmasent = [' '.join(l) for l in l_words]
get_cosine_sim(sentences[0],sentences[1])[0][1]

0.4472135954999579

https://medium.com/@adriensieg/text-similarities-da019229c894
https://towardsdatascience.com/overview-of-text-similarity-metrics-3397c4601f50