<a href="https://colab.research.google.com/github/rauan-assabayev/NLP/blob/master/lab7/HypernymTitles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Lab 7 
#### Rouge Metrics

In [4]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np
from fuzzywuzzy import fuzz
import ipywidgets as widgets
import pprint
from ipywidgets import interact, interact_manual
import re
__PATH__ = "https://raw.githubusercontent.com/rauan-assabayev/NLP/master/lab7/data.csv"



In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
df = pd.read_csv(__PATH__,sep=";",header=0)
df.head(5)

Unnamed: 0,id,updatedDate,publishedDate,title,summary,authors,category,metaData,downloadLink,filePath
0,http://arxiv.org/abs/1407.6950v1,2014-07-24T16:56:39Z,2014-07-24T16:56:39Z,"How,whenAndHowMuchACardDeckIsWellShuffled.pdf",The Thesis Consider The Mixing Of Few 3 4 ...,Benjamin Isac Fargion,cs.DM,"Italian Thesis In Engeenering Computer, 26 Feb...",http://arxiv.org/pdf/1407.6950v1.pdf,"./files/How,whenAndHowMuchACardDeckIsWellShuff..."
1,http://arxiv.org/abs/0907.0618v1,2009-07-03T12:35:10Z,2009-07-03T12:35:10Z,QuantumIsometryGroups.pdf,This Thesis Contains The Formulation And Com...,Jyotishman Bhowmick,math.OA,Thesis,http://arxiv.org/pdf/0907.0618v1.pdf,./files/QuantumIsometryGroups.pdf
2,http://arxiv.org/abs/1806.09601v2,2018-07-14T17:06:27Z,2018-06-25T17:55:59Z,ComputationAndBoundingOfFolkmanNumbers.pdf,Phd Thesis Under The Supervision Of Professo...,Aleksandar Bikov,math.CO,PhD Thesis,http://arxiv.org/pdf/1806.09601v2.pdf,./files/ComputationAndBoundingOfFolkmanNumbers...
3,http://arxiv.org/abs/1905.03014v1,2019-05-08T11:47:34Z,2019-05-08T11:47:34Z,OnChurch'sThesisInCubicalAssemblies.pdf,"We Show That Church's Thesis, The Axiom Stat...","Andrew Swan, Taichi Uemura,",math.LO,0,http://arxiv.org/pdf/1905.03014v1.pdf,./files/OnChurch'sThesisInCubicalAssemblies.pdf
4,http://arxiv.org/abs/1901.04911v1,2019-01-15T16:24:07Z,2019-01-15T16:24:07Z,UnconstrainedChurchTuringThesisCannotPossiblyB...,The Church Turing Thesis Asserts That If A P...,Yuri Gurevich,cs.LO,0,http://arxiv.org/pdf/1901.04911v1.pdf,./files/UnconstrainedChurchTuringThesisCannotP...


**Preprocessing the title to list of tokens**

In [0]:
titles = list(df['title'].apply(
    lambda t : 
        tuple(
            filter(lambda e:not e in stopwords.words('english'),
                map(lambda e:e.lower(),
                       re.findall('([A-Z]{1}[a-z]+)',t.replace('.pdf','')))
                )
            )
        )
    )

In [0]:
res = {}
for title in titles:
    synsets = {}
    for word in title:
        synsets[word]=[synset for synset in wn.synsets(word)]
    res[title] = synsets

In [0]:
def get_hyper1(data):
    hypernyms1 = []
    synsets_ = wn.synsets(data) 
    for s in range(len(synsets_)):
        hypernyms1.extend(wn.synsets(data)[s].hypernyms()) 
    hyper = set(hypernyms1)
    return hyper

In [0]:
def get_hyper2(data):
    hyper1 = get_hyper1(data)
    hypernyms2 = []
    for hypernym1 in hyper1:
        hypernyms2.extend(hypernym1.hypernyms())
    hyper = set(hypernyms2)
    return hyper

In [0]:
def f_score(y_true, y_pred):
    inter = y_true.intersection(y_pred)
    cross_size = len(y_true & y_pred)
    if cross_size == 0: 
        return inter, 0
    else:
        p = 1. * cross_size / len(y_pred)
        r = 1. * cross_size / len(y_true)
        return inter, 2 * p * r / (p + r)

In [0]:
def distance(a,b):
    ### Put your code here
    a = set(a)
    b = set(b) 
    inter, f1_score = f_score(a,b)
    
    a = a - inter 
    b = b - inter 
    
    if len(a) == 0 or len(b) == 0: 
        result = 1 - f1_score
    else: 
        penalty = 0
        # first layer hypernyms
        for word_a in a:
            for word_b in b:
                var_hyp1 = get_hyper1(word_a)
                var_hyp2 = get_hyper1(word_b)
                num_hyper1 = len(var_hyp1.intersection(var_hyp2))    
                if (num_hyper1 > 0):
                    penalty = penalty + 0.6
                    # in case not in first layer of hypernyms
                if (num_hyper1 == 0):
                    var_hyp3 = get_hyper2(word_a)
                    var_hyp4 = get_hyper2(word_b)
                    num_hyper2 = len(var_hyp3.intersection(var_hyp4)) 
                    + len(var_hyp3.intersection(var_hyp2)) 
                    + len(var_hyp1.intersection(var_hyp4)) 
                    if (num_hyper2 > 0):
                        penalty = penalty + 0.4
        
        if penalty > 0:
            prec_penalty = penalty/len(a)
            recall_penalty = penalty/len(b)
            fscore_penalty = 2 * (prec_penalty * recall_penalty) / (prec_penalty + recall_penalty)
        else:
            fscore_penalty = 0
                                                    
        result = 1 - (2 * f1_score + fscore_penalty)/3   
    
    return result

In [0]:
buff = list(res.items())
dist = np.zeros((len(buff),len(buff)))
for lli,ll in enumerate(buff):
    for rri,rr in enumerate(buff):
        dist[lli,rri]=distance(ll[0],rr[0])


**Top ten closest articles with fuzzy metrics of titles**

In [14]:
@interact(ind=(0,len(buff)-1,1))
def h(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint([buff[i][0] for i in dist[ind][:].argsort()[1:11]])

interactive(children=(IntSlider(value=0, description='ind', max=995), Output()), _dom_classes=('widget-interac…

In [15]:
@interact(ind=(0,len(buff)-1,1))
def hypernyms(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint(buff[ind][1])

interactive(children=(IntSlider(value=0, description='ind', max=995), Output()), _dom_classes=('widget-interac…