# Mapping between PWN 1.6 and 3.0

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../src")

In [2]:
import mesh
import pandas as pd
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
from nltk.corpus import wordnet as wn

In [4]:
wn.get_version()

'3.0'

## load PWN1.6 data

In [5]:
import json

In [10]:
import numpy as np

In [13]:
sense_dir = mesh.get_data_dir() / "sense_data"
wn16_map = pd.read_csv(sense_dir / "wn16_cwn_map.csv", encoding="UTF-8", index_col=0, 
                       dtype={"cwn_id": np.object, "wn16_offset": np.object})

In [15]:
wn16_uniq = wn16_map.groupby("wn16_offset").first().reset_index()

In [16]:
from fuzzywuzzy import fuzz
import re

def map_to_wn30(wn16_entry):
    wn16_lemmas = wn16_entry.wn16_lemmas.split(",")
    wn16_def = wn16_entry.wn16_def.split(";")[0]    
    head = re.sub(r"\(\w\)", "", wn16_lemmas[0])
    wn30_syns = wn.synsets(head)
    if not wn30_syns:
        raise ValueError("Cannot find %s in WN30" % (head,))
    candid_vec = []
    
    for syn_x in wn30_syns:
        wn30_lemmas = set(syn_x.lemmas())
        score = fuzz.ratio(wn16_def, syn_x.definition())
        candid_vec.append((syn_x, score))
    candid_vec.sort(key=lambda x: x[1], reverse=True)
    if not candid_vec:
        breakpoint()
    return candid_vec[0][0], candid_vec[0][1]

In [25]:
map_data = []
for row_id, entry in tqdm(wn16_uniq.iterrows(),total=wn16_uniq.shape[0]):    
    try:
        wn30_syn, score = map_to_wn30(entry)
        map_data.append(dict(
            cwn_id=entry.cwn_id,
            wn16_def=entry.wn16_def,
            wn16_lemmas=entry.wn16_lemmas,
            wn16_offset=entry.wn16_offset,
            wn16_pos=entry.wn16_pos,
            wn30_syn_name=wn30_syn.name(),
            wn30_lemmas=",".join([x.name() for x in wn30_syn.lemmas()]),
            wn30_def=wn30_syn.definition(),
            score=score
        ))
    except Exception as ex:        
        map_data.append(dict(
            cwn_id=entry.cwn_id,
            wn16_def=entry.wn16_def,
            wn16_lemmas=entry.wn16_lemmas,
            wn16_offset=entry.wn16_offset,
            wn30_syn_name=None,
            wn30_lemmas=None,
            wn30_def=None,
            score=-1,
        ))


HBox(children=(FloatProgress(value=0.0, max=4993.0), HTML(value='')))




In [26]:
cwn_wn16_wn30_map = pd.DataFrame.from_records(map_data)

In [27]:
cwn_wn16_wn30_map

Unnamed: 0,cwn_id,wn16_def,wn16_lemmas,wn16_offset,wn16_pos,wn30_syn_name,wn30_lemmas,wn30_def,score
0,05237701,"draw air into, and expel out of, the lungs; ""I...","breathe,take_a_breath,respire",00001740,v,breathe.v.01,"breathe,take_a_breath,respire,suspire","draw air into, and expel out of, the lungs",100
1,04069402,(usually followed by `to') not having the nece...,"unable,not_able",00002062,a,unable.a.01,unable,(usually followed by `to') not having the nece...,100
2,04008202,heave or utter a sigh; breathe deeply and heav...,sigh,00003011,v,sigh.v.02,sigh,utter with a sigh,63
3,03043301,"expel air; ""Exhale when you lift the weight""","exhale,expire,breathe_out",00003142,v,exhale.v.01,"exhale,expire,breathe_out",expel air,100
4,05229171,"exhale spasmodically, as when an irritant ente...",sneeze,00003595,v,sneeze.v.01,sneeze,"exhale spasmodically, as when an irritant ente...",100
...,...,...,...,...,...,...,...,...,...
4988,07063202,distance travelled per unit time,"speed,velocity",10978183,n,speed.n.01,"speed,velocity",distance travelled per unit time,100
4989,03003505,any distinct time period in a sequence of even...,"phase,stage",10983365,n,phase.n.01,"phase,stage",any distinct time period in a sequence of events,100
4990,03035802,the term during which some position is held,"tenure,term_of_office,incumbency",10984112,n,tenure.n.01,"tenure,term_of_office,incumbency",the term during which some position is held,100
4991,05145601,the time period during which you are at work,"shift,work_shift,duty_period",10984256,n,shift.n.03,"shift,work_shift,duty_period",the time period during which you are at work,100


In [28]:
cwn_wn30_map_checked = cwn_wn16_wn30_map.loc[cwn_wn16_wn30_map.score>=62, :]

In [29]:
cwn_wn16_wn30_map.to_csv(sense_dir/"cwn_wn16_wn30_map.csv", encoding="UTF-8")

In [30]:
cwn_wn30_map_checked.to_csv(sense_dir/"cwn_wn16_wn30_map.checked.csv", encoding="UTF-8")

In [31]:
print(cwn_wn16_wn30_map.shape, cwn_wn30_map_checked.shape)

(4993, 9) (4145, 9)
