# OntoAlignment between PWN and CWN

* What is the data structure necessary to align ontologically between PWN and CWN

In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../src")

In [2]:
import mesh
import pandas as pd
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
sense_dir = mesh.get_data_dir()/"sense_data"
mesh.ensure_dir(sense_dir)
sense_map_frame = pd.read_csv(sense_dir/"sense_map_bn_pwn.csv", index_col=0)

In [4]:
entry = sense_map_frame.iloc[8, :].to_dict()

In [5]:
entry

{'word': '霧',
 'pwn_synset': '11458314n',
 'en_word': 'fog',
 'pwn_sense_offset': 'fog%1:19:00::',
 'pos': 'NOUN',
 'bn_offset': 'bn:00035570n',
 'bn_key': True}

In [6]:
from CwnGraph import CwnBase
cwn = CwnBase()

In [7]:
from nltk.corpus import wordnet as wn
def get_synset(syn_id):
    syn_pos = syn_id[-1]
    syn_num = syn_id[:-1]
    return wn.synset_from_pos_and_offset(syn_pos, int(syn_num))

In [8]:
cwn_senses = cwn.find_all_senses(entry["word"])

In [9]:
syn = get_synset(entry["pwn_synset"])

In [10]:
mesh.get_pwn_relations(entry["pwn_synset"], depth=2)

[(Synset('aerosol.n.01'), 'hypernyms', 0),
 (Synset('cloud.n.01'), 'hypernyms', 1),
 (Synset('fogbank.n.01'), 'hyponyms', 0),
 (Synset('ice_fog.n.01'), 'hyponyms', 0),
 (Synset('mist.n.01'), 'hyponyms', 0),
 (Synset('pea_soup.n.02'), 'hyponyms', 0)]

In [22]:
wn.synsets("cat")[0].name()

'cat.n.01'

In [11]:
dir(wn.synsets("cat")[0])
mesh.get_pwn_relations(wn.synsets("cat")[0], ["hypernyms"], depth=2)

[(Synset('feline.n.01'), 'hypernyms', 0),
 (Synset('carnivore.n.01'), 'hypernyms', 1)]

In [12]:
from mesh import cwn_onto_align as onto_align

In [13]:
adata = onto_align.get_alignment_structure(entry["word"], entry["pwn_synset"], cwn)

In [27]:
align_data = []
for idx, row in tqdm(sense_map_frame.iterrows(), total=sense_map_frame.shape[0]):
    adata = onto_align.get_alignment_structure(row["word"], row["pwn_synset"], cwn)
    align_data.append(adata)

HBox(children=(FloatProgress(value=0.0, max=17400.0), HTML(value='')))




## Generate mapping json

In [31]:
import json
with open(mesh.get_data_dir()/"sense_data/bn_alignment_data.json", "w", encoding="UTF-8") as fout:
    json.dump([x.to_dict() for x in align_data], fout, indent=2, ensure_ascii=False)

In [34]:
import pickle
with open(mesh.get_data_dir()/"sense_data/bn_alignment_data.pkl", "wb") as fout:
    pickle.dump(align_data, fout)

PicklingError: args[0] from __newobj__ args has the wrong class

In [33]:
import nltk
nltk.download('omw')

[nltk_data] Downloading package omw to E:\nltk_data...
[nltk_data]   Unzipping corpora\omw.zip.


True

## CWN existing PWN alignment

In [52]:
niter = filter(lambda x: x[1]['node_type']==("pwn_synset"), cwn.V.items())

In [53]:
synsets=list(niter)

In [54]:
synsets[0]

('pwn_09553033N',
 {'node_type': 'pwn_synset', 'synset_sno': '01', 'synset_word1': 'expense'})

In [59]:
cwn.find_edges(synsets[1][0], is_directed=False)

[<CwnRelation> generic(rev): pwn_04355268N <- 07057003,
 <CwnRelation> hyponym(rev): pwn_04355268N <- 05172801,
 <CwnRelation> generic(rev): pwn_04355268N <- 0517280101,
 <CwnRelation> hyponym(rev): pwn_04355268N <- 07077208,
 <CwnRelation> generic(rev): pwn_04355268N <- 0707720801]

In [61]:
mesh.get_synset("04355268n")

AssertionError: 

In [69]:
wn.synsets("endowment")[0].lemmas()

[Lemma('endowment.n.01.endowment'),
 Lemma('endowment.n.01.gift'),
 Lemma('endowment.n.01.talent'),
 Lemma('endowment.n.01.natural_endowment')]

In [71]:
from CwnGraph import CwnSense
CwnSense("07057003", cwn)

<CwnSense[07057003](資): 天賦的能力。>

## Read PWN-1.6

In [131]:
wn16_dir = mesh.get_data_dir()/"wordnet-1.6"
noun_path = wn16_dir / "dict/data.noun"
verb_path = wn16_dir / "dict/data.verb"
adj_path = wn16_dir / "dict/data.adj"
adv_path = wn16_dir / "dict/data.adv"

In [129]:
def parse_synset(ln_txt):
    syn_data, gloss = ln_txt.split("|")    
    toks = syn_data.split()    
    offset = toks[0]
    pos = toks[2]
    n_lemma = int(toks[3], 16)    
    lemmas = toks[slice(4,n_lemma*2+4,2)]
    return (offset, pos, lemmas, gloss.strip())

In [123]:
parse_synset(noun_data[15226])

('02536246',
 'n',
 ['curtain', 'drape', 'drapery', 'mantle', 'pall'],
 'hanging cloth used as a blind')

In [124]:
toks = "00001740 03 n 02 entity 0 something 0 014".split()

In [125]:
def load_wn16_data(data_path, map_data):
    with data_path.open() as fin:
        data = [x for x in fin.readlines() if not x.startswith(" ")]
    for ln in tqdm(data, desc=data_path.name):
        syn_data = parse_synset(ln)
        map_data[syn_data[0]+syn_data[1]] = syn_data    

In [132]:
map_data = {}
load_wn16_data(noun_path, map_data)
load_wn16_data(verb_path, map_data)
load_wn16_data(adj_path, map_data)
load_wn16_data(adv_path, map_data)

HBox(children=(FloatProgress(value=0.0, description='data.noun', max=66025.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='data.verb', max=12127.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='data.adj', max=17915.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='data.adv', max=3575.0, style=ProgressStyle(description_wi…




In [133]:
len(map_data)

99642

In [136]:
sense_dir = mesh.get_data_dir() / "sense_data"
wn16_synpath = sense_dir/"wn16_synsets.json"
with wn16_synpath.open("w", encoding="UTF-8") as fout:
    json.dump(map_data, fout, ensure_ascii=False, indent=2)

## Load PWN synset data

In [3]:
from CwnGraph import PwnSynset, CwnBase
import json

In [4]:
cwn = CwnBase()

In [5]:
niter = filter(lambda x: x[1]['node_type']==("pwn_synset"), cwn.V.items())

In [6]:
pwn_synsets=list(niter)

In [7]:
len(pwn_synsets)

5012

In [8]:
cwn.find_edges(pwn_synsets[0][0], False)

[<CwnRelation> generic(rev): pwn_09553033N <- 07057002,
 <CwnRelation> generic(rev): pwn_09553033N <- 03003302,
 <CwnRelation> generic(rev): pwn_09553033N <- 06526301]

In [9]:
syn=PwnSynset("pwn_09553033N", cwn)

In [10]:
syn.relations

[('generic', <CwnSense[07057002](資): 用於換取等值特定對象的金錢。>, 'reversed'),
 ('generic', <CwnSense[03003302](撥款): 支付或調配的金錢。>, 'reversed'),
 ('generic', <CwnSense[06526301](費用): 用於換取等值特定對象的金錢。>, 'reversed')]

### load WN16 data

In [11]:
sense_dir = mesh.get_data_dir() / "sense_data"
wn16_synpath = sense_dir/"wn16_synsets.json"
with wn16_synpath.open("r", encoding="UTF-8") as fin:
    wn16_data = json.load(fin)

In [12]:
wn16_data["09553033n"]

['09553033',
 'n',
 ['expense', 'disbursal', 'disbursement'],
 'amounts paid for goods and services that may be currently tax deductible (as opposed to capital expenditures)']

In [24]:
entries = []
for syn in tqdm(pwn_synsets):
    syn_id, syn_data = syn
    syn_offset = syn_id.split("_")[1].lower()
    
    wn16_syn = None
    if syn_offset in wn16_data:
        wn16_syn = wn16_data[syn_offset]
    
    if syn_offset.replace("a", "s") in wn16_data:
        wn16_syn = wn16_data[syn_offset.replace("a", "s")]
    
    if not wn16_syn:
        print("Cannot find " + syn_offset)
        continue
        
    pwn_synset = PwnSynset(syn_id, cwn)
    rels = pwn_synset.relations     
    for rel_x in rels:
        rel_type, cwn_node, _ = rel_x
        try:
            entries.append(dict(
                cwn_id=cwn_node.id,
                cwn_pos=cwn_node.pos,
                lemmas=cwn_node.lemmas[0].lemma,
                cwn_def=cwn_node.definition,
                rel_type=rel_type,
                wn16_offset=wn16_syn[0],
                wn16_pos=wn16_syn[1],
                wn16_lemmas=','.join(wn16_syn[2]),
                wn16_def=wn16_syn[3]
            ))            
        except Exception as ex:
            print(syn_id, end=" ")
            print(ex)                 

HBox(children=(FloatProgress(value=0.0, max=5012.0), HTML(value='')))

Cannot find 04360492n
Cannot find ------
pwn_07579443N list index out of range
pwn_00817247V list index out of range
pwn_00427308V list index out of range
Cannot find 
Cannot find 001364494v
Cannot find 00871598a
Cannot find 01237414a
Cannot find 06211120n
Cannot find 0082064v
pwn_00454769V list index out of range
Cannot find 01914685v
Cannot find 05113619n
pwn_00547039V list index out of range
Cannot find 017393308v
Cannot find 01211326v
Cannot find 00136112v
Cannot find 00847824v
Cannot find 02301996v
Cannot find 02385846v
Cannot find 01565884n
Cannot find 01412424



In [25]:
import pandas as pd

In [26]:
wn16_map = pd.DataFrame.from_records(entries)

In [27]:
wn16_map.to_csv(sense_dir / "wn16_cwn_map.csv", encoding="UTF-8")