In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../src")

In [2]:
import mesh
from tqdm.autonotebook import tqdm
from CwnGraph import CwnBase
import json, pickle

  from tqdm.autonotebook import tqdm


In [3]:
import requests

In [4]:
with (mesh.get_data_dir()/"babelnet_key.txt").open("r") as fin:
    bn_key = fin.read().strip()
bn_url = "https://babelnet.io/v5/"

In [5]:
bn_key

'baba90e0-0321-47de-a4e3-b1ae08aea79a'

In [6]:
bnapi = mesh.BabelNetAPI()

In [7]:
bnapi.get_version()

'V4_0'

In [25]:
cwn = CwnBase()

In [32]:
lemmas = []
for nid, ndata in cwn.V.items():
    if ndata["node_type"] == "lemma":        
        lemmas.append(ndata["lemma"])

In [42]:
from itertools import chain
def find_cwn_senses(lemma):
    try:
        sense_iter = (x.senses for x in cwn.find_lemma(f"^{lemma}$"))
        sense_iter = chain.from_iterable(sense_iter)
        return list(sense_iter)
    except Exception as ex:
        print(lemma)
        print(ex)
        return []

In [58]:
target_lemmas = []
for lemma_x in tqdm(set(lemmas)):
    if not lemma_x:
        continue    
    try:        
        sense_list = find_cwn_senses(lemma_x)        
        if len(sense_list) > 2:
            target_lemmas.append(lemma_x)
    except Exception as ex:
        print(ex)        

HBox(children=(FloatProgress(value=0.0, max=26550.0), HTML(value='')))




In [63]:
target_lemma_path = mesh.get_data_dir()/"bn_target_lemmas.txt"
with target_lemma_path.open("w", encoding="UTF-8") as fout:
    for lemma in target_lemmas:
        fout.write(lemma + "\n")

In [None]:
target_lemma_path = mesh.get_data_dir()/"bn_target_lemmas.txt"
with target_lemma_path.open("r", encoding="UTF-8") as fin:
    taget_lemmas = fin.readlines()

## Continue to retrieve data from BabelNet

In [8]:
target_lemma_path = mesh.get_data_dir()/"bn_target_lemmas.txt"
with target_lemma_path.open("r", encoding="UTF-8") as fin:
    target_lemmas = [x.strip() for x in fin.readlines()]
    
bn_sense_path = mesh.get_data_dir()/"bn_sense_data.pkl"
if bn_sense_path.exists():
    with bn_sense_path.open("rb") as fin:
        sense_data = pickle.load(fin)
else:
    sense_data = {}

In [12]:
len(sense_data)

1948

In [19]:
target_lemmas[0]

'先發'

In [13]:
quota = 900
for lemma_x in tqdm(target_lemmas):
    if quota == 0:
        break
    if lemma_x in sense_data:
        continue
    data = bnapi.get_senses(lemma_x)
    if 'message' in data:        
        print(data["message"])
        break
    quota -= 1
    sense_data[lemma_x] = data    

HBox(children=(FloatProgress(value=0.0, max=3396.0), HTML(value='')))




In [14]:
len(sense_data)

2848

In [15]:
with bn_sense_path.open("wb") as fout:
    pickle.dump(sense_data, fout)

In [16]:
s1 = list(sense_data.items())[1]

In [17]:
to_deleted = []
for lemma, senses in sense_data.items():
    if 'message' in senses:
        to_deleted.append(lemma)

In [18]:
for lemma in to_deleted:
    sense_data.pop(lemma)

In [19]:
len(sense_data)

2848

In [20]:
wn_map = {}
for lemma, senses in sense_data.items():    
    if not senses:
        continue
    
    wn_senses = filter(lambda x: x["type"] == "WordNetSense", senses)    
    slist = []
    for s in wn_senses:
        
        p = s["properties"]
        slist.append((p.get("wordNetOffset"), 
                     p.get("fullLemma"), 
                     p.get("senseKey"), p.get("pos"), p.get("synsetID").get("id"), p.get("bKeySense")))
    wn_map[lemma] = slist

In [21]:
print(f"target_lemmas (>3 in CWN): {len(target_lemmas)}\n"
      f"sense_data (requested from BN so far): {len(sense_data)}\n"
      f"with WN senses: {len(wn_map)}")

target_lemmas (>3 in CWN): 3396
sense_data (requested from BN so far): 2848
with WN senses: 2149


## A word sense sample

In [22]:
from nltk.corpus import wordnet as wn

In [23]:
wn_map["犯"]

[('02582615v',
  'perpetrate',
  'perpetrate%2:41:00::',
  'VERB',
  'bn:00085426v',
  False),
 ('02582615v', 'commit', 'commit%2:41:00::', 'VERB', 'bn:00085426v', False),
 ('02582615v', 'pull', 'pull%2:41:00::', 'VERB', 'bn:00085426v', False),
 ('02582921v', 'make', 'make%2:41:13::', 'VERB', 'bn:00090561v', False)]

In [51]:
wn.synset_from_pos_and_offset("v", 2582615).definition()

'perform an act, usually with a negative connotation'

In [106]:
wn.synset_from_pos_and_offset("v", 2582921).definition()

'carry out or commit'

In [107]:
find_cwn_senses("犯")

[<CwnSense[06700001](犯): 越過本身應有的範圍，而侵占到後述對象的權益。>,
 <CwnSense[06700002](犯): 不遵守約定或上級規定的規則。>,
 <CwnSense[06700003](犯): 違反競賽的規則，通常由裁判所判定，並帶有一定的罰則。>,
 <CwnSense[06700004](犯): 做出不應該或不正確的事情。>,
 <CwnSense[06700005](犯): 做出違反法律、應受處罰的事情。>,
 <CwnSense[06700006](犯): 做出違反法律的事情，而應受處罰的人。>,
 <CwnSense[06700007](犯): 做出特定被禁止的行為的事情。>,
 <CwnSense[06700008](犯): 疾病病徵再次發生。>,
 <CwnSense[06700009](犯): 侵犯到後述對象的權益而遭到報復。>]

## Make a dataframe

In [33]:
import pandas as pd

In [29]:
data = [(lemma, *map_entry) for lemma, sense_map in wn_map.items() for map_entry in sense_map]
    

In [35]:
sense_map_frame = pd.DataFrame.from_records(data)

In [36]:
sense_map_frame

Unnamed: 0,0,1,2,3,4,5,6
0,犯,02582615v,perpetrate,perpetrate%2:41:00::,VERB,bn:00085426v,False
1,犯,02582615v,commit,commit%2:41:00::,VERB,bn:00085426v,False
2,犯,02582615v,pull,pull%2:41:00::,VERB,bn:00085426v,False
3,犯,02582921v,make,make%2:41:13::,VERB,bn:00090561v,False
4,淺,09433134n,shoal,shoal%1:17:01::,NOUN,bn:00070877n,False
...,...,...,...,...,...,...,...
14497,值,05856388n,value,value%1:09:00::,NOUN,bn:00079508n,False
14498,值,05954894n,values,values%1:09:00::,NOUN,bn:00079516n,False
14499,值,04979425n,value,value%1:07:01::,NOUN,bn:00079510n,False
14500,值,02702508v,cost,cost%2:42:00::,VERB,bn:00083193v,False


In [41]:
sense_dir = mesh.get_data_dir()/"sense_data"
mesh.ensure_dir(sense_dir)
sense_map_frame.to_csv(sense_dir/"sense_map_bn_pwn.csv")