In [1]:
import json
import os

import pandas as pd
from gensim.models import KeyedVectors
from nodevectors import ProNE, Glove, GGVec


In [75]:
word_vectors = KeyedVectors.load_word2vec_format("data/colex_embeddings/clics_node2vec_embeddings", binary=False)

In [77]:
word_vectors["0"]

array([-0.02047994,  0.05916424,  0.20519084, -0.3105172 ,  0.16588277,
       -1.0919931 ,  0.2553862 ,  0.666562  , -0.46283624,  0.43598193,
       -0.05534175, -0.56868917, -0.13260768, -0.26992664,  0.05205116,
       -0.3607125 ,  0.13472432, -0.50320816, -0.20322326, -0.419984  ,
        0.14175324,  0.2873209 , -0.03548029, -0.68048465,  0.45879516,
        0.92322433, -0.4175724 , -0.4252096 , -0.671687  ,  0.07446837,
        0.18522488,  0.50527734,  0.25406852, -0.13428748, -0.40946656,
       -0.2687254 , -0.16756059, -0.66990787, -0.67280316, -0.5126048 ,
       -0.46065453, -0.04142022, -0.32780153, -0.12089379,  0.07880938,
       -0.17406897,  0.4083    , -0.7078555 , -0.25423497,  0.5606548 ,
        0.5153306 , -0.42795762, -0.4156445 ,  0.70552844,  0.0909841 ,
       -0.5457586 , -0.18857387, -0.3339893 , -0.33302993,  0.02607498,
        0.6861821 ,  0.37163094, -0.23359749, -0.02251102, -1.121223  ,
        0.30505878,  0.02470319, -0.00456789, -0.46862364,  0.45

In [5]:
def load_model(model_name, dataset, input_folder="data/node_embeddings"):
    if model_name == "node2vec":
        filepath = os.path.join(input_folder, dataset, f"{model_name}.bin")
        return KeyedVectors.load(filepath)
    else:
        filepath = os.path.join(input_folder, dataset, f"{model_name}.zip")
        if model_name == "prone":
            return ProNE.load(filepath).model
        if model_name == "ggvc":
            return GGVec.load(filepath).model
        if model_name == "glove":
            return Glove.load(filepath).model


In [6]:
!pwd

/Users/yiyichen/Documents/experiments/ColexGraph


In [66]:
model = load_model("node2vec", "clics")

In [67]:
df = pd.read_csv("data/edgelists/edgelists_clics.csv")

In [None]:
model_ggvc

In [68]:
node2vec_sim = []
for src, tgt in zip(df["target_id"], df["source_id"]):
    sim = model.wv.similarity(src, tgt)
    node2vec_sim.append(sim)

In [69]:
df["node2vec_sim"] = node2vec_sim

In [74]:
df[df["node2vec_sim"]>0.9]

Unnamed: 0,source,target,weight,target_id,source_id,source_concept,target_concept,node2vec_sim
96,568,705,110,430,327,RISE (MOVE UPWARDS),GO UP (ASCEND),0.98799
135,1574,1439,93,967,1078,FLOAT,SWIM,0.972762
379,1155,1667,46,1150,728,CAVE,HOLE,0.999459
684,2113,2112,27,1395,1396,OLD (USED),OLD (AGED),0.936774
1496,1172,1455,12,983,739,SHOOT,PULL,0.936992
2587,1061,1064,6,663,660,ARMOUR,SHIELD,0.998261
2607,1215,78,6,45,773,THOU,THAT,0.901669
3485,729,679,4,409,452,DAWN,BRIGHT,0.987455
4202,1489,960,3,601,1011,CLOUD,WIND,0.903854


In [31]:
name2id["EARTH (SOIL)"]

'1228'

In [53]:
model.wv.similarity(node2id[name2id["DUST"]], node2id[name2id["EARTH (SOIL)"]])

0.25389394

In [55]:
model.wv.similarity(node2id[name2id["HARD"]], node2id[name2id["BRAVE"]])

0.42228952

In [58]:
model.wv.similarity(1232, 565)

0.36112124

In [38]:
name2id["HARD"]

'1884'

In [None]:
word_vectors.distance

In [59]:
df = pd.read_csv("data/edgelists/edgelists_clics.csv")

In [61]:
id2name

{'0': nan,
 '1': 'CONTEMPTIBLE',
 '2': 'DUST',
 '3': 'BRAVE',
 '4': 'COURTYARD',
 '5': 'GAZELLE',
 '6': 'EARTHQUAKE',
 '7': 'GATHER',
 '8': 'CURSE',
 '9': 'ANNOUNCE',
 '10': 'FIREWOOD',
 '11': 'DARKNESS',
 '12': 'MIDDAY',
 '13': 'DECEIT',
 '14': 'YOKE',
 '15': 'OTTER',
 '16': 'SLED',
 '17': 'EYELID',
 '18': 'EARLOBE',
 '19': 'FISHING LINE',
 '20': 'SCYTHE',
 '21': 'TASTE (SOMETHING)',
 '22': 'INTEND',
 '23': 'RYE',
 '24': 'PRAY',
 '25': 'DIRT',
 '26': 'UNPLEASANT',
 '27': 'AIR',
 '28': 'MOISTEN',
 '29': 'GELD',
 '30': 'DISPEL',
 '31': 'PAN',
 '32': 'EARWAX',
 '33': 'NETTLE',
 '34': 'STAIRS',
 '35': 'TAMARIND',
 '36': 'PUBIC HAIR',
 '37': 'LIBERATE',
 '38': 'SPREAD OUT',
 '39': 'KEEN',
 '40': 'ENVY',
 '41': 'IGNORE',
 '42': 'STINKING',
 '43': 'LEAD (GUIDE)',
 '44': 'INJURE',
 '45': 'EVIL',
 '46': 'HOLLOW OUT',
 '47': 'EXCRETA',
 '48': 'BE SILENT',
 '49': 'REFUSE',
 '50': 'JACK',
 '51': 'SORE',
 '52': 'SHIVER',
 '53': 'SPIRIT',
 '54': 'SHRINK',
 '55': 'WHISPER',
 '56': 'BAD LUCK',
 '57':

In [63]:
df["source_concept"] = df["source"].apply(lambda x: id2name[str(x)])
df["target_concept"] = df["target"].apply(lambda x: id2name[str(x)])

In [65]:
df.to_csv("data/edgelists/edgelists_clics.csv", index=False)