In [2]:
import pandas as pd
import pathlib

In [3]:
from conceptnet_text_to_uri import standardized_uri
from sklearn.preprocessing import normalize

In [4]:
!ls ./conceptnet

mini.h5


In [5]:
conceptnet = pd.read_hdf('./conceptnet/mini.h5')  

In [6]:
def normalize_vec(vec):
    """
    L2-normalize a single vector, as a 1-D ndarray or a Series.
    """
    if isinstance(vec, pd.Series):
        return normalize(vec.fillna(0).values.reshape(1, -1))[0]
    elif isinstance(vec, np.ndarray):
        return normalize(vec.reshape(1, -1))[0]
    else:
        raise TypeError(vec)

In [7]:
normalized_values = normalize(conceptnet.values)

In [9]:
normalized_values.shape

(1635499, 300)

In [12]:
conceptnet.index

Index(['/c/de/####er', '/c/de/###er', '/c/de/##cm', '/c/de/##er',
       '/c/de/##jahre', '/c/de/##k', '/c/de/##m', '/c/de/##rd', '/c/de/##t',
       '/c/de/##tel',
       ...
       '/c/zh/龟毛', '/c/zh/龟甲', '/c/zh/龟缩', '/c/zh/龟背竹', '/c/zh/龟船', '/c/zh/龟裂',
       '/c/zh/龠', '/c/zh/龢', '/c/zh/𫓧', '/c/zh/𫚉鱼'],
      dtype='object', length=1635499)

In [17]:
with open("codenames_ai/wordlist-eng.txt") as f:
    wordlist = set(f.read().splitlines())

In [40]:
def prefix_to_word_stub(prefix: str):
    word_stub = prefix.split('/c/en/', 1)[1]
    word_stub = " ".join(word_stub.upper().split("_"))
    return word_stub

In [41]:
def is_valid_prefix(prefix: str) -> bool:
    if not prefix.startswith('/c/en/'):
        return False
    if prefix.startswith('/c/en/#'):
        return False
    word_stub = prefix_to_word_stub(prefix)
    return word_stub in wordlist

In [26]:
'/c/en/sd_fsdf'.split('/c/en/', 1)[1].isalpha()

False

In [42]:
valid_prefixes = [p for p in conceptnet.index if is_valid_prefix(p)]

In [45]:
normalized_prefixes = [prefix_to_word_stub(p) for p in valid_prefixes if is_valid_prefix(p)]

In [43]:
valid_prefixes

['/c/en/africa',
 '/c/en/agent',
 '/c/en/air',
 '/c/en/alien',
 '/c/en/alps',
 '/c/en/amazon',
 '/c/en/ambulance',
 '/c/en/america',
 '/c/en/angel',
 '/c/en/antarctica',
 '/c/en/apple',
 '/c/en/arm',
 '/c/en/atlantis',
 '/c/en/australia',
 '/c/en/aztec',
 '/c/en/back',
 '/c/en/ball',
 '/c/en/band',
 '/c/en/bank',
 '/c/en/bar',
 '/c/en/bark',
 '/c/en/bat',
 '/c/en/battery',
 '/c/en/beach',
 '/c/en/bear',
 '/c/en/beat',
 '/c/en/bed',
 '/c/en/beijing',
 '/c/en/bell',
 '/c/en/belt',
 '/c/en/berlin',
 '/c/en/bermuda',
 '/c/en/berry',
 '/c/en/bill',
 '/c/en/block',
 '/c/en/board',
 '/c/en/bolt',
 '/c/en/bomb',
 '/c/en/bond',
 '/c/en/boom',
 '/c/en/boot',
 '/c/en/bottle',
 '/c/en/bow',
 '/c/en/box',
 '/c/en/bridge',
 '/c/en/brush',
 '/c/en/buck',
 '/c/en/buffalo',
 '/c/en/bug',
 '/c/en/bugle',
 '/c/en/button',
 '/c/en/calf',
 '/c/en/canada',
 '/c/en/cap',
 '/c/en/capital',
 '/c/en/car',
 '/c/en/card',
 '/c/en/carrot',
 '/c/en/casino',
 '/c/en/cast',
 '/c/en/cat',
 '/c/en/cell',
 '/c/en/centau

In [37]:
conceptnet.loc[valid_prefixes].values.to_numpy()

array([[-2,  0,  0, ...,  0,  3, -1],
       [ 0, -2, -4, ..., -4, -2,  0],
       [ 0,  0,  2, ...,  0,  0, -3],
       ...,
       [-1,  1,  0, ...,  3,  1, -7],
       [-1,  0,  4, ...,  0,  2,  9],
       [ 0,  1,  0, ..., -5, -7,  6]], dtype=int8)

In [63]:
conceptnet_normalized = pd.DataFrame(normalized_values, index=conceptnet.index)

In [44]:
conceptnet.loc["/c/en/twitter"]/conceptnet.loc["/c/en/twitter"].sum()

0     -0.018182
1     -0.018182
2     -0.000000
3      0.018182
4     -0.018182
         ...   
295    0.018182
296    0.036364
297   -0.018182
298   -0.109091
299   -0.072727
Name: /c/en/twitter, Length: 300, dtype: float64

In [10]:
conceptnet.loc["/c/en/twitter"]

0      1
1      1
2      0
3     -1
4      1
      ..
295   -1
296   -2
297    1
298    6
299    4
Name: /c/en/twitter, Length: 300, dtype: int8

In [18]:
with pathlib.Path("./codenames_ai/wordlist-eng.txt").open() as f:
    codenames_wordlist = f.read().splitlines()

In [20]:
wordlist_codes = [standardized_uri("en", cw) for cw in codenames_wordlist]

In [22]:
has_vector = [c in conceptnet.index for c in wordlist_codes]

In [24]:
all(has_vector)

True

In [65]:
def get_word_vectors(words):
    codes = [standardized_uri("en", cw) for cw in words]
    vectors = conceptnet_normalized.loc[codes]
    return vectors

In [66]:
def get_similarity_to(words_a, words_b):
    return get_word_vectors(words_a).values @ get_word_vectors(words_b).values.T

In [72]:
get_similarity_to(
    ["fall", "iron", "witch", "ham", "note", "cat", "beijing", "bear", "ambulance"],
    ["pig", "commodities", "wok"]
)

array([[-0.00152121,  0.12974121, -0.04465161],
       [-0.01646635,  0.06788395,  0.1405661 ],
       [ 0.06603686,  0.02517621, -0.07160979],
       [ 0.30535771, -0.04175501,  0.02672226],
       [ 0.02616878, -0.01625773, -0.04615694],
       [ 0.09370663,  0.04936789,  0.0278698 ],
       [ 0.03529743,  0.03522508,  0.18972301],
       [ 0.03847977,  0.18190285, -0.01654458],
       [-0.01255556, -0.01666464,  0.02443253]])

In [None]:
p_threshold = 0.1  # 0.1 or 0.05

In [70]:
get_similarity_to(
    ["bank"],
    ["beach", "shore", "money"]
)

array([[0.10181325, 0.26793814, 0.18515725]])

In [37]:
get_word_vectors(["hello", "world"]).values.shape

(2, 300)

In [38]:
get_word_vectors(["hello"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
/c/en/hello,0,3,-2,0,-1,2,4,-2,-7,6,...,3,-4,-4,10,7,0,3,2,1,-5
