In [3]:
import numpy as np
import json

with open("../public/recommend/vec.bytes", 'rb') as f:
    vec = np.frombuffer(f.read(), dtype=np.float32).reshape([-1, 300])
    
with open("../public/recommend/word.json", "rt") as f:
    word = json.load(f)
    
with open("/Users/szymon/Downloads/google-10000-english-usa.txt") as f:
    common_words = f.read().strip().split('\n')
    
word_to_idx = {w: idx for idx, w in enumerate(word)}
def word_to_vec(w):
    return vec[word_to_idx[w]]

In [4]:
def metric(word):
    targetvec = word_to_vec(word)
    distances = 1 - (vec * targetvec[None, :]).sum(-1)
    return (distances < 0.5).sum()

In [5]:
codenames_words = ["africa", "agent", "air", "alien", "alps", "amazon", "ambulance", "america", "angel", "antarctica", "apple", "arm", "atlantis", "australia", "aztec", "back", "ball", "band", "bank", "bar", "bark", "bat", "battery", "beach", "bear", "beat", "bed", "beijing", "bell", "belt", "berlin", "bermuda", "berry", "bill", "block", "board", "bolt", "bomb", "bond", "boom", "boot", "bottle", "bow", "box", "bridge", "brush", "buck", "buffalo", "bug", "bugle", "button", "calf", "canada", "cap", "capital", "car", "card", "carrot", "casino", "cast", "cat", "cell", "centaur", "center", "chair", "change", "charge", "check", "chest", "chick", "china", "chocolate", "church", "circle", "cliff", "cloak", "club", "code", "cold", "comic", "compound", "concert", "conductor", "contract", "cook", "copper", "cotton", "court", "cover", "crane", "crash", "cricket", "cross", "crown", "cycle", "czech", "dance", "date", "day", "death", "deck", "degree", "diamond", "dice", "dinosaur", "disease", "doctor", "dog", "draft", "dragon", "dress", "drill", "drop", "duck", "dwarf", "eagle", "egypt", "embassy", "engine", "england", "europe", "eye", "face", "fair", "fall", "fan", "fence", "field", "fighter", "figure", "file", "film", "fire", "fish", "flute", "fly", "foot", "force", "forest", "fork", "france", "game", "gas", "genius", "germany", "ghost", "giant", "glass", "glove", "gold", "grace", "grass", "greece", "green", "ground", "ham", "hand", "hawk", "head", "heart", "helicopter", "himalayas", "hole", "hollywood", "honey", "hood", "hook", "horn", "horse", "horseshoe", "hospital", "hotel", "ice", "icecream", "india", "iron", "ivory", "jack", "jam", "jet", "jupiter", "kangaroo", "ketchup", "key", "kid", "king", "kiwi", "knife", "knight", "lab", "lap", "laser", "lawyer", "lead", "lemon", "leprechaun", "life", "light", "limousine", "line", "link", "lion", "litter", "loch ness", "lock", "log", "london", "luck", "mail", "mammoth", "maple", "marble", "march", "mass", "match", "mercury", "mexico", "microscope", "millionaire", "mine", "mint", "missile", "model", "mole", "moon", "moscow", "mount", "mouse", "mouth", "mug", "nail", "needle", "net", "new york", "night", "ninja", "note", "novel", "nurse", "nut", "octopus", "oil", "olive", "olympus", "opera", "orange", "organ", "palm", "pan", "pants", "paper", "parachute", "park", "part", "pass", "paste", "penguin", "phoenix", "piano", "pie", "pilot", "pin", "pipe", "pirate", "pistol", "pit", "pitch", "plane", "plastic", "plate", "platypus", "play", "plot", "point", "poison", "pole", "police", "pool", "port", "post", "pound", "press", "princess", "pumpkin", "pupil", "pyramid", "queen", "rabbit", "racket", "ray", "revolution", "ring", "robin", "robot", "rock", "rome", "root", "rose", "roulette", "round", "row", "ruler", "satellite", "saturn", "scale", "school", "scientist", "scorpion", "screen", "scuba diver", "seal", "server", "shadow", "shakespeare", "shark", "ship", "shoe", "shop", "shot", "sink", "skyscraper", "slip", "slug", "smuggler", "snow", "snowman", "sock", "soldier", "soul", "sound", "space", "spell", "spider", "spike", "spine", "spot", "spring", "spy", "square", "stadium", "staff", "star", "state", "stick", "stock", "straw", "stream", "strike", "string", "sub", "suit", "superhero", "swing", "switch", "table", "tablet", "tag", "tail", "tap", "teacher", "telescope", "temple", "theater", "thief", "thumb", "tick", "tie", "time", "tokyo", "tooth", "torch", "tower", "track", "train", "triangle", "trip", "trunk", "tube", "turkey", "undertaker", "unicorn", "vacuum", "van", "vet", "wake", "wall", "war", "washer", "washington", "watch", "water", "wave", "web", "well", "whale", "whip", "wind", "witch", "worm", "yard"];

codenames_vecs = np.stack([
    word_to_vec(w) for w in codenames_words
])

In [6]:
from sklearn.cluster import KMeans

kmeans = KMeans(40)
distances_to_clusters = kmeans.fit_transform(codenames_vecs)
clusters = kmeans.cluster_centers_

In [7]:
def synonym_score(target_word): 
    dists = 1 - (vec * word_to_vec(target_word)[None, :]).sum(-1)
    return sum(dists < 0.5)
synonym_score('address'), synonym_score('location'), synonym_score('post'), synonym_score('send')

(116, 67, 434, 298)

In [8]:
import random

cluster_to_words = {}
for w, dists in zip(codenames_words, distances_to_clusters):
    cluster_idx = dists.argmin()
    cluster_to_words.setdefault(cluster_idx, [])
    cluster_to_words[cluster_idx].append((dists.min(), w))

codenames_words_set = set(codenames_words)
for cluster_idx, words in sorted(cluster_to_words.items()):
    print(f'cluster {cluster_idx}')
    for dist, w in sorted(words):
        print('   ', dist, w)
        
    used_words = set([w for _, w in sorted(words)])
    distances_to_center = 1 - (vec * clusters[cluster_idx][None, :]).sum(-1)
    replacement_words = [word[widx] for widx in distances_to_center.argsort()[:300] 
                         if word[widx] not in used_words]
    
    new_list = [replacement_words[0]]
    new_vec = word_to_vec(replacement_words[0]).reshape(1, -1)
    del replacement_words[0]
    while len(new_list) < 2 * len(used_words):
        scores = []
        for w in replacement_words:
            dists = 1 - (new_vec * word_to_vec(w)[None, :]).sum(-1)
            scores.append(dists.mean()) # TODO: try min
        max_dist_idx = np.array(scores).argmax()
        new_list.append(replacement_words[max_dist_idx])
        new_vec = np.concatenate([new_vec, word_to_vec(new_list[-1]).reshape(1, -1) ], 0)
        del replacement_words[max_dist_idx]
    new_list += random.sample(replacement_words, len(used_words))
    new_list = [(-synonym_score(w), w) for w in new_list]
    
    print()
    for sc, w in sorted(new_list):
        print('   ', w)
#     for w in replacement_words[:10]:
#         print('   ', w, '(!)' if w in codenames_words_set else '')
#     print()
#     for w in replacement_words[-10:]:
#         print('   ', w, '(!)' if w in codenames_words_set else '')

cluster 0
    0.5885835 icecream
    0.5885835 ketchup

    mayonnaise
    peaches
    spice
    poop
    starbucks
    cracker
cluster 1
    0.5930111 round
    0.6620897 ring
    0.70236754 gold
    0.7188445 diamond
    0.7270119 circle
    0.7277977 spot
    0.7378455 square
    0.75292313 crown
    0.7617385 match
    0.7775448 row
    0.7913465 shot
    0.85391164 lap
    0.88755274 pistol
    1.011895 bugle

    could
    one
    too
    then
    also
    before
    all
    ever
    find
    down
    today
    great
    with
    week
    turns
    night
    area
    man
    ended
    play
    setting
    ten
    final
    floor
    corner
    purple
    glass
    band
    tie
    crystal
    rose
    pearl
    star
    bar
    loose
    hole
    oval
    rings
    belt
    pin
    rounds
    triple
cluster 2
    0.5818435 moscow
    0.61331534 berlin
    0.61561793 beijing
    0.61733323 tokyo
    0.6324515 london

    earlier
    arriving
    czech
    baghdad
    station
    j


    although
    better
    before
    someone
    more
    choice
    nice
    used
    hands
    butt
    underneath
    lock
    toe
    ankle
    gloves
    nose
    inch
    fist
    forearm
    swing
    pen
cluster 13
    0.6359175 web
    0.6531923 check
    0.66956156 link
    0.7110927 code
    0.7247746 server
    0.7269964 mail
    0.73105747 tag
    0.7347824 file
    0.7522285 log
    0.7600057 net
    0.79137415 bug
    0.811503 engine
    0.81583714 agent
    0.8499449 amazon
    0.8687283 spy

    when
    only
    means
    will
    take
    out
    give
    own
    i
    my
    ask
    goes
    full
    works
    website
    add
    features
    drop
    sent
    type
    added
    report
    blog
    customer
    reference
    enter
    checking
    box
    network
    facebook
    updates
    ie
    tool
    listing
    security
    files
    setup
    download
    html
    browser
    pages
    register
    via
    directory
    script
cluster 14
    0.64602077 h


    though
    one
    something
    bottle
    australian
    monsters
    frozen
    ice
    aquatic
    reptiles
    yacht
    peacock
    mermaid
    expedition
    spider
    dolphin
    owl
    panda
    inflatable
    snail
    herring
    galapagos
    cod
    pelican
cluster 28
    0.63666964 eye
    0.6696365 face
    0.67729527 head
    0.70686746 mouth
    0.73221 heart
    0.74382645 tooth
    0.7614785 chest
    0.78158194 nail
    0.7996356 brush
    0.8036145 disease
    0.81017756 laser
    0.8190626 spine
    0.90597 mole

    because
    would
    we
    once
    going
    want
    simply
    where
    take
    very
    usually
    best
    known
    problems
    turning
    area
    care
    caused
    spot
    straight
    eyes
    causing
    blue
    black
    surgery
    surface
    smile
    stomach
    tip
    tissue
    breath
    cheek
    forehead
    lip
    jaw
    bones
    facial
    bite
    skull
cluster 29
    0.59879166 play
    0.6271088 game
    

In [82]:
len(codenames_words)

400

In [33]:
chosen_words="""    spice
    cafe
    cracker
    night
    play
    floor
    corner
    purple
    glass
    band
    tie
    crystal
    rose
    pearl
    star
    bar
    hole
    oval
    belt
    czech
    japan
    jerusalem
    kolkata
    seoul
    zurich
    circus
    summit
    course
    current
    note
    order
    hand
    step
    left
    opening
    system
    drop
    access
    key
    single
    store
    connection
    ocean
    lake
    river
    plane
    round
    moon
    frog
    eagle
    knight
    alien
    lion
    spiderman
    pirate
    reindeer
    corpse
    hamster
    serpent
    raccoon
    automaton
    wizard
    elf
    clone
    martian
    maze
    bigfoot
    jellyfish
    break
    shoe
    gown
    pack
    sandals
    fly
    seat
    underwear
    mesh
    fur
    glove
    hood
    course
    city
    room
    park
    ball
    stone
    grill
    golf
    lodge
    bench
    house
    boy
    wolf
    truck
    bird
    chicken
    box
    pink
    black
    bag
    farm
    puppy
    kitten
    dragon
    cow
    fox
    elephant
    cross
    hunt
    frog
    moose
    goat
    paws
    tail
    bull
    rat
    paw
    cage
    opening
    handle
    surface
    crystal
    lid
    toilet
    mirror
    hose
    freezer
    jug
    barrel
    hell
    drop
    purple
    soldier
    storm
    frog
    rainbow
    demon
    vampire
    shaman
    assassin
    dagger
    trap
    rune
    present
    addition
    mind
    course
    future
    week
    year
    current
    second
    time
    world
    month
    third
    lock
    toe
    ankle
    nose
    inch
    fist
    forearm
    swing
    pen
    website
    drop
    report
    blog
    customer
    reference
    box
    network
    update
    security
    file
    browser
    directory
    script
    officer
    unit
    jump
    drive
    boat
    motorcycle
    bomb
    airline
    airport
    carrier
    crew
    wing
    pepper
    field
    hill
    rice
    sugar
    corn
    silk
    cotton
    peach
    gold
    sea
    forest
    sky
    plant
    tea
    sand
    venus
    dolphin
    astronaut
    cosmos
    ares
    equator
    prometheus
    writer
    advisor
    physician
    technician
    pharmacist
    scientist
    bartender
    waiter
    war
    market
    syria
    london
    italy
    latvia
    wales
    nepal
    usa
    lithuania
    western
    french
    paris
    scandinavia
    fight
    vehicle
    star
    sniper
    feather
    kite
    radar
    hammer
    archer
    raptor
    butter
    soup
    box
    pepper
    black
    ginger
    bacon
    cake
    pizza
    coffee
    rose
    brownie
    mango
    cereal
    cupcake
    peanut
    jar
    set
    story
    death
    action
    american
    fan
    movie
    musical
    television
    romance
    ghost
    fantasy
    fiction
    episode
    prince
    turtle
    scuba
    reef
    salt
    arm
    oven
    grill
    bolt
    cross
    bronze
    spoon
    brush
    bell
    drain
    tube
    government
    news
    police
    pittsburgh
    jersey
    manhattan
    lincoln
    capitol
    handle
    screen
    pocket
    blade
    cap
    chain
    bow
    hook
    arrow
    spring
    cable
    screw
    beam
    bullet
    pit
    ice
    yacht
    mermaid
    spider
    dolphin
    owl
    panda
    snail
    cod
    salmon
    octopus
    crab
    shrimp
    care
    surgery
    surface
    smile
    stomach
    tissue
    breath
    cheek
    forehead
    lip
    jaw
    bone
    face
    bite
    skull
    night
    play
    drop
    team
    luck
    track
    bet
    throw
    game
    club
    tournament
    poker
    bowling
    hall
    violin
    singer
    hymn
    soprano
    swan
    voice
    tower
    puzzle
    egypt
    banner
    crown
    apollo
    tower
    inverse
    student
    director
    mathematics
    trained
    hospital
    class
    winter
    temperature
    ice
    faith
    prayer
    priest
    devil
    angel
    ceremony
    wedding
    funeral
    lamp
    canon
    meter
    camera
    window
    mountain
    garden
    bear
    wind
    fire
    gate
    city
    park
    guitar
    tune
    radio
    performer
    cinema
    ballet
    ticket
    circus
    arena
    spaniel
terrier
poodle
shepherd
kennel
monopoly
magic
author
adventure
twilight
comic
poem
character
universe
well
kiss
dinner""".split()

In [34]:
chosen_words = sorted(set(chosen_words))

In [36]:
repr(chosen_words)

"['access', 'action', 'addition', 'adventure', 'advisor', 'airline', 'airport', 'alien', 'american', 'angel', 'ankle', 'apollo', 'archer', 'arena', 'ares', 'arm', 'arrow', 'assassin', 'astronaut', 'author', 'automaton', 'bacon', 'bag', 'ball', 'ballet', 'band', 'banner', 'bar', 'barrel', 'bartender', 'beam', 'bear', 'bell', 'belt', 'bench', 'bet', 'bigfoot', 'bird', 'bite', 'black', 'blade', 'blog', 'boat', 'bolt', 'bomb', 'bone', 'bow', 'bowling', 'box', 'boy', 'break', 'breath', 'bronze', 'brownie', 'browser', 'brush', 'bull', 'bullet', 'butter', 'cable', 'cafe', 'cage', 'cake', 'camera', 'canon', 'cap', 'capitol', 'care', 'carrier', 'cereal', 'ceremony', 'chain', 'character', 'cheek', 'chicken', 'cinema', 'circus', 'city', 'class', 'clone', 'club', 'cod', 'coffee', 'comic', 'connection', 'corn', 'corner', 'corpse', 'cosmos', 'cotton', 'course', 'cow', 'crab', 'cracker', 'crew', 'cross', 'crown', 'crystal', 'cupcake', 'current', 'customer', 'czech', 'dagger', 'death', 'demon', 'devil

In [31]:
set(chosen_words).intersection(codenames_words)

{'alien',
 'angel',
 'arm',
 'ball',
 'band',
 'bar',
 'bear',
 'bell',
 'belt',
 'bolt',
 'bomb',
 'bow',
 'box',
 'brush',
 'cap',
 'club',
 'comic',
 'cotton',
 'cross',
 'crown',
 'czech',
 'death',
 'dragon',
 'drop',
 'eagle',
 'egypt',
 'face',
 'fan',
 'field',
 'file',
 'fire',
 'fly',
 'forest',
 'game',
 'ghost',
 'glass',
 'glove',
 'gold',
 'hand',
 'hole',
 'hood',
 'hook',
 'hospital',
 'ice',
 'key',
 'knight',
 'lion',
 'lock',
 'london',
 'luck',
 'moon',
 'night',
 'note',
 'octopus',
 'park',
 'pirate',
 'pit',
 'plane',
 'play',
 'police',
 'rose',
 'round',
 'scientist',
 'screen',
 'shoe',
 'soldier',
 'spider',
 'spring',
 'star',
 'swing',
 'tail',
 'tie',
 'time',
 'tower',
 'track',
 'tube',
 'war',
 'well',
 'wind'}

In [28]:
reference = ['dating', 'kiss', 'dinner', 'affection', 'partner']

dists = np.concatenate([(1 - (vec * word_to_vec(w)[None, :]).sum(-1)).reshape(-1, 1) for w in reference], 1)
for widx in np.argsort(dists.mean(-1))[:100]:
    print(word[widx])

partner
friends
kiss
friend
love
relationship
couple
boyfriend
friendship
dating
husband
her
she
loving
lover
loved
always
you
never
girlfriend
wife
affection
dinner
want
him
me
know
wanted
girl
someone
sister
everyone
even
wanting
once
wish
together
thing
affair
kind
again
wants
one
but
moment
ever
we
tell
relationships
happy
when
woman
having
others
desire
remember
bring
give
both
enjoy
just
because
giving
while
meet
i
well
met
good
with
would
so
it
let
couples
my
feel
going
partners
sure
lovers
only
really
another
get
dad
fun
way
guy
that
then
thought
much
surprise
our
them
romantic
looking
they
marriage


array([[0.52985203, 0.46490002, 0.41684628, 0.91122746, 0.6148555 ],
       [0.56132746, 0.47562057, 0.4222868 , 0.8862601 , 0.6265718 ],
       [0.55051565, 0.4426825 , 0.4435364 , 0.9476344 , 0.6528337 ],
       ...,
       [0.6023398 , 0.52694017, 0.5112357 , 0.9279846 , 0.6813105 ],
       [0.8296769 , 0.8187469 , 0.8189982 , 0.9180113 , 0.8902627 ],
       [0.7278642 , 0.7976171 , 0.7970578 , 0.99744195, 0.85186654]],
      dtype=float32)