In [1]:
import numpy as np
import six
import sys
import os
import traceback
import re
import pickle
from copy import deepcopy

from chainer import cuda
from context2vec.common.context_models import Toks
from context2vec.common.model_reader import ModelReader
import sklearn
import pandas as pd
import logging
from scipy.stats import spearmanr
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim
import math




[nltk_data] Downloading package stopwords to /home/ql261/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def produce_top_n_simwords(w_filter,context_embed,n_result,index2word):
        #assume that w_filter is already normalized
        context_embed = context_embed / xp.sqrt((context_embed * context_embed).sum())
        similarity_scores=[]
        print('producing top {0} simwords'.format(n_result))
        similarity = (w_filter.dot(context_embed)+1.0)/2
        top_words_i=[]
        top_words=[]
        count = 0
        for i in (-similarity).argsort():
                    if xp.isnan(similarity[i]):
                        continue
                    print('{0}: {1}'.format(str(index2word[i]), str(similarity[i])))
                    count += 1
                    top_words_i.append(i)
                    top_words.append(index2word[i])
                    similarity_scores.append(similarity[i])
                    if count == n_result:
                        break

        top_vec=w_filter[top_words_i,:]
        
        return top_vec,np.array(similarity_scores),top_words
    
def top_mutual_sim(top_vec,similarity_scores):

    #normalize the top_vec
    s = np.sqrt((top_vec * top_vec).sum(1))
    s[s==0.] = 1.
    top_vec /= s.reshape((s.shape[0], 1))
    
    # substitutes' similarity to sentence (similarity_scores) as weight matrix to mutual similarity
    max_score=similarity_scores[0]
    similarity_scores=np.array(similarity_scores)
    sim_weights=(similarity_scores+similarity_scores.reshape(len(similarity_scores),1))/2.0
    #weighted by the maximum score in the substitutes (highre max score means the context is more certain about the substitutes)
    sim_weights=(sim_weights/float(sum(sum(sim_weights))))*max_score
    # dot product weighted by substitute probability (sim_weights)
    inf_score=sum(sum(top_vec.dot(top_vec.T)*sim_weights))
    return inf_score

def top_cluster_density(top_vec,similarity_scores):
    #normalize the top_vec
    s = np.sqrt((top_vec * top_vec).sum(1))
    s[s==0.] = 1.
    top_vec = top_vec/ s.reshape((s.shape[0], 1))
    
    #perform the centroid
    max_score=similarity_scores[0]
    similarity_scores=np.array(similarity_scores).reshape(len(similarity_scores),1)/sum(similarity_scores)
    centroid_vector=sum(top_vec*similarity_scores)
    # average of cosine distance to the centroid,weighted by max scores
    inf_score=sum(top_vec.dot(centroid_vector))/len(top_vec)*max_score
    return inf_score

In [3]:
top_cluster_density(np.array([[1.,1.,1.],[4.,4.,4.]]),np.array([0.3,0.1]))
# np.linalg.norm([0.57735027,0.57735027, 0.57735027])

0.3

In [27]:
def load_w2salience(w2salience_f,weight_type):
    w2salience={}
    with open(w2salience_f) as f:
        for line in f:
            if line.strip()=='':
                continue
            w,w_count,s_count=line.strip().split('\t')
            if weight_type==INVERSE_W_FREQ:
                w2salience[w]=1/float(w_count)
            elif weight_type==INVERSE_S_FREQ:
                w2salience[w]=math.log(1+84755431/float(s_count))
    return w2salience

def skipgram_context(model,words,pos,weight=None,w2entropy=None):
    context_wvs=[]
    weights=[]
    for i,word in enumerate(words):
        if i != pos: #surroudn context words
            try:
                if weight ==LDA:
                    if word in w2entropy and word in model:
                        print (word,w2entropy[word])
                        weights.append(1/(w2entropy[word]+1.0))
                        context_wvs.append(model[word])
                elif weight in [INVERSE_W_FREQ,INVERSE_S_FREQ]:
                    if word in w2entropy and word in model:
                        print (word,w2entropy[word])
                        weights.append(w2entropy[word])
                        context_wvs.append(model[word])
                else:
                    #equal weights per word
                    context_wvs.append(model[word])
                    weights.append(1.0)
            except KeyError as e:
                print ('==warning==: key error in context {0}'.format(e))
    context_embed=sum(np.array(context_wvs)*np.array(weights).reshape(len(weights),1))#/sum(weights)
    return sum(weights),context_embed #  will be normalized later

def lg_model_out_w2v(top_words,w_target,word2index_target):
        # lg model substitutes in skipgram embedding
        top_vec=[]
        index_list=[]
        for i,word in enumerate(top_words):
            try :
                top_vec.append(w_target[word2index_target[word]])
                index_list.append(i)
            except KeyError as e:
                print (e)
        return np.array(top_vec),index_list
    
def context_inform(test_s,test_w, model,model_type,n_result,w_filter,index2word,weight,w2entropy=None,w_target=None,word2index_target=None,index2word_target=None):
    #produce context representation and infromative score for each context
    test_s=test_s.replace(test_w, ' '+test_w+' ')
    print(test_s)
    words=test_s.split()
    pos=words.index(test_w)
    
    score=1.0 #default score
    
    # Decide on the model
    if model_type=='context2vec':
        context_embed= model.context2vec(words, pos)
        context_embed_out=context_embed
    
    elif model_type=='skipgram':
        score,context_embed=skipgram_context(model,words,pos,weight,w2entropy)
        context_embed_out=context_embed
        
    elif model_type=='context2vec-skipgram':
        # context2vec substitutes in skipgram space
        context_embed= model.context2vec(words, pos)
        top_vec,sim_scores,top_words=produce_top_n_simwords(w_filter,context_embed,n_result,index2word)
        top_vec,index_list=lg_model_out_w2v(top_words,w_target,word2index_target) 
        sim_scores=sim_scores[index_list] #weighted by substitute probability
        context_embed_out=sum(top_vec*((sim_scores/sum(sim_scores)).reshape(len(sim_scores),1)))
    else:
        print ('model type {0} not recognized'.format(model_type))
        sys.exit(1)
        
        
    #decide on weight per sentence
    if weight==TOP_MUTUAL_SIM:
        print (weight)
#         if word2index_target==None: #not context2vec-skipgram
#             context2vec word embedding space neighbours
        top_vec,sim_scores,top_words=produce_top_n_simwords(w_filter,context_embed,n_result,index2word)
        #skipgram word embedding space neighbours when context2vec-skipgram
        score=top_mutual_sim(top_vec,sim_scores)
        print (score)
    elif weight==TOP_CLUSTER_DENSITY:
        print (weight)
#         if word2index_target==None: #not context2vec-skipgram
#             context2vec word embedding space neighbours
        top_vec,sim_scores,top_words=produce_top_n_simwords(w_filter,context_embed,n_result,index2word)
        score=top_cluster_density(top_vec,sim_scores)
        print (score)
        
    elif weight=='learned':
        print ('learned not implemented')
    elif weight=='gaussian':
        print ('gaussian not implemented')
    elif weight ==False or weight in [LDA,INVERSE_S_FREQ,INVERSE_W_FREQ]:
        score=score
    else:
        print ('weight mode {0} not recognized'.format(weight))
    return score,context_embed_out

def additive_model(f_w,test_ss,test_w, model_type,model,n_result,w_filter,index2word,weight=False,w2entropy=None,w_target=None,word2index_target=None,index2word_target=None):
    #produce context representation across contexts using weighted average
    
    context_out=[]
    context_weights=[]
    for test_s in test_ss.split('@@'):
        test_s=test_s.strip()
        #produce context representation with scores
        score,context_embed=context_inform(test_s,test_w, model,model_type,n_result,w_filter,index2word,weight,w2entropy,w_target,word2index_target,index2word_target)
        print ('weight is {0}'.format(score))
        context_out.append(context_embed)
        context_weights.append(score)
    
    
    #sum representation across contexts
    context_out=np.array(context_out)
    norm_weights=np.array(context_weights).reshape(len(context_weights),1)/float(sum(context_weights))
    f_w.write(','.join([str(i[0]) for i in norm_weights])+'\n')
    print ('normalized weight: \n  {0}'.format(norm_weights))
    
    if model_type=='skipgram':
        # context representation by weighted sum of all context words in all contexts
        context_avg=sum(context_out)/sum(context_weights)
    else:
        # context represenatation by weighted sum of contexts
        context_avg=sum(norm_weights*context_out)
    
    
    # check new embedding neighbours

    print('producing top {0} words for new embedding'.format(n_result))
    if index2word_target==None:
        top_vec,scores,top_words=produce_top_n_simwords(w,context_avg,n_result,index2word)
    else:
        #print the target space neighbours for context2vec-skipgram
        print (w_target.shape)
        top_vec,scores,top_words=produce_top_n_simwords(w_target,context_avg,n_result,index2word_target)
    
    return context_avg




In [5]:
def filter_w(w,word2index,index2word):
    #filter out words with no letters in, and stopwords
    stopw=stopwords.words('english')
    stopw=[word.encode('utf-8') for word in stopw]
    index2word_filter={}
    word2index_filter={}
    index_filter2index=[]
    counter=0
    for word in word2index:
            if word not in stopw:
                    index_filter2index.append(word2index[word])
                    word2index_filter[word]=counter
                    index2word_filter[counter]=word
                    counter+=1
    w_filter= w[index_filter2index,:]
    return w_filter,word2index_filter,index2word_filter

def rm_stopw_context(model):
    stopw=stopwords.words('english')
    stopw=[word.encode('utf-8') for word in stopw]
    
    model={word:model.wv.__getitem__(word) for word in model.wv.vocab if word not in stopw}
    return model




In [6]:
def eval_chimera(chimeras_data_f,context_model,model_type,n_result,w,index2word,weight=False,w2entropy=None,w_target=None,word2index_target=None,index2word_target=None):
    chimeras_data_dir='/'.join(chimeras_data_f.split('/')[:-1])
    num_sent=chimeras_data_f.split('/')[-1].split('.')[1][1]
    print (chimeras_data_dir)
    print (num_sent)
    with open(chimeras_data_dir+'/weights_{0}_{1}_{2}'.format(num_sent,model_type,str(weight)),'w') as f_w:
        spearmans=[]
        data=pd.read_csv(os.path.join(chimeras_data_f),delimiter='\t',header=None)

        for index, row in data.iterrows():
            golds=[]
            model_predict=[]
            probes=[]
            #compute context representation
            if weight!='learned':
                context_avg=additive_model(f_w,row[1].lower(),'___', model_type,context_model,n_result,w,index2word,weight,w2entropy,w_target,word2index_target,index2word_target)
            context_avg = context_avg / xp.sqrt((context_avg * context_avg).sum())

            #cosine similarity with probe embedding
            for gold,probe in zip(row[3].split(','),row[2].split(',')):
                try:
                    if index2word_target==None:
                        probe_w_vec=xp.array(w[word2index[probe]])
                    else:
                        probe_w_vec=xp.array(w_target[word2index_target[probe]])
                    probe_w_vec=probe_w_vec/xp.sqrt((probe_w_vec*probe_w_vec).sum())
                    cos=probe_w_vec.dot(context_avg)
                    if xp.isnan(cos):
                        continue
                    else:
                        model_predict.append(cos)
                        golds.append(gold)
                        probes.append(probe)
                except KeyError as e:
                    print ("====warning key error for probe=====: {0}".format(e))
            print ('probes',probes)
            print ('gold',golds)
            print ('model_predict',model_predict)
            sp=spearmanr(golds,model_predict)[0]
            print ('spearman correlation is {0}'.format(sp))
            if not math.isnan(sp):
                spearmans.append(sp)
        print ("AVERAGE RHO:",float(sum(spearmans))/float(len(spearmans)))

In [14]:
TOP_MUTUAL_SIM='top_mutual_sim'
TOP_CLUSTER_DENSITY='top_cluster_density'
LDA='lda'
INVERSE_S_FREQ='inverse_s_freq'
INVERSE_W_FREQ='inverse_w_q'
WEIGHT_DICT={0:False,1:TOP_MUTUAL_SIM,2:LDA,3:INVERSE_S_FREQ,4:INVERSE_W_FREQ,5:TOP_CLUSTER_DENSITY}


if __name__=="__main__":
    
    #params read in
    if sys.argv[0]=='/usr/local/lib/python2.7/dist-packages/ipykernel_launcher.py':
        
        data='./eval_data/data-chimeras/dataset.l4.fixed.test.txt.punct'

        weight=WEIGHT_DICT[5]
        
#         ##context2vec
##         model_param_file='../models/context2vec/model_dir/context2vec.ukwac.model.params'
#         model_param_file='../models/context2vec/model_dir/MODEL-wiki.params.14'
        
#         model_type='context2vec'
#         context_rm_stopw=0

# ####skipgram
#         model_param_file='../models/wiki_all.model/wiki_all.sent.split.model'
#         model_type='skipgram'
#         context_rm_stopw=1
#         weight='inverse_w_freq'
#         w2salience_f='../corpora/corpora/wiki.all.utf8.sent.split.tokenized.vocab'
#         w2salience_f='../models/lda/w2entropy'


####context2vec-skipgram
        model_param_file='../models/context2vec/model_dir/MODEL-wiki.params.14?../models/wiki_all.model/wiki_all.sent.split.model'
#         model_param_file='../models/context2vec/model_dir/context2vec.ukwac.model.params?../models/wiki_all.model/wiki_all.sent.split.model'
        model_type='context2vec-skipgram'
#         context_rm_stopw=0
        n_result=20
    
    else:
        if len(sys.argv) < 5:
            print >> sys.stderr, "Usage: {0} <model_param_file> <model_type> <weight:{1}> <eval_data> <w2salience>"  .format (sys.argv[0],WEIGHT_DICT.items())
            sys.exit(1)
        
        model_param_file = sys.argv[1]
        model_type=sys.argv[2]
        
        if '-' in sys.argv[3]:
            weight,n_result=sys.argv[3].split('-')
            weight=WEIGHT_DICT[int(weight)]
            n_result=int(n_result)
        else:
            weight=WEIGHT_DICT[int(sys.argv[3])]
            n_result=20 #default is 20 top
            
#         context_rm_stopw=int(sys.argv[4])
        data =sys.argv[4]
        
        if len(sys.argv)>5:
            w2salience_f=argv[5]
        else:
            w2salience_f=None
    
    #gpu setup 
    gpu = -1 # todo: make this work with gpu

    if gpu >= 0:
        cuda.check_cuda_available()
        cuda.get_device(gpu).use()    
    xp = cuda.cupy if gpu >= 0 else np
    
    # logging
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    
    #choose model type
    print ('read model....')
    if model_type=='context2vec':
        #read in model
        
        model_reader = ModelReader(model_param_file)
        w = model_reader.w
        index2word = model_reader.index2word
        word2index=model_reader.word2index
        model = model_reader.model
        w_target=None
        word2index_target=None
        index2word_target=None
        context_rm_stopw=0
    elif model_type=='skipgram':
        model = gensim.models.Word2Vec.load(model_param_file)
        w=deepcopy(model.wv.vectors)
        #vector normalize for probe w embedding
        s = np.sqrt((w * w).sum(1))
        s[s==0.] = 1.
        w /= s.reshape((s.shape[0], 1))
        
        index2word=model.wv.index2word
        word2index={key: model.wv.vocab[key].index for key in model.wv.vocab}
        w_target=None
        word2index_target=None
        index2word_target=None
        context_rm_stopw=1
        
    elif model_type=='context2vec-skipgram':
        model_param_context,model_param_w2v=model_param_file.split('?')
        model_reader = ModelReader(model_param_context)
        w = model_reader.w
        index2word = model_reader.index2word
        word2index=model_reader.word2index
        model = model_reader.model
        
        model_w2v = gensim.models.Word2Vec.load(model_param_w2v)
        w_target=model_w2v.wv.vectors
        index2word_target=model_w2v.wv.index2word
        word2index_target={key: model_w2v.wv.vocab[key].index for key in model_w2v.wv.vocab}
        context_rm_stopw=0
    
    w2salience=None
    
    #remove stop words in target word space
    print ('filter words for target....')
    w,word2index,index2word=filter_w(w,word2index,index2word)
    if  index2word_target!=None:
        w_target,word2index_target,index2word_target=filter_w(w_target,word2index_target,index2word_target)
    
    #weight

#     if weight==TOP_MUTUAL_SIM of weight==TOP_CLUSTER_DENSITY:
    if weight==LDA:
        print ('load vectors and entropy')
        w2salience=pickle.load(open(w2salience_f))
    elif weight==INVERSE_W_FREQ:
        print ('load w2freq')
        w2salience=load_w2salience(w2salience_f,weight)
    elif weight==INVERSE_S_FREQ:
        print ('load w2freq')
        w2salience=load_w2salience(w2salience_f,weight)


    # remove context stop words
    if int(context_rm_stopw)==1:
        print ('filter words for context....')

        model=rm_stopw_context(model)
        
    
    print (model_param_file,model_type,weight,context_rm_stopw,data,w2salience_f)


read model....
Reading config file: ../models/context2vec/model_dir/MODEL-wiki.params.14
Config:  {'config_path': '../models/context2vec/model_dir/', 'model_file': 'MODEL-wiki.14', 'deep': 'yes', 'drop_ratio': '0.0', 'words_file': 'WORDS-wiki.targets.14', 'unit': '400'}
filter words for target....


In [26]:
#read in data
# data='./eval_data/data-chimeras/dataset.l6.fixed.test.txt.punct'
if data.split('/')[-2]== 'data-chimeras':
#         weight=WEIGHT_DICT[5]
#         print (weight)
        eval_chimera(data,model,model_type,n_result,w,index2word,weight,w2salience,w_target,word2index_target,index2word_target)
        

top_cluster_density
./eval_data/data-chimeras
4
canned sardines and  ___  between two slices of wholemeal bread and thinly spread flora original .
producing top 20 simwords
meatloaf: 0.5400038
sandwiches: 0.5396633
tamales: 0.538706
chicken: 0.53835166
spreads: 0.53812134
popcorn: 0.5368529
sliced: 0.53673893
mash: 0.53662497
squeezed: 0.5363477
sorbet: 0.53589183
sandwiched: 0.53515226
cooked: 0.5348494
stuffed: 0.5340092
couscous: 0.53323156
fritters: 0.5331776
sauerkraut: 0.5330202
mixtures: 0.532765
alternates: 0.5324085
pickles: 0.5319029
smothered: 0.5318118
top_cluster_density
0.1994159702048819
weight is 0.199415970205
erm ,  ___  , low fat dairy products , incidents of heart disease for those who have an olive oil rich diet .
producing top 20 simwords
dairies: 0.54627275
pharmaceuticals: 0.53947484
pharma: 0.53916913
probiotics: 0.5363117
medications: 0.5351843
freezers: 0.5344019
stocks: 0.53410417
toothpastes: 0.53363115
cigarettes: 0.53224295
antibiotics: 0.5318707
fertiliz

dryers: 2.410021397384447
heaters: 2.287674014874611
refrigerators: 2.2283185651140682
conditioners: 2.2060273912187327
dishwashers: 2.187360237146278
countertops: 2.177482950370545
washers: 2.1536678703113035
faucets: 2.143473133835979
stoves: 2.129653007026908
coolers: 2.1215162733073725
hvac: 2.0962491647500583
humidifiers: 2.0947642153382473
bathtubs: 2.08657882975143
cookers: 2.0821765788009454
freezers: 2.0783120229254655
exchangers: 2.076196342531757
lotions: 2.0741902278133315
refrigeration: 2.070755790352465
desiccant: 2.0455872291707946
heater: 2.0366521819376397
('probes', ['stove', 'microwave', 'kettle', 'cage', 'wastebin', 'leopard'])
('gold', ['2', '2.86', '1.71', '1.29', '1.29', '1.43'])
('model_predict', [0.6540423440345664, 0.4766230471553823, 0.39453497243949587, 0.28707659862069773, 0.26131489519003703, 0.17873685779156184])
spearman correlation is 0.753702346348
what a gorgeous man to work for , anna confided , as she and merrill stacked the  ___  later .
producing 

communication: 0.53766054
biomolecular: 0.5359215
telecommunication: 0.53123266
vulnerable: 0.5306935
hydronic: 0.52950984
signalling: 0.52887285
musculoskeletal: 0.52825975
redundant: 0.5273194
excretory: 0.52708215
unmanned: 0.5270074
sewage: 0.5266811
avionics: 0.52606964
telematic: 0.52559805
electronic: 0.52552056
microelectronic: 0.5251757
microelectromechanical: 0.52517474
electromechanical: 0.52451944
electrical: 0.5245029
fastening: 0.5240151
water: 0.52390796
top_cluster_density
0.16080082413759475
weight is 0.160800824138
and in  ___  went off in towns and cities nationwide almost simultaneously , leaving two people dead .
producing top 20 simwords
sloviansk: 0.5494989
pristina: 0.53434443
euromaidan: 0.5333552
sanatoriums: 0.53307927
desperation: 0.5323595
kramatorsk: 0.5313507
gjakova: 0.52725536
flames: 0.5268635
luhansk: 0.5267165
retaliation: 0.5264642
rioting: 0.52608454
retribution: 0.5260176
protest: 0.5254646
vain: 0.52530324
donbass: 0.5252157
riots: 0.524915
feriz

editorgiven1: 2.4424752150890514
editorsurname4: 2.404290819870354
laysummary: 2.3492642787893785
laydate: 2.28362183116889
publicationdate: 2.2274024553277636
editorsurname3: 2.2082984889022312
editorsurname1: 2.1871701043986844
editorsurname2: 2.18013616700515
bassoons: 2.0880302271351687
pyrams: 2.009511341965295
52-69: 1.9847324079739548
oboes: 1.982482501475976
muricidae: 1.9819544189069038
passeriformesfamily: 1.9807943009900322
florets: 1.962339309640857
pelecaniformesfamily: 1.9204965524550717
hindwings: 1.9036331262381307
authorsep: 1.9000158019482913
ugauga: 1.8902426761592022
officership: 1.8866460823467839
('probes', ['bagpipe', 'harmonica', 'whistle', 'shotgun', 'bear', 'bouquet'])
('gold', ['2.29', '2', '1.86', '1.57', '2.14', '1.86'])
('model_predict', [0.37552022762078674, 0.48450547774096503, 0.41249973568604076, 0.44769258268405965, 0.22866329515823347, 0.3077054648694734])
spearman correlation is -0.376851173174
the new corps of mcneillstown scored their first succes

producing top 20 simwords
men: 0.5306778
maidens: 0.5286087
buddhas: 0.52651554
rathas: 0.5248934
apostles: 0.5248148
sisters: 0.5244507
thieves: 0.52372116
englishmen: 0.52333766
lads: 0.52293724
lovers: 0.522708
families: 0.5226182
graces: 0.5220536
dwarves: 0.52196985
brothers: 0.5216378
gallants: 0.5212962
sages: 0.5210994
trees: 0.5205441
survivors: 0.5202488
elders: 0.51997113
strangers: 0.51983863
top_cluster_density
0.1447151748082831
weight is 0.144715174808
normalized weight: 
  [[0.25657269]
 [0.29555368]
 [0.26324719]
 [0.18462644]]
producing top 20 words for new embedding
(259235, 400)
producing top 20 simwords
passeriformesfamily: 2.3761747620931724
egrets: 2.3590897226275516
opossums: 2.3447148716473434
avocets: 2.3303920002670604
ibises: 2.315282252119032
quails: 2.260791403015749
pelecaniformesfamily: 2.2592394044058417
cormorants: 2.2567748903919447
woodpeckers: 2.255940439850131
spoonbills: 2.253546383564284
partridges: 2.2531385554554415
porcupines: 2.24027202535102

producing top 20 simwords
shark: 0.5409703
walrus: 0.53790665
dinosaur: 0.5372292
molluscan: 0.5358135
whale: 0.5347538
colossal: 0.53358984
cetacean: 0.5330972
fossilized: 0.53081346
missing: 0.5308083
giraffe: 0.53080523
seabird: 0.5291618
pointy: 0.5285864
dogtooth: 0.5284912
theropod: 0.5278289
dromaeosaurid: 0.52715206
camel: 0.52617013
tyrannosaur: 0.5259305
elephant: 0.5259155
gigantic: 0.5259083
avian: 0.52487564
top_cluster_density
0.17332945227513508
weight is 0.173329452275
here a  ___  with human legs strides across the gallery , there an antlered man dances the other animals to life .
producing top 20 simwords
man: 0.53334796
woman: 0.530745
quadruped: 0.5283987
worm: 0.5276877
broom: 0.5274533
rope: 0.5271437
creature: 0.5268686
figure: 0.52677125
monkey: 0.5261051
satyr: 0.5256517
millipede: 0.52543247
mannequin: 0.52526635
cleaver: 0.5247842
snake: 0.52470446
boy: 0.52453786
biped: 0.5243091
girl: 0.5242321
giraffe: 0.5240158
scythe: 0.5238202
clown: 0.523718
top_cluste

porcupines: 2.3356319700371646
vulpes: 2.3018358673322608
opossums: 2.2952764438777535
pronghorn: 2.255663100532687
avocets: 2.24971785287735
muntjac: 2.2463498713590786
chital: 2.234528207375371
tragopans: 2.2201569367310574
nilgai: 2.218852903774974
raccoons: 2.21562852790262
pelecaniformesfamily: 2.2150247852750216
partridges: 2.2119194209981505
peafowls: 2.203567030269009
egrets: 2.196854132618716
tapirs: 2.1924030593369856
monals: 2.188047632146362
mustela: 2.179815421757198
peafowl: 2.1723887638025365
civet: 2.1702616769725744
latrans: 2.1513787291329107
('probes', ['lion', 'horse', 'chipmunk', 'bayonet', 'raisin', 'dishwasher'])
('gold', ['3.14', '2.43', '3.14', '1.57', '2.14', '1.14'])
('model_predict', [0.5476353302830819, 0.44965660514233247, 0.4701177232109202, 0.24117667271673635, 0.30377498681675613, 0.26055143132414904])
spearman correlation is 0.927633657044
' crest ' is absolutely fantastic , a totally together  ___  groove with clear , beautiful vocals .
producing top 

clavichord: 0.5528551
spinet: 0.54128385
theorbo: 0.5401555
amati: 0.5336909
violoncello: 0.5327866
bandoneon: 0.5324573
violin: 0.53138554
pianoforte: 0.5312793
organ: 0.5308833
harpsichord: 0.52934587
lute: 0.52891797
stradivari: 0.52889144
bach: 0.52877295
cornetto: 0.5280739
vihuela: 0.5278173
violone: 0.5275036
viols: 0.52708554
manuscript: 0.526075
harpsichords: 0.5253175
piano: 0.5253172
top_cluster_density
0.28599557746845766
weight is 0.285995577468
before we provoke any lawsuits , let ' s go back to what turned rich on to playing  ___  in the first place .
producing top 20 simwords
backgammon: 0.54875344
football: 0.54602164
craps: 0.5444898
baseball: 0.54150987
poker: 0.5413007
basketball: 0.5412633
streetball: 0.54089105
soccer: 0.5407711
pinochle: 0.5391764
cards: 0.535995
games: 0.53566086
gigs: 0.53545105
hooky: 0.5352999
matches: 0.53287643
videogames: 0.5327251
billiards: 0.53242475
dominoes: 0.5318981
stickball: 0.53087705
volleyball: 0.530175
shit: 0.5299919
top_clus

oboes: 2.531372054100214
bassoons: 2.489322735944043
dulcimer: 2.458721773955629
mandolin: 2.433800483228939
clarinet: 2.418402716940932
clarinets: 2.4034574024655986
xylophone: 2.3947891630904685
12-string: 2.3783461714125576
vibraphone: 2.3773585240915462
zither: 2.3771964638541245
continuo: 2.3720768284847464
harpsichord: 2.3697555384095237
tabla: 2.353700088750832
glockenspiel: 2.350871939856319
timpani: 2.3505266433648826
sitar: 2.3428559294547537
bouzouki: 2.3347832038770893
stringed: 2.322478645008465
trombones: 2.3106458076001095
accordion: 2.3101332231608716
('probes', ['harpsichord', 'harmonica', 'peg', 'sled', 'shed', 'bedroom'])
('gold', ['3.71', '2.29', '1.71', '2.71', '1.86', '1.43'])
('model_predict', [0.7224300144681497, 0.7579631710447707, 0.28472928933712804, 0.29616972074571074, 0.17275403358225339, 0.3073269940788801])
spearman correlation is 0.371428571429
you switch the  ___  on because you have faith it will boil the water for your tea .
producing top 20 simwords

producing top 20 simwords
milkshake: 0.53799033
doughnut: 0.53342235
splash: 0.5328983
compost: 0.52775425
drip: 0.5258548
toothbrush: 0.52578723
sponge: 0.52577245
tarpaulin: 0.5255126
manger: 0.52533823
stove: 0.5252009
saucepan: 0.5244172
crock: 0.5241811
teapot: 0.52373284
tap: 0.52304214
terrarium: 0.52269244
fridge: 0.52254647
bouquet: 0.52194595
leek: 0.52173245
banana: 0.5213072
bottle: 0.52096415
top_cluster_density
0.15892774700687795
weight is 0.158927747007
normalized weight: 
  [[0.3449084 ]
 [0.22484211]
 [0.21490413]
 [0.21534536]]
producing top 20 words for new embedding
(259235, 400)
producing top 20 simwords
marinated: 2.340688703742164
caramelized: 2.3113204118507142
starches: 2.296714409582595
thickener: 2.2877735865079782
broth: 2.2829205836265096
rinsed: 2.274032451727532
yolks: 2.2683109386711093
hydrogenated: 2.267957115327923
shallots: 2.2555655505555525
sauteed: 2.2455453865148924
cornstarch: 2.240449782821498
marinade: 2.2403384072993204
kneaded: 2.2369749862

producing top 20 simwords
braids: 0.54516995
festoons: 0.54344755
tassels: 0.5374455
buckles: 0.53676164
handkerchiefs: 0.53366935
parasols: 0.5327432
bows: 0.5326219
bonnets: 0.5317746
leggings: 0.5315585
incisions: 0.53093725
ruffles: 0.52997005
horseshoes: 0.52985734
epaulets: 0.5294918
drapery: 0.5287507
toggles: 0.5282592
cuffs: 0.5274453
beads: 0.5265498
bandanas: 0.5264929
breastplates: 0.5262997
pleats: 0.5262542
top_cluster_density
0.2416123025841026
weight is 0.241612302584
" you ' d better come back in the house and dry your shoes  ___  ," said betty .
producing top 20 simwords
": 0.51874864
--: 0.51505446
/: 0.5142584
,: 0.5123305
?: 0.5093929
<: 0.507694
o'er: 0.5074109
til: 0.5069648
fro: 0.50643224
;: 0.506034
afore: 0.50506765
': 0.5050249
mr: 0.50501823
...: 0.50467914
maw: 0.5033746
mrs.: 0.50318575
ye: 0.5031509
sinn: 0.50312537
w/: 0.5030024
a-: 0.5022748
'"'
'/'
','
'?'
'<'
"o'er"
';'
"'"
'...'
'mrs.'
'w/'
top_cluster_density
0.13894250416505038
weight is 0.1389425

producing top 20 simwords
biped: 0.53723687
godsend: 0.5364061
quadruped: 0.53562254
mouthful: 0.53492534
hallucination: 0.5349221
hermaphrodite: 0.5340009
cocoon: 0.529384
deformity: 0.5291218
trifle: 0.52766174
corpse: 0.5267489
sleepwalker: 0.52667856
mirage: 0.52640396
parallelogram: 0.52551156
weakling: 0.5247217
crock: 0.52468413
birthmark: 0.524062
chameleon: 0.52371466
blemish: 0.5237118
vase: 0.52362764
raindrop: 0.5235328
top_cluster_density
0.14077597198570782
weight is 0.140775971986
the owner will be summonsed to court for illegal possession of a  ___  without a permit .
producing top 20 simwords
firearm: 0.5486134
handgun: 0.5409034
property: 0.5347436
license: 0.53364676
parcel: 0.5327471
lease: 0.53164345
conveyance: 0.53013253
stamp: 0.5290181
permit: 0.5278925
dwelling: 0.5272455
pawnshop: 0.52714735
pistol: 0.526424
licence: 0.52641803
fishpond: 0.5255298
casino: 0.52540857
bottle: 0.52448446
padlock: 0.52238405
motel: 0.522357
fence: 0.5216662
vehicle: 0.5215521
top

passeriformesfamily: 2.251264243584883
pelecaniformesfamily: 2.2154979195611855
editorgiven1: 2.1853712803795795
ulidiid: 2.1842248586076263
editorsurname4: 2.1648215240439943
langued: 2.1379011490870594
coraciiformesfamily: 2.125760598622776
issuant: 2.1022981947255146
gules: 2.095227771226434
laysummary: 2.0930573707041904
mordellidae: 2.070484832154288
accipitridae: 2.068786797424574
outstretched: 2.059907928728872
ifeq: 2.0580130060643227
hindwings: 2.0422532411233005
gastropod: 2.0375579146733074
mollusk: 2.033159240793382
charadriiformesfamily: 2.0266710201990406
beaks: 2.018393284642305
pterostichinae: 2.0139057975717445
('probes', ['duck', 'swan', 'jet', 'platypus', 'pig', 'armour'])
('gold', ['3.86', '3.29', '1.57', '2.29', '2.57', '2.14'])
('model_predict', [0.3647170970634007, 0.330947857552631, 0.28169567700022946, 0.34591170914971764, 0.40913316476234396, 0.32610581054453913])
spearman correlation is 0.714285714286
the voice stopped its feeble calling and all that could be

producing top 20 simwords
mound: 0.53586787
manger: 0.5339325
canopy: 0.5309436
roof: 0.5301752
worker: 0.5301042
awning: 0.5279408
shelter: 0.5275435
keeper: 0.5272198
dweller: 0.5260268
picker: 0.5255936
termite: 0.52497625
beetle: 0.52347076
fire: 0.52301663
warmer: 0.52265877
climber: 0.52234626
borer: 0.5221836
baby: 0.5219254
harvester: 0.5215212
structure: 0.52133584
sparrow: 0.521101
top_cluster_density
0.11968234913649217
weight is 0.119682349136
i might find the odd  ___  fight surviving in lahore or somewhere in pakistan , they thought , but not in delhi .
producing top 20 simwords
gang: 0.5282021
brotherly: 0.5275843
guy: 0.5272943
man: 0.52712077
boy: 0.52710783
pillow: 0.52284175
girl: 0.5226517
parsi: 0.5216736
pencil: 0.52112305
dog: 0.5204239
lucky: 0.51961035
triad: 0.5189848
gangster: 0.5189351
warrior: 0.5188205
tiger: 0.51865375
guys: 0.51839954
bloody: 0.5183175
little: 0.51776725
numbers: 0.51760334
dalit: 0.5173768
top_cluster_density
0.13320104203002925
weight 

2014: 0.53673404
2013-14: 0.53262246
2012/13: 0.5310311
2015: 0.52825135
2013: 0.52631307
2014/15: 0.5253913
sightseers: 0.5248404
2016: 0.52291745
2013-2014: 0.521359
filming: 0.5211616
2011-12: 0.5206322
2012-2013: 0.5204453
2010/11: 0.5192107
2018: 0.51899374
shipping: 0.51898444
2012: 0.517897
angling: 0.5177409
fishing: 0.517508
mooring: 0.5163975
paragliding: 0.5162269
'2012/13'
'2014/15'
'2010/11'
top_cluster_density
0.17476082131546236
weight is 0.174760821315
designed by dave thomas , close to the sea , surrounded by pine trees , and home  ___  .
producing top 20 simwords
grown: 0.5357546
waters: 0.53547376
turf: 0.5335696
gardens: 0.5331803
shrubbery: 0.5321987
driftwood: 0.53144306
furnishings: 0.5314018
greenery: 0.52993333
decor: 0.5295919
breezes: 0.52818006
dews: 0.5281557
improvement: 0.5280017
thereto: 0.5257597
clearings: 0.52469474
safely: 0.5237305
soil: 0.5229867
ground: 0.52047586
wood: 0.5204436
green: 0.52013254
hay: 0.51956534
top_cluster_density
0.126551346552

publicationdate: 2.216468799288913
editorsurname4: 2.1890576533221244
editorsurname1: 2.1873541778096675
editorgiven1: 2.1870836621440946
passeriformesfamily: 2.1789369981787003
editorsurname2: 2.1238148065349804
editorsurname3: 2.083790078745666
ifeq: 1.9978972287000234
laysummary: 1.9946268612855333
pyrams: 1.9715636629873194
mollusk: 1.9624669219539557
pelecaniformesfamily: 1.9403285599147464
muricidae: 1.9324410953078213
coraciiformesfamily: 1.9292923603243832
shawls: 1.9280264115458814
laydate: 1.9262930639306508
publicationplace: 1.9154697389203532
gruiformesfamily: 1.892795781300926
avocets: 1.8874936978286718
fasciolariidae: 1.8854998679635642
('probes', ['harpsichord', 'trumpet', 'typewriter', 'penguin', 'olive', 'lettuce'])
('gold', ['2.57', '3.43', '2.71', '1.43', '2.57', '1.43'])
('model_predict', [0.3790380043162751, 0.36920759147912624, 0.4601407690488438, 0.2909464533054912, 0.3590490006249183, 0.3885553443079017])
spearman correlation is 0.264820448851
such players , wh

producing top 20 simwords
moments: 0.53292984
awhile: 0.53284234
smiling: 0.53216445
brushing: 0.5282487
shaving: 0.52745426
bloodstains: 0.5260167
hours: 0.5259084
eons: 0.5257551
handshakes: 0.5255906
exercise: 0.52481544
gunshots: 0.5239085
keepsakes: 0.5232896
defecating: 0.52312124
wrinkles: 0.5229239
months: 0.5225495
fingerprints: 0.522404
snoring: 0.5214543
pencils: 0.5214176
breath: 0.5210001
prick: 0.5204974
top_cluster_density
0.13028529363848132
weight is 0.130285293638
teddy disassociates himself from her and eats lots of  ___  almost with his mouth open , just to show how broad - minded he is .
producing top 20 simwords
cupcakes: 0.5549164
confetti: 0.5516241
bubblegum: 0.54959536
candy: 0.54756606
pastries: 0.5451172
pancakes: 0.5450439
oranges: 0.54461503
bananas: 0.54429996
sandwiches: 0.5442359
waffles: 0.54211307
cake: 0.5408388
raspberries: 0.5407064
toothpaste: 0.5401691
pills: 0.5397608
sweets: 0.5397154
strawberries: 0.5397145
candies: 0.5394251
stuff: 0.5391239


motorboat: 2.245396125954585
monoplane: 2.170039325230026
sloop: 2.1384615938711913
biplane: 2.1332136438423985
floatplane: 2.1324364502295525
hulled: 2.130332547481453
seaplane: 2.103196773903081
viic: 2.0964219167784934
sloops: 2.072632758835079
schooner: 2.068450706060528
seaplanes: 2.0666971622057764
masted: 2.0589867517440963
trawler: 2.057014321850693
tailwheel: 2.0391319760134
dinghy: 2.0376359039477663
lifeboats: 2.029526994480715
sternwheeler: 2.0264868703471537
seats-in-tandem: 2.0211584445303328
pby: 2.013729166349572
dc-3: 2.0127765210566326
('probes', ['jet', 'yacht', 'hawk', 'stork', 'corkscrew', 'nightgown'])
('gold', ['4', '2.86', '3.29', '2.29', '1.71', '3'])
('model_predict', [0.4559584107086978, 0.6026869170090967, 0.3241692745782489, 0.3354811535416165, 0.32457654405290226, 0.36421704541878736])
spearman correlation is 0.2
i just caught the last bit of a message at the tube: no hand luggage allowed  ___  .
producing top 20 simwords
egress: 0.5309149
us: 0.52817214
u

j: 0.5259543
r: 0.52469385
,: 0.5236489
8d: 0.52313584
beare: 0.52075845
2/6: 0.51880944
': 0.5181411
ridgeway: 0.5180383
kev: 0.51774156
northey: 0.5170627
n: 0.5169792
3/5: 0.51647544
shilling: 0.5158302
xxii: 0.51488054
w: 0.51480913
c: 0.5146456
tre: 0.51406264
4/5: 0.5138696
pritchett: 0.5136812
chas: 0.513232
','
'2/6'
"'"
'3/5'
'4/5'
top_cluster_density
0.13190228200721907
weight is 0.131902282007
certain foods cause more problems than others , for example , dry crumbly consistencies may cause more trouble for someone than mashed  ___  .
producing top 20 simwords
potatoes: 0.55949223
potato: 0.5507527
almonds: 0.5449818
watermelons: 0.5449456
onions: 0.5423523
spinach: 0.540713
vegetables: 0.5386165
tomatoes: 0.53860205
apricots: 0.53837025
pumpkin: 0.53771913
chickpeas: 0.5376939
bananas: 0.5357532
onion: 0.5356982
plantains: 0.5350492
peanuts: 0.53478175
tomato: 0.53447104
cornmeal: 0.53440005
marshmallows: 0.5342846
chicken: 0.53407115
shallots: 0.5337418
top_cluster_density


madoguchi: 2.1418392522350884
fireplace: 2.130359914470548
smokehouse: 2.1276737847849163
publicationdate: 2.1039956797083423
kitchenette: 2.088363553875952
cupboards: 2.066502611473717
editorgiven1: 2.0605272762202516
editorsurname4: 2.051317516739811
pantry: 2.0422041168667273
cupboard: 2.0045608534708554
editorsurname1: 2.0034703517161576
editorsurname2: 1.9987138835113347
fireplaces: 1.9983303698386425
bedrooms: 1.9910183420670018
restrooms: 1.9840527070750245
porch: 1.9654378146664238
kitchen: 1.9634752337468493
outbuilding: 1.960008665216317
cafeteria: 1.9584683728769672
bunks: 1.9470487633507227
('probes', ['cupboard', 'basement', 'mixer', 'dishwasher', 'ladle', 'boat'])
('gold', ['2.43', '2', '2.14', '2.14', '1.43', '1.43'])
('model_predict', [0.6726585179196212, 0.615988455201036, 0.4373667853061449, 0.5601501209736163, 0.4184029870157195, 0.3135685262027469])
spearman correlation is 0.794461346554
once broached , the contents of a pot should be stored in the  ___  and quickly

producing top 20 simwords
refrigerant: 0.53703487
ducting: 0.52469814
sorbent: 0.52412164
metering: 0.5240125
refrigeration: 0.5230675
hydrocarbon: 0.5230194
treatment: 0.5220217
air-: 0.5218441
chiller: 0.52157795
cooling: 0.51991683
gases: 0.51940393
radiators: 0.5192813
water-: 0.51907617
transmission: 0.5189163
compressors: 0.518701
cogeneration: 0.5185624
scrubber: 0.5183299
cleaning: 0.51785207
refrigerants: 0.5175572
reprocessing: 0.5173808
top_cluster_density
0.20806241762985067
weight is 0.20806241763
dolce e gabbana were caught raking through marie antoinette ' s  ___  for their collection presented in milan .
producing top 20 simwords
boutiques: 0.5467185
bra: 0.5384395
d'oro: 0.5367244
fashions: 0.5322758
boxes: 0.53224576
chandelier: 0.5321732
cabinets: 0.5281826
gallery: 0.5277793
knitwear: 0.5277682
cookies: 0.52764976
hats: 0.52739483
wardrobe: 0.5270737
rezzonico: 0.52705073
outfits: 0.52662873
jewellery: 0.52627873
missoni: 0.52626204
roses: 0.5259873
': 0.5259259
wat

melody: 0.5426544
music: 0.54092616
vibe: 0.53682905
songwriting: 0.536544
sound: 0.53465
instrumentation: 0.53428805
ambiance: 0.5340725
feeling: 0.53354484
atmosphere: 0.53145784
groove: 0.5314048
song: 0.5308088
voice: 0.53029156
chorus: 0.5298525
rhythm: 0.52868956
piano: 0.52815384
riff: 0.52806544
album: 0.5272809
beat: 0.5271719
craziness: 0.5267798
tune: 0.5266911
top_cluster_density
0.19871075334294375
weight is 0.198710753343
we have three orchestras , several choirs , bands , an  ___  quartet and a pipe band .
producing top 20 simwords
accordion: 0.5304456
organ: 0.52761143
satb: 0.5255333
orchestral: 0.52434516
acoustic: 0.52389675
improvisational: 0.52292717
acoustical: 0.51956207
oboe: 0.5194894
overtone: 0.519471
ensemble: 0.5186383
acrobatic: 0.5173138
improvisatory: 0.5170337
electric: 0.51638484
opera: 0.51637673
improvising: 0.51574576
instrument: 0.5152881
astonishing: 0.5145552
eclectic: 0.51262915
art: 0.5122502
improv: 0.5120924
top_cluster_density
0.175963679289

bassoons: 2.6134975186809863
clarinet: 2.57451584952482
oboes: 2.538609948000839
trombone: 2.471234776780502
clarinets: 2.4617279081437458
saxophone: 2.4358674816759875
flugelhorn: 2.3862454278188068
trombones: 2.385534792417602
bassoon: 2.3723369658258644
mandolin: 2.361525205710671
cello: 2.334897351455667
violin: 2.3346460727825598
vibraphone: 2.332665361905505
harpsichord: 2.3249121617181707
continuo: 2.3206125255017187
oboe: 2.313657184727103
contrabass: 2.300408501417827
harmonica: 2.297965255108444
flute: 2.2920347885298846
contrabassoon: 2.290433840237538
('probes', ['clarinet', 'harp', 'buckle', 'wrench', 'urn', 'mackerel'])
('gold', ['2.14', '2.57', '1.43', '1.86', '1.43', '1.29'])
('model_predict', [0.8146235868623317, 0.6780158617661582, 0.2985719179281991, 0.3532449433363837, 0.24064757123879227, 0.3088110703417242])
spearman correlation is 0.753702346348
i doubt if we ' ll ever hear a man play a  ___  like that again .
producing top 20 simwords
song: 0.5454898
ballad: 0.5

producing top 20 simwords
kerchief: 0.5623294
scarf: 0.55413365
shawl: 0.5470062
headband: 0.54660535
bandana: 0.544902
bandanna: 0.5437385
skirt: 0.5429467
handkerchief: 0.5420273
bandage: 0.54029715
cravat: 0.5401544
bandolier: 0.53972304
loincloth: 0.5396775
scalpel: 0.5384173
gorget: 0.53735316
wrap: 0.53709066
ring: 0.53694904
collar: 0.536693
blanket: 0.5350109
towel: 0.5349316
mortarboard: 0.5348043
top_cluster_density
0.24666467860423644
weight is 0.246664678604
it was like the skirt just unraveled and the hole got longer and wider till my  ___  could no longer hide it .
producing top 20 simwords
feet: 0.54042995
knees: 0.5404205
fingers: 0.5401324
armpits: 0.539904
legs: 0.53902566
bosom: 0.5380939
waist: 0.537932
lips: 0.5371786
toes: 0.5353347
shoulders: 0.5346929
nose: 0.53427637
finger: 0.5330684
body: 0.53250766
eyelashes: 0.532233
hands: 0.5322243
head: 0.5321494
haunches: 0.5305663
wrists: 0.52951086
neck: 0.5294994
navel: 0.5292811
top_cluster_density
0.232921357657216

producing top 20 simwords
car: 0.5465626
bike: 0.54331213
stroller: 0.53664285
suitcase: 0.53623617
slippers: 0.53616256
belongings: 0.53583777
earring: 0.5339724
passports: 0.5325579
bags: 0.5321564
money: 0.5319383
passport: 0.5311762
lorries: 0.53110814
bicycle: 0.5308502
vehicle: 0.53078926
dentures: 0.53074896
shoes: 0.53069603
tractor: 0.5303219
horse: 0.5302454
towel: 0.5297857
paperwork: 0.529688
top_cluster_density
0.15982281124178122
weight is 0.159822811242
the programme is extensive and includes the development of turf pitches ,  ___  parks , sports facilities and a learning centre .
producing top 20 simwords
recreational: 0.53038645
car: 0.5299021
landscaped: 0.5289235
indoor: 0.528188
skate: 0.52673733
sports: 0.5267297
sporting: 0.52644026
i.t.: 0.5250486
floodlit: 0.5247052
astroturf: 0.52395743
outdoor: 0.5232236
amusement: 0.52184504
recreation: 0.5217928
athletic: 0.52173615
grassed: 0.5211752
sport: 0.5204319
pitch: 0.52036405
industrial: 0.5195584
jogging: 0.518445

producing top 20 simwords
communal: 0.52991784
conveyance: 0.52674097
policing: 0.52613753
photocopying: 0.52574337
recreational: 0.52551985
detention: 0.52515996
educational: 0.5249629
accommodation: 0.5246975
lodging: 0.5235676
daycare: 0.5231817
social: 0.52163184
administrative: 0.5214623
residential: 0.52142197
hostel: 0.5212533
polling: 0.5207461
transport: 0.5206977
inspection: 0.5202408
maintenance: 0.5195734
centralised: 0.51952153
ambulance: 0.5193909
top_cluster_density
0.13754792033829555
weight is 0.137547920338
" and every time the pose was over they ' d be off on their  ___  ," broods miss stetson .
producing top 20 simwords
foreheads: 0.5399443
knees: 0.5384797
shoulders: 0.53779274
lips: 0.53729516
gazes: 0.53483087
honeymoon: 0.5345844
respective: 0.5303446
neck: 0.5302547
sister: 0.52936214
backside: 0.5286687
bellies: 0.52808124
ankles: 0.5280273
toes: 0.52730095
unwrapped: 0.5264899
noses: 0.5262379
scent: 0.5260882
nipples: 0.52534366
thighs: 0.5252578
whining: 0.

coriander: 2.896593044269893
marinated: 2.8637902176657732
shallots: 2.8548075934936623
cilantro: 2.8500124839019576
chickpeas: 2.8033334934698586
stewed: 2.7964237755626415
salads: 2.788069798409824
soups: 2.7172074355416664
braised: 2.713181609705281
sauces: 2.6780727180319825
lentils: 2.6559431225944685
garlic: 2.6553580442795273
minced: 2.652805085514433
grated: 2.6359622829066325
chilies: 2.632853930131586
onions: 2.6296900293698853
seasonings: 2.6184614043716383
pickled: 2.616967872927814
diced: 2.6165867949439985
toppings: 2.6135940014301955
('probes', ['celery', 'radish', 'grape', 'salamander', 'budgie', 'pot'])
('gold', ['2.86', '1.71', '2.14', '1.29', '1.57', '1.71'])
('model_predict', [0.7622439788747, 0.6971799718780716, 0.5202367810867687, 0.2465826533349386, 0.25040046030473173, 0.47301887555427446])
spearman correlation is 0.898645105261
motor cycles  ___  , clothes , records , and football expenses accounted fur most of the new disposable income .
producing top 20 simwo

producing top 20 simwords
garlic: 0.53850776
mushrooms: 0.53731406
carrots: 0.536721
leeks: 0.5345755
lemongrass: 0.53417045
tarragon: 0.5338888
carrot: 0.5328887
vegetables: 0.53265184
nuts: 0.5320344
juices: 0.53117996
apricots: 0.52973217
fruits: 0.529469
spinach: 0.5283152
chives: 0.5282141
tarts: 0.52798086
berries: 0.5277956
cucumbers: 0.52775115
peanuts: 0.5270896
curds: 0.52689195
savory: 0.52671677
top_cluster_density
0.28557490194585944
weight is 0.285574901946
she spoke for the first time since luncheon began: " darling , your  ___  , please ."
producing top 20 simwords
missus: 0.5496322
grandma: 0.54244524
grandmother: 0.540969
lady: 0.5402515
friend: 0.5394533
neighbour: 0.5392449
mom: 0.5387982
excellency: 0.5379468
sister: 0.5376678
darling: 0.53714675
mother: 0.53697574
comrade: 0.5353193
highness: 0.53504443
housekeeper: 0.5346267
aunt: 0.5342091
mum: 0.5341094
bride: 0.5337798
mistress: 0.53364956
granny: 0.5333984
baby: 0.533073
top_cluster_density
0.2134828691628120

shoe: 0.54450274
head: 0.542749
kerchief: 0.5426387
tongue: 0.54138213
tail: 0.5398369
arm: 0.53819895
finger: 0.5376176
towel: 0.5375366
hands: 0.53691983
foot: 0.5366882
sword: 0.5366115
trousers: 0.53546596
hair: 0.53484875
shirt: 0.53481364
hat: 0.53474545
truncheon: 0.5344806
legs: 0.53431773
hand: 0.53418046
neck: 0.5341212
cloak: 0.5341148
top_cluster_density
0.18583677143557864
weight is 0.185836771436
so be a good boy and remove the masking tape from your hamster ' s mouth and zip up your  ___  ."
producing top 20 simwords
mouth: 0.5474518
pet: 0.5418029
pacifier: 0.5401725
ear: 0.53959334
doggy: 0.53738785
head: 0.53692204
penis: 0.5366676
dog: 0.53621835
toothpaste: 0.53560036
hair: 0.53545976
nose: 0.5353277
fave: 0.53510785
smelly: 0.5349054
tummy: 0.53419185
pants: 0.53392047
crotch: 0.5320001
neck: 0.5312207
arse: 0.5311342
toothbrush: 0.5307515
popcorn: 0.5307455
top_cluster_density
0.1766956349359449
weight is 0.176695634936
" and do you usually put your lunch on the f

editorgiven1: 2.472670260738928
publicationdate: 2.4540108771768083
editorsurname4: 2.4407585799847795
editorsurname1: 2.3361219010963694
laysummary: 2.307763667855143
editorsurname3: 2.288980741514702
editorsurname2: 2.272580420099499
laydate: 2.190375763220705
pelecaniformesfamily: 2.1511713304465894
publicationplace: 2.1488177953396694
pyrams: 2.1114361078135757
2014-01-01: 2.0652565341359885
hindwings: 2.0351817701911243
passeriformesfamily: 2.015131444400168
blouses: 2.0057370205792213
upperparts: 1.9931738192455648
underparts: 1.9853196971578864
includedworkurl: 1.9813313649128312
waistcoat: 1.9730596584053912
ifeq: 1.9706417016765052
('probes', ['pants', 'shawl', 'cape', 'curtain', 'pajama', 'cart'])
('gold', ['1.86', '1.43', '2.71', '1.86', '2.86', '1.86'])
('model_predict', [0.5216887483543514, 0.4596732657633624, 0.19114367890765951, 0.3495397341392875, 0.39317760092370124, 0.2989514681627772])
spearman correlation is -0.394664881467
protective clothing such as heavy boots , 

ailerons: 0.5391577
strakes: 0.53001994
apertures: 0.52800786
ribs: 0.52455306
embrasures: 0.523977
diagonals: 0.5233969
spars: 0.5231733
struts: 0.5230566
shutters: 0.52249545
radials: 0.5217509
screws: 0.52114266
airbrakes: 0.5211122
rotors: 0.5209364
prongs: 0.5208318
cantilevers: 0.52082056
gaps: 0.52047867
hemispheres: 0.52026075
trousers: 0.520245
scallops: 0.5196915
planks: 0.5196799
top_cluster_density
0.20095583273340467
weight is 0.200955832733
when i play a note on one of them , the corresponding string on the other  ___  will vibrate as well .
producing top 20 simwords
side: 0.53909445
string: 0.5334441
finger: 0.53089654
strings: 0.5254597
button: 0.5234871
chords: 0.521268
keyboard: 0.52095103
limb: 0.5202771
surface: 0.52025986
piano: 0.51973045
piece: 0.5190938
organ: 0.5185512
edge: 0.5185395
hand: 0.5177841
element: 0.51770985
plate: 0.51726425
instrument: 0.5169064
wall: 0.51670027
end: 0.51630074
frets: 0.51549745
top_cluster_density
0.14107210945453055
weight is 0.

oboes: 2.916058624968583
bassoons: 2.7982954182182884
continuo: 2.661226388314362
arpeggios: 2.600638692694604
timpani: 2.5745151450308663
octaves: 2.5671849893604937
ostinato: 2.566152197577976
clarinets: 2.5185203274432943
trombones: 2.5176247450191735
contrabassoon: 2.4743032012989
violas: 2.461998259891959
arpeggiated: 2.4591294239709307
semitone: 2.451495093418353
diatonic: 2.435783664160542
semitones: 2.4311553257707574
cellos: 2.425580336292363
pizzicato: 2.388004782589022
pentatonic: 2.385594278893097
trills: 2.380971020148643
glockenspiel: 2.3610524243740687
('probes', ['harp', 'drum', 'racquet', 'colander', 'rocket', 'radish'])
('gold', ['2.57', '2.43', '1.57', '1.57', '1.29', '2.43'])
('model_predict', [0.6121466703074128, 0.5592479809790303, 0.32305149442572556, 0.32682428870225066, 0.18396914789720195, 0.27144051234392563])
spearman correlation is 0.735612357921
in 1762, when the fairground op�ra - comique merged with the com�die - ltalienne , there were  ___  and six wind

dumpsters: 0.50452495
parcels: 0.50402856
piles: 0.50055945
funds: 0.50044936
<EOS>: 0.5
increments: 0.49993455
boxes: 0.4997438
whatever: 0.49895996
supplies: 0.49878496
amounts: 0.49853414
balloons: 0.49736387
monies: 0.49709052
takings: 0.4970781
orders: 0.49687576
flags: 0.49615863
tempers: 0.4957739
sandbags: 0.49568504
rates: 0.49561462
torches: 0.49557295
'<EOS>'
top_cluster_density
0.11701458489115309
weight is 0.117014584891
kit contains double planked hull ,  ___  , four sheets of detailed brass etched parts .
producing top 20 simwords
unpainted: 0.5407104
openwork: 0.5372412
mahogany: 0.5357572
fretwork: 0.53511804
weatherboard: 0.5340117
weatherboards: 0.53399026
brass: 0.53356045
copper: 0.53352505
metal: 0.5325704
rectangular: 0.5324907
tin: 0.53246915
rivets: 0.53227764
vellum: 0.53209054
ironstone: 0.5315451
slate: 0.5315092
riveted: 0.5312549
varnished: 0.53122056
timber: 0.53121346
planked: 0.5307219
enamel: 0.53070617
top_cluster_density
0.21190714112918574
weight is

wheel-drive: 2.0918545672774993
tailskid: 2.0889236917978047
axles: 2.0341348202052183
bogies: 2.029094355496218
tailwheel: 2.028264296461519
5-speed: 2.0218104379876465
and-ride: 2.0167092244259854
4-door: 2.0043995654435287
seats-in-side-by-side: 2.004370183673987
jeepneys: 1.9956084043174327
axle: 1.9897768243858016
wheelbase: 1.987651202117811
0px: 1.9845399183649728
4-speed: 1.9778765003346654
seats-in-tandem: 1.9655473534773877
2-door: 1.9639869919495312
4-wheel: 1.9589978418126783
trainlink: 1.9543434528578485
rickshaws: 1.9522317747453424
undercarriage: 1.9521165009515842
('probes', ['skateboard', 'jeep', 'train', 'fridge', 'shed', 'parakeet'])
('gold', ['2.43', '2.14', '2.43', '2', '2.29', '2.57'])
('model_predict', [0.4448011387320102, 0.5982218591322264, 0.5274982880371895, 0.3893691505843725, 0.4079000477942212, 0.19249520983110993])
spearman correlation is -0.231908414261
purpleloans - interest only mortgages purpleloan loans can be used for almost anything - home improvem

embers: 0.5387981
sandbags: 0.53448206
icicles: 0.53165686
flames: 0.5300545
smoke: 0.52854997
bricks: 0.52739805
boulders: 0.5265109
cinders: 0.5259305
rain: 0.52569735
karma: 0.5251545
snow: 0.52514744
fires: 0.52469194
shards: 0.52350086
boils: 0.5233945
debris: 0.5232757
chaos: 0.52244717
candles: 0.5221176
manholes: 0.5219688
blinds: 0.5214318
cracks: 0.5211663
top_cluster_density
0.1662294923551677
weight is 0.166229492355
the larvae , many of which resemble  ___  , are herbivorous , feeding on leaves and other plant matter .
producing top 20 simwords
termites: 0.55269736
weevils: 0.5474207
isopods: 0.5472116
mantids: 0.5469859
mealybugs: 0.54663485
beetles: 0.5460508
insects: 0.54591495
sawflies: 0.5450718
ladybirds: 0.5440427
earwigs: 0.5419098
aphids: 0.5412015
katydids: 0.5405034
cicadas: 0.54014367
ants: 0.5398071
slugs: 0.53868556
woodlice: 0.53867406
caddisfly: 0.53846186
millipedes: 0.5374893
frogs: 0.53719664
dandelions: 0.53619164
top_cluster_density
0.32755574274893995

editorgiven1: 2.0972191474951125
editorsurname4: 2.071436699471288
pelecaniformesfamily: 2.0371710837909163
passeriformesfamily: 2.035309720875369
editorsurname3: 1.9863238810133397
laysummary: 1.9647507777574722
mollusk: 1.9624867569759386
editorsurname2: 1.9085987523860934
editorsurname1: 1.9053615839290066
coraciiformesfamily: 1.888048573719637
laydate: 1.88603116747923
nudibranch: 1.8852950996219735
strigiformesfamily: 1.88419063202977
publicationdate: 1.883591716332119
gastropod: 1.8758926646747383
charadriiformesfamily: 1.8706887820978153
2014-01-01: 1.860980800462308
ulidiid: 1.8515194027290724
orangutan: 1.8498021640511741
gruiformesfamily: 1.841627526162161
('probes', ['crocodile', 'iguana', 'gorilla', 'banner', 'buzzard', 'shovel'])
('gold', ['2.71', '3.71', '2', '1.71', '2.14', '2.43'])
('model_predict', [0.5367838475909263, 0.4318572062738054, 0.4903225218969131, 0.2564149854888257, 0.35918495510300985, 0.3850169414341177])
spearman correlation is 0.542857142857
it is one o

brass: 0.57184285
metal: 0.5586899
pipe: 0.54414415
steel: 0.54290366
rock: 0.5423645
samba: 0.53745437
steelpan: 0.53691113
string: 0.5348301
rubber: 0.53319496
horn: 0.53206617
tambura: 0.5313151
marimba: 0.53011614
charanga: 0.52965456
mathcore: 0.5264964
jam: 0.5259941
reggae: 0.52525973
ragga: 0.52424985
dhol: 0.5241828
fado: 0.5232167
afrobeat: 0.52285767
top_cluster_density
0.19262728806290352
weight is 0.192627288063
consequently , the  ___  pupil has a high musical intelligence and others have a facility with numbers and dates .
producing top 20 simwords
matric: 0.5346823
blind: 0.528029
prospective: 0.52619344
sslc: 0.5248936
galileo: 0.52459264
dart: 0.5242757
k5: 0.5227051
minim: 0.52259636
topmost: 0.52259207
sighted: 0.52228576
apt: 0.5209631
boarding: 0.52089816
entrance: 0.5207595
p7: 0.52075934
brighter: 0.51994157
deaf: 0.5193602
d6: 0.5190447
examination: 0.51882035
s1: 0.5184947
vce: 0.517994
top_cluster_density
0.10070387048047635
weight is 0.10070387048
it is a tu

thickets: 2.286656848712693
passeriformesfamily: 2.268417443276901
sclerophyll: 2.237521266493241
shrubland: 2.2269777935514723
seagrass: 2.21515325807111
pelecaniformesfamily: 2.2137063542417548
sedges: 2.211349775198402
avocets: 2.183851048720441
shrubby: 2.1789287443739354
shrublands: 2.1742060975143205
mangroves: 2.1619996489325946
eucalypt: 2.15758532353022
roadsides: 2.1573143603014038
whelks: 2.157020654679256
dipterocarp: 2.14420988160901
montane: 2.1421670260647185
understory: 2.1261307213338814
savannas: 2.1234397732179033
vaccinium: 2.1217933498980615
crappie: 2.1168811390690587
('probes', ['crocodile', 'iguana', 'gorilla', 'banner', 'buzzard', 'shovel'])
('gold', ['4', '2.57', '3.57', '1.71', '2.29', '1.43'])
('model_predict', [0.5152629947274957, 0.42824046909417984, 0.4081201152331042, 0.2045151224930325, 0.3761158143231757, 0.39410909840468006])
spearman correlation is 0.771428571429
he never got himself an a  ___  so that he would often have to transpose down a semitone

producing top 20 simwords
instrument: 0.5304002
organ: 0.5298997
instrumentation: 0.527622
pedal: 0.5263692
instruments: 0.5262388
training: 0.5244481
functioning: 0.5234674
music: 0.52337235
work: 0.52111053
flute: 0.52031195
anatomy: 0.52008253
operation: 0.5187718
technique: 0.5180018
workshop: 0.5179768
tuning: 0.5170567
equipment: 0.5167454
rudiments: 0.5160417
timpani: 0.5157255
choir: 0.5154243
transposition: 0.5153846
top_cluster_density
0.14045128233231427
weight is 0.140451282332
here  ___  are placed above the oboes and would thus give a clear brilliant " tang " to the chord .
producing top 20 simwords
trumpets: 0.5333894
strings: 0.5248232
chords: 0.52438587
notes: 0.5241466
volutes: 0.5232438
sixths: 0.522915
trills: 0.522349
horns: 0.52225304
cymbals: 0.519197
cadenzas: 0.5179724
arabesques: 0.517601
modulations: 0.51726645
variations: 0.516425
tempers: 0.5164192
bows: 0.51637644
triads: 0.5153483
couplets: 0.51462954
recorders: 0.51400024
dampers: 0.5136838
cadences: 0.5

oboes: 2.9150509094179613
bassoons: 2.8604519231752903
clarinets: 2.6258993969009286
timpani: 2.6028902228582886
continuo: 2.5783664960162147
arpeggios: 2.559257619700105
trombones: 2.54797954735505
violas: 2.527228693496569
contrabassoon: 2.5037110833844816
cellos: 2.501483355874346
12-string: 2.4278634421261445
glockenspiel: 2.4137916019642756
violins: 2.399888194164981
woodwinds: 2.393876723019968
octaves: 2.3915451918128667
trills: 2.3816676239152166
arpeggiated: 2.377483284640152
bassoon: 2.375167926643345
pizzicato: 2.3623870831769977
vibraphone: 2.3548593911035587
('probes', ['banjo', 'gorilla', 'whistle', 'worm', 'dress', 'pine'])
('gold', ['3.14', '2.57', '2.43', '2.14', '2', '2'])
('model_predict', [0.6133986675617449, 0.24877943704372651, 0.4395322087535497, 0.25822179548296176, 0.27426255159090535, 0.20855427133703996])
spearman correlation is 0.521793932087
the highly effective  ___  is satin finished lever - arm with a geared action and soft touch panels inset into the ar

cocoon: 0.5462168
container: 0.5424089
husk: 0.54221326
pot: 0.5367699
dough: 0.53444505
compost: 0.5331894
tart: 0.53148574
drupe: 0.52958775
batter: 0.5286207
receptacle: 0.52824605
leaf: 0.52677256
salad: 0.5266242
skillet: 0.52563107
casserole: 0.52527815
pulp: 0.5251222
omelette: 0.5250881
bun: 0.52495676
puree: 0.5247391
crumb: 0.52408457
oven: 0.523987
top_cluster_density
0.18343948474637273
weight is 0.183439484746
normalized weight: 
  [[0.23022655]
 [0.29898799]
 [0.23318821]
 [0.23759725]]
producing top 20 words for new embedding
(259235, 400)
producing top 20 simwords
cilantro: 2.4159868851961144
marinated: 2.4142038671665493
flavorings: 2.4134882181852104
coriander: 2.389239951117246
shallots: 2.3831453849178166
flavouring: 2.3498843107801686
flavoring: 2.3421656946066385
toppings: 2.3402707321738836
sauces: 2.322282070914658
syrups: 2.3097850340686135
caramelized: 2.3018194779779195
salads: 2.296249702290449
stewed: 2.2931772430752972
flavourings: 2.288771607779458
goreng

flouring: 0.5669532
fulling: 0.56203866
grist: 0.5525673
flour: 0.54787165
forge: 0.5446891
woolen: 0.54272336
flax: 0.5423984
puddling: 0.54228467
woollen: 0.54156166
cooperage: 0.5376648
carding: 0.5370254
fireclay: 0.5332699
ironstone: 0.5331704
grinding: 0.5330743
corn: 0.53304815
loom: 0.5326759
timber: 0.5323607
flint: 0.5321514
spinning: 0.53181225
brickmaking: 0.53095627
top_cluster_density
0.23537654352091125
weight is 0.235376543521
carbendazim , a hormone disrupting pesticide , was found above legal limits [ 2 ] in apricots , green beans  ___  .
producing top 20 simwords
3: 0.54926217
1-5: 0.544183
3-4: 0.5434875
2: 0.5420016
6: 0.54010135
5: 0.5383905
etc: 0.53798604
4: 0.5369412
03: 0.53652936
12-17: 0.5345086
22-24: 0.5344567
1-2: 0.534272
2-3: 0.53266644
7: 0.5326009
respectively: 0.53239584
5-7: 0.53222877
1-3: 0.531649
6-7: 0.5313092
ripened: 0.52987885
12-20: 0.52919996
top_cluster_density
0.23219465495491365
weight is 0.232194654955
what happened to the brown bowls w

producing top 20 simwords
kid: 0.53518254
book: 0.5323258
rug: 0.5293166
gem: 0.529003
patch: 0.52823204
vibe: 0.52822435
shawl: 0.528121
girl: 0.5279498
thing: 0.52750593
blanket: 0.52710855
bump: 0.527075
vid: 0.5267277
card: 0.52612984
twist: 0.52590054
costume: 0.52576846
dojo: 0.52554816
monster: 0.52540123
groove: 0.5247941
pen: 0.52366185
carpet: 0.52347374
top_cluster_density
0.12343127824221974
weight is 0.123431278242
normalized weight: 
  [[0.21166014]
 [0.33025929]
 [0.28704249]
 [0.17103808]]
producing top 20 words for new embedding
(259235, 400)
producing top 20 simwords
opossums: 2.25928755581345
egrets: 2.2528001321428124
avocets: 2.2445274065240923
insectivores: 2.235643437908537
skinks: 2.231625944945526
pelecaniformesfamily: 2.223605006065518
lizards: 2.2196028242146437
passeriformesfamily: 2.2158608961565833
tapirs: 2.200952121058342
gymnures: 2.1931448404381113
geckos: 2.1920909174358503
ibises: 2.1859947746696973
porcupines: 2.1830183496266518
mongooses: 2.1821134