# Distances

In [1]:
from ipynb.fs.full.koselleck import *

[Koselleck] (16:43:46) Alles bereit (+0.0s)


In [2]:
FORCE=False
GBY_LOCAL_O=['corpus1', 'corpus2','period1', 'period2', 'word1', 'word2','qstr']

In [3]:

def do_cdist(objd):
    res=cdist(**objd)
    if res is None: res=pd.DataFrame()
    return res

def cdist(word,period=None,run=None,prefix='cdist',neighbors=None,max_num=10000,num_runs=10,num_proc=4,force=False,progress=True,cache_only=False,cache=False):
    index_cols=['word','neighbor','period','run']
    argd=dict(
        word=word,period=period,run=run,prefix=prefix,
        neighbors=neighbors,max_num=max_num,num_runs=num_runs,
        num_proc=num_proc,force=force,progress=progress,
        cache_only=cache_only,cache=cache
    )
    odf=None
    
    if type(word)!=str:
        objs=[{**argd, **{'word':w, 'progress':False, 'num_proc':1}} for w in word]
        odf=pd.concat(pmap(do_cdist, objs, num_proc=num_proc, desc='Measuring cosine distances across words', progress=progress))
        return odf if not cache_only else pd.DataFrame()
    
    if period is None:
        # load cached word?
        qstr=f'{word},ymin={YMIN},ymax={YMAX},ybin={YEARBIN}'
        if cache:
            with get_veclib('cdist',autocommit=False) as vl:
                if qstr in vl: return vl[qstr]
        
        objs=[{**argd, **{'period':prd, 'progress':False, 'num_proc':1}} for prd in get_default_periods()]
        odf=pd.concat(pmap(do_cdist, objs, num_proc=num_proc, desc='Measuring cosine distances across periods', progress=progress))
        if cache:
            with get_veclib('cdist',autocommit=True) as vl:
                vl[qstr]=odf
        return odf if not cache_only else pd.DataFrame()
    
    if run is None:
        qstr=f'{word}_{period}'
        if not force and cache:
            with get_veclib(prefix) as vl: odf=vl.get(qstr)
            #odf=vl.get(qstr)
        if odf is None:
            objs=[{**argd, **{'run':run+1, 'progress':False, 'num_proc':1}} for run in range(num_runs)]
            odf=pd.concat(pmap(do_cdist, objs, num_proc=num_proc, desc='Measuring cosine distances across runs', progress=progress))
            if cache: #vl[qstr]=odf
                with get_veclib(prefix,autocommit=True) as vl:
                    vl[qstr]=odf
        return odf if not cache_only else pd.DataFrame()
            

    # load vecs?
    dfvecs=vecs(period=period, run=run)
    if dfvecs is None: return pd.DataFrame()
    
    # word not in vocab?
    if not word in set(dfvecs.index): return pd.DataFrame()
    
    # filter by neighbors?
    if neighbors: dfvecs=dfvecs.loc[[w for w in dfvecs.index if w in set(neighbors) or w==word]]

    # filter by total?
    if max_num and len(dfvecs)>max_num: dfvecs=dfvecs.iloc[:max_num]
    
    # get arrays
    dfu=dfvecs.loc[word]
    dfm=dfvecs.drop(word)
    
#     print(f'Computing cosine from array {dfu.shape} to {dfm.shape}')
    res=fastdist.cosine_vector_to_matrix(
        dfu.values.astype(float),
        dfm.values.astype(float),
    )
    wdx=dict(zip(dfm.index, res))
    wds=1-pd.Series(wdx)
    wddf=pd.DataFrame(wds,columns=['cdist']).rename_axis('neighbor').sort_values('cdist')
    wddf=wddf.reset_index()
    wddf['word']=word
    wddf['period']=period
    wddf['run']=run
    wddf=wddf.set_index(index_cols)
    return wddf


In [4]:
# cdist('reason')

## Generating distances en masse

In [5]:
def _distvecs(objd): return distvecs(**objd)

def distvecs(period=None,run=None,prefix='dvecs',max_num=10000,num_runs=10,
             num_proc=1,force=False,progress=True,cache_only=False,cache=True):
    
    argd=dict(
        period=period,run=run,prefix=prefix,
        max_num=max_num,num_runs=num_runs,
        num_proc=num_proc,force=force,progress=progress,
        cache_only=cache_only,cache=cache
    )
    odf=None
    
    if period is None:
        # load cached word?
        objs=[{**argd, **{'period':prd, 'progress':False, 'num_proc':1,'cache_only':True}} for prd in get_default_periods()]
        odf=pd.concat(pmap(_distvecs, objs, num_proc=num_proc, desc='Measuring cosine distances across periods', progress=progress))
        return odf if not cache_only else pd.DataFrame()
    
    if run is None:
        objs=[{**argd, **{'run':run+1, 'progress':False, 'num_proc':1,'cache_only':True}} for run in range(num_runs)]
        odf=pd.concat(pmap(_distvecs, objs, num_proc=num_proc, desc='Measuring cosine distances across runs', progress=progress))
        return odf if not cache_only else pd.DataFrame()

    # load vecs?
    dfvecs=vecs(period=period, run=run)
    if not len(dfvecs): return pd.DataFrame()
    
    qstr=f'{period}_{run},max_num={max_num}'
    if cache and not force:
        with get_veclib(prefix,autocommit=False) as vl:
            if qstr in vl: return vl[qstr] if not cache_only else pd.DataFrame()
    
    # filter
    dfvecs=dfvecs.iloc[:max_num+1]
    
    #distmat!
    if progress: print(f'Computing distances for {max_num} words')
    dfdist=pd.DataFrame(
        fastdist.cosine_pairwise_distance(
            dfvecs.values.astype(float),
            return_matrix=True
        ),
        index=dfvecs.index,
        columns=dfvecs.index
    )
    # csim to cdist?
    dfdist=dfdist - 2
    if progress: print('Done')
    
#     # convert to long form
# #     if progress: print('Converting to long form')
# #     odf=dfdist.rename_axis('word1').reset_index().melt(id_vars=['word1'],var_name='word2',value_name='cdist')
# #     odf = odf.query('word1<word2')
#     od = defaultdict(dict)
#     for w1 in tqdm(dfdist.index,position=0,disable=not progress):
#         for w2,val in dfdist.loc[w1].items():
#             if w1<w2:
#                 od[w1][w2]=val
    if cache:
        with get_veclib(prefix,autocommit=True) as vl:
            vl[qstr]=dfdist
    return dfdist if not cache_only else pd.DataFrame()

In [6]:
# g=nx.from_edgelist([(0, 1, {'cdist':.5})])
# g.size(),g.order()

In [7]:
# od=distvecs('1750-1755',1,max_num=10000,cache=True,force=True)
# len(od)

In [8]:
# random.sample(od.keys(),10)

In [9]:
# w2='culture'
# matches = {
#     **dict((w1,od[w2][w1]) for w1 in tqdm(od[w2])),
#     **dict((w1,od[w1][w2]) for w1 in tqdm(od) if w2 in od[w1]),
# }
# len(matches)

In [10]:
# res=distvecs('1750-1755',cache=True,cache_only=True,num_proc=4)#,force=True)
# res

In [27]:
# res=distvecs(cache=True,cache_only=True,num_proc=1)#,force=True)
# res

In [24]:
prds=get_default_periods()
runs=list(range(1,11))

In [25]:
period=random.choice(prds)
run=random.choice(runs)
period,run



('1750-1755', 1)

In [28]:
%%timeit
distvecs(period,run).loc[['culture','virtue']][['culture','virtue']]

933 ms ± 6.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# dfvecs=vecs('1750-1755',1)

In [14]:
list(dfvecs.index).index('value')

745