# Neighbs

In [1]:
from koselleck import *

In [3]:
def to_nbr(dfc,progress=True):
    vl=get_veclib('nbr')
    o=[]
    for (wrd,prd),dfprd in tqdm(dfc.groupby(['word','period']),desc='Computing neighborhood',disable=not progress):
        qstr=f'nbr({wrd}_{prd})'
        if qstr in vl:
            odf=vl[qstr]
        else:
            dfprdg=dfprd.groupby('neighbor')
            dfprd=dfprd.reset_index().set_index('neighbor')
            dfprd['count']=dfprdg.size()
            dfprd['score']=[(c - (cd/10)) for c,cd in zip(dfprd['count'], dfprd['cdist'])]
            odf=dfprd.groupby(['period','neighbor']).mean().reset_index()
            odf['rank']=odf['score'].rank(ascending=False,method='min').apply(int)
            odf=odf.sort_values('rank')
            vl[qstr]=odf
        o.append(odf)
    return pd.concat(o).sort_values('score',ascending=False) if len(o) else pd.DataFrame()

In [4]:
def to_nbr(word,period,max_rank=1000,force=False,cache_only=False,num_proc=1,progress=True):
    qstr=f'{word}_{period}'
    odf=pd.DataFrame()
    with get_veclib('nbr',autocommit=True) as vl:
        if qstr in vl:
            odf=vl.get(qstr)
        else:
            gby=['word','neighbor','period']
            dfprd = cdist(word,period,num_proc=num_proc,progress=progress)
            dfprdg=dfprd.groupby(gby)
            dfprd=dfprd.reset_index().set_index(gby)
            dfprd['count']=dfprdg.size()
            dfprd['score']=[(c - (cd/10)) for c,cd in zip(dfprd['count'], dfprd['cdist'])]
            odf=dfprd.groupby(gby).mean()
            odf['rank']=odf['score'].rank(ascending=False,method='min').apply(int)
            odf=odf.sort_values('rank')
            if max_rank: odf=odf[odf['rank']<=max_rank]
            vl[qstr]=odf
    return odf if not cache_only else pd.DataFrame()

In [5]:
def nbr_(argd): return to_nbr(**argd)

def nbr(word_or_words,period_or_periods=None,prefix='nbr',neighbors=None,
        max_rank=1000,force=False,cache_only=False,num_proc=1):
    # preproc input
    words=tokenize_fast(word_or_words) if type(word_or_words)==str else list(word_or_words)
    if period_or_periods is None:
        periods=get_default_periods()
    elif type(period_or_periods)==str:
        periods=tokenize_fast(period_or_periods)
    else:
        periods=list(period_or_periods)
    # get objs
    objs = [
        dict(word=word,period=period,max_rank=max_rank,force=force,cache_only=cache_only,num_proc=1,progress=False)
        for word in words
        for period in periods
    ]
    # map
    return pd.concat(pmap(
        nbr_,
        objs,
        num_proc=num_proc,
        desc='Computing neighborhoods across word-periods'
    ))
    
    

In [None]:
nbr('virtue,progress',num_proc=1)

Computing neighborhoods across word-periods [x1]:  48%|████▊     | 46/96 [00:00<00:00, 90.07it/s]

In [None]:
# def do_nbr(objd):
#     res=nbr(**objd)
#     if res is None: res=pd.DataFrame()
#     return res


# def nbr(word,period=None,run=None,prefix='nbr',neighbors=None,max_num=None,num_runs=10,num_proc=4,force=False,progress=True,cache_only=False):
#     index_cols=['word','neighbor','period','run']
#     argd=dict(
#         word=word,period=period,run=run,prefix=prefix,
#         neighbors=neighbors,max_num=max_num,num_runs=num_runs,
#         num_proc=num_proc,force=force,progress=progress,
#         cache_only=cache_only
#     )
#     odf=None
    
#     if type(word)!=str:
#         objs=[{**argd, **{'word':w, 'progress':False, 'num_proc':1}} for w in word]
#         odf=pd.concat(pmap(do_nbr, objs, num_proc=num_proc, desc='Computing neighborhoods across words', progress=progress))
#         return odf if not cache_only else pd.DataFrame()
        
#     if period is None:
#         objs=[{**argd, **{'period':prd, 'progress':False, 'num_proc':1}} for prd in get_default_periods()]
#         odf=pd.concat(pmap(do_nbr, objs, num_proc=num_proc, desc='Computing neighborhoods across periods', progress=progress))
#         return odf if not cache_only else pd.DataFrame()

#     if run is None:
#         qstr=f'{word}_{period}'
#         if not force:
#             with get_veclib(prefix) as vl: odf=vl.get(qstr)
#         if odf is None:
#             objs=[{**argd, **{'run':run+1, 'progress':False, 'num_proc':1}} for run in range(num_runs)]
#             odf=pd.concat(pmap(do_nbr, objs, num_proc=num_proc, desc='Computing neighborhoods across runs', progress=progress))
#             with get_veclib(prefix,autocommit=True) as vl: vl[qstr]=odf
#         if not cache_only and neighbors:
#             neighbors=set(tokenize_fast(neighbors)) if type(neighbors)==str else set(neighbors)
#             odf=odf.reset_index()
#             odf=odf[odf.neighbor.isin(neighbors)]
#             odf=odf.set_index(index_cols)
        
#         return odf if not cache_only else pd.DataFrame()
            
#     # otherwise

#     # get?
#     if type(run)==int: run=str(run).zfill(2)
#     dfvecs=vecs(period=period, run=run)
#     if dfvecs is None:
#         print(wqstr,'!?')
#         return pd.DataFrame()
#     if not words: words=dfvecs.index
#     words=set(words)
#     if not word in words: return pd.DataFrame()    
#     dfu=dfvecs.loc[word]
#     if max_num and len(dfvecs)>max_num: dfvecs=dfvecs.iloc[:max_num]
#     dfm=dfvecs.drop(word)
#     res=fastdist.cosine_vector_to_matrix(
#         dfu.values.astype(float),
#         dfm.values.astype(float),
#     )
#     wdx=dict(
#         (x,1-y)
#         for x,y in zip(dfm.index, res)
#     )
#     wds=pd.Series(wdx)#.sort_values()
#     wddf=pd.DataFrame(wds,columns=['nbr']).rename_axis('neighbor').sort_values('nbr')

#     wddf=wddf.reset_index()
#     wddf['word']=word
#     wddf['period']=period
#     wddf['run']=run
#     wddf=wddf.set_index(index_cols)
#     return wddf
#     return pd.DataFrame()
   





In [19]:
dfnbr=to_nbr(dfc)

Computing neighborhood: 100%|██████████| 48/48 [00:00<00:00, 54.81it/s]


In [25]:
dfnbr[dfnbr['rank']<=100]

Unnamed: 0,period,neighbor,cdist,count,score,rank
41199,1840-1845,wisdom,0.226419,10,9.977358,1
40004,1825-1830,wisdom,0.226866,10,9.977313,1
36938,1800-1805,wisdom,0.227134,10,9.977287,1
35893,1800-1805,virtuous,0.229836,10,9.977016,2
34127,1795-1800,virtuous,0.230758,10,9.976924,1
...,...,...,...,...,...,...
528,1725-1730,advantage,0.662187,10,9.933781,96
18385,1725-1730,lover,0.662554,10,9.933745,97
22595,1725-1730,pernicious,0.662954,10,9.933705,98
7422,1725-1730,cultivated,0.663067,10,9.933693,99


In [18]:
# def nbr() # @TODO

In [32]:
dfc=cdist('culture','1780-1785',1)

In [None]:
get_vecs('1740-1745_01')

In [None]:
# dbget('vecs(1720-1725_02)')

In [None]:
vl=get_veclib('vecs')

In [None]:
for k in vl:
    print(k)
    break

In [None]:
f

## Gen data

In [None]:
gen_neighbors('virtue,value,station,culture,integrity')

In [None]:
dbget('nbr(culture)')

In [None]:
# dbget('lsn(culture_1925-1930_bpo,culture_1940_1945_bpo,k=25)')

In [None]:
dfmodels = get_pathdf_models(period_len=5).query('run<="run_10" & period_start>=1720')
for i,grp in dfmodels.groupby(['corpus','period']): pass
grp

In [None]:
odf=do_gen_neighbs(grp,progress=True)
odf[odf.word=='value']

In [None]:
dfneighbs = gen_all_neighbors(lim=None,force=1)
dfneighbs

## Load data

In [None]:
dfneighbs = get_all_neighbors()
dfneighbs

In [None]:
dfneighbstr=get_all_neighbors_strsummary(force=True)
dfneighbstr

In [None]:
dfneighbstr.loc['culture']

In [None]:
dfneighbstr.loc['history']