# Distances

In [1]:
from ipynb.fs.full.koselleck import *

In [2]:
FORCE=False
GBY_LOCAL_O=['corpus1', 'corpus2','period1', 'period2', 'word1', 'word2','qstr']

## Generating distances en masse

In [3]:
def _distvecs(objd): return distvecs(**objd)

def distvecs(period=None,run=None,prefix='dvecs',max_num=10000,num_runs=10,
             num_proc=1,force=False,progress=True,cache_only=True,cache=True):
    
    argd=dict(
        period=period,run=run,prefix=prefix,
        max_num=max_num,num_runs=num_runs,
        num_proc=num_proc,force=force,progress=progress,
        cache_only=cache_only,cache=cache
    )
    odf=None
    
    if period is None:
        # load cached word?
        objs=[{**argd, **{'period':prd, 'progress':True, 'num_proc':1,'cache_only':True}} for prd in get_default_periods()]
        odf=pd.concat(pmap(_distvecs, objs, num_proc=num_proc, desc='Measuring cosine distances across periods', progress=progress))
        return odf if not cache_only else pd.DataFrame()
    
    if run is None:
        objs=[{**argd, **{'run':run+1, 'progress':True, 'num_proc':1,'cache_only':True}} for run in range(num_runs)]
        odf=pd.concat(pmap(_distvecs, objs, num_proc=num_proc, desc='Measuring cosine distances across runs', progress=progress))
        return odf if not cache_only else pd.DataFrame()

    # load vecs?
    dfvecs=vecs(period=period, run=run)
    if not len(dfvecs): return pd.DataFrame()
    
    pprefix=prefix+'_'+period.split('-')[0]+'_'+str(run).zfill(2)
#     ofnfn=os.path.join(PATH_DB,'dvecs',f'data.dvecs.{period}_{run:02}.pkl')
#     qstr=f'{period}_{run},max_num={max_num}'
#     if cache and not force:
# #         if os.path.exists(ofnfn):
# #             print('Found on disk. Loading...')
#             #with open(ofnfn,'rb') as f: od=pickle.load(f)
# #             odf=read_df(ofnfn)
# #             print('Finished loading')
# #             return odf if not cache_only else pd.DataFrame()
#         with get_veclib(pprefix,autocommit=False) as vl:
#             if len()
#             if qstr in vl:
#                 print('Found in vector library. Loading')
#                 odf=vl[qstr] if not cache_only else pd.DataFrame()
#                 print('Finished loading')
#                 return odf
    # filter
    dfvecs=dfvecs.iloc[:max_num]
    
    #distmat!
    if progress: print(f'Computing distances for {max_num} words')
    dfdist=pd.DataFrame(
        fastdist.cosine_pairwise_distance(
            dfvecs.values.astype(float),
            return_matrix=True
        ),
        index=dfvecs.index,
        columns=dfvecs.index
    )
    # csim to cdist?
    dfdist=2-dfdist
    if progress: print('Done')
    
    
    
    # convert to long form
#     od = defaultdict(dict)
# #     g=nx.Graph()
    if cache:
#         with get_veclib(pprefix,autocommit=False) as vl:
#             for i,w1 in enumerate(tqdm(dfdist.index,position=0,disable=not progress)):
#                 vl[w1]=dfdist.loc[w1]
# #                 if i and not i%100: vl.commit()
#             vl.commit()
        def tryword(w1):
            try:
                with get_veclib(prefix=w1,folders=['wvecs'],autocommit=False) as vl:
                    qstr=f'{period}_{run:02}'
                    if force or qstr not in vl:
                        vl[qstr]=dfdist.loc[w1]
                        vl.commit()
            except Exception as e:
                print('!!',e)
                return w1
            return 0

        tryagain=[]
        for i,w1 in enumerate(tqdm(dfdist.index,position=0,disable=not progress)):
            res=tryword(w1)
            if res!=0: tryagain+=[w1]
        for i,w1 in enumerate(tqdm(tryagain,position=0,disable=not progress)):
            res=tryword(w1)
        
        
                
            
#             if w1<w2:
# #                 g.add_edge(w1,w2,cdist=val)
#                 od[w1][w2]=val
#     if cache:
#         dfdist.to_pickle(ofnfn)
        #with open(ofnfn,'wb') as of: pickle.dump(od,of)
#         with get_veclib(pprefix,autocommit=True) as vl:
#             vl[qstr]=od
    return dfdist if not cache_only else pd.DataFrame()

In [4]:
# res=distvecs('1700-1740',1)#.loc['culture'][['culture','represent']]
# res

In [5]:
# res=distvecs('1780-1785',num_proc=4)#.loc['culture'][['culture','represent']]
# res

In [6]:
# for prd in get_default_periods():
#     if prd<"1750": continue
#     print(prd)
#     distvecs(prd,num_proc=1,cache=True,cache_only=True,force=False)

## By word

In [107]:
def distvecs_word_period(word,period,run=1,words=None,**y):
    try:
        dfvecs = vecs(period,run)
        if words: dfvecs=dfvecs.loc[set(dfvecs.index)&set(words)] 
        s=dfvecs.loc[word]
        dfvecs=dfvecs.drop(word)
    except KeyError as e:
        return pd.Series()
    u=s.values.astype('float')
    m=dfvecs.values.astype('float')
    res=fastdist.cosine_vector_to_matrix(u,m)
    return 1-pd.Series(res, index=dfvecs.index)

def _distvecs_word_period_(objd): return distvecs_word_period(**objd)

In [110]:
w=random.choice(get_valid_words())
w='culture'

In [111]:
distvecs_word_period(w,'1700-1740',words=get_valid_words()).sort_values()

vegetation     0.238368
cultivating    0.247090
planting       0.270857
plants         0.279436
gardening      0.284771
                 ...   
sentence       1.411540
protest        1.426899
prisoner       1.431536
traitor        1.452999
swore          1.490939
Length: 13480, dtype: float64

In [105]:

def distvecs_word(word,
                  only_valid_words=True,reformat=False,
                  periods=None,ymin=YMIN,ymax=YMAX,ybin=YEARBIN,num_runs=1,
                  num_proc=1,force=False,cache=True,vl_tbl='distvecs_word'):
    qstr=word
    valid_words=set(get_valid_words()) if only_valid_words else None
    with get_veclib(vl_tbl) as vl:
        od=vl.get(qstr,{})
        cols_done=set(od.keys())
        periods = periods if periods else get_periods_bystep(ymin=ymin, ymax=ymax, ybin=ybin)
        objs = [
            dict(word=word, period=period, run=run, key=f'{period}_{run:02}', words=valid_words)
            for period in periods
            for run in range(1,num_runs+1)
        ]
        objs_todo = [obj for obj in objs if obj['key'] not in cols_done]
        if objs_todo:

            iterr = pmap_iter(
                _distvecs_word_period_,
                objs,
                num_proc=num_proc,
            )

            for i,res_s in enumerate(iterr):
                key=objs[i]['key']
                od[key]=res_s.loc[set(res_s.index)]
#                 od[key]=dict(res_s)
            vl[qstr]=od
            vl.commit()
    
        keys=set(obj['key'] for obj in objs)
        od_inp = dict((k,v) for k,v in od.items() if k in keys)
        odf=pd.DataFrame(od_inp)
    
        if reformat:
            odf2=odf.T
            odf2['word_']=word
            odf2['period_']=[x.split('_')[0] for x in odf2.index]
            odf2['run_']=[int(x.split('_')[1]) for x in odf2.index]
            odf=odf2.set_index(['word_','period_','run_'])
        
        return odf

In [103]:
odf=distvecs_word(w, ymin=1700, ymax=1900, ybin=40)
# distvecs_word(w, ymin=1720, ymax=1900, ybin=5)

In [100]:
odf

Unnamed: 0,1700-1740_01,1740-1780_01,1780-1820_01,1820-1860_01,1860-1900_01
abandon,0.330221,0.330432,0.349275,0.312593,0.307720
abandoned,0.142146,0.273295,0.284448,0.275133,0.252011
abandoning,0.249230,0.411530,0.479452,0.430369,0.440708
abandonment,,,0.444241,0.424689,0.478530
abate,0.491136,0.411512,0.396168,0.325673,0.353313
...,...,...,...,...,...
zone,0.226193,0.115579,-0.072329,-0.116534,-0.037059
zoo,-0.106325,-0.179965,-0.116447,-0.186264,-0.177936
zoological,,-0.154577,-0.185991,-0.264371,-0.231505
zoology,,-0.017019,-0.153323,-0.194718,-0.163341


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,abandon,abandoned,abandoning,abandonment,abate,abated,abatement,abbe,abbess,abbey,...,zealous,zenith,zephyr,zest,zinc,zone,zoo,zoological,zoology,zulu
word,period,run,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
weakening,1700-1740,1,0.330221,0.142146,0.24923,,0.491136,0.244381,0.247195,-0.120381,-0.13531,-0.13649,...,0.155407,0.212332,0.069789,-0.079247,-0.226511,0.226193,-0.106325,,,
weakening,1740-1780,1,0.330432,0.273295,0.41153,,0.411512,0.296222,0.136512,-0.091485,-0.06232,-0.255566,...,0.197535,0.261516,-0.014148,-0.019612,-0.082049,0.115579,-0.179965,-0.154577,-0.017019,
weakening,1780-1820,1,0.349275,0.284448,0.479452,0.444241,0.396168,0.240492,0.259471,-0.173795,-0.170173,-0.228862,...,0.169031,0.095546,-0.141186,-0.004601,0.000198,-0.072329,-0.116447,-0.185991,-0.153323,-0.219812
weakening,1820-1860,1,0.312593,0.275133,0.430369,0.424689,0.325673,0.314274,0.218541,-0.114231,-0.151464,-0.140796,...,0.069138,0.108633,-0.062683,0.031038,0.090149,-0.116534,-0.186264,-0.264371,-0.194718,-0.068619
weakening,1860-1900,1,0.30772,0.252011,0.440708,0.47853,0.353313,0.310518,0.291648,-0.112584,-0.046244,-0.113532,...,0.108007,0.128412,-0.084911,-0.04029,0.084281,-0.037059,-0.177936,-0.231505,-0.163341,0.046471


In [82]:
odf2=odf.rename_axis('word').reset_index().melt(id_vars=['word'],value_name='cdist')
odf2['period']=[x.split('_')[0] for x in odf2.variable]
odf2['run']=[int(x.split('_')[1]) for x in odf2.variable]
odf2

Unnamed: 0,index,variable,cdist,period,run
0,abandon,1700-1740_01,0.330221,1700-1740,1
1,abandoned,1700-1740_01,0.142146,1700-1740,1
2,abandoning,1700-1740_01,0.249230,1700-1740,1
3,abandonment,1700-1740_01,,1700-1740,1
4,abate,1700-1740_01,0.491136,1700-1740,1
...,...,...,...,...,...
636684,zone,1895-1900_01,0.281473,1895-1900,1
636685,zoo,1895-1900_01,0.205820,1895-1900,1
636686,zoological,1895-1900_01,0.202006,1895-1900,1
636687,zoology,1895-1900_01,0.432930,1895-1900,1
