# Distances

In [1]:
from koselleck import *

## Generating data

In [2]:
FORCE=False
YMIN=1720
YMAX=1960

### Within-model Distance matrices

In [3]:
# gen_within_model_dists(
#     words=get_valid_words(),
#     num_proc=1,
#     period_len=5,
#     ymin=YMIN,
#     ymax=YMAX,
#     force=FORCE
# )

### Across-model Distance matrices

In [4]:
def get_cross_model_dists_paths(ofn='data.all_local_dists.paths.csv',force=False,period_len=5,ymin=1720,ymax=1960):
    ofnfn=os.path.join(PATH_DATA,ofn)
    if not force and os.path.exists(ofnfn): 
        odf=read_df(ofnfn)
    else:
        dfpaths=get_model_paths_df(PATH_MODELS_BPO).query(
            f'(period_end-period_start)==5 & period_start>={ymin} & period_end<={ymax}'
        ).sort_values('period_start')
        dfpaths['period']=[f'{x}-{y}' for x,y in zip(dfpaths.period_start, dfpaths.period_end)]
#         display(dfpaths)
        o=[]
        for i1,row1 in tqdm(dfpaths.iterrows(), total=len(dfpaths)):
            for i2,row2 in dfpaths.iterrows():
                if row1.run!=row2.run: continue
                if i1>=i2: continue
                o+=[{
                    **dict((k+'1',v) for k,v in row1.items()),
                    **dict((k+'2',v) for k,v in row2.items())
                }]
        odf=pd.DataFrame(o)
        odf.to_csv(ofnfn,index=False)
    return odf.sort_values(['period_start1','period_start2','run1','run2'])


In [5]:
dfpaths_cmp=get_cross_model_dists_paths(
    period_len=5,
    ymin=YMIN,
    ymax=YMAX,
    force=FORCE,
)
dfpaths_cmp[['period1','period2','run1','run2','path1','path2']]

Unnamed: 0,period1,period2,run1,run2,path1,path2
305,1720-1725,1725-1730,run_01,run_01,/home/ryan/github/koselleck/data/models/bpo/1720-1725/run_01/model.bin,/home/ryan/github/koselleck/data/models/bpo/1725-1730/run_01/model.bin
69,1720-1725,1725-1730,run_02,run_02,/home/ryan/github/koselleck/data/models/bpo/1720-1725/run_02/model.bin,/home/ryan/github/koselleck/data/models/bpo/1725-1730/run_02/model.bin
726,1720-1725,1725-1730,run_03,run_03,/home/ryan/github/koselleck/data/models/bpo/1720-1725/run_03/model.bin,/home/ryan/github/koselleck/data/models/bpo/1725-1730/run_03/model.bin
403,1720-1725,1725-1730,run_04,run_04,/home/ryan/github/koselleck/data/models/bpo/1720-1725/run_04/model.bin,/home/ryan/github/koselleck/data/models/bpo/1725-1730/run_04/model.bin
236,1720-1725,1725-1730,run_05,run_05,/home/ryan/github/koselleck/data/models/bpo/1720-1725/run_05/model.bin,/home/ryan/github/koselleck/data/models/bpo/1725-1730/run_05/model.bin
...,...,...,...,...,...,...
20677,1955-1960,1950-1955,run_06,run_06,/home/ryan/github/koselleck/data/models/bpo/1955-1960/run_06/model.bin,/home/ryan/github/koselleck/data/models/bpo/1950-1955/run_06/model.bin
20729,1955-1960,1950-1955,run_07,run_07,/home/ryan/github/koselleck/data/models/bpo/1955-1960/run_07/model.bin,/home/ryan/github/koselleck/data/models/bpo/1950-1955/run_07/model.bin
20521,1955-1960,1950-1955,run_08,run_08,/home/ryan/github/koselleck/data/models/bpo/1955-1960/run_08/model.bin,/home/ryan/github/koselleck/data/models/bpo/1950-1955/run_08/model.bin
20703,1955-1960,1950-1955,run_09,run_09,/home/ryan/github/koselleck/data/models/bpo/1955-1960/run_09/model.bin,/home/ryan/github/koselleck/data/models/bpo/1950-1955/run_09/model.bin


In [57]:
def do_gen_cross_model_dists(pathdf,words=None,progress=False,lim_runs=10,ks=[10,25,50],progress_words=False):
    dfneighb=get_all_neighbors()
    if not words: words=set(dfneighb.reset_index().word)
    
    pathdf=pathdf.iloc[:lim_runs]
    iterr=pathdf.iterrows()
    if progress: iterr=tqdm(iterr,total=len(pathdf))
    o=[]
    for i,row in iterr:
        m1=load_model(row.path1)
        m2=load_model(row.path2)
        period1=row.period1
        period2=row.period2
        mwords=set(m1.wv.key_to_index.keys())&set(m2.wv.key_to_index.keys())

        iter2 = tqdm(words,position=0) if progress_words else words
        for w in iter2:
            neighb1={w.split()[0] for w in dfneighb.loc[w,period1].neighb.split(', ')}
            neighb2={w.split()[0] for w in dfneighb.loc[w,period2].neighb.split(', ')}
            metaneighb=list((set(neighb1)|set(neighb2)) & mwords)
            
            vector1=[(1-fastdist.cosine(m1.wv.vectors[m1.wv.key_to_index[w]], m1.wv.vectors[m1.wv.key_to_index[w2]])) for w2 in metaneighb]
            vector2=[(1-fastdist.cosine(m2.wv.vectors[m2.wv.key_to_index[w]], m2.wv.vectors[m2.wv.key_to_index[w2]])) for w2 in metaneighb]
            csim=fastdist.cosine(vector1,vector2) # returns similarity not distane!!
            dist=1-csim
            o+=[{
#                 'corpus2':row.corpus2,
#                 'period2':row.period2,
#                 'run2':row.run2,
                'word':w,
                'dist_local':dist,
                'k':25
            }]
            break
    return pd.DataFrame(o).groupby(['word']).mean().reset_index()

In [58]:
# dfpaths_cmp_grps = dfpaths_cmp.groupby(['period1','period2'])
# for i,grp in dfpaths_cmp_grps: break
# grp

In [59]:
do_gen_cross_model_dists(grp,progress=True)

100%|██████████| 10/10 [00:03<00:00,  2.67it/s]


Unnamed: 0,word,dist_local,k
0,value,0.040927,25


In [61]:
def gen_cross_model_dists(
        dfpaths_cmp=None,
        lim=None,
        num_proc=4,
        num_runs=1,
        ofnfn=FN_ALL_LOCALDISTS_V2,
        force=False,
        **y):
    if not force and os.path.exists(ofnfn):
        odf=read_df(ofnfn)
    else:
        if dfpaths_cmp is None: dfpaths_cmp=get_cross_model_dists_paths()
        odf=pmap_groups(
            do_gen_cross_model_dists,
#             dfpaths_cmp.iloc[:lim].groupby(['corpus1','period1','run1']),
            dfpaths_cmp.iloc[:lim].groupby(['period1','period2']),
            num_proc=num_proc,
            desc='Calculating Local Neighborhood Distance Measure over periods',
            **y
        )
        odf.to_pickle(ofnfn)
    return odf


In [67]:
odf=gen_cross_model_dists(
    dfpaths=dfpaths_cmp,
    lim=None,
    num_proc=4,
    num_runs=4,
    force=FORCE
)
odf

Unnamed: 0_level_0,Unnamed: 1_level_0,word,dist_local,k
period1,period2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1720-1725,1725-1730,value,0.040927,25
1720-1725,1730-1735,value,0.057139,25
1720-1725,1735-1740,value,0.040190,25
1720-1725,1740-1745,value,0.045690,25
1720-1725,1750-1755,value,0.048366,25
...,...,...,...,...
1955-1960,1920-1925,value,0.042139,25
1955-1960,1925-1930,value,0.034009,25
1955-1960,1940-1945,value,0.031635,25
1955-1960,1945-1950,value,0.028362,25


In [66]:
odf

Unnamed: 0_level_0,Unnamed: 1_level_0,word,dist_local,k
period1,period2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1720-1725,1725-1730,value,0.040927,25
1720-1725,1730-1735,value,0.057139,25
1720-1725,1735-1740,value,0.040190,25
1720-1725,1740-1745,value,0.045690,25
1720-1725,1750-1755,value,0.048366,25
...,...,...,...,...
1955-1960,1920-1925,value,0.042139,25
1955-1960,1925-1930,value,0.034009,25
1955-1960,1940-1945,value,0.031635,25
1955-1960,1945-1950,value,0.028362,25


In [6]:
# odf.query('word=="station"').groupby(['period1','period2']).mean().sort_index()

## Postprocessing

In [9]:
# This averages down
alldist_df = get_cross_model_dists(force=FORCE)
alldist_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,dist_local,dist_local_z,dist_local_perc
corpus1,corpus2,period1,period2,word,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bpo,bpo,1890-1895,1895-1900,day,0.007251,-2.229570,0.003468
bpo,bpo,1865-1870,1870-1875,time,0.007274,-2.232159,0.003162
bpo,bpo,1930-1935,1925-1930,day,0.007437,-2.227280,0.006026
bpo,bpo,1865-1870,1875-1880,time,0.007450,-2.230666,0.007257
bpo,bpo,1880-1885,1875-1880,time,0.007642,-2.226431,0.006189
bpo,bpo,...,...,...,...,...,...
bpo,bpo,1810-1815,1925-1930,factor,0.578083,8.208947,99.999679
bpo,bpo,1805-1810,1930-1935,factor,0.578762,8.230830,99.999763
bpo,bpo,1810-1815,1910-1915,factor,0.579372,8.227624,99.999791
bpo,bpo,1810-1815,1895-1900,factor,0.580439,8.245224,99.999755


In [14]:
dfm_word = alldist_df.groupby('word').mean().sort_values('dist_local_perc').tail(25)
# dfm_word

## Plotting data

In [None]:
# get_all_localdists()

In [None]:
# source(get_historical_semantic_distance_matrix)

In [12]:
# odf=get_historical_semantic_distance_matrix('station')
# # odf[(odf.period1=='1720-1725') | (odf.period2=='1720-1725')]
# odf

In [15]:
# @interact
# def doplot(w='station'):
#     return plot_historical_semantic_distance_matrix(w)

In [None]:
plot_historical_semantic_distance_matrix('station')

In [None]:
plot_historical_semantic_distance_matrix('interest')

In [None]:
plot_historical_semantic_distance_matrix('commerce')