# Neighbs

In [6]:
from ipynb.fs.full.koselleck import *

In [14]:
def do_nbr_word(dfgrp,max_rank=NBR_MAX_RANK,num_proc=1,progress=True):
    words_with_data=set(dfgrp.columns)
    o=[]
    for (word,period,run),wprdf in dfgrp.groupby(['word_','period_','run_']):
        row=wprdf.iloc[0].dropna().sort_values()
        to_drop={word} | (set(row.index) - words_with_data)
        row=row.drop(to_drop)
        o+=[
            {
                'word':word,
                'period':period,
                'run':run,
                'neighbor':nb,
                'cdist':val,
                'rank':i+1
            } for i,(nb,val) in enumerate(row.items())
            if i<max_rank
        ]
    dfprd=pd.DataFrame(o)
    gby=['word','period','neighbor']
    dfprdg=dfprd.groupby(gby)
    dfprd=dfprd.reset_index().set_index(gby)
    dfprd['count']=dfprdg.size()
    dfprd['score']=[(c - (cd/10)) for c,cd in zip(dfprd['count'], dfprd['cdist'])]
    odf=dfprd.groupby(gby).agg(dict(
        count=np.median,
        score=np.mean,
#         run=np.max,
        cdist=np.mean
    ))
    odf['rank']=odf['score'].rank(ascending=False,method='min').apply(int)
    odf=odf.drop('score',1).sort_values('rank')
#     odf=odf.rename({'run':'num_runs'},axis=1)
    if max_rank: odf=odf[odf['rank']<=max_rank]
    return odf.reset_index().drop(['word','period'],axis=1)

def nbr_word(word,period_or_periods=None,run_or_runs=None,num_runs=10,num_proc=1,progress=True,
             max_rank=NBR_MAX_RANK,cache=True,cache_only=False,force=False,pmap_use_cache=False):
    if progress: print(f'Running nbr_word({word})')
    qstr=f'{word},max_rank={max_rank},ymin={YMIN},ymax={YMAX},ybin={YEARBIN}'
    if cache and not force:
        with get_veclib('nbr') as vl:
            if qstr in vl:
                if progress: print(f'Finished running nbr_word({word})')
                return vl[qstr] if not cache_only else pd.DataFrame()
    
    dfc = cdist_word(word,period_or_periods,num_proc=num_proc,progress=progress)
    if not len(dfc): return pd.DataFrame()
    odf=pmap_groups(
        do_nbr_word,
        dfc.rename_axis(['word_','period_','run_']).groupby(['word_','period_']),
        kwargs=dict(max_rank=max_rank,num_proc=1,progress=False),
        num_proc=num_proc,
        progress=progress,
        use_cache=pmap_use_cache
    )
    if not len(odf): return pd.DataFrame()
    odf=odf.rename_axis(['word','period']).reset_index().set_index(['word','period','neighbor'])
    if cache:
        with get_veclib('nbr',autocommit=True) as vl:
            vl[qstr]=odf
    
    if progress: print(f'Finished running nbr_word({word})')
    return odf if not cache_only else pd.DataFrame()

In [15]:
dfnbr=nbr_word('station',force=True,progress=1,num_proc=1)#,max_rank=25)

[Koselleck] (10:15:34) Running nbr_word(station) (+102.4s)
[Koselleck] (10:15:34) Loading cdist data for "station" (+0.0s)
[Koselleck] (10:15:34) Finished loading cdist data from pkl for "station" (+0.4s)
[Koselleck] (10:15:34) Finished running cdist_word(station) (+0.2s)
Mapping do_nbr_word [x1]: 100%|██████████| 48/48 [00:03<00:00, 12.94it/s]
[Koselleck] (10:15:38) Finished running nbr_word(station) (+3.9s)


In [17]:
# dfnbr.groupby('period').size()

In [4]:
nbr_word('despotism',num_proc=4,cache=True)

[Koselleck] (19:35:17) Running nbr_word(despotism) (+0.0s)
[Koselleck] (19:35:17) Finished running nbr_word(despotism) (+0.0s)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,cdist,rank
word,period,neighbor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
despotism,1765-1770,regal,8,0.769257,1
despotism,1765-1770,tyranny,7,0.736510,2
despotism,1765-1770,despotic,7,0.788323,3
despotism,1765-1770,slavery,6,0.812143,4
despotism,1765-1770,tyrannical,5,0.754684,5
despotism,...,...,...,...,...
despotism,1885-1890,emancipation,1,0.692456,21
despotism,1885-1890,prussian,1,0.696914,22
despotism,1885-1890,degradation,1,0.697513,23
despotism,1885-1890,catholics,1,0.699194,24


In [5]:
def _nbr_(objd): return nbr_word(**objd)

def nbr(word_or_words,period_or_periods=None,run_or_runs=None,max_rank=NBR_MAX_RANK,num_runs=10,
        num_proc=1,force=False,progress=True,cache_only=False,cache=True,use_threads=False,
        pmap_use_cache=False):
    index_cols=['word','neighbor','period','run']
    words=tokenize_fast(word_or_words) if type(word_or_words)==str else list(word_or_words)
    objs=[
        dict(
            word=word,
            period_or_periods=period_or_periods,
            run_or_runs=run_or_runs,
            num_runs=num_runs,
            num_proc=1 if len(words)>1 else num_proc,
            progress=False if len(words)>1 else progress,
            max_rank=NBR_MAX_RANK,
            cache=cache,
            cache_only=cache_only,
            force=force,
            pmap_use_cache=pmap_use_cache
        ) for word in words
    ]
    o=pmap(
        _nbr_,
        objs,
        num_proc=num_proc if len(words)>1 else 1,
        progress=progress if len(words)>1 else False,
        desc='Gathering word neighborhoods',
        use_threads=use_threads
    )
    return pd.concat(o) if len(o) else pd.DataFrame()


In [6]:
nbr(['virtues','representations','virtue','territories','kingdoms'],num_proc=4)

Gathering word neighborhoods [x4]: 100%|██████████| 5/5 [00:00<00:00, 253.33it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,cdist,rank
word,period,neighbor,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
virtues,1720-1725,wisdom,10,0.923790,1
virtues,1720-1725,esteem,9,0.940699,2
virtues,1720-1725,piety,9,0.953068,3
virtues,1720-1725,glorious,9,1.003024,4
virtues,1720-1725,qualities,8,0.882740,5
...,...,...,...,...,...
kingdoms,1915-1920,treaties,4,0.809415,21
kingdoms,1915-1920,scotland,4,0.812085,22
kingdoms,1915-1920,groups,3,0.729139,23
kingdoms,1915-1920,province,3,0.740160,24


## Speed tests

Winner was:

```python
%%timeit
nbr(random.sample(get_valid_words(),10),num_proc=6,cache_only=True)
```

with ~2.5s/word on average.

In [7]:
# @interact
# def spw(spw=2.5,numword=len(get_valid_words())):
#     print(f'ETA: {round( (spw * numword) / 60 / 60,1)} hours')

In [8]:
# %%timeit
# nbr(random.sample(get_valid_words(),10),num_proc=6,cache_only=True)

In [9]:
# %%timeit
# nbr(random.sample(get_valid_words(),10),num_proc=6,cache_only=True,pmap_use_cache=True)

In [10]:
# %%timeit
# nbr(random.sample(get_valid_words(),10),num_proc=4,cache_only=True)

In [11]:
# %%timeit
# nbr(random.sample(get_valid_words(),10),num_proc=1,cache_only=True)

In [12]:
# %%timeit
# nbr(random.sample(get_valid_words(),10),num_proc=4,cache_only=True,use_threads=True)

In [13]:
# %%timeit
# nbr(random.sample(get_valid_words(),10),num_proc=8,cache_only=True,use_threads=True)

## Importing older data

In [14]:
def load_older_data_neighbs(ifn=FN_ALL_NEIGHBS):
    df=read_df(ifn)
    df=df.query(f'"{YMIN}"<=period<"{YMAX}"')
    df['cdist']=1-df['csim']
    df['num_runs']=10
    df['count']=df['count'].apply(int)
    odf=df.set_index(['word','neighbor','period'])[['count','num_runs','cdist','rank']]
    return odf

In [15]:
# dfold=load_older_data_neighbs()
# dfold

In [16]:
def import_older_data_neighbs(df):
    with get_veclib('nbr',autocommit=False) as vl:
        for i,(word,wdf) in enumerate(tqdm(df.groupby('word'))):
            wqstr=f'{word},ymin={YMIN},ymax={YMAX},ybin={YEARBIN}'
            if not wqstr in vl:
                vl[wqstr]=wdf
            if i and not i%100: vl.commit()
        vl.commit()

In [17]:
# import_older_data_neighbs(dfold)

## Regenerating data

In [18]:
words=get_valid_words()
random.shuffle(words)
len(words),words[:10]

(5911,
 ['assassin',
  'action',
  'chancellor',
  'enthusiast',
  'glen',
  'cur',
  'versification',
  'hammer',
  'salt',
  'epidemic'])

In [19]:
nbr(words,num_proc=4,cache_only=True,pmap_use_cache=False)

Gathering word neighborhoods [x4]:   9%|▉         | 560/5911 [17:36<7:08:58,  4.81s/it] [Koselleck] (19:52:59) !! Ran out of input (+1061.4s)
Gathering word neighborhoods [x4]:  10%|▉         | 577/5911 [17:53<3:41:47,  2.49s/it][Koselleck] (19:53:26) !! not enough values to unpack (expected 2, got 0) (+27.1s)
Gathering word neighborhoods [x4]:  16%|█▌        | 951/5911 [30:16<2:36:23,  1.89s/it] [Koselleck] (20:05:38) !! not enough values to unpack (expected 2, got 0) (+1820.9s)
Gathering word neighborhoods [x4]:  26%|██▌       | 1510/5911 [49:09<2:58:36,  2.44s/it] [Koselleck] (20:24:35) !! not enough values to unpack (expected 2, got 0) (+2957.7s)
Gathering word neighborhoods [x4]:  28%|██▊       | 1643/5911 [52:18<2:59:39,  2.53s/it][Koselleck] (20:27:36) !! not enough values to unpack (expected 2, got 0) (+3139.0s)
Gathering word neighborhoods [x4]:  28%|██▊       | 1674/5911 [53:15<3:44:29,  3.18s/it][Koselleck] (20:28:38) !! not enough values to unpack (expected 2, got 0) (+243.

In [1]:
## Plot neighbors

In [2]:
def plot_nbrs(
        word='station',
        vnum='v18-newnbr-20y-5-v-20',
        min_periods=2,
        max_rank=20,
        max_rank1=5,
        width=8,
        height=6,
        save=False,
        showdata=False,
        year_min=YMIN,
        year_max=YMAX,
        ybin=20,
        num_proc=2):
    w=word
    dfres=nbr(w,num_proc=num_proc,progress=False)
    
#     if w in set(dfres.index):
    if len(dfres):
        wdf=dfres.loc[w].reset_index()
        
        wdf=wdf.query(f'period>="{year_min}"')
        wdf=wdf.query(f'period<"{year_max}"')
        wdf=wdf[wdf['rank']<=max_rank]
        
        
        wdf['period_orig']=wdf['period']
        wdf['period']=[periodize(int(y.split('-')[0]),ybin) for y in wdf.period_orig]
        wdf=wdf.groupby(['period','neighbor']).mean().reset_index()        
        words_ever_high=set(wdf[wdf['rank']<=max_rank1].neighbor)
        
        figdf=wdf
        figdf=figdf[figdf.neighbor.isin(words_ever_high)].groupby(['neighbor','period']).mean().reset_index()
        wdfp=figdf.pivot('neighbor','period','rank')
        figdf=wdfp.reset_index().melt(id_vars=['neighbor'],value_name='rank')
        
        figdf=figdf.groupby('neighbor').filter(lambda gdf: len(gdf.dropna())>=min_periods)
        
        
        
        figdf=pd.concat(
            grp.assign(rank3=(grp['rank']).rank(method='first'))
            for i,grp in figdf.groupby('period')
        )
        figdf=figdf[figdf.rank3<=max_rank]
        figdf=figdf.sort_values('rank')
        if showdata: display(figdf)
                        
        # start fig
        fig=start_fig(
            figdf,
            x='period',
            y='rank3',
            label='neighbor',
#             group='neighbor',
#             color='neighbor',
#             fill='conc',
            figure_size=(width,height)
        ) + p9.geom_line(p9.aes(group='neighbor'),size=0.5) + p9.scale_y_reverse()
        
        first_appearances=figdf.sort_values('period').drop_duplicates('neighbor')
        other_appearances=figdf.loc[set(figdf.index) - set(first_appearances.index)]
        
        fig+=p9.geom_label(size=6.5,data=other_appearances)
        fig+=p9.geom_label(size=6.5,fill='#BBBBBB',fontweight='bold',data=first_appearances)
        
        fig+=p9.scale_fill_gradient(low='white',high='#999999')
#         fig+=p9.scale_color_gradient(low='white',high='#999999')
        fig+=p9.labs(
            title=f'Changing associations of "{w}"',
            x='Date of semantic model',
            y=f'Proximity to "{w}" →',
            fill='Concreteness'
        )
        if save:
            ofn=f'fig.changing_simple.{w}.{vnum}.png'
            fig.save(f'figures/{ofn}')
            fig.save(f'/home/ryan/Markdown/Drafts/TheGreatAbstraction/figures/{ofn}')
#         display(fig)
#         return figdf
        return fig
    else:
        return f'!! {w} not in data'
    
    
# res=interact_manual(plot_nbrs)
# # res()

NameError: name 'YMIN' is not defined