In [1]:
from ipynb.fs.full.koselleck import *

In [2]:



def get_historical_semantic_distance_matrix(
        words=None,
        df_dists=None,
        dist_key='lnm',
        ymin=YMIN_DISTMAT,
        ymax=YMAX_DISTMAT,
        ybin=YBIN_DISTMAT,
        k=K,
        interpolate=False,
        normalize=False,
        num_proc=1,
        progress=True,
        add_missing_periods=True,
        num_runs=NUM_RUNS_LNM,
        force=False,
        **lnm_opts):
    
    try:
        df_dists=lnm(words,ymin=ymin,ymax=ymax,ybin=ybin,k=k,num_proc=num_proc,force=force,
                    num_runs=num_runs,**lnm_opts)
        odfi=df_dists.groupby(['period1','period2']).mean().reset_index()
        odfi[f'{dist_key}_perc']=odfi[dist_key].rank(ascending=True) / len(odfi) * 100
        odfi=odfi.append(odfi.assign(period1=odfi.period2,period2=odfi.period1))

        if add_missing_periods:
            all_periods = set(get_periods_bystep(ymin=ymin,ymax=ymax,ybin=ybin))
            missing_periods = all_periods - set(odfi.period1) - set(odfi.period2)
            odfi=odfi.append(pd.DataFrame([
                dict(period1=p1, period2=p2, dist_key=np.nan, dist_key_perc=np.nan)
                for p1 in missing_periods
                for p2 in missing_periods
                if p1!=p2
            ]))
        
        odfp=odfi.pivot('period1','period2',f'{dist_key}_perc')#.fillna(0)
#         odfp=odfi.pivot('period1','period2',dist_key)#.fillna(0)
        if interpolate:
            for idx in odfp.index:
                odfp.loc[idx] = odfp.loc[idx].interpolate(limit_direction='both')
            for col in odfp.columns:
                odfp[col] = odfp[col].interpolate(limit_direction='both')
            odfp=odfp.replace({0:np.nan})
        if normalize:
            for col in odfp.columns:
#                 odfp[col]=(odfp[col] - odfp[col].mean()) / odfp[col].std()
                odfp[col]=(odfp[col] / odfp[col].sum())# * 100# / odfp[col].std()
            
        return odfp
    except KeyError:
        return pd.DataFrame()


In [3]:
#get_historical_semantic_distance_matrix('culture',ymin=1700,ymax=1900,ybin=20)

In [4]:
# get_historical_semantic_distance_matrix('station').sort_values('1890-1895')

## Plot

In [5]:
def plot_distmat(distdf,xcol='period1',ycol='period2',value_name='lnm_perc',
                 use_color=False,xlim=None,ylim=None,title='Distance matrix',ofn=None,force=False,
                 invert=False,**y):
    if ofn:
        ofnfn=os.path.join(PATH_FIGS,ofn)
        if not force and os.path.exists(ofnfn): return ofnfn
    
    
    distdfm=distdf.reset_index().melt(id_vars=[xcol],value_name=value_name)#.dropna()
    
    
    fig=start_fig(
        distdfm,
        x=f'factor({xcol})',
        y=f'factor({ycol})',
        fill=value_name,
        **y
    )
    fig+=p9.geom_tile()
    if not use_color:
        if not invert:
            fig+=p9.scale_fill_gradient(high='#111111',low='#FFFFFF')   
        else:
            fig+=p9.scale_fill_gradient(low='#111111',high='#FFFFFF')   
    else:
        fig+=p9.scale_fill_distiller(type='div',palette=5)
    fig+=p9.theme(
        axis_text_x=p9.element_text(angle=90)
    )
    fig+=p9.labs(
        x='Date of semantic model',
        y='Date of semantic model',
        fill='Semantic distance\n(LNM percentile)',
        title=title
    )
    if ofn:
        fig.save(ofnfn)
        if PATH_FIGS2: fig.save(os.path.join(PATH_FIGS2,ofn))
        return ofnfn
        
    return fig


def plot_historical_semantic_distance_matrix(words,save=False,force=False,force_data=False,dist_key='lnm',interpolate=False,vnum='v35',**y):
    wstr=words.strip() if type(words)==str else '-'.join(words)
    wstr2=words.strip() if type(words)==str else ', '.join(words)
    res=plot_distmat(
        get_historical_semantic_distance_matrix(
            words,
            dist_key=dist_key,
            interpolate=interpolate,
            force=force_data,
            **y
        ),#.fillna(0),
        figure_size=(8,8),
        ofn=f'fig.{wstr}.distmat.{vnum}.png' if save else None,
        title=f'Historical-semantic distance matrix for ‘{wstr2}’',
        force=force
    )
    return print_img(res) if save else res



In [6]:
# plot_historical_semantic_distance_matrix('culture',force=True,k=10,ymin=1700,ymax=1900,ybin=5,num_proc=4)

In [7]:
def gettodaysdate(): return time.strftime('%Y-%m-%d')

In [8]:
def iplot_historical_semantic_distance_matrix(w='culture', **attrs):
    attrs={**dict(
        words=w,
        force=True,
        dist_key=fixed('lnm'),
        k=10,
        ymin=1720,
        ymax=1900,
        ybin=5,
        force_data=False,
        vnum=gettodaysdate(),
        save=False,
        num_runs=3,
        num_proc=3),**attrs}
    res=interact_manual_plot(plot_historical_semantic_distance_matrix, **attrs)
    return res

In [9]:
res=iplot_historical_semantic_distance_matrix('liberty')

interactive(children=(Text(value='liberty', description='words'), Checkbox(value=False, description='save'), C…