## Distmat

In [83]:
from ipynb.fs.full.koselleck import *

In [95]:
def distmat(word,
            ymin=YMIN_DISTMAT,#_BIG,
            ymax=YMAX_DISTMAT,#_BIG,
            ybin=YBIN_DISTMAT,#_BIG,
            k=K,
            force=False,
            cache=True,
            **ldist_opts):
    qstr=f'{word}/{ymin}-{ymax}_{ybin}/k={k}'
    if not force and cache:
        with get_db('distmat','r') as db:
            if qstr in db:
                odf=db[qstr]
                return odf
    dfldist=ldist(word, ymin=ymin,ymax=ymax,ybin=ybin,k=k, **ldist_opts)
    idf=dfldist.reset_index()
    idf=idf.append(idf.assign(period1=idf.period2, period2=idf.period1))
    dfdistmat=idf.pivot('period1','period2','dist')
    if cache:
        with get_db('distmat','w') as db: db[qstr]=dfdistmat
    return dfdistmat
get_historical_semantic_distance_matrix = distmat

In [96]:
# distmat??

In [98]:
# distmat('value',force=True)

In [49]:
# vl=get_valid_words()[:10]
# vl

In [50]:
# distmat('long',ymin=1680,ymax=1960)

In [51]:
def gen_distmats_(objd):
    try:
        return distmat(**objd)
    except Exception as e:
        #print('!!',e)
        return pd.DataFrame()
    

def gen_distmats(
        words=None,
        ymin=YMIN_DISTMAT,
        ymax=YMAX_DISTMAT,
        ybin=YBIN_DISTMAT,
        k=K,
        num_proc=1,
        force=False,
        shuffle=True,
        lim=None,
        **ldist_opts):
    if not words: words=get_valid_words()
    words=to_words(words)
    if shuffle: random.shuffle(words)
    def to_qstr(w): return f'{w}/{ymin}-{ymax}_{ybin}/k={k}'
    with get_db('distmat',mode='c') as db:
        objs=[
            dict(
                word=word,
                ymin=ymin, ymax=ymax, ybin=ybin, k=k,
                cache=False,force=True,
                **ldist_opts
            )
            for word in words
            if force or to_qstr(word) not in db
            #and word in {'culture','station','demand'}
        ][:lim]#[:3]
        iterr=pmap_iter(gen_distmats_,objs, num_proc=num_proc)
        for i,(obj,dfdist) in enumerate(zip(objs,iterr)):
            #printm('### '+obj['word'])
            #display(plot_distmat(dfdist))
            db[to_qstr(obj['word'])] = dfdist
            if i and not i%10: db.commit()
        db.commit()

In [52]:
# YMIN_DISTMAT_BIG = 1680
# YMAX_DISTMAT_BIG = 1960
# YBIN_DISTMAT_BIG = 5

In [48]:
gen_distmats(
    get_valid_words(),
    ymin=YMIN_DISTMAT_BIG,
    ymax=YMAX_DISTMAT_BIG,
    ybin=YBIN_DISTMAT_BIG,
    shuffle=False,
    num_proc=7,
    force=True
)

In [None]:
goodjob

In [7]:
def distmat_(objd): return distmat(**objd)

def distmats(words,num_proc=1,progress=True,**distmat_opts):
    odf=None
    objs=[dict(word=w, progress=False, **distmat_opts) for w in to_words(words)]
    iterr=pmap_iter(distmat_, objs, progress=progress, num_proc=num_proc)
    for df in iterr:
        df=df
        if odf is None:
            odf=df
        else:
            df3=pd.concat([odf,df])
            odf=df3.groupby(df3.index).mean()
    return odf

## Plot

In [8]:
def plot_distmat(distdf,xcol='period1',ycol='period2',value_name='lnm_perc',
                 use_color=False,xlim=None,ylim=None,title='Distance matrix',ofn=None,force=False,
                 invert=False,**y):
    if ofn:
        ofnfn=os.path.join(PATH_FIGS,ofn)
        if not force and os.path.exists(ofnfn): return ofnfn
    
    
    distdfm=distdf.reset_index().melt(id_vars=[xcol],value_name=value_name)#.dropna()
    
    
    fig=start_fig(
        distdfm,
        x=f'factor({xcol})',
        y=f'factor({ycol})',
        fill=value_name,
        **y
    )
    fig+=p9.geom_tile()
    if not use_color:
        if not invert:
            fig+=p9.scale_fill_gradient(high='#111111',low='#FFFFFF')   
        else:
            fig+=p9.scale_fill_gradient(low='#111111',high='#FFFFFF')   
    else:
        fig+=p9.scale_fill_distiller(type='div',palette=5)
    fig+=p9.theme(
        axis_text_x=p9.element_text(angle=90)
    )
    fig+=p9.labs(
        x='Date of semantic model',
        y='Date of semantic model',
        fill='Semantic distance\n(LNM percentile)',
        title=title
    )
    if ofn:
        fig.save(ofnfn)
        if PATH_FIGS2: fig.save(os.path.join(PATH_FIGS2,ofn))
        return ofnfn
        
    return fig


def plot_historical_semantic_distance_matrix(words,save=False,force=False,force_data=False,dist_key='lnm',interpolate=False,vnum='v35',**y):
    wstr=words.strip() if type(words)==str else '-'.join(words)
    wstr2=words.strip() if type(words)==str else ', '.join(words)
    res=plot_distmat(
        distmat(
            words,
            dist_key=dist_key,
            interpolate=interpolate,
            force=force_data,
            **y
        ),#.fillna(0),
        figure_size=(8,8),
        ofn=f'fig.{wstr}.distmat.{vnum}.png' if save else None,
        title=f'Historical-semantic distance matrix for ‘{wstr2}’',
        force=force
    )
    return print_img(res) if save else res



In [9]:
# plot_historical_semantic_distance_matrix('culture',force=True,k=10,ymin=1700,ymax=1900,ybin=5,num_proc=4)

In [10]:
def gettodaysdate(): return time.strftime('%Y-%m-%d')

In [11]:
def iplot_historical_semantic_distance_matrix(w='culture', **attrs):
    attrs={**dict(
        words=w,
        force=True,
        dist_key=fixed('lnm'),
        k=10,
        ymin=1720,
        ymax=1900,
        ybin=5,
        force_data=False,
        vnum=gettodaysdate(),
        save=False,
        num_runs=3,
        num_proc=3),**attrs}
    res=interact_manual_plot(plot_historical_semantic_distance_matrix, **attrs)
    return res

In [12]:
# res=iplot_historical_semantic_distance_matrix('liberty')

## Correlating distance matrices

In [13]:
from skbio.stats.distance import mantel

In [14]:
# %%timeit
# x=distmat('culture').fillna(0)
# y=distmat('station').fillna(0)
# z=distmat('virtue').fillna(0)

In [15]:
# %%timeit
# corr_x2y = mantel(x, y, method='pearson', permutations=100, alternative='two-sided', strict=False, lookup=None)

In [16]:
# corr_x2z = mantel(x, z, method='pearson', permutations=999, alternative='two-sided', strict=False, lookup=None)
# corr_y2z = mantel(y, z, method='pearson', permutations=999, alternative='two-sided', strict=False, lookup=None)

In [17]:
# xs,ys,zs='culture','station','virtue'

# print(xs,ys,corr_x2y)
# print(xs,zs,corr_x2z)
# print(ys,zs,corr_y2z)

In [18]:
def corr_distmats(dist1,dist2):
    try:
        res = mantel(
            dist1.fillna(0),
            dist2.fillna(0),
            method='pearson',
            permutations=100,
            strict=False
        )
        odx={}
        odx['corr'],odx['p_value'],odx['n'] = res
        return odx
    except ValueError as e:
        return {}

In [19]:
# corr_distmats(
#     distmat('culture'),
#     distmat('slave')
# )

In [20]:
# distmat('culture')

In [21]:
def do_corr_distmat_words(obj):
    distmat1,word1,word2 = obj
    odx={'word1':word1, 'word2':word2}
    try:
        odx={**odx, **corr_distmats(distmat1,distmat(word2).fillna(0))}
    except Exception as e:
        pass
    return odx

def corr_distmat_words(word,words=None,num_proc=1,
                       ymin=YMIN_DISTMAT, ymax=YMAX_DISTMAT, ybin=YBIN_DISTMAT, k=K,
                       force=False,progress=True,
                       **distmat_opts):
    words_todo=words=[w for w in (get_keywords() if words is None else words) if w!=word]
    def to_qstr(w):
        w1,w2=sorted([word,w])
        return f'distmat/{w1}/{w2}/{ymin}-{ymax}_{ybin}/k={k}'
    
    
#     print('# objs',len(words_todo))
    if not force:
        with get_db('corrs') as db:
            words_todo=[w for w in words if to_qstr(w) not in db]
#     print('# todo',len(words_todo))
    
    if len(words_todo):
        try:
            distmat1=distmat(word,progress=False).fillna(0)
        except Exception as e:
#             print('!!',word,e)
            return pd.DataFrame()
        
        objs=[(distmat1,word,w) for w in words_todo]
        iterr=pmap_iter(do_corr_distmat_words, objs, num_proc=num_proc, progress=progress)
        l=[]
        with get_db('corrs','c') as db:
            for w,res in zip(words_todo,iterr):
#                 print(w,res)
                db[to_qstr(w)]=res
            db.commit()
        #corr_df = pd.DataFrame(iterr)
    
    with get_db('corrs','r') as db:
        corr_df=pd.DataFrame(db[to_qstr(x)] for x in words if to_qstr(x) in db)
    #if len(corr_df):
    #    corr_df=corr_df.dropna().sort_values('corr',ascending=False)
    return corr_df

In [22]:
# corr_distmat_words('polish',['vice','honesty','door','virtue'],force=False)

In [23]:
# cordf_slv=corr_distmat_words('slave',words=get_keywords(),num_proc=2)
# cordf_slv

In [24]:
# cordf_sta=corr_distmat_words('station',num_proc=4)
# cordf_sta

In [25]:
# cordf_dem=corr_distmat_words('demand',num_proc=4)
# cordf_dem

In [26]:
def corr_distmat_pairwise(words=None,progress=True,**opts):
    if not words: words=get_keywords()
    words=list(words)
    random.shuffle(words)
    l=[]
    for w1 in tqdm(words,position=0,desc='Computing pairwise comparisons',disable=not progress):
        w2l = [w2 for w2 in words if w1<w2]
        df = corr_distmat_words(w1,w2l,progress=False,**opts)
        l+=[df]
    return pd.concat(l) if len(l) else pd.DataFrame()

In [130]:
w2pos=lltk.get_word2pos()
wl = [w for w in get_signif_novelty_words() if w2pos.get(w)=='nn1']
#len(wl),'culture' in set(wl)

In [131]:
#wl=random.sample(list(get_signif_novelty_words()),10)
# wl=get_keywords()
# df=corr_distmat_pairwise(wl,num_proc=4)

In [132]:
# stop

In [133]:
df=df.dropna().sort_values('corr',ascending=False)
df['corr_rank']=df['corr'].rank(ascending=False)
df['corr_perc']=df['corr'].rank(ascending=True) / len(df) * 100
df2=df.assign(word1=df.word2, word2=df.word1)
dfboth=df.append(df2)
dfpiv=dfboth.pivot('word1','word2','corr')

In [65]:

def draw_bokeh(G,
    title='Networkx Graph', 
    save_to=None,
    color_by=None,
    size_by=None,
    default_color='skyblue',
    default_size=15,
    min_size=5,
    max_size=30,
    show_labels=True,
    notebook=True,
):
    import networkx as nx
    from bokeh.io import output_notebook, show, save
    from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, EdgesAndLinkedNodes, NodesAndLinkedEdges, LabelSet
    from bokeh.plotting import figure
    from bokeh.plotting import from_networkx
    from bokeh.palettes import Blues8, Reds8, Purples8, Oranges8, Viridis8, Spectral8
    from bokeh.transform import linear_cmap
    from networkx.algorithms import community
    from bokeh.plotting import from_networkx
    if notebook: output_notebook()
    
    #Establish which categories will appear when hovering over each node
    HOVER_TOOLTIPS = [("ID", "@index")]#, ("Relations")]

    #Create a plot — set dimensions, toolbar, and title
    # possible tools are pan, xpan, ypan, xwheel_pan, ywheel_pan, wheel_zoom, xwheel_zoom, ywheel_zoom, zoom_in, xzoom_in, yzoom_in, zoom_out, xzoom_out, yzoom_out, click, tap, crosshair, box_select, xbox_select, ybox_select, poly_select, lasso_select, box_zoom, xbox_zoom, ybox_zoom, save, undo, redo, reset, help, box_edit, line_edit, point_draw, poly_draw, poly_edit or hover
    plot = figure(
        tooltips = HOVER_TOOLTIPS,
        tools="pan,wheel_zoom,save,reset,point_draw",
            active_scroll='wheel_zoom',
#             tools="",
        x_range=Range1d(-10.1, 10.1),
        y_range=Range1d(-10.1, 10.1),
        title=title
    )

    #Create a network graph object with spring layout
    # https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.drawing.layout.spring_layout.html

    #Set node size and color

    # size?
    size_opt = default_size
    if size_by is not None:
        size_opt = '_size'
        data_l = X = np.array([d.get(size_by,0) for n,d in G.nodes(data=True)])
        data_l_norm = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
        data_scaled = [(min_size + (max_size * x)) for x in data_l_norm]
        for x,n in zip(data_scaled, G.nodes()):
            G.nodes[n]['_size']=x


    # get network
    network_graph = from_networkx(G, nx.spring_layout, scale=10, center=(0, 0))



    # render nodes
    network_graph.node_renderer.glyph = Circle(
        size=size_opt, 
        fill_color=color_by if color_by is not None else default_color
    )

    #Set edge opacity and width
    network_graph.edge_renderer.glyph = MultiLine(line_alpha=0.5, line_width=1)

    #Add network graph to the plot
    plot.renderers.append(network_graph)

    #Add Labels
    if show_labels:
        x, y = zip(*network_graph.layout_provider.graph_layout.values())
        node_labels = list(G.nodes())
        source = ColumnDataSource({'x': x, 'y': y, 'name': [node_labels[i] for i in range(len(x))]})
        labels = LabelSet(x='x', y='y', text='name', source=source, background_fill_color='white', text_font_size='10px', background_fill_alpha=.7)
        plot.renderers.append(labels)

    show(plot)
    if save_to: save(plot, filename=save_to)

In [134]:
def plot_distmat_nx(df):
    fig = go.Figure(data=go.Heatmap(z=df, x=df.index, y=df.columns, colorscale='gray',reversescale=True))
    fig.update_layout(width=800,height=800)
    return fig

In [142]:
# fig = ff.create_dendrogram(dfpiv.fillna(0), orientation='left', labels=dfpiv.index)
# fig.update_layout(width=800, height=1600)
# fig.show()

In [158]:
gdf = dfboth[dfboth.p_value<=0.05].sort_values('corr_rank').groupby('word1').head(1).sort_values('word1')
signw=set(get_signif_novelty_words())
def decide_if_sign(w): return f'{w}*' if w in signw else w
gdf['word1']=gdf.word1.apply(decide_if_sign)
gdf['word2']=gdf.word2.apply(decide_if_sign)
gdf

Unnamed: 0,word1,word2,corr,p_value,n,corr_rank,corr_perc
38,administration,constitution,0.837786,0.009901,36.0,23.0,99.630003
55,anarchy,mechanical,0.770644,0.009901,36.0,313.0,94.752775
92,aristocracy,creative,0.718545,0.009901,33.0,994.0,83.299697
14,authority,private,0.777652,0.009901,36.0,266.0,95.543222
117,balance,criticism,0.808694,0.009901,36.0,90.0,98.503195
...,...,...,...,...,...,...,...
143,violence,experience,0.756198,0.009901,36.0,452.0,92.415069
18,war,reputation,0.705535,0.009901,36.0,1215.0,79.582913
100,wealth,intellectual,0.788804,0.009901,36.0,188.0,96.855029
107,welfare,individual,0.790931,0.009901,36.0,172.0,97.124117


In [161]:
# gdf[gdf.word2.str.endswith('*')]

In [162]:
g=nx.from_pandas_edgelist(
    gdf,
    source='word1',
    target='word2',
    edge_attr=['corr','p_value','n','corr_rank','corr_perc']
)


In [163]:
draw_bokeh(g)

In [147]:
gdf[gdf.word1=='culture']

Unnamed: 0,word1,word2,corr,p_value,n,corr_rank,corr_perc
155,culture,liberal,0.781376,0.009901,36.0,234.0,96.081399
