# Semantic shifts

In [1]:
from ipynb.fs.full.koselleck import *

[Koselleck] (14:15:11) Alles bereit


## Generating data

### Local neighborhood measure

In [20]:
def to_lnm(word1_period1,word2_period2,k=25,num_proc=1,cache=True,progress=True,qstr=None,dfnbr1=None,dfnbr2=None):
    word1,period1=word1_period1.split('_') if type(word1_period1)==str else word1_period1
    word2,period2=word2_period2.split('_') if type(word2_period2)==str else word2_period2

    qstr=f'{word1}_{period1},{word2}_{period2},k={k}' if not qstr else qstr
    if cache:
        with get_veclib('lnm') as vl:
            if qstr in vl:
                print("Found in lnm db:",qstr)
                return vl[qstr]
    
    try:
        if dfnbr1 is None: dfnbr1=to_nbr(word1,period1,num_proc=num_proc,progress=progress)
        if dfnbr2 is None: dfnbr2=to_nbr(word2,period2,num_proc=num_proc,progress=progress)

        # filter words in both models
        dfmetanbr=dfnbr1.iloc[:k].append(dfnbr2.iloc[:k])
        metaneighb=list(set(dfmetanbr.index.get_level_values('neighbor')))

        # dists
        dfcdist1=cdist(word1,period1,neighbors=metaneighb,progress=progress,num_proc=num_proc).reset_index()
        dfcdist2=cdist(word2,period2,neighbors=metaneighb,progress=progress,num_proc=num_proc).reset_index()
    except KeyError:
        return pd.DataFrame()
    
    dfcdist1['word_period_run']=list(zip(dfcdist1.word,dfcdist1.period,dfcdist1.run))
    dfcdist2['word_period_run']=list(zip(dfcdist2.word,dfcdist2.period,dfcdist2.run))
    
    dfpiv1=dfcdist1.pivot('word_period_run','neighbor','cdist')
    dfpiv2=dfcdist2.pivot('word_period_run','neighbor','cdist')
    
    dfpiv_dists = cdist_dfpivs(dfpiv1,dfpiv2)
    o=[]
    for i,row in dfpiv_dists.iterrows():
        odx={
            'word1':row.i1[0],
            'word2':row.i2[0],
            'period1':row.i1[1],
            'period2':row.i2[1],
            'run1':row.i1[2],
            'run2':row.i2[2],
            'lnm':row.cdist,
            'mneighb_size':row['size'],
            'neighb1_size':row.size1,
            'neighb2_size':row.size2,
        }
        o+=[odx]
    odf=pd.DataFrame(o)
    index_cols=['word1','word2','period1','period2','run1','run2']
    odf=odf.set_index(index_cols)
    
    if cache:
        with get_veclib('lnm',autocommit=True) as vl:
            vl[qstr]=odf
    
    return odf

In [21]:
def cdist_dfpivs(dfpiv1,dfpiv2):
    o=[]
    for i1,i2 in zip(dfpiv1.index,dfpiv2.index):
        dfcmp=dfpiv1.loc[[i1]].append(dfpiv2.loc[[i2]])
        dfcmpd=dfcmp.dropna(axis=1)
        csim=fastdist.cosine(
            dfcmpd.iloc[0].values.astype(float),
            dfcmpd.iloc[1].values.astype(float)
        )
        odx={
            'i1':i1,
            'i2':i2,
            'cdist':1-csim,
            'size1':len(dfcmp.iloc[0].dropna()),
            'size2':len(dfcmp.iloc[1].dropna()),
            'size':len(dfcmpd.columns)
        }
        o.append(odx)
    return pd.DataFrame(o)
            

In [22]:
# words=get_all_nouns()
# word=random.choice(words)
# word

In [23]:
# odf=to_lnm(f'{word}_1750-1755', f'{word}_1850-1855')
# odf

In [24]:
# odf=to_lnm('culture_1750-1755','culture_1850-1855')
# odf

In [25]:
# odf=to_lnm('station_1750-1755','station_1850-1855')
# odf.groupby(['word1','word2','period1','period2']).mean()

In [26]:
# odf=to_lnm('god_1750-1755','god_1850-1855')
# odf.groupby(['word1','word2','period1','period2']).mean()

### Scaling up

In [27]:
def lnm_postproc(odf):
    odf_ac=odf.reset_index().drop('run1',1).drop('run2',1)
    odx={
        'word1':odf_ac.iloc[0].word1,
        'word2':odf_ac.iloc[0].word2,
        'period1':odf_ac.iloc[0].period1,
        'period2':odf_ac.iloc[0].period2,
        **dict(odf_ac.mean()),
    }
    return odx

In [28]:
# lnm_postproc(odf)

In [29]:
nbr_word('virtue')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,num_runs,cdist,rank
word,neighbor,period,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
virtue,authority,1720-1725,10,10,0.460938,1
virtue,integrity,1720-1725,10,10,0.502169,2
virtue,popish,1720-1725,10,10,0.527994,3
virtue,shewing,1720-1725,10,10,0.543816,4
virtue,privileges,1720-1725,10,10,0.546869,5
virtue,...,...,...,...,...,...
virtue,outwardly,1955-1960,2,7,0.460964,96
virtue,tragedy,1955-1960,2,7,0.462616,97
virtue,creation,1955-1960,2,9,0.466674,98
virtue,attitude,1955-1960,2,7,0.466790,99


In [30]:
def _lnm_word(objd): return lnm_word(**objd)
def _to_lnm(objd): return to_lnm(**objd)
    
def lnm_word(word1,word2=None,periods=None,ybin=YEARBIN,ymin=YMIN,ymax=YMAX,k=K,
             num_proc=1,cache=True,cache_periods=False,progress=True,force=False):
    if not word2: word2=word1
    wqstr=f'{word1},{word2},ybin={YEARBIN},ymin={YMIN},ymax={YMAX},k={K}'
    if cache and not force:
        with get_veclib('lnm',autocommit=False) as vl:
            if wqstr in vl: return vl[wqstr]
        
    ## preproc
    if not periods: periods=get_default_periods(ymin=ymin,ymax=ymax,ybin=ybin)
    
    def _getnbr(w):
        odf=nbr_word(
            w,
            periods=periods,
            ymin=ymin,ymax=ymax,ybin=ybin,
            num_proc=num_proc,progress=progress,
            cache=True,cache_only=False
        )
        return odf if len(odf) else None
    
    dfnbr1 = _getnbr(word1)
    dfnbr2 = dfnbr1 if word1==word2 else _getnbr(word2)
    
    objs = [
        dict(
            word1_period1=(word1,p1),
            word2_period2=(word2,p2),
            k=k,
            num_proc=1,
            cache=cache_periods,
            progress=False,
            dfnbr1=dfnbr1,
            dfnbr2=dfnbr2
        )
        for p1 in periods
        for p2 in periods
        if p1<p2 or (p1<=p2 and word1!=word2)
    ]

    iterr=pmap_iter(
        _to_lnm,
        objs,
        num_proc=num_proc,
        progress=progress,
        desc='Measuring LNM across word-periods'
    )
    o=[]
    for odf in iterr:
        if len(odf):
            odx=lnm_postproc(odf)
            if odx: o.append(odx)
    odf_word=pd.DataFrame(o).set_index(['word1','word2','period1','period2']) if len(o) else pd.DataFrame()
    if cache:
        with get_veclib('lnm',autocommit=True) as vl:
            vl[wqstr]=odf_word
    return odf_word

In [32]:
lnm_word('histories',num_proc=4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lnm,mneighb_size,neighb1_size,neighb2_size
word1,word2,period1,period2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
histories,histories,1720-1725,1725-1730,0.032169,17.300000,23.000000,18.200000
histories,histories,1720-1725,1730-1735,0.033491,20.000000,23.000000,21.300000
histories,histories,1720-1725,1735-1740,0.036281,19.300000,23.000000,20.400000
histories,histories,1720-1725,1740-1745,0.031337,18.200000,23.000000,19.100000
histories,histories,1720-1725,1745-1750,0.028120,16.700000,23.000000,17.800000
histories,histories,...,...,...,...,...,...
histories,histories,1940-1945,1950-1955,0.019162,8.222222,11.444444,10.111111
histories,histories,1940-1945,1955-1960,0.015453,7.444444,11.444444,10.222222
histories,histories,1945-1950,1950-1955,0.010950,7.800000,11.600000,10.100000
histories,histories,1945-1950,1955-1960,0.019158,8.200000,11.600000,10.200000


In [33]:
def lnm(word_or_words1,word_or_words2=None,period_or_periods=None,k=25,num_proc=1,cache=True,progress=True):
    
    ## input
    # words1?
    words1=tokenize_fast(word_or_words1) if type(word_or_words1)==str else list(word_or_words1)
    # words2?
    if word_or_words2 is None:
        words2=words1
    elif type(word_or_words2)==str:
        words2=tokenize_fast(word_or_words2)
    else:
        words2=list(word_or_words2)
    # periods?
    if period_or_periods is None:
        periods=get_default_periods()
    elif type(period_or_periods)==str:
        periods=tokenize_fast(period_or_periods)
    else:
        periods=list(period_or_periods)
        
    
#     objs=objs[:10]#random.sample(objs,10)
    objs = [dict(word1=w1,word2=w2,periods=periods,
                 num_proc=num_proc,progress=progress if len(words1)==1 else False) for w1,w2 in zip(words1,words2)]
    o=pmap(
        _lnm_word,
        objs,
        desc='Measuring LNM distance across words',
        num_proc=1,
        progress=progress if len(words1)>1 else False
    )
    odf=pd.concat(o) if len(o) else pd.DataFrame()
    return odf

In [34]:
dflnm=lnm(['histories','history','value','values'],num_proc=4)
dflnm

Measuring LNM distance across words [x1]: 100%|██████████| 4/4 [00:00<00:00, 373.62it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lnm,mneighb_size,neighb1_size,neighb2_size
word1,word2,period1,period2,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
histories,histories,1720-1725,1725-1730,0.032169,17.3,23.0,18.2
histories,histories,1720-1725,1730-1735,0.033491,20.0,23.0,21.3
histories,histories,1720-1725,1735-1740,0.036281,19.3,23.0,20.4
histories,histories,1720-1725,1740-1745,0.031337,18.2,23.0,19.1
histories,histories,1720-1725,1745-1750,0.028120,16.7,23.0,17.8
...,...,...,...,...,...,...,...
values,values,1940-1945,1950-1955,0.034502,45.8,45.8,46.0
values,values,1940-1945,1955-1960,0.045121,45.0,45.0,45.0
values,values,1945-1950,1950-1955,0.034384,45.9,45.9,46.0
values,values,1945-1950,1955-1960,0.033806,42.0,42.0,42.0


In [None]:
dflnm=lnm(get_valid_words(),num_proc=4)
dflnm

Measuring LNM distance across words [x1]:  81%|████████  | 4770/5911 [30:00<5:37:30, 17.75s/it] 

In [None]:
# dflnm = lnm('station',num_proc=4)

In [None]:
# dflnm.sort_values('lnm')

## Importing old data (to save time)

In [None]:
gcols=['word1','word2','period1','period2']
icols=gcols+['run1','run2']
valcols=['lnm','mneighb_size','neighb1_size','neighb2_size']

def import_into_shift_db(ifn=FN_ALL_LOCALDISTS_ORIGDATA,k=K):
    print('Reformatting old data')
    DF=read_df(ifn).query(f'k=={K}').reset_index()
    rund=dict((f'run_{i:02}',i) for i in range(1,11))
    DF.run1=DF.run1.replace(rund)
    DF.run2=DF.run2.replace(rund)
    df_done=DF
    df_done['word2']=df_done.word
    df_done=df_done.rename({'word':'word1','metaneighb_size':'mneighb_size', 'dist_local':'lnm'},axis=1)
    df_done['mneighb_size']=np.nan
    df_done['neighb1_size']=np.nan
    df_done['neighb2_size']=np.nan
    print('Done')
    display(df_done)
    pmap_groups(
        do_add_db,
        df_done.groupby(['word1','word2']),
        num_proc=1,
        desc='Importing...'
    )

def do_add_db(dfgrp):
    word1,word2=dfgrp.iloc[0].word1, dfgrp.iloc[0].word2
    wqstr=f'{word1},{word2},ybin={YEARBIN},ymin={YMIN},ymax={YMAX},k={K}'
    with get_veclib('lnm',autocommit=True) as vl:
        odf=dfgrp.set_index(icols)[valcols]
        vl[wqstr]=odf
        vl.commit()
    return pd.DataFrame([{'success':True}])


In [None]:
# import_into_shift_db(DF)