# Semantic shifts

In [3]:
from ipynb.fs.full.koselleck import *

[Koselleck] (12:37:30) Alles bereit (+0.0s)


## Generating data

### Local neighborhood measure

In [4]:
nbr_word('virtue').loc['virtue']

[Koselleck] (12:37:30) Running nbr_word(virtue) (+0.0s)
[Koselleck] (12:37:30) Finished running nbr_word(virtue) (+0.0s)


Unnamed: 0_level_0,Unnamed: 1_level_0,count,cdist,rank
period,neighbor,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1720-1725,authority,10,0.921875,1
1720-1725,integrity,10,1.004337,2
1720-1725,popish,10,1.055988,3
1720-1725,derogatory,10,1.063466,4
1720-1725,mankind,10,1.076029,5
...,...,...,...,...
1895-1900,encouragement,4,1.162844,996
1895-1900,lofty,4,1.163750,997
1895-1900,warfare,4,1.163813,998
1895-1900,pose,4,1.167160,999


In [5]:

def lnm_neighborhoods(dfnbr1,dfnbr2,dfcdist1,dfcdist2,k=K,run_all_pairs=False,keep_runs=False,**attrs):
    # filter words in both models
    valid_words_now=list(set(dfnbr1.index) & set(dfnbr2.index) & set(dfcdist1.columns) & set(dfcdist2.columns))
    dfnbr1=dfnbr1.loc[valid_words_now].dropna().sort_values('rank').iloc[:k]
    dfnbr2=dfnbr2.loc[valid_words_now].dropna().sort_values('rank').iloc[:k]
    dfmetanbr=dfnbr1.append(dfnbr2)
    metaneighb=list(set(dfnbr1.index) | set(dfnbr2.index))
    dfcdist1=dfcdist1[metaneighb].dropna()
    dfcdist2=dfcdist2[metaneighb].dropna()
    runs=set(dfcdist1.index) & set(dfcdist2.index)
    o=[]
    for run1 in runs:
        for run2 in runs:
            if not run_all_pairs and run1!=run2: continue
            if run_all_pairs and run1>run2: continue
                
            dists1=dfcdist1.loc[run1]
            dists2=dfcdist2.loc[run2]
            try:
                distdists = 1-fastdist.cosine(dists1.values.astype(float), dists2.values.astype(float))
            except ZeroDivisionError as e:
#                 display(dists1)
#                 display(dists2)
                print('!!',e)
#                 stop
                continue
            odx={
                **attrs,
                'run1':run1,
                'run2':run2,
                'lnm':distdists,
                'mneighb_size':len(metaneighb),
                'neighb1_size':len(dfnbr1),
                'neighb2_size':len(dfnbr2),
            }
            o.append(odx)
    odf=pd.DataFrame(o)
    if keep_runs or not len(o): return odf
    odf=odf.groupby(list(attrs.keys())).mean().drop(['run1','run2'],1)#.mean()
    return odf

In [6]:
def _lnm_neighborhoods_(objd): return lnm_neighborhoods(**objd)


In [7]:
def lnm_word(
        word,
        period_or_periods=None,
        run_or_runs=None,
        word2=None,
        cache=True,
        force=False,
        k=K,
        cache_only=False,
        progress=True,
        progress_nbr=False,
        num_proc=1):
    word1 = word
    if not word2: word2 = word1
    qstr=f'{word1},{word2},ymin={YMIN},ymax={YMAX},ybin={YEARBIN},k={k}'
    if cache and not force:
        with get_veclib('lnm') as vl:
            if qstr in vl: return vl[qstr] if not cache_only else pd.DataFrame()
    
    try:
        dfnbr1 = nbr_word(word1,period_or_periods,run_or_runs,progress=progress_nbr,force=force).loc[word1]
        dfnbr2 = dfnbr1 if word2==word1 else nbr_word(word2,period_or_periods,run_or_runs,progress=progress_nbr,force=force).loc[word2]
        dfcdist1 = cdist_word(word1,period_or_periods,run_or_runs,progress=progress_nbr).loc[word1]
        dfcdist2 = dfcdist1 if word2==word1 else cdist_word(word2,period_or_periods,run_or_runs,progress=progress_nbr).loc[word2]
    except KeyError:
        return pd.DataFrame()
    
            
    periods=sorted(list(
        set(dfnbr1.index.get_level_values('period')) | set(dfnbr2.index.get_level_values('period'))
    ))
    objs = [
        dict(
            word1=word1,word2=word2,
            period1=prd1,period2=prd2,
            dfnbr1=dfnbr1.loc[prd1],
            dfnbr2=dfnbr2.loc[prd2],
            dfcdist1=dfcdist1.loc[prd1],
            dfcdist2=dfcdist2.loc[prd2],
            k=k
        )
        for prd1 in periods
        for prd2 in periods
        if prd1<prd2
    ]
    if progress: print(f'# of objects: {len(objs)}')
    o=pmap(
        _lnm_neighborhoods_,
        objs,
        num_proc=num_proc,
        progress=progress,
        desc='Measuring LNM across period comparisons'
    )
    odf=pd.concat(o) if len(o) else pd.DataFrame()
    if cache:
        with get_veclib('lnm',autocommit=True) as vl:
            vl[qstr]=odf
    
    return odf if not cache_only else pd.DataFrame()

In [8]:
# odf=lnm_word('station',num_proc=1,cache_only=False,force=True,progress_nbr=True)
# odf

## lnm()

In [9]:
def _lnm_(objd): return lnm_word(**objd)

def lnm(
        word_or_words,
        period_or_periods=None,
        run_or_runs=None,
        cache=True,
        force=False,
        cache_only=False,
        progress=True,
        progress_nbr=False,
        progress_word=None,
        num_proc=1,
        k=K
        ):
    words=tokenize_fast(word_or_words) if type(word_or_words)==str else list(word_or_words)
    
    objs=[
        dict(
            word=word,
            period_or_periods=period_or_periods,
            run_or_runs=run_or_runs,
            word2=None,
            cache=cache,
            force=force,
            k=k,
            cache_only=cache_only,
            progress=progress_word if progress_word is not None else (False if len(words)>1 else progress),
            progress_nbr=progress_nbr,
            num_proc=1 if len(words)>1 else num_proc,
        ) for word in words
    ]
    o=pmap(
        _lnm_,
        objs,
        num_proc=num_proc if len(words)>1 else 1,
        progress=progress if len(words)>1 else False,
        desc='Measuring LNM across words',
    )
    return pd.concat(o) if len(o) else pd.DataFrame()


In [10]:
# lnm('virtue,vice,virtues,vices,values')

In [11]:
def lnm_precache_words(words=None,**y):
    words_done=set()
    with get_veclib('lnm') as vl:
        words_done=set(k.split(',')[0] for k in vl.keys())
    if not words: words=get_valid_words()
    words=[w for w in words if not w in words_done]
    lnm(words, cache_only=True,**y)

In [12]:
# words=get_valid_words()
# random.shuffle(words)
# res=pmap(do_word, words, num_proc=1)

In [13]:
# lnm_precache_words(get_all_nouns_adjs(), num_proc=3)

## Comparisons of magnitudes

In [14]:
words=get_words_with_lnm()
len(words)

10324

In [20]:
def ttest_lnm_word(word,valkey='lnm',min_n=20):
    df=lnm_word(word)
    if not len(df): return pd.DataFrame()
    df=df.loc[word,word].reset_index()
    df.period1=df.period1.apply(lambda ystr: periodize_sattelzeit(int(ystr[:4]), use_dates=False))
    df.period2=df.period2.apply(lambda ystr: periodize_sattelzeit(int(ystr[:4]), use_dates=False))
    df['period_cmp']=[f'{x}-v-{y}' for x,y in zip(df.period1,df.period2)]
#     period_cmps={f'{x}-v-{y}' for x,y in zip(df.period1,df.period2) if x!=y}
    period_cmps={f'{x}-v-{y}' for x,y in zip(df.period1,df.period2) if x==y}
    o=[]
    for period_cmp in period_cmps:
        g=df[df.period_cmp==period_cmp]
        p1,p2=period_cmp.split('-v-')
#         gnull=df[df.period_cmp.isin({f'{p1}-v-{p1}', f'{p2}-v-{p2}'})]
        gnull=df[(df.period_cmp!=period_cmp) & df.period_cmp.isin(period_cmps)]
        a=gnull[valkey]
        b=g[valkey]
        if len(a)<min_n or len(b)<min_n: continue
        mw,mw_p=mannwhitneyu(a,b)
        o+=[dict(
            word=word,
            vector='LNM',
            period_cmp=period_cmp,
            n1=len(a),
            n2=len(b),
            mw=mw,
            mw_p=mw_p,
            avg1=a.mean(),
            avg2=b.mean(),
        )]
    df=pd.DataFrame(o)
#     if len(df):
#         df['avg_diff']=df.avg2 - df.avg1
#         df['avg_div']=df.avg2/df.avg1
    return df

In [22]:
ttest_lnm_word('sympathy')

Unnamed: 0,word,vector,period_cmp,n1,n2,mw,mw_p,avg1,avg2
0,sympathy,LNM,During-v-During,93,66,738.0,1.874428e-16,0.025309,0.045749
1,sympathy,LNM,After-v-After,68,91,705.0,4.573953e-17,0.045641,0.02494


In [23]:
def ttest_lnm_words(force=False,num_proc=1):
    if not force and os.path.exists(FN_LNM_TTEST): return read_df(FN_LNM_TTEST)
    
    words=get_words_with_lnm()
    o=pmap(
        ttest_lnm_word,
        words,
        num_proc=num_proc
    )
    if not len(o): return pd.DataFrame()
    odf=pd.concat(o) if len(o) else pd.DataFrame()
    odf['mw_perc']=odf.mw.rank(ascending=False) / len(odf) * 100
    odf=odf.set_index(['word','vector','period_cmp']).sort_values('mw')
    odf=pd.concat(
        vdf.assign(mw_perc_vec=vdf.mw.rank(ascending=False) / len(vdf) * 100)
        for i,vdf in odf.groupby('vector')
    )
    odf=pd.concat(
        vdf.assign(mw_perc_vec_cmp=vdf.mw.rank(ascending=False) / len(vdf) * 100)
        for i,vdf in odf.groupby(['vector','period_cmp'])
    )
    odf=pd.concat(
        vdf.assign(
            avg1_perc_vec=(vdf.avg1.rank(ascending=False) / len(vdf) * 100),
            avg2_perc_vec=(vdf.avg2.rank(ascending=False) / len(vdf) * 100),
        )
        for i,vdf in odf.groupby('vector')
    )
    
    odf=odf.sort_index()
    s=odf.avg1.append(odf.avg2)
    odf.avg1 = (odf.avg1 - s.mean())/s.std()
    odf.avg2 = (odf.avg2 - s.mean())/s.std()
    odf['avg_diff']=odf.avg2 - odf.avg1
    odf['avg_div']=odf.avg2/odf.avg1
    
    odf.to_pickle(FN_LNM_TTEST)
    
    # add to db
    with get_veclib('ttest_lnm') as vl:
        for w,wdf in tqdm(odf.groupby('word'),desc='Adding to db'):
            vl[w]=wdf
        print('Committing')
        vl.commit()
        print('Done')
    
    return odf

In [24]:
odf=ttest_lnm_words(num_proc=4,force=True)
odf

Mapping ttest_lnm_word() [x4]: 100%|██████████| 10324/10324 [00:24<00:00, 425.30it/s]
Adding to db: 100%|██████████| 5347/5347 [00:05<00:00, 949.86it/s] 
[Koselleck] (12:47:02) Committing (+349.8s)
[Koselleck] (12:47:02) Done (+0.3s)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n1,n2,mw,mw_p,avg1,avg2,mw_perc,mw_perc_vec,mw_perc_vec_cmp,avg1_perc_vec,avg2_perc_vec,avg_diff,avg_div
word,vector,period_cmp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
abbe,LNM,Before-v-Before,61,43,1203.0,2.379579e-01,0.910807,0.969635,73.653576,73.653576,61.223292,12.952314,15.701262,0.058828,1.064589
abbe,LNM,During-v-During,50,54,1321.0,4.264494e-01,0.895216,0.972088,69.814165,69.814165,78.477842,13.267882,15.659187,0.076872,1.085870
abbey,LNM,After-v-After,111,91,3702.0,5.551074e-04,1.209284,0.593787,12.338710,12.338710,13.302848,9.011220,22.706872,-0.615497,0.491023
abbey,LNM,Before-v-Before,157,45,1845.0,5.309801e-07,0.651875,1.909350,54.330295,54.330295,33.916412,17.847125,6.100982,1.257475,2.929012
abbey,LNM,During-v-During,136,66,4149.0,1.925111e-01,1.029084,0.731967,5.599579,5.599579,8.256262,11.241234,19.775596,-0.297117,0.711280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zealous,LNM,After-v-After,111,87,4091.0,3.276214e-02,-0.116929,-0.386408,6.367461,6.367461,8.522137,45.140252,59.438990,-0.269479,3.304645
zealous,LNM,Before-v-Before,153,45,1802.0,6.064750e-07,-0.416172,0.379506,55.504909,55.504909,35.407373,62.005610,28.022440,0.795678,-0.911897
zealous,LNM,During-v-During,132,66,3453.0,8.788946e-03,-0.125301,-0.455407,16.746143,16.746143,29.431599,45.476858,63.288920,-0.330106,3.634509
zoo,LNM,Before-v-Before,23,45,516.0,4.948288e-01,1.430636,1.619306,93.425666,93.425666,91.195116,6.612903,8.239832,0.188671,1.131879
