# Semantic shifts

In [1]:
from ipynb.fs.full.koselleck import *

[Koselleck] (11:29:06) Alles bereit


## Generating data

### Local neighborhood measure

In [2]:
def to_lnm(word1_period1,word2_period2,k=25,num_proc=1,cache=True,progress=True):
    word1,period1=word1_period1.split('_') if type(word1_period1)==str else word1_period1
    word2,period2=word2_period2.split('_') if type(word2_period2)==str else word2_period2

    qstr=f'{word1}_{period1},{word2}_{period2},k={k}'
    if cache:
        with get_veclib('lnm') as vl:
            if qstr in vl:
                print("Found in lnm db:",qstr)
                return vl[qstr]
    
    dfnbr1=to_nbr(word1,period1,num_proc=num_proc,progress=progress)
    dfnbr2=to_nbr(word2,period2,num_proc=num_proc,progress=progress)
    
    # filter words in both models
    dfmetanbr=dfnbr1.iloc[:k].append(dfnbr2.iloc[:k])
    metaneighb=list(set(dfmetanbr.index.get_level_values('neighbor')))
    
    # dists
    dfcdist1=cdist(word1,period1,neighbors=metaneighb,progress=progress,num_proc=num_proc).reset_index()
    dfcdist2=cdist(word2,period2,neighbors=metaneighb,progress=progress,num_proc=num_proc).reset_index()
    
    dfcdist1['word_period_run']=list(zip(dfcdist1.word,dfcdist1.period,dfcdist1.run))
    dfcdist2['word_period_run']=list(zip(dfcdist2.word,dfcdist2.period,dfcdist2.run))
    
    dfpiv1=dfcdist1.pivot('word_period_run','neighbor','cdist')
    dfpiv2=dfcdist2.pivot('word_period_run','neighbor','cdist')
    
    dfpiv_dists = cdist_dfpivs(dfpiv1,dfpiv2)
    o=[]
    for i,row in dfpiv_dists.iterrows():
        odx={
            'word1':row.i1[0],
            'word2':row.i2[0],
            'period1':row.i1[1],
            'period2':row.i2[1],
            'run1':row.i1[2],
            'run2':row.i2[2],
            'lnm':row.cdist,
            'mneighb_size':row['size'],
            'neighb1_size':row.size1,
            'neighb2_size':row.size2,
        }
        o+=[odx]
    odf=pd.DataFrame(o)
    index_cols=['word1','word2','period1','period2','run1','run2']
    odf=odf.set_index(index_cols)
    
    if cache:
        with get_veclib('lnm',autocommit=True) as vl:
            vl[qstr]=odf
    
    return odf

In [3]:
def cdist_dfpivs(dfpiv1,dfpiv2):
    o=[]
    for i1,i2 in zip(dfpiv1.index,dfpiv2.index):
        dfcmp=dfpiv1.loc[[i1]].append(dfpiv2.loc[[i2]])
        dfcmpd=dfcmp.dropna(axis=1)
        csim=fastdist.cosine(
            dfcmpd.iloc[0].values.astype(float),
            dfcmpd.iloc[1].values.astype(float)
        )
        odx={
            'i1':i1,
            'i2':i2,
            'cdist':1-csim,
            'size1':len(dfcmp.iloc[0].dropna()),
            'size2':len(dfcmp.iloc[1].dropna()),
            'size':len(dfcmpd.columns)
        }
        o.append(odx)
    return pd.DataFrame(o)
            

In [4]:
# words=get_all_nouns()
# word=random.choice(words)
# word

In [5]:
# odf=to_lnm(f'{word}_1750-1755', f'{word}_1850-1855')
# odf

In [6]:
# odf=to_lnm('culture_1750-1755','culture_1850-1855')
# odf

In [7]:
# odf=to_lnm('station_1750-1755','station_1850-1855')
# odf.groupby(['word1','word2','period1','period2']).mean()

In [8]:
# odf=to_lnm('god_1750-1755','god_1850-1855')
# odf.groupby(['word1','word2','period1','period2']).mean()

### Scaling up

In [9]:
def lnm_postproc(odf):
    odf_ac=odf.reset_index().drop('run1',1).drop('run2',1)
    odx={
        'word1':odf_ac.iloc[0].word1,
        'word2':odf_ac.iloc[0].word2,
        'period1':odf_ac.iloc[0].period1,
        'period2':odf_ac.iloc[0].period2,
        **dict(odf_ac.mean()),
    }
    return odx

In [10]:
# lnm_postproc(odf)

In [11]:
def lnm_(objd):
    res=to_lnm(**objd)
    if res is None: res=pd.DataFrame()
    return res

def _lnm_ww_(obj):
    word1,word2,ybin,ymin,ymax,k=obj
    wqstr=f'{word1},{word2},ybin={YEARBIN},ymin={YMIN},ymax={YMAX},k={K}'
    with get_veclib('lnm',autocommit=False) as vl:
        if wqstr in vl: return vl[wqstr]
        
    o=[]
    for odf in iterr:
        odx=lnm_postproc(odf)
        if odx: o.append(odx)
    return pd.DataFrame(o).set_index(['word1','word2','period1','period2'])
        

def lnm(word_or_words1,word_or_words2=None,period_or_periods=None,k=25,num_proc=1,cache=True):
    
    ## input
    # words1?
    words1=tokenize_fast(word_or_words1) if type(word_or_words1)==str else list(word_or_words1)
    # words2?
    if word_or_words2 is None:
        words2=words1
    elif type(word_or_words2)==str:
        words2=tokenize_fast(word_or_words2)
    else:
        words2=list(word_or_words2)
    # periods?
    if period_or_periods is None:
        periods=get_default_periods()
    elif type(period_or_periods)==str:
        periods=tokenize_fast(period_or_periods)
    else:
        periods=list(period_or_periods)
        
    ## preproc
    objs = [
        dict(
            word1_period1=(w1,p1),
            word2_period2=(w2,p2),
            k=k,
            num_proc=1,
            cache=cache,
            progress=False
        )
        for p1 in periods
        for p2 in periods
        for w1,w2 in zip(words1,words2)
        if p1<p2 or (p1<=p2 and w1!=w2)
    ]
#     objs=objs[:10]#random.sample(objs,10)
    
    ## proc
    iterr=pmap_iter(
        lnm_,
        objs,
        num_proc=num_proc,
        desc='Measuring local neighb. distance across word-periods'
    )
    
    ## postproc
    o=[]
    for odf in iterr:
        odx=lnm_postproc(odf)
        if odx: o.append(odx)
    return pd.DataFrame(o).set_index(['word1','word2','period1','period2'])

In [12]:
# dflnm = lnm('station',num_proc=4)

In [13]:
# dflnm.sort_values('lnm')

## Importing old data (to save time)

In [14]:
K=25

In [15]:
print('Reformatting old data')
ifn='/home/ryan/github/koselleck/data1/data.all_local_dists.v4.pkl'
DF=read_df(ifn).query(f'k=={K}').reset_index()
rund=dict((f'run_{i:02}',i) for i in range(1,11))
DF.run1=DF.run1.replace(rund)
DF.run2=DF.run2.replace(rund)
df_done=DF
df_done['word2']=df_done.word
df_done=df_done.rename({'word':'word1','metaneighb_size':'mneighb_size', 'dist_local':'lnm'},axis=1)
df_done['mneighb_size']=np.nan
df_done['neighb1_size']=np.nan
df_done['neighb2_size']=np.nan
print('Done')
df_done

[Koselleck] (11:29:06) Reformatting old data
[Koselleck] (11:29:34) Done


Unnamed: 0,corpus1,period1,run1,corpus2,period2,run2,word1,lnm,k,word2,mneighb_size,neighb1_size,neighb2_size
0,bpo,1720-1725,1,bpo,1730-1735,1,statute,0.100421,25.0,statute,,,
1,bpo,1720-1725,1,bpo,1730-1735,1,heron,0.189011,25.0,heron,,,
2,bpo,1720-1725,1,bpo,1730-1735,1,inconvenience,0.125592,25.0,inconvenience,,,
3,bpo,1720-1725,1,bpo,1730-1735,1,root,0.065087,25.0,root,,,
4,bpo,1720-1725,1,bpo,1730-1735,1,application,0.072644,25.0,application,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20410826,bpo,1955-1960,4,bpo,1950-1955,4,rug,0.061721,25.0,rug,,,
20410827,bpo,1955-1960,4,bpo,1950-1955,4,stride,0.086377,25.0,stride,,,
20410828,bpo,1955-1960,4,bpo,1950-1955,4,uncle,0.041325,25.0,uncle,,,
20410829,bpo,1955-1960,4,bpo,1950-1955,4,variable,0.090544,25.0,variable,,,


In [16]:
# print('Grouping')
gcols=['word1','word2','period1','period2']
# df_done_g = df_done.groupby(gcols)
# len(df_done_g)
# print('Done')
# df_done_g.size()

In [17]:
icols=gcols+['run1','run2']
valcols=['lnm','mneighb_size','neighb1_size','neighb2_size']

# with get_veclib('lnm',autocommit=False) as vl:
#     for i,((word1,word2,period1,period2),grp) in enumerate(tqdm(df_done_g,desc="Adding to lnm db")):
#         qstr=f'{word1}_{period1},{word2}_{period2},k={K}'
#         dfgq=grp.set_index(icols)[valcols]
#         vl[qstr]=dfgq
#         if i and not i%100:
#             vl.commit()
#     vl.commit()

In [22]:
icols=gcols+['run1','run2']
valcols=['lnm','mneighb_size','neighb1_size','neighb2_size']

def do_add_db(dfgrp):
    word1,word2=dfgrp.iloc[0].word1, dfgrp.iloc[0].word2
    wqstr=f'{word1},{word2},ybin={YEARBIN},ymin={YMIN},ymax={YMAX},k={K}'
    with get_veclib('lnm',autocommit=False) as vl:
        odf=dfgrp.set_index(icols)[valcols]
#         display(odf)
#         stop
        vl[wqstr]=odf
        vl.commit()
        
#         for (word1,word2,period1,period2),grp in dfgrp.groupby(gcols):
#             qstr=f'{word1}_{period1},{word2}_{period2},k={K}'
#             dfgq=grp.set_index(icols)[valcols]
#             vl[qstr]=dfgq
#         vl.commit()
    return pd.DataFrame([{'success':True}])


In [None]:
pmap_groups(
    do_add_db,
    df_done.groupby(['word1','word2']),
    num_proc=1
)

Mapping do_add_db [x1]:   0%|          | 4/6010 [00:09<3:51:22,  2.31s/it]

In [20]:

# def import_into_shift_db(df_done=None,
#         ifn=FN_ALL_LOCALDISTS_ORIGDATA,
#         icols=['word1','word2','period1','period2','run1','run2'],
#         k=25
#         ):
#     df_done=read_df(ifn).reset_index() if df_done is None else df_done
#     df_done.run1=pmap(_fmtrun, df_done.run1)
#     df_done.run2=pmap(_fmtrun, df_done.run2)
#     df_done['word2']=df_done.word
#     df_done=df_done.rename({'word':'word1','metaneighb_size':'mneighb_size', 'dist_local':'lnm'},axis=1)
#     return df_done
#     df_done=df_done.groupby(icols).mean()[
#         ['lnm','mneighb_size','neighb1_size','neighb2_size']
#     ]
#     print("Original data reformatted:")
#     return df_done
#     display(df_done)
    
#     gcols=['word1','word2','period1','period2']
#     with get_veclib('lnm',autocommit=False) as vl:
#         for i,((word1,word2,period1,period2),grp) in enumerate(tqdm(df_done.groupby(gcols),desc="Adding to lnm db")):
#             qstr=f'{word1}_{period1},{word2}_{period2},k={k}'
#             vl[qstr]=grp
#             if i and not i%100:
#                 vl.commit()
#         vl.commit()
    

In [21]:
import_into_shift_db(DF)

NameError: name 'import_into_shift_db' is not defined

In [None]:
to_lnm('administration_1720-1725','administration_1725-1730')

In [None]:
lnm('value')