# Computing key vectors across models

In [2]:
from ipynb.fs.full.koselleck import *

[Koselleck] (17:27:00) Alles bereit (+3089.2s)


In [3]:
#dfmodels = get_default_models()
# dfmodels

In [4]:
# row=dfmodels.sample(n=1).iloc[0]
# m=load_model_row(row)
# row.period, row.run, m

In [5]:
# Defining key vectors

# field dictionary
fieldd=get_fields()

def get_misccontrasts(fieldd=None):
    if fieldd is None: fieldd=get_fields()
    return [{
        'contrast':'Woman-Man',
        'source':'VG',
        'pos':fieldd['VG.Human.Female'],
        'neg':fieldd['VG.Human.Male'],
    },
    {
        'contrast':'Human-Object',
        'source':'VG',
        'pos':fieldd['VG.Human'],
        'neg':fieldd['VG.Object'],
    },
    {
        'contrast':'Pleasure-Pain',
        'source':'HGI',
        'pos':fieldd['HGI.Pleasur'],
        'neg':fieldd['HGI.Pain'],
    },
    {
        'contrast':'Interp-Desc',
        'source':'HGI',
        'pos':fieldd['HGI.IAV'],
        'neg':fieldd['HGI.DAV'],
    },
    {
        'contrast':'Polit-Acad',
        'source':'HGI',
        'pos':fieldd['HGI.Polit@'],
        'neg':fieldd['HGI.Academ'],
    },
    {
        'contrast':'Qual-Quant',
        'source':'HGI',
        'pos':fieldd['HGI.Quality'],
        'neg':fieldd['HGI.Quan'],
    },
#     {
#         'contrast':'Time',
#         'source':'HGI',
#         'pos':fieldd['HGI.Time@'],
#     },
    {
        'contrast':'Time-Space',
        'source':'HGI',
        'pos':fieldd['HGI.Time@'],
        'neg':fieldd['HGI.Space'],
    },
    {
        'contrast':'Pos-Neg',
        'source':'HGI',
        'period':'na',
        'pos':fieldd['HGI.Pstv'],
        'neg':fieldd['HGI.Ngtv'],
        'neither':{}
    },
    {
        'contrast':'Virtue-Vice',
        'source':'HGI',
        'period':'na',
        'pos':fieldd['HGI.Virtue'],
        'neg':fieldd['HGI.Vice'],
        'neither':{}
    },
    {
        'contrast':'Collective-Indiv',
        'source':'RH',
        'period':'na',
        'pos':{'society','world','public','publicity','polity'},
        'neg':{'individual','person','private','privacy'},
        'neither':{}
    },
    {
        'contrast':'Judg-Perc',
        'source':'HGI',
        'period':'na',
        'pos':fieldd['HGI.Eval@'],
        'neg':fieldd['HGI.Perceiv'],
        'neither':{}
    },
    {
        'contrast':'Strong-Weak',
        'source':'HGI',
        'period':'na',
        'pos':fieldd['HGI.Strong'],
        'neg':fieldd['HGI.Weak'],
        'neither':{}
    },
    {
        'contrast':'Active-Passive',
        'source':'HGI',
        'period':'na',
        'pos':fieldd['HGI.Active'],
        'neg':fieldd['HGI.Passive'],
        'neither':{}
    },
    ]


In [6]:
def get_key_contrasts():
    origc=[cd for cd in get_origcontrasts().to_dict('records') if cd['source']=='Median']
    for dx in origc: dx['pos'],dx['neg'] = dx['neg'],dx['pos']
    miscc=get_misccontrasts()
    return origc + miscc

In [7]:
# pd.DataFrame(get_key_contrasts())

In [8]:

def get_centroid(model,words):
    words=[words] if type(words)==str else list(words)
    vectors=[]
    for w in words:
        if w in model.wv.key_to_index:
            vectors+=[model.wv[w]]
    if not vectors: return None
    return np.mean(vectors,0)

def compute_vector(model,words_pos=[],words_neg=[]):
    centroid_pos=get_centroid(model,words_pos)
    if not words_neg: return centroid_pos
    centroid_neg=get_centroid(model,words_neg)
    if centroid_neg is not None:
        return centroid_pos - centroid_neg
    else:
        return centroid_pos

def compute_vector_scores(m,pos,neg=None,z=True):
    vec=np.array(compute_vector(m,pos,neg), dtype=np.float64)
    matrix=np.array(m.wv.get_normed_vectors(),dtype=np.float64)
    res=fastdist.vector_to_matrix_distance(vec,matrix,fastdist.cosine,'cosine')
    resd=dict((m.wv.index_to_key[i],x)for i,x in enumerate(res))
    s=pd.Series(resd)
    if z: s=(s - s.mean())/s.std()
    return s.sort_values()



In [9]:
def compute_key_vector_scores(m,fieldd=None,words=None):
    # get fields
    if fieldd is None: fieldd=get_fields()
    abswordseed=fieldd['Abs-Conc.Median.Abs.orig']
    concwordseed=fieldd['Abs-Conc.Median.Conc.orig']
    odf=pd.DataFrame()
    for cd in get_key_contrasts():
        odf[cd['contrast']+'.'+cd['source']] = compute_vector_scores(m, cd.get('pos',{}), cd.get('neg',{}))
    odf=odf.rename_axis('word').reset_index()
    if words: odf=odf[odf.word.isin(set(words))]
    return odf.set_index('word')

In [10]:
# dfvec=compute_key_vector_scores(m, words=get_valid_words()).sort_values('Collective-Indiv.RH')
# dfvec

In [11]:
# for col in dfvec.columns:
#     printm('### '+col)
#     printm('* Positive: '+ ', '.join(dfvec[col].sort_values(ascending=False).iloc[:10].index))
#     printm('* Negative: '+ ', '.join(dfvec[col].sort_values(ascending=True).iloc[:10].index))
    

In [12]:
# dfvec.corr().sort_values('Collective-Indiv.RH')

In [13]:
def compute_key_vector_scores_across_models(pathdf,words=None,num_proc=1,**attrs):
    #gby=[c for c in pathdf.columns if not c.startswith('path')]
    gby=['corpus','period','run']
    return pmap_groups(
        do_compute_all_vectors,
        pathdf.groupby(gby),
        num_proc=num_proc,
        kwargs=dict(words=words),
        desc='Computing key vectors across model runs'
    )
def do_compute_all_vectors(df,words=None,add_ambig=True, add_freq=True, add_sing=True):
    m=load_model_row(df.iloc[0])
    odf=compute_key_vector_scores(m,words=words)#.reset_index()
    if add_ambig: odf['Ambig.NX']=measure_ambiguity(m,words=words,z=True)
    if add_freq: odf['Freq.M']=measure_freq(m,words=words, z=True)
    if add_sing: odf['Sing-Plural.M']=measure_singularism(m,words=words,z=True)['freq_diff']
    return odf.reset_index()

In [14]:
# for i,gdf in dfmodels.groupby(['corpus','period','run']): pass

In [15]:
# words=get_all_nouns_adjs()

In [16]:
# ogdf=do_compute_all_vectors(gdf,words=words, add_ambig=True).sort_values('Sing-Plural.M')
# ogdf

In [17]:
# odf=compute_key_vector_scores_across_models(
#     dfmodels,#.query('run<="run_10"'),
#     words=words,
#     num_proc=4,
# )
# odf.to_pickle(FN_VECTOR_SCORES_RUNS)
# odf

In [18]:
# odf.groupby('word').mean().sort_values('Qual-Quant.HGI')

## Utilities

In [19]:
def get_words_ever_abs(fn=FN_VECTOR_SCORES_RUNS):
    df=read_df(fn)
    wea=df.groupby(['period','word']).mean()
    wea=wea[wea['Abs-Conc.Median']>=1].reset_index()
    wea=set(wea.word)
    return wea
#     len(wea),random.sample(wea,10)

In [20]:
def get_vector_scores(force=False):
    global DFVECSCORES
    if DFVECSCORES is None or force:
        df = read_df(FN_VECTOR_SCORES_RUNS).reset_index()
        df['period_int']=df.period.apply(lambda x: int(x[:4]))
        df['period_str']=df['period']
        df['period']=df.period_int.apply(periodize_sattelzeit)
        DFVECSCORES=df
    return DFVECSCORES#.groupby(['word','period']).mean()


## Comparisons

In [21]:
# %%timeit
df=get_vector_scores(force=False)

In [22]:
df[df.word=='virtue']

Unnamed: 0,corpus,period,run,word,Abs-Conc.Median,Woman-Man.VG,Human-Object.VG,Pleasure-Pain.HGI,Interp-Desc.HGI,Polit-Acad.HGI,...,Virtue-Vice.HGI,Collective-Indiv.RH,Judg-Perc.HGI,Strong-Weak.HGI,Active-Passive.HGI,Ambig.NX,Freq.M,Sing-Plural.M,period_int,period_str
6890,bpo,1700-1770,run_01,virtue,3.214739,-0.055065,1.381495,2.572173,2.845677,1.558998,...,2.805773,1.113559,0.046696,2.234454,-2.549791,0.298927,1.268937,0.460143,1720,1720-1725
13793,bpo,1700-1770,run_02,virtue,3.426751,-0.575663,2.026422,3.071403,3.055524,1.234054,...,3.345770,0.401270,-0.324064,2.097842,-1.850180,-1.078081,1.375073,0.494944,1720,1720-1725
20768,bpo,1700-1770,run_03,virtue,3.285548,1.083244,1.714781,2.420538,2.791909,1.790631,...,2.765761,1.978974,0.215037,1.647343,-2.477496,0.372658,1.417830,0.536341,1720,1720-1725
27674,bpo,1700-1770,run_04,virtue,3.090393,1.627666,1.881941,1.783067,2.609544,1.351938,...,2.420377,0.445426,-0.509606,1.502378,-2.437469,-0.285504,1.345429,0.533547,1720,1720-1725
34645,bpo,1700-1770,run_05,virtue,2.864174,0.091354,1.674049,2.071734,2.518239,1.389316,...,1.980953,1.054147,-0.099164,1.516220,-2.173876,-0.402512,1.379214,0.519481,1720,1720-1725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3489011,bpo,1830-1900,run_06,virtue,2.075567,0.081599,1.128639,1.710345,1.406778,-1.219324,...,2.402779,-0.545846,-2.313037,0.976088,-1.817453,-0.962849,0.467582,-0.095407,1895,1895-1900
3500193,bpo,1830-1900,run_07,virtue,2.040826,-0.015448,1.149327,1.871244,1.653688,-0.984135,...,2.663170,0.068328,-1.312321,0.828359,-2.137828,-0.625897,0.377683,-0.229678,1895,1895-1900
3511774,bpo,1830-1900,run_08,virtue,2.446604,0.236667,1.871441,1.457747,1.879064,-0.553039,...,2.196886,-0.368364,-1.410475,1.099873,-2.078233,-0.301605,0.480107,-0.136159,1895,1895-1900
3522818,bpo,1830-1900,run_09,virtue,2.116359,0.783226,1.321273,1.838180,1.898465,-0.730975,...,2.270594,0.367582,-1.046473,0.510220,-2.027397,-0.343166,0.397047,-0.251059,1895,1895-1900


In [23]:
for i,dfword in tqdm(df.groupby('word')):
    if i=='virtue': break
dfword

 97%|█████████▋| 11312/11678 [00:02<00:00, 5351.80it/s]


Unnamed: 0,corpus,period,run,word,Abs-Conc.Median,Woman-Man.VG,Human-Object.VG,Pleasure-Pain.HGI,Interp-Desc.HGI,Polit-Acad.HGI,...,Virtue-Vice.HGI,Collective-Indiv.RH,Judg-Perc.HGI,Strong-Weak.HGI,Active-Passive.HGI,Ambig.NX,Freq.M,Sing-Plural.M,period_int,period_str
6890,bpo,1700-1770,run_01,virtue,3.214739,-0.055065,1.381495,2.572173,2.845677,1.558998,...,2.805773,1.113559,0.046696,2.234454,-2.549791,0.298927,1.268937,0.460143,1720,1720-1725
13793,bpo,1700-1770,run_02,virtue,3.426751,-0.575663,2.026422,3.071403,3.055524,1.234054,...,3.345770,0.401270,-0.324064,2.097842,-1.850180,-1.078081,1.375073,0.494944,1720,1720-1725
20768,bpo,1700-1770,run_03,virtue,3.285548,1.083244,1.714781,2.420538,2.791909,1.790631,...,2.765761,1.978974,0.215037,1.647343,-2.477496,0.372658,1.417830,0.536341,1720,1720-1725
27674,bpo,1700-1770,run_04,virtue,3.090393,1.627666,1.881941,1.783067,2.609544,1.351938,...,2.420377,0.445426,-0.509606,1.502378,-2.437469,-0.285504,1.345429,0.533547,1720,1720-1725
34645,bpo,1700-1770,run_05,virtue,2.864174,0.091354,1.674049,2.071734,2.518239,1.389316,...,1.980953,1.054147,-0.099164,1.516220,-2.173876,-0.402512,1.379214,0.519481,1720,1720-1725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3489011,bpo,1830-1900,run_06,virtue,2.075567,0.081599,1.128639,1.710345,1.406778,-1.219324,...,2.402779,-0.545846,-2.313037,0.976088,-1.817453,-0.962849,0.467582,-0.095407,1895,1895-1900
3500193,bpo,1830-1900,run_07,virtue,2.040826,-0.015448,1.149327,1.871244,1.653688,-0.984135,...,2.663170,0.068328,-1.312321,0.828359,-2.137828,-0.625897,0.377683,-0.229678,1895,1895-1900
3511774,bpo,1830-1900,run_08,virtue,2.446604,0.236667,1.871441,1.457747,1.879064,-0.553039,...,2.196886,-0.368364,-1.410475,1.099873,-2.078233,-0.301605,0.480107,-0.136159,1895,1895-1900
3522818,bpo,1830-1900,run_09,virtue,2.116359,0.783226,1.321273,1.838180,1.898465,-0.730975,...,2.270594,0.367582,-1.046473,0.510220,-2.027397,-0.343166,0.397047,-0.251059,1895,1895-1900


In [24]:
from scipy.stats import mannwhitneyu

In [25]:
period1='1700-1770'
period2='1770-1830'
period3='1830-1900'

def ttest_word_df(dfword,min_n=20):
    grp1=dfword.query(f'period=="{period1}"')
    grp2=dfword.query(f'period=="{period2}"')
    grp3=dfword.query(f'period=="{period3}"')
    
    cmp={
        'Before-v-After':(grp1,grp3),
        'Before-v-During':(grp1,grp2),
        'During-v-After':(grp2,grp3)
    }
    o=[]
    for cmpname,(g1,g2) in cmp.items():
        if not len(g1) or not len(g2): continue
        vecnames = {c for c in g1.columns if c[0]==c[0].upper()} & {c for c in g2.columns if c[0]==c[0].upper()}
        for vecname in vecnames:
            a=g1[vecname]
            b=g2[vecname]
            if len(a)<min_n or len(b)<min_n: continue
            t,p = ttest_ind(a,b)
            #nstat1,np1 = normaltest(a)
            #nstat2,np2 = normaltest(b)
            mw,mw_p = mannwhitneyu(a,b)
            
            odx=dict(
                period_cmp=cmpname,
                vector=vecname,
#                 t=t,
#                 p=p,
                n1=len(a),
                n2=len(b),
                avg1=a.mean(),
                avg2=b.mean(),
                avg_diff=b.mean() - a.mean(),
                mw=mw,
                mw_p=mw_p
#                 normality1=nstat1,
#                 normality1_p=np1,
#                 normality2=nstat2,
#                 normality2_p=np2,
            )
            o.append(odx)
    odf=pd.DataFrame(o)
#     if len(odf):
#         display(dfword)
#         display(odf)
#         stop
#     odf['avg_diff_abs']=odf.avg_diff.apply(abs)
#     odf['is_signif']=odf.mw_p<=0.01
    return odf

In [26]:
dfword=ttest_word_df(df[df.word=='station']).sort_values('avg_diff')
dfword

Unnamed: 0,period_cmp,vector,n1,n2,avg1,avg2,avg_diff,mw,mw_p
49,During-v-After,Virtue-Vice.HGI,120,140,1.871378,-0.294399,-2.165776,1534.0,3.397985e-30
15,Before-v-After,Virtue-Vice.HGI,100,140,1.80914,-0.294399,-2.103538,1507.0,1.918766e-25
37,During-v-After,Pos-Neg.HGI,120,140,1.547652,-0.509102,-2.056754,1533.0,3.334253e-30
13,Before-v-After,Pleasure-Pain.HGI,100,140,1.731376,-0.291203,-2.02258,843.0,1.8217950000000001e-31
3,Before-v-After,Pos-Neg.HGI,100,140,1.509316,-0.509102,-2.018418,1510.0,2.035634e-25
47,During-v-After,Pleasure-Pain.HGI,120,140,1.605966,-0.291203,-1.897169,1153.0,2.055364e-33
4,Before-v-After,Abs-Conc.Median,100,140,1.075546,-0.800612,-1.876157,491.0,6.218272e-35
38,During-v-After,Abs-Conc.Median,120,140,0.881545,-0.800612,-1.682156,887.0,9.212645999999999e-36
43,During-v-After,Interp-Desc.HGI,120,140,0.818561,-0.752247,-1.570808,1208.0,6.138449e-33
9,Before-v-After,Interp-Desc.HGI,100,140,0.81353,-0.752247,-1.565777,1270.0,1.6253570000000003e-27


In [32]:
def ttest_vector_scores(force=False,num_proc=1,**attrs):
    if not force and os.path.exists(FN_VECTOR_SCORES_TTEST): return read_df(FN_VECTOR_SCORES_TTEST)
    
    df=get_vector_scores()
    ttests=pmap_groups(
        ttest_word_df,
        df.groupby('word'),
        num_proc=num_proc
    )
    ttests['mw_perc']=ttests.mw.rank(ascending=False) / len(ttests) * 100
    odf=ttests.reset_index().set_index(['word','vector','period_cmp']).sort_values('mw')
    odf=pd.concat(
        vdf.assign(mw_perc_vec=vdf.mw.rank(ascending=False) / len(vdf) * 100)
        for i,vdf in odf.groupby('vector')
    )
    odf=pd.concat(
        vdf.assign(mw_perc_vec_cmp=vdf.mw.rank(ascending=False) / len(vdf) * 100)
        for i,vdf in odf.groupby(['vector','period_cmp'])
    )
    odf=pd.concat(
        vdf.assign(
            avg1_perc_vec=(vdf.avg1.rank(ascending=False) / len(vdf) * 100),
            avg2_perc_vec=(vdf.avg2.rank(ascending=False) / len(vdf) * 100),
        )
        for i,vdf in odf.groupby('vector')
    )
    odf=odf.sort_index()
    odf.to_pickle(FN_VECTOR_SCORES_TTEST)
    
    # add to db
    with get_veclib('ttest') as vl:
        for w,wdf in tqdm(odf.groupby('word'),desc='Adding to db'):
            vl[w]=wdf
        print('Committing')
        vl.commit()
        print('Done')
    
    return odf
    

In [33]:
ttests = ttest_vector_scores(num_proc=4,force=True)
ttests

Mapping ttest_word_df [x4]: 100%|██████████| 11678/11678 [02:35<00:00, 75.08it/s]
Adding to db: 100%|██████████| 10801/10801 [00:16<00:00, 664.98it/s]
[Koselleck] (17:49:31) Committing (+831.4s)
[Koselleck] (17:49:32) Done (+0.9s)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n1,n2,avg1,avg2,avg_diff,mw,mw_p,mw_perc,mw_perc_vec,mw_perc_vec_cmp,avg1_perc_vec,avg2_perc_vec
word,vector,period_cmp,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
abandonment,Abs-Conc.Median,During-v-After,57.0,140.0,1.298035,1.471056,0.173021,3074.0,5.818143e-03,61.483485,62.735264,73.795125,13.060690,10.685409
abandonment,Active-Passive.HGI,During-v-After,57.0,140.0,-0.076074,-0.412245,-0.336172,2726.0,2.488190e-04,66.536779,65.326937,76.446781,49.904385,66.226054
abandonment,Ambig.NX,During-v-After,57.0,140.0,0.656615,0.624658,-0.031957,3981.0,4.906557e-01,47.435068,68.753984,82.768887,2.086758,4.153387
abandonment,Collective-Indiv.RH,During-v-After,57.0,140.0,0.090856,0.183857,0.093001,3776.0,2.781402e-01,50.706802,59.187439,79.907890,48.210152,39.319623
abandonment,Freq.M,During-v-After,57.0,140.0,-0.374268,-0.350042,0.024226,2410.0,6.718183e-06,70.882185,46.960446,54.168217,75.223941,63.381085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zoology,Sing-Plural.M,During-v-After,92.0,139.0,,,,1566.0,1.382250e-22,82.768852,49.183078,40.858764,,
zoology,Strong-Weak.HGI,During-v-After,92.0,139.0,1.074550,1.542408,0.467858,3260.0,1.469611e-10,58.663184,60.177140,72.948456,16.271346,10.527728
zoology,Time-Space.HGI,During-v-After,92.0,139.0,0.230915,0.346584,0.115669,5465.0,3.092591e-02,23.148135,28.132653,52.242278,32.012614,30.865233
zoology,Virtue-Vice.HGI,During-v-After,92.0,139.0,1.000916,1.288035,0.287120,4247.0,7.910287e-06,43.133571,42.807059,54.791589,19.857752,16.073406


In [None]:
odf1