# Word embedding models

In [1]:
from koselleck import *
YEARBIN=5

## Generate data

### Create models

In [19]:
def get_skipgrams(idir=PATH_SKIPGRAMS_YR,skipgram_n=25, calc_numlines=False):
    odf=pd.DataFrame([
        {
            'corpus':fn.split('.')[2],
            'year':int([x for x in fn.split('.') if x.isdigit()][0]),
#             'period_end':int([x for x in fn.split('.') if x.isdigit()][-1]),
            'path':os.path.join(idir,fn)
        }
        for fn in os.listdir(idir)
        if fn.startswith('data.skipgrams')
    ]).sort_values(['corpus','year'])
    if calc_numlines:
        odf['num_lines']=odf.path.progress_apply(get_numlines)
        odf['num_words']=odf['num_lines']*skipgram_n
    return odf#.query('1680<=year<1970')

In [20]:
# get_skipgrams()

In [21]:
dfskip=get_skipgrams(calc_numlines=True)
# dfskip=dfskip.query('1720<=year<1900')
dfskip['period']=dfskip.year.apply(lambda y: periodize(y,YEARBIN))
dfskip

100%|██████████| 289/289 [00:06<00:00, 46.01it/s] 


Unnamed: 0,corpus,year,path,num_lines,num_words,period
182,bpo,1681,/home/ryan/github/koselleck/data/skipgrams/yea...,4189,104725,1680-1685
56,bpo,1682,/home/ryan/github/koselleck/data/skipgrams/yea...,5451,136275,1680-1685
51,bpo,1683,/home/ryan/github/koselleck/data/skipgrams/yea...,3435,85875,1680-1685
60,bpo,1684,/home/ryan/github/koselleck/data/skipgrams/yea...,3156,78900,1680-1685
200,bpo,1685,/home/ryan/github/koselleck/data/skipgrams/yea...,3054,76350,1685-1690
...,...,...,...,...,...,...
116,bpo,1967,/home/ryan/github/koselleck/data/skipgrams/yea...,5316,132900,1965-1970
20,bpo,1968,/home/ryan/github/koselleck/data/skipgrams/yea...,6281,157025,1965-1970
34,bpo,1969,/home/ryan/github/koselleck/data/skipgrams/yea...,5992,149800,1965-1970
87,bpo,1970,/home/ryan/github/koselleck/data/skipgrams/yea...,5811,145275,1970-1975


In [22]:
# dfskip.groupby('period').num_words.sum()

In [23]:
def get_dfskipruns(dfskip,num_runs=10,incl_existing=False):
    dfskipruns=pd.concat([
        dfskip.assign(run=f'run_{str(i+1).zfill(2)}')
        for i in range(num_runs)
    ])
    dfskipruns['opath']=dfskipruns.apply(lambda row: os.path.join(PATH_MODELS_NEW,row.corpus,row.period,row.run,'model.bin'),1)
    dfskipruns['opath_exists']=dfskipruns.opath.apply(lambda x: os.path.exists(x))
    if not incl_existing: dfskipruns=dfskipruns[dfskipruns.opath_exists==False]
    return dfskipruns

In [24]:
# get_dfskipruns(dfskip,num_runs=2)

In [25]:
def gen_and_save_model(dfskip,nskip=DEFAULT_NUM_SKIP,force=False,vector_size=100,window=10,min_count=5,epochs=10,workers=8,verbose=False):
    row=dfskip.iloc[0]
    odir=os.path.join(PATH_MODELS_NEW,row.corpus,row.period,row.run)
    ofnfn=os.path.join(odir,'model.bin')
    if force or not os.path.exists(ofnfn):
        ensure_dir_exists(odir)
        ss=SkipgramsSamplers(dfskip.path, nskip)
        disable_gensim_logging() if not verbose else enable_gensim_logging()
        model = Word2Vec(sentences=ss,vector_size=vector_size,window=window,min_count=min_count,epochs=epochs,workers=workers)
        model.save(ofnfn)
    return pd.DataFrame([{'fnfn':ofnfn}])


In [26]:
# fnfn=gen_and_save_model(get_dfskipruns(dfskip).iloc[:1], force=True).fnfn.iloc[0]
# load_model(fnfn).wv.most_similar('value')

In [27]:
# res=pmap_groups(
#     gen_and_save_model,
#     dfskipruns.groupby(['period','run']),
#     num_proc=4,
#     kwargs=dict(force=True, nskip=NSKIP_PER_YR)
# )

In [32]:
def gen_models(
        ybin=5,
        ymin=1680,
        ymax=1970,
        num_runs=1,
        force=False,
        nskip_per_yr=NSKIP_PER_YR
    ):
    dfskip=get_skipgrams(calc_numlines=False).query(f'{ymin}<=year<{ymax}')
    dfskip['period']=dfskip.year.apply(lambda y: periodize(y,ybin))
    dfskipruns=get_dfskipruns(dfskip, num_runs=num_runs, incl_existing=force)
    dfgrps=dfskipruns.groupby(['period','run'])
    print(f'Generating {len(dfgrps)} new models over {dfskipruns.period.nunique()} periods and {dfskipruns.run.nunique()} runs')
    return pmap_groups(
        gen_and_save_model,
        dfskipruns.groupby(['period','run']),
        num_proc=4,
        kwargs=dict(force=force, nskip=nskip_per_yr)
    )

In [33]:
gen_models(num_runs=10)

#### Computing over runs

Unnamed: 0,corpus,year,path,period,run,opath,opath_exists
200,bpo,1685,/home/ryan/github/koselleck/data/skipgrams/yea...,1685-1690,run_02,/home/ryan/github/koselleck/data/models/bpo/16...,False
214,bpo,1686,/home/ryan/github/koselleck/data/skipgrams/yea...,1685-1690,run_02,/home/ryan/github/koselleck/data/models/bpo/16...,False
271,bpo,1687,/home/ryan/github/koselleck/data/skipgrams/yea...,1685-1690,run_02,/home/ryan/github/koselleck/data/models/bpo/16...,False
243,bpo,1690,/home/ryan/github/koselleck/data/skipgrams/yea...,1690-1695,run_02,/home/ryan/github/koselleck/data/models/bpo/16...,False
155,bpo,1691,/home/ryan/github/koselleck/data/skipgrams/yea...,1690-1695,run_02,/home/ryan/github/koselleck/data/models/bpo/16...,False
...,...,...,...,...,...,...,...
106,bpo,1965,/home/ryan/github/koselleck/data/skipgrams/yea...,1965-1970,run_10,/home/ryan/github/koselleck/data/models/bpo/19...,False
132,bpo,1966,/home/ryan/github/koselleck/data/skipgrams/yea...,1965-1970,run_10,/home/ryan/github/koselleck/data/models/bpo/19...,False
116,bpo,1967,/home/ryan/github/koselleck/data/skipgrams/yea...,1965-1970,run_10,/home/ryan/github/koselleck/data/models/bpo/19...,False
20,bpo,1968,/home/ryan/github/koselleck/data/skipgrams/yea...,1965-1970,run_10,/home/ryan/github/koselleck/data/models/bpo/19...,False


Mapping gen_and_save_model [x4]: 100%|██████████| 196/196 [40:57<00:00, 12.54s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,fnfn
period,run,Unnamed: 2_level_1
1680-1685,run_04,/home/ryan/github/koselleck/data/models/bpo/16...
1680-1685,run_05,/home/ryan/github/koselleck/data/models/bpo/16...
1680-1685,run_06,/home/ryan/github/koselleck/data/models/bpo/16...
1680-1685,run_07,/home/ryan/github/koselleck/data/models/bpo/16...
1680-1685,run_08,/home/ryan/github/koselleck/data/models/bpo/16...
...,...,...
1965-1970,run_06,/home/ryan/github/koselleck/data/models/bpo/19...
1965-1970,run_07,/home/ryan/github/koselleck/data/models/bpo/19...
1965-1970,run_08,/home/ryan/github/koselleck/data/models/bpo/19...
1965-1970,run_09,/home/ryan/github/koselleck/data/models/bpo/19...


## Test models

In [35]:
m=load_model('/home/ryan/github/koselleck/data/models/bpo/1805-1810/run_25/model.bin')
m.wv.most_similar('virtue')

[('folly', 0.7436553835868835),
 ('humanity', 0.7396243810653687),
 ('virtuous', 0.7302984595298767),
 ('social', 0.7297175526618958),
 ('virtues', 0.7252563834190369),
 ('wisdom', 0.7209689021110535),
 ('pride', 0.7133151292800903),
 ('passions', 0.70967036485672),
 ('freedom', 0.7070727944374084),
 ('benevolence', 0.704535186290741)]

In [45]:
m=load_model('/home/ryan/github/koselleck/data/models/bpo/1965-1970/run_07/model.bin')
m.wv.most_similar(['king','woman'],['man'])

[('ursula', 0.6152636408805847),
 ('colonel', 0.6055870652198792),
 ('peggy', 0.5735412836074829),
 ('parker', 0.5717411041259766),
 ('kim', 0.5693525075912476),
 ('anne', 0.5633865594863892),
 ('prison', 0.5611194372177124),
 ('blonde', 0.5601671934127808),
 ('eleanor', 0.559158980846405),
 ('figaro', 0.5566076040267944)]

In [74]:
def test_models(dfmodels,gby=['period','run']):
    o=[]
    dfgrp=dfmodels.groupby(gby)
    for period,dfg in tqdm(sorted(dfgrp)):#, total=len(dfgrp)):
        path=dfg.iloc[-1].path
        m=load_model(path)
        try:
            testvec=m.wv.most_similar(['king','woman'],['man'],topn=25)
        except KeyError:
            continue
        testvec_wl=[x for x,y in testvec]
        has_queen='queen' in set(testvec_wl)
        odx={
            **dict(zip(gby,period)),
            'has_queen':has_queen,
            'rank_queen':testvec_wl.index('queen') if has_queen else np.nan,
            'neighborhood':', '.join(testvec_wl),
        }
        o+=[odx]
#         break
    return pd.DataFrame(o)

In [75]:
dfmodels = get_pathdf_models().query('period_len==5')
dftests  = test_models(dfmodels)
dftests

100%|██████████| 1120/1120 [06:10<00:00,  3.02it/s]


Unnamed: 0,period,run,has_queen,rank_queen,neighborhood
0,1680-1685,run_01,False,,"likewise, mannage, conversation, nemine, subsc..."
1,1680-1685,run_02,False,,"officers, likewise, continu'd, giving, princel..."
2,1680-1685,run_03,False,,"intending, thimbles, frippery, revivall, lady,..."
3,1680-1685,run_04,False,,"murther, reunited, gemma, geographer, revivall..."
4,1680-1685,run_05,False,,"murther, sieur, reunited, cardinal, depriv'd, ..."
...,...,...,...,...,...
1113,1965-1970,run_06,False,,"eldest, princess, edward, countess, emma, pete..."
1114,1965-1970,run_07,False,,"ursula, colonel, peggy, parker, kim, anne, pri..."
1115,1965-1970,run_08,False,,"lee, emma, mitchell, ellington, jenny, cahn, j..."
1116,1965-1970,run_09,False,,"von, karajan, smith, mag, jane, cbs, boyer, da..."


In [76]:
dftests.to_csv('../../data/data.model.tests.csv')

In [79]:
dftests.query('has_queen==True').groupby('period').size()

period
1700-1705     2
1710-1715     4
1715-1720     4
1720-1725    25
1725-1730    18
1730-1735    25
1735-1740    25
1740-1745    25
1745-1750    25
1750-1755    25
1755-1760    25
1760-1765    25
1765-1770    25
1770-1775    25
1775-1780    25
1780-1785    25
1785-1790    25
1790-1795    25
1795-1800    25
1800-1805    25
1805-1810    25
1810-1815    25
1815-1820    25
1820-1825    25
1825-1830    25
1830-1835    25
1835-1840    25
1840-1845    25
1845-1850    25
1850-1855    25
1855-1860    25
1860-1865    25
1865-1870    25
1870-1875    25
1875-1880    25
1880-1885    25
1885-1890    25
1890-1895    25
1895-1900    25
1900-1905    10
1905-1910    10
1910-1915    10
1915-1920    10
1920-1925    10
1925-1930    10
1930-1935    10
1935-1940    10
1940-1945    10
1945-1950    10
1950-1955    10
1955-1960    10
1960-1965     7
dtype: int64