# Word embedding models

In [1]:
from ipynb.fs.full.koselleck import *

[Koselleck] (11:05:07) Alles bereit (+0.0s)


## Skipgrams

### Generated elsewhere...

### Loading

In [2]:
def get_skipgrams(idir=PATH_SKIPGRAMS_YR,skipgram_n=25, calc_numlines=False):
    odf=pd.DataFrame([
        {
            'corpus':fn.split('.')[2],
            'year':int([x for x in fn.split('.') if x.isdigit()][0]),
#             'period_end':int([x for x in fn.split('.') if x.isdigit()][-1]),
            'path':os.path.join(idir,fn)
        }
        for fn in os.listdir(idir)
        if fn.startswith('data.skipgrams')
    ]).sort_values(['corpus','year'])
    if calc_numlines:
        odf['num_lines']=odf.path.progress_apply(get_numlines)
        odf['num_words']=odf['num_lines']*skipgram_n
    return odf#.query('1680<=year<1970')

In [3]:
# get_skipgrams()

In [4]:
# dfskip=get_skipgrams(calc_numlines=True)
# dfskip['period']=dfskip.year.apply(lambda y: periodize(y,YEARBIN))
# dfskip

In [5]:
# dfskip.groupby('period').num_words.sum()

In [6]:
def get_dfskipruns(dfskip,num_runs=10,incl_existing=False):
    dfskipruns=pd.concat([
        dfskip.assign(run=f'run_{str(i+1).zfill(2)}')
        for i in range(num_runs)
    ])
    dfskipruns['opath']=dfskipruns.apply(lambda row: os.path.join(PATH_MODELS_NEW,row.corpus,row.period,row.run,'model.bin'),1)
    dfskipruns['opath_exists']=dfskipruns.opath.apply(lambda x: os.path.exists(x))
    if not incl_existing: dfskipruns=dfskipruns[dfskipruns.opath_exists==False]
    return dfskipruns

In [7]:
# get_dfskipruns(dfskip,num_runs=2)

## Generate models

In [8]:
def gen_and_save_model(dfskip,nskip=DEFAULT_NUM_SKIP,force=False,vector_size=100,window=10,min_count=5,epochs=10,workers=8,verbose=False):
    row=dfskip.iloc[0]
    odir=os.path.join(PATH_MODELS_NEW,row.corpus,row.period,row.run)
    ofnfn=os.path.join(odir,'model.bin')
    if force or not os.path.exists(ofnfn):
        ensure_dir_exists(odir)
        ss=SkipgramsSamplers(dfskip.path, nskip)
        disable_gensim_logging() if not verbose else enable_gensim_logging()
        model = Word2Vec(sentences=ss,vector_size=vector_size,window=window,min_count=min_count,epochs=epochs,workers=workers)
        model.save(ofnfn)
    return pd.DataFrame([{'fnfn':ofnfn}])


In [9]:
# fnfn=gen_and_save_model(get_dfskipruns(dfskip).iloc[:1], force=True).fnfn.iloc[0]
# load_model(fnfn).wv.most_similar('value')

In [10]:
# res=pmap_groups(
#     gen_and_save_model,
#     dfskipruns.groupby(['period','run']),
#     num_proc=4,
#     kwargs=dict(force=True, nskip=NSKIP_PER_YR)
# )

In [11]:
def gen_models(
        ybin=5,
        ymin=1680,
        ymax=1970,
        num_runs=1,
        force=False,
        nskip_per_yr=NSKIP_PER_YR
    ):
    dfskip=get_skipgrams(calc_numlines=False).query(f'{ymin}<=year<{ymax}')
    dfskip['period']=dfskip.year.apply(lambda y: periodize(y,ybin))
    dfskipruns=get_dfskipruns(dfskip, num_runs=num_runs, incl_existing=force)
    dfgrps=dfskipruns.groupby(['period','run'])
    print(f'Generating {len(dfgrps)} new models over {dfskipruns.period.nunique()} periods and {dfskipruns.run.nunique()} runs')
    return pmap_groups(
        gen_and_save_model,
        dfskipruns.groupby(['period','run']),
        num_proc=4,
        kwargs=dict(force=force, nskip=nskip_per_yr)
    )

In [12]:
# gen_models(num_runs=10)

## Test models

### Getting model paths

In [13]:

            
def get_model_paths(model_dir=PATH_MODELS,model_fn='model.bin',vocab_fn='vocab.txt',period_len=None):
    """
    Get all models' paths
    """
    ld=[]
    for root,dirs,fns in tqdm(os.walk(model_dir),desc='Scanning directory for models'):
        if model_fn in fns:
            corpus,period,run=root.split('/')[-3:]
            if not 'run_' in run:
                corpus,period=root.split('/')[-2:]
                run=None
            dx={
                'corpus':corpus,
                'period_start':int(period.split('-')[0]),
                'period_end':int(period.split('-')[-1]),
                'path':os.path.join(root,model_fn),
                'path_vocab':os.path.join(root,vocab_fn)
            }
            if run is not None: dx['run']=run
            if period_len and int(dx['period_end'])-int(dx['period_start'])!=period_len:
                continue
            ld.append(dx)
    return ld

In [14]:
YMIN,YMAX,YEARBIN

(1720, 1900, 5)

In [15]:
def get_pathdf_models(period_len=YEARBIN,ymin=YMIN,ymax=YMAX):
    pathdf=pd.DataFrame(get_model_paths(PATH_MODELS_BPO, 'model.bin'))#.sort_values(['period_start','run'])
    pathdf['period']=[f'{x}-{y}' for x,y in zip(pathdf.period_start, pathdf.period_end)]
    pathdf['period_len']=pathdf.period_end - pathdf.period_start
    pathdf['qstr']=[
        f'vecs({period}_{run.split("_")[-1]})'
        for period,run in zip(pathdf.period, pathdf.run)
    ]
    if period_len: pathdf=pathdf[pathdf.period_len==period_len]
    if ymin: pathdf=pathdf[pathdf.period_start>=ymin]
    if ymax: pathdf=pathdf[pathdf.period_end<=ymax]
    return pathdf[~pathdf.period.isnull()].sort_values('period_start')

In [16]:
# get_pathdf_models()

In [17]:
def get_default_models(ymin=YMIN,ymax=YMAX,ybin=YEARBIN,num_runs=10):
    if os.path.exists(FN_DEFAULT_MODEL_PATHS):
        odf=read_df(FN_DEFAULT_MODEL_PATHS)
    else:
        odf=get_pathdf_models(period_len=ybin)
        odf.to_pickle(FN_DEFAULT_MODEL_PATHS)
    return odf.query(f'{ymin}<=period_start & period_end<={ymax} & run<="run_{num_runs:02}"')
    

def get_default_periods(**y):
    return sorted(list(set(get_default_models(**y).period)))

In [18]:
get_default_periods()

['1720-1725',
 '1725-1730',
 '1730-1735',
 '1735-1740',
 '1740-1745',
 '1745-1750',
 '1750-1755',
 '1755-1760',
 '1760-1765',
 '1765-1770',
 '1770-1775',
 '1775-1780',
 '1780-1785',
 '1785-1790',
 '1790-1795',
 '1795-1800',
 '1800-1805',
 '1805-1810',
 '1810-1815',
 '1815-1820',
 '1820-1825',
 '1825-1830',
 '1830-1835',
 '1835-1840',
 '1840-1845',
 '1845-1850',
 '1850-1855',
 '1855-1860',
 '1860-1865',
 '1865-1870',
 '1870-1875',
 '1875-1880',
 '1880-1885',
 '1885-1890',
 '1890-1895',
 '1895-1900']

## Input prep

In [19]:
def get_periods_runs(period_or_periods=None,run_or_runs=None,num_runs=10):
    periods=period_or_periods
    if periods is None: periods=get_default_periods()
    if type(periods)==str: periods=tokenize_fast(periods)
    periods=set(periods)
    runs=run_or_runs    
    if runs is None: runs=list(range(1,num_runs+1))
    if type(runs)==int: runs=[runs]
    if type(runs)==str: runs=[int(runs)]
    runs=set(runs)
    return periods,runs

In [20]:
get_periods_runs('1770-1775,1780-1785')

({'1770-1775', '1780-1785'}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10})

In [21]:
# get_default_models()

In [22]:
# get_default_periods(ymin=1800)

### Loading models

In [23]:

def load_model(path_model,path_vocab=None,min_count=None,cache_bin=True,cache=True):
    global MODEL_CACHE
    
    if cache and path_model in MODEL_CACHE: return MODEL_CACHE[path_model]
    print('Loading',path_model)
    model=do_load_model(path_model,path_vocab=path_vocab,min_count=min_count,cache_bin=cache_bin)
    return model
    
def do_load_model(path_model,path_vocab=None,min_count=None,cache_bin=True):
#     print('>> loading',path_model)
    path_model_bin=path_model.split('.txt')[0]+'.bin' if not path_model.endswith('.bin') else path_model
    if os.path.exists(path_model_bin):
        model=gensim.models.KeyedVectors.load(path_model_bin,mmap='r')
    elif os.path.exists(path_model):
        if not path_vocab: path_vocab=os.path.join(os.path.dirname(path_model,'vocab.txt'))
        if os.path.exists(path_vocab):
            model = gensim.models.KeyedVectors.load_word2vec_format(path_model,path_vocab)
            if min_count: filter_model(model,min_count=min_count)
        else:
            model = gensim.models.KeyedVectors.load_word2vec_format(path_model)
        if cache_bin:
            model.save(path_model_bin)
    else:
        print('!!??',path_model)
        stop
        return None
#     print(path_model, len(model.wv.key_to_index))
    return model
    

In [24]:
m=load_model('/home/ryan/github/koselleck/data1/models/bpo/1805-1810/run_25/model.bin')
m.wv.most_similar('virtue')

[Koselleck] (11:05:07) Loading /home/ryan/github/koselleck/data1/models/bpo/1805-1810/run_25/model.bin (+0.1s)


[('folly', 0.7436553835868835),
 ('humanity', 0.7396243810653687),
 ('virtuous', 0.7302984595298767),
 ('social', 0.7297175526618958),
 ('virtues', 0.7252563834190369),
 ('wisdom', 0.7209689021110535),
 ('pride', 0.7133151292800903),
 ('passions', 0.70967036485672),
 ('freedom', 0.7070727944374084),
 ('benevolence', 0.704535186290741)]

In [25]:
m=load_model('/home/ryan/github/koselleck/data1/models/bpo/1945-1950/run_07/model.bin')
m.wv.most_similar(['king','woman'],['man'])

[Koselleck] (11:05:07) Loading /home/ryan/github/koselleck/data1/models/bpo/1945-1950/run_07/model.bin (+0.4s)


[('elizabeth', 0.719698965549469),
 ('queen', 0.6907009482383728),
 ('princess', 0.6652333736419678),
 ("victoria's", 0.574531614780426),
 ('alexandra', 0.5180464386940002),
 ('princesses', 0.5009782910346985),
 ('mary', 0.4933466911315918),
 ('duke', 0.4914858043193817),
 ('crown', 0.49019908905029297),
 ('anne', 0.488145649433136)]

In [26]:
def test_models(dfmodels,gby=['period','run']):
    o=[]
    dfgrp=dfmodels.groupby(gby)
    for period,dfg in tqdm(sorted(dfgrp)):#, total=len(dfgrp)):
        path=dfg.iloc[-1].path
        m=load_model(path)
        try:
            testvec=m.wv.most_similar(['king','woman'],['man'],topn=25)
        except KeyError:
            continue
        testvec_wl=[x for x,y in testvec]
        has_queen='queen' in set(testvec_wl)
        odx={
            **dict(zip(gby,period)),
            'has_queen':has_queen,
            'rank_queen':testvec_wl.index('queen') if has_queen else np.nan,
            'neighborhood':', '.join(testvec_wl),
        }
        o+=[odx]
#         break
    return pd.DataFrame(o)

In [27]:
# dfmodels = get_pathdf_models().query('period_len==5')
# dftests  = test_models(dfmodels)
# dftests.to_csv('../../data/data.model.tests.csv')
# dftests.query('has_queen==True').groupby('period').size()
# dftests

In [28]:
def get_new_veclib_word_data_path(word):
    ofn=os.path.join(PATH_DB,'cdists',f'data.cdists.{word}.pkl.gz')
    odir=os.path.dirname(ofn)
    if not os.path.exists(odir): os.makedirs(odir)
    return ofn

In [29]:
get_new_veclib_word_data_path('virtue')

'/home/ryan/github/koselleck/db/cdists/data.cdists.virtue.pkl.gz'

In [30]:
def get_veclib_word_data(word,progress=True,cache=True,cache_only=False,force=False,remove_old=True,
                        periods=None):
    if progress: print(f'Loading cdist data for "{word}"')
    odf=pd.DataFrame()
    fnfn=get_new_veclib_word_data_path(word)
    oldfnfn=get_old_veclib_word_data_path(word)
    if cache and not force and os.path.exists(fnfn):
        try:
            odf=read_df(fnfn)
            if progress: print(f'Finished loading cdist data from pkl for "{word}"')
        except Exception as e:
            print('!!',e)
    if not len(odf):
        if not os.path.exists(oldfnfn):
            if progress: print(f'No file found at {oldfnfn}')
        else:
            with get_veclib_word(word) as vl:
                dfdist=pd.DataFrame(dict(vl.items())).T.rename_axis('period_run_')
                dfdist['period_'],dfdist['run_']=zip(*[x.split('_') for x in dfdist.index])
                dfdist['run_']=dfdist['run_'].apply(int)
                odf=dfdist.reset_index().drop('period_run_',1).set_index(['period_','run_'])
                if cache:
                    if progress: print(f'Saving dfdist to "{fnfn}"')
                    odf.to_pickle(fnfn)
                    if remove_old and os.path.exists(oldfnfn):
                        if progress: print(f'Removing old data from "{oldfnfn}"')
                        os.remove(oldfnfn)
                if progress: print(f'Finished loading cdist data from sqlite for "{word}"')
    if not len(odf): return odf
    odf['word_']=word
    odf=odf.reset_index()
    
    if periods is None: periods=set(get_default_periods())
    odf=odf[odf.period_.isin(periods)]
    odf=odf.set_index(['word_','period_','run_']).rename_axis(['word','period','run'])
    return odf

In [31]:
dfdist=get_veclib_word_data('histories',force=False,remove_old=False,cache_only=False)
dfdist

[Koselleck] (11:05:08) Loading cdist data for "histories" (+0.4s)
[Koselleck] (11:05:08) Finished loading cdist data from pkl for "histories" (+0.3s)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,a',a's,aa,aaa,aad,aal,aam,aaron,aas,ab,...,zoom,zoroaster,zos,zr,zs,zu,zulu,zulus,zurich,zz
word,period,run,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
histories,1780-1785,2,1.728527,1.517937,1.706469,1.505782,1.620336,1.681973,,,1.318497,1.733148,...,,,,1.640306,1.122436,,,,,
histories,1780-1785,4,2.347848,1.828671,1.905335,1.601725,2.012108,2.415577,,,1.514773,2.028202,...,,,,1.897767,1.622288,,,,,
histories,1780-1785,3,2.062550,1.714686,2.258954,,1.807324,1.941981,,,1.489314,1.899359,...,,,,1.497988,1.410675,,,,,
histories,1780-1785,5,1.736573,1.800321,1.911046,1.746098,1.874663,2.108638,,,1.873124,1.882235,...,,,,1.793693,1.223120,,,,,
histories,1780-1785,6,2.178246,1.901612,2.201326,1.856099,2.087203,1.823636,,,1.535926,2.307930,...,,,,1.878862,1.641602,,,,,
histories,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
histories,1895-1900,6,1.896813,,1.792559,,1.201605,,,,,2.084911,...,,,,,,1.725056,,,,
histories,1895-1900,7,2.270421,,1.721304,,1.545514,,,,,2.223810,...,,,,,1.656098,2.077523,,,,
histories,1895-1900,8,2.382183,,1.914793,,1.539141,,,,,2.028859,...,,,,,,1.965047,,,,
histories,1895-1900,9,2.133486,,1.548652,,1.615873,,,,,2.159049,...,,,,,,1.880041,,,,


In [32]:
def get_all_words_in_sqlite_data():
    fns=os.listdir(os.path.join(PATH_DB,'wvecs'))
    words=[fn.split('.sqlite')[0].split('.')[-1] for fn in fns]
    return words

In [33]:
# words=get_all_words_in_sqlite_data()
# len(words),random.sample(words,10)

In [34]:
def _get_veclib_word_data_(objd): return get_veclib_word_data(**objd)
def reformat_all_sqlite_data(words=None,lim=None,num_proc=1,remove_old=True):
    words=get_all_words_in_sqlite_data()[:lim] if words is None else list(words)[:lim]
    return pmap(
        _get_veclib_word_data_,
        [dict(word=word,progress=False,cache=True,force=True,remove_old=remove_old,
              cache_only=True) for word in words],
        num_proc=num_proc,
        desc='Reformatting old sqlite data into pkl files',
        use_threads=False
    )

In [35]:
# words=get_valid_words()
# random.shuffle(words)
# res=reformat_all_sqlite_data(words,lim=None,num_proc=4)