# Word embedding models

In [4]:
from ipynb.fs.full.koselleck import *

[Koselleck] (17:38:42) Alles bereit (+0.0s)


## Skipgrams

### Generated elsewhere...

### Loading

In [5]:
def get_skipgrams(idir=PATH_SKIPGRAMS_YR,skipgram_n=25, calc_numlines=False):
    odf=pd.DataFrame([
        {
            'corpus':fn.split('.')[2],
            'year':int([x for x in fn.split('.') if x.isdigit()][0]),
#             'period_end':int([x for x in fn.split('.') if x.isdigit()][-1]),
            'path':os.path.join(idir,fn)
        }
        for fn in os.listdir(idir)
        if fn.startswith('data.skipgrams')
    ]).sort_values(['corpus','year'])
    if calc_numlines:
        odf['num_lines']=odf.path.progress_apply(get_numlines)
        odf['num_words']=odf['num_lines']*skipgram_n
    return odf#.query('1680<=year<1970')

In [6]:
# get_skipgrams()

In [7]:
# dfskip=get_skipgrams(calc_numlines=True)
# dfskip['period']=dfskip.year.apply(lambda y: periodize(y,YEARBIN))
# dfskip

In [8]:
# dfskip.groupby('period').num_words.sum()

In [9]:
def get_dfskipruns(dfskip,num_runs=10,incl_existing=False):
    dfskipruns=pd.concat([
        dfskip.assign(run=f'run_{str(i+1).zfill(2)}')
        for i in range(num_runs)
    ])
    dfskipruns['opath']=dfskipruns.apply(lambda row: os.path.join(PATH_MODELS_NEW,row.corpus,row.period,row.run,'model.bin'),1)
    dfskipruns['opath_exists']=dfskipruns.opath.apply(lambda x: os.path.exists(x))
    if not incl_existing: dfskipruns=dfskipruns[dfskipruns.opath_exists==False]
    return dfskipruns

In [10]:
# get_dfskipruns(dfskip,num_runs=2)

## Generate models

In [11]:
def gen_and_save_model(dfskip,nskip=DEFAULT_NUM_SKIP,force=False,vector_size=100,window=10,min_count=5,epochs=10,workers=8,verbose=False):
    row=dfskip.iloc[0]
    odir=os.path.join(PATH_MODELS_NEW,row.corpus,row.period,row.run)
    ofnfn=os.path.join(odir,'model.bin')
    if force or not os.path.exists(ofnfn):
        ensure_dir_exists(odir)
        ss=SkipgramsSamplers(dfskip.path, nskip)
        disable_gensim_logging() if not verbose else enable_gensim_logging()
        model = Word2Vec(sentences=ss,vector_size=vector_size,window=window,min_count=min_count,epochs=epochs,workers=workers)
        model.save(ofnfn)
    return pd.DataFrame([{'fnfn':ofnfn}])


In [12]:
# fnfn=gen_and_save_model(get_dfskipruns(dfskip).iloc[:1], force=True).fnfn.iloc[0]
# load_model(fnfn).wv.most_similar('value')

In [13]:
# res=pmap_groups(
#     gen_and_save_model,
#     dfskipruns.groupby(['period','run']),
#     num_proc=4,
#     kwargs=dict(force=True, nskip=NSKIP_PER_YR)
# )

In [14]:
def gen_models(
        ybin=5,
        ymin=1680,
        ymax=1970,
        num_runs=1,
        force=False,
        nskip_per_yr=NSKIP_PER_YR
    ):
    dfskip=get_skipgrams(calc_numlines=False).query(f'{ymin}<=year<{ymax}')
    dfskip['period']=dfskip.year.apply(lambda y: periodize(y,ybin))
    dfskipruns=get_dfskipruns(dfskip, num_runs=num_runs, incl_existing=force)
    dfgrps=dfskipruns.groupby(['period','run'])
    print(f'Generating {len(dfgrps)} new models over {dfskipruns.period.nunique()} periods and {dfskipruns.run.nunique()} runs')
    return pmap_groups(
        gen_and_save_model,
        dfskipruns.groupby(['period','run']),
        num_proc=4,
        kwargs=dict(force=force, nskip=nskip_per_yr)
    )

In [15]:
# gen_models(num_runs=10)

## Test models

### Getting model paths

In [16]:

            
def get_model_paths(model_dir=PATH_MODELS,model_fn='model.bin',vocab_fn='vocab.txt',period_len=None):
    """
    Get all models' paths
    """
    ld=[]
    for root,dirs,fns in tqdm(os.walk(model_dir),desc='Scanning directory for models'):
        if model_fn in fns:
            corpus,period,run=root.split('/')[-3:]
            if not 'run_' in run:
                corpus,period=root.split('/')[-2:]
                run=None
            dx={
                'corpus':corpus,
                'period_start':int(period.split('-')[0]),
                'period_end':int(period.split('-')[-1]),
                'path':os.path.join(root,model_fn),
                'path_vocab':os.path.join(root,vocab_fn)
            }
            if run is not None: dx['run']=run
            if period_len and int(dx['period_end'])-int(dx['period_start'])!=period_len:
                continue
            ld.append(dx)
    return ld

In [17]:
YMIN,YMAX,YEARBIN

(1720, 1900, 5)

In [18]:
def get_pathdf_models(period_len=YEARBIN,ymin=YMIN,ymax=YMAX):
    pathdf=pd.DataFrame(get_model_paths(PATH_MODELS_BPO, 'model.bin'))#.sort_values(['period_start','run'])
    pathdf['period']=[f'{x}-{y}' for x,y in zip(pathdf.period_start, pathdf.period_end)]
    pathdf['period_len']=pathdf.period_end - pathdf.period_start
    pathdf['qstr']=[
        f'vecs({period}_{run.split("_")[-1]})'
        for period,run in zip(pathdf.period, pathdf.run)
    ]
    if period_len: pathdf=pathdf[pathdf.period_len==period_len]
    if ymin: pathdf=pathdf[pathdf.period_start>=ymin]
    if ymax: pathdf=pathdf[pathdf.period_end<=ymax]
    return pathdf[~pathdf.period.isnull()].sort_values('period_start')

In [19]:
# get_pathdf_models()

In [20]:
def get_default_models(ymin=YMIN,ymax=YMAX,ybin=YEARBIN,num_runs=10):
    if os.path.exists(FN_DEFAULT_MODEL_PATHS):
        odf=read_df(FN_DEFAULT_MODEL_PATHS)
    else:
        odf=get_pathdf_models(period_len=ybin)
        odf.to_pickle(FN_DEFAULT_MODEL_PATHS)
    return odf.query(f'{ymin}<=period_start & period_end<={ymax} & run<="run_{num_runs:02}"')
    

def get_default_periods(**y):
    return sorted(list(set(get_default_models(**y).period)))

In [1]:
# get_default_periods()

## Input prep

In [22]:
def get_periods_runs(period_or_periods=None,run_or_runs=None,num_runs=10):
    periods=period_or_periods
    if periods is None: periods=get_default_periods()
    if type(periods)==str: periods=tokenize_fast(periods)
    periods=set(periods)
    runs=run_or_runs    
    if runs is None: runs=list(range(1,num_runs+1))
    if type(runs)==int: runs=[runs]
    if type(runs)==str: runs=[int(runs)]
    runs=set(runs)
    return periods,runs

In [2]:
# get_periods_runs('1770-1775,1780-1785')

In [24]:
# get_default_models()

In [25]:
# get_default_periods(ymin=1800)

### Loading models

In [4]:

def load_model(path_model,path_vocab=None,min_count=None,cache_bin=True,cache=False):
    global MODEL_CACHE
    
    if cache and path_model in MODEL_CACHE: return MODEL_CACHE[path_model]
#     print('Loading',path_model)
    model=do_load_model(path_model,path_vocab=path_vocab,min_count=min_count,cache_bin=cache_bin)
    return model
    
def do_load_model(path_model,path_vocab=None,min_count=None,cache_bin=True):
#     print('>> loading',path_model)
    path_model_bin=path_model.split('.txt')[0]+'.bin' if not path_model.endswith('.bin') else path_model
    if os.path.exists(path_model_bin):
        model=gensim.models.KeyedVectors.load(path_model_bin,mmap='r')
    elif os.path.exists(path_model):
        if not path_vocab: path_vocab=os.path.join(os.path.dirname(path_model,'vocab.txt'))
        if os.path.exists(path_vocab):
            model = gensim.models.KeyedVectors.load_word2vec_format(path_model,path_vocab)
            if min_count: filter_model(model,min_count=min_count)
        else:
            model = gensim.models.KeyedVectors.load_word2vec_format(path_model)
        if cache_bin:
            model.save(path_model_bin)
    else:
        print('!!??',path_model)
        stop
        return None
#     print(path_model, len(model.wv.key_to_index))
    return model
    

In [5]:
# row = get_default_models().sample(n=1).iloc[0]
# load_model_row(row)

In [42]:
# m=load_model('/home/ryan/github/koselleck/data1/models/bpo/1805-1810/run_25/model.bin')
# m.wv.most_similar('virtue')

In [41]:
# m=load_model('/home/ryan/github/koselleck/data1/models/bpo/1945-1950/run_07/model.bin')
# m.wv.most_similar(['king','woman'],['man'])

In [30]:
def test_models(dfmodels,gby=['period','run']):
    o=[]
    dfgrp=dfmodels.groupby(gby)
    for period,dfg in tqdm(sorted(dfgrp)):#, total=len(dfgrp)):
        path=dfg.iloc[-1].path
        m=load_model(path)
        try:
            testvec=m.wv.most_similar(['king','woman'],['man'],topn=25)
        except KeyError:
            continue
        testvec_wl=[x for x,y in testvec]
        has_queen='queen' in set(testvec_wl)
        odx={
            **dict(zip(gby,period)),
            'has_queen':has_queen,
            'rank_queen':testvec_wl.index('queen') if has_queen else np.nan,
            'neighborhood':', '.join(testvec_wl),
        }
        o+=[odx]
#         break
    return pd.DataFrame(o)

In [31]:
# dfmodels = get_pathdf_models().query('period_len==5')
# dftests  = test_models(dfmodels)
# dftests.to_csv('../../data/data.model.tests.csv')
# dftests.query('has_queen==True').groupby('period').size()
# dftests

In [32]:
def get_new_veclib_word_data_path(word):
    ofn=os.path.join(PATH_DB,'cdists',f'data.cdists.{word}.pkl.gz')
    odir=os.path.dirname(ofn)
    if not os.path.exists(odir): os.makedirs(odir)
    return ofn

In [33]:
get_new_veclib_word_data_path('virtue')

'/home/ryan/github/koselleck/db/cdists/data.cdists.virtue.pkl.gz'

In [34]:
def get_veclib_word_data(word,progress=True,cache=True,cache_only=False,force=False,remove_old=True,
                        periods=None):
    if progress: print(f'Loading cdist data for "{word}"')
    odf=pd.DataFrame()
    fnfn=get_new_veclib_word_data_path(word)
    oldfnfn=get_old_veclib_word_data_path(word)
    if cache and not force and os.path.exists(fnfn):
        try:
            odf=read_df(fnfn)
            if progress: print(f'Finished loading cdist data from pkl for "{word}"')
        except Exception as e:
            print('!!',e)
    if not len(odf):
        if not os.path.exists(oldfnfn):
            if progress: print(f'No file found at {oldfnfn}')
        else:
            with get_veclib_word(word) as vl:
                dfdist=pd.DataFrame(dict(vl.items())).T.rename_axis('period_run_')
                dfdist['period_'],dfdist['run_']=zip(*[x.split('_') for x in dfdist.index])
                dfdist['run_']=dfdist['run_'].apply(int)
                odf=dfdist.reset_index().drop('period_run_',1).set_index(['period_','run_'])
                if cache:
                    if progress: print(f'Saving dfdist to "{fnfn}"')
                    odf.to_pickle(fnfn)
                    if remove_old and os.path.exists(oldfnfn):
                        if progress: print(f'Removing old data from "{oldfnfn}"')
                        os.remove(oldfnfn)
                if progress: print(f'Finished loading cdist data from sqlite for "{word}"')
    if not len(odf): return odf
    odf['word_']=word
    odf=odf.reset_index()
    
    if periods is None: periods=set(get_default_periods())
    odf=odf[odf.period_.isin(periods)]
    odf=odf.set_index(['word_','period_','run_']).rename_axis(['word','period','run'])
    return odf

In [35]:
# dfdist=get_veclib_word_data('histories',force=False,remove_old=False,cache_only=False)
# dfdist

In [36]:
def get_all_words_in_sqlite_data():
    fns=os.listdir(os.path.join(PATH_DB,'wvecs'))
    words=[fn.split('.sqlite')[0].split('.')[-1] for fn in fns]
    return words

In [37]:
# words=get_all_words_in_sqlite_data()
# len(words),random.sample(words,10)

In [38]:
def _get_veclib_word_data_(objd): return get_veclib_word_data(**objd)
def reformat_all_sqlite_data(words=None,lim=None,num_proc=1,remove_old=True):
    words=get_all_words_in_sqlite_data()[:lim] if words is None else list(words)[:lim]
    return pmap(
        _get_veclib_word_data_,
        [dict(word=word,progress=False,cache=True,force=True,remove_old=remove_old,
              cache_only=True) for word in words],
        num_proc=num_proc,
        desc='Reformatting old sqlite data into pkl files',
        use_threads=False
    )

In [39]:
# words=get_valid_words()
# random.shuffle(words)
# res=reformat_all_sqlite_data(words,lim=None,num_proc=4)

## Misc. functions

In [40]:


def measure_ambiguity(model,words=None,topn=10):
    dfdist=to_dist(model,words=words)
    g=to_semnet_from_dist(dfdist,topn=topn)
    s=pd.Series(nx.clustering(g)).sort_values()
    amb=1-s
    return amb

def to_dist(m,words=None,z=1,norm=1,maxwords=10000):
    if words is None:
        words=[m.wv.index_to_key[i] for i in range(maxwords)]
    else:
        words=set(words) & set(m.wv.key_to_index.keys())
    words=list(words)
    vecs = np.array([m.wv[w] for w in words], dtype=np.float64)
    omatrix = fastdist.cosine_pairwise_distance(vecs, return_matrix=True)
    odf = round(pd.DataFrame(omatrix, index=words, columns=words),6)
    maxv=odf.max().max()
    odf=odf.replace({maxv:np.nan})
    if norm: odf=maxv - odf
    if z: odf=(odf - odf.mean().mean())/odf.std().std()
    return odf

def to_semnet_from_dist(dfdist,cutoff=3,topn=10):
    g=nx.Graph()
    for word1 in dfdist.columns:
        for word2,val in dfdist[word1].sort_values(ascending=True).iloc[:topn].items():
            g.add_edge(word1,word2,weight=val*-1)
    return g

def measure_ambiguity(model,words=None,topn=10,z=False):
    dfdist=to_dist(model,words=words)
    g=to_semnet_from_dist(dfdist,topn=topn)
    s=pd.Series(nx.clustering(g)).sort_values()
    amb=1-s
    if z: amb=(amb - amb.mean())/amb.std()
    return amb

def get_any_model(dfpath=None):
    if dfpath is None: dfpath=get_pathdf_models_bydecade()
    row=dfpath.sample(n=1).iloc[0]
    return load_model_row(row)

def measure_freq(model,words=None,tf=True,z=False):
    mwords=set(model.wv.key_to_index.keys())
    words=mwords if not words else mwords&set(words)
    vocabd=dict(
        (
            w,
            model.wv.get_vecattr(w,'count')
        )
        for w in words
    )
    svocab=pd.Series(vocabd)
    if tf: svocab=svocab / svocab.sum()
    if z: svocab=(svocab - svocab.mean()) / svocab.std()
    return svocab

INFLECTER=None
def measure_singularism(m,words=None,z=True):
    global INFLECTER
    if INFLECTER is None:
        import inflect
        INFLECTER=inflect.engine() 
    p=INFLECTER

    if not words: words=get_valid_words()
    words=list(set(words) & set(m.wv.key_to_index.keys()))
    words_plurals = pmap(p.plural_noun, words, num_proc=1, progress=False)
    s=measure_freq(m,words=set(words+words_plurals),z=z)
    sd=dict(s)
    odf=pd.DataFrame([
        {'word':ws, 'word_pural':wp, 'freq_sing':sd.get(ws), 'freq_plural':sd.get(wp)}
        for ws,wp in zip(words,words_plurals)
        if ws!=wp
    ])
    odf['freq_diff']=odf['freq_sing']-odf['freq_plural']
    odf['rank_sing']=odf['freq_sing'].rank(ascending=True)
    odf['rank_plural']=odf['freq_plural'].rank(ascending=True)
    odf['rank_diff']=odf['rank_sing']-odf['rank_plural']
    if z:
        for x in odf.select_dtypes('number').columns:
            odf[x]=(odf[x] - odf[x].mean())/odf[x].std()
    return odf.set_index('word').sort_values('rank_diff').dropna()
