# Simplified functions

In [1]:
from ipynb.fs.full.koselleck import *
w='culture'
YMIN_WDIST=YMIN_NBR
YMAX_WDIST=YMAX_NBR
YBIN_WDIST=YBIN_NBR

In [2]:
def get_words_corpora_periods(
        word_or_words, period_or_periods=None, corpus_or_corpora=DEFAULT_CORPUS,
        ybin=YBIN_WDIST, ymin=YMIN_WDIST, ymax=YMAX_WDIST):
    words=to_words(word_or_words)
    corpora=to_words(corpus_or_corpora)
    periods=to_words(period_or_periods) if period_or_periods else get_periods_bystep(
        ymin=ymin,ymax=ymax,ybin=ybin
    )
    return words,corpora,periods

In [3]:
def wdist(word_or_words, period_or_periods=None, corpus_or_corpora=DEFAULT_CORPUS,
          ybin=YBIN_WDIST, ymin=YMIN_WDIST, ymax=YMAX_WDIST, **attrs):
    words,corpora,periods = get_words_corpora_periods(
        word_or_words,period_or_periods,corpus_or_corpora,
        ybin=ybin,ymin=ymin,ymax=ymax
    )
    queries = [f'{w}/{c}/{p}' for w in words for c in corpora for p in periods]
    with get_db('wdists',mode='r') as db:
        d=dict(
            (
                qstr.replace('/','_'),
                db[qstr]
            )
            for qstr in queries
            if qstr in db and len(db[qstr])
        )
        odf=pd.DataFrame(dict((k,v) for k,v in d.items() if len(v)))
        odf['_avg_']=odf.mean(axis=1)
        return odf.sort_values('_avg_').drop('_avg_',1)

In [4]:
# wdist(random.choice(get_valid_words()))

In [5]:
# wdist('value',ybin=5)

In [6]:
K_NBR=100

def nbr(word_or_words,ntop=K_NBR,**wdist_opts):
    dfdists = wdist(word_or_words,**wdist_opts)
    l=[]
    for col in dfdists.columns:
        s=dfdists[col].sort_values().iloc[:ntop]
        srank=s.rank(ascending=True)
        for sword,sval,srankval in zip(s.index,s,srank):
            word,corpus,period=col.split('_')
            dx=dict(
                word=word,
                corpus=corpus,
                period=period,
                neighbor=sword,
                dist=sval,
                rank=int(srankval)
            )
            l.append(dx)
    return pd.DataFrame(l).set_index(['word','corpus','period','neighbor'])

In [7]:
# nbr(w)

## LNM

In [8]:
# def ldist(word_or_words, k=K, incl_words=False, force=False,progress=True,num_proc=1,
#           ybin=YBIN_DISTMAT, ymin=YMIN_DISTMAT, ymax=YMAX_DISTMAT, **wdist_opts):
#     dfdist=wdist(word_or_words=word_or_words, ybin=ybin, ymin=ymin, ymax=ymax, **wdist_opts)
#     l=[]
#     qcols=[(col1,col2,f"{col1}/{col2}/k={k}/incl_words={incl_words}") for col1 in dfdist.columns for col2 in dfdist.columns if col1<col2]

#     dbdone,dbqueue={},{}
#     if force:
#         ql=qcols
#     else:
#         with get_db('ldist',mode='r') as db:
#             for q in qcols:
#                 if q[-1] in db:
#                     dbdone[q[-1]]=db[q[-1]]
#             ql=[q for q in qcols if not q[-1] in dbdone]
    
#     if len(ql):
#         objs = [
#             (dfdist[col1],dfdist[col2],k,incl_words)
#             for col1,col2,qstr in ql
#         ]
#         iterr = pmap_iter(do_ldist, objs, num_proc=num_proc, progress=progress)
#         for (col1,col2,qstr),odx in zip(ql,iterr):
#             odx['word1'],odx['corpus1'],odx['period1']=col1.split('_')
#             odx['word2'],odx['corpus2'],odx['period2']=col2.split('_')
#             dbqueue[qstr]=odx
        
#         with get_db('ldist',mode='c') as db:
#             for qstr,odx in dbqueue.items():
#                 db[qstr]=odx
#             db.commit()
        
#     odf=pd.DataFrame(list(dbdone.values()) + list(dbqueue.values()))
#     if len(odf): odf=odf.set_index(['word1','word2','corpus1','corpus2','period1','period2'])
#     return odf

def ldist_qstr(col1,col2,k,incl_words):
    return f"{col1}/{col2}/k={k}/incl_words={incl_words}"

def ldist_word_qstr(word,ymin=YMIN_DISTMAT,ymax=YMAX_DISTMAT,ybin=YBIN_DISTMAT,k=K,
                    corpus=DEFAULT_CORPUS):
    return f'{word}/{corpus}/{ymin}-{ymax}_{ybin}'

def ldist(word_or_words, **ldist_iter_opts):
    l=[res for res in ldist_iter(word_or_words, **ldist_iter_opts)]
    return pd.concat(l) if len(l) else pd.DataFrame()

def ldist_iter(word_or_words, force=False, num_proc=1,
               ymin=YMIN_DISTMAT,ymax=YMAX_DISTMAT,ybin=YBIN_DISTMAT,k=K,
               corpus=DEFAULT_CORPUS,commit_byword=False,
               **ldist_word_opts):
    objs=[
        dict(word=w, force=force, progress=False, num_proc=1 if commit_byword else num_proc,
             commit=not commit_byword, return_dict=False, qstr=ldist_word_qstr(
                w,ymin=ymin,ymax=ymax,ybin=ybin,k=k,corpus=corpus
             ), **ldist_word_opts)
        for w in to_words(word_or_words)
    ]
    
    iterr=pmap_iter(
        ldist_word_,
        objs,
        num_proc=1 if not commit_byword else num_proc
    )
        #for i,dbd in enumerate(iterr):
        #    for qstr,odx in dbd.items(): db[qstr]=odx
        #    if i and not i%100: db.commit()
        #    odf=pd.DataFrame(dbd.values())
    
    if commit_byword:
        with get_db('ldist',mode='c') as db:
            for i,(obj,odf) in enumerate(zip(objs,iterr)):
                if odf is not None and len(odf):
                    db[obj['qstr']]=odf
                    yield odf
                if i and not i%10: db.commit()
            db.commit()
    else:
        yield from iterr
    
def ldist_word_(objd): return ldist_word(**objd)
def ldist_word(word,
               k=K,
               incl_words=False,
               force=False,
               progress=True,
               num_proc=1,
               ybin=YBIN_DISTMAT,
               ymin=YMIN_DISTMAT,
               ymax=YMAX_DISTMAT,
               return_dict=False,
               commit=True,
               **wdist_opts):
    dfdist=wdist(word, ybin=ybin, ymin=ymin, ymax=ymax, **wdist_opts)
    l=[]
    qcols=[
        (col1,col2,ldist_qstr(col1,col2,k,incl_words))
        for col1 in dfdist.columns for col2 in dfdist.columns if col1<col2
    ]

    dbdone,dbqueue={},{}
    if force or not commit:
        ql=qcols
    else:
        with get_db('ldist',mode='r') as db:
            for q in qcols:
                if q[-1] in db:
                    dbdone[q[-1]]=db[q[-1]]
            ql=[q for q in qcols if not q[-1] in dbdone]
    
    if len(ql):
        objs = [
            (dfdist[col1],dfdist[col2],k,incl_words)
            for col1,col2,qstr in ql
        ]
        iterr = pmap_iter(do_ldist, objs, num_proc=num_proc, progress=progress)
        for (col1,col2,qstr),odx in zip(ql,iterr):
            odx['word1'],odx['corpus1'],odx['period1']=col1.split('_')
            odx['word2'],odx['corpus2'],odx['period2']=col2.split('_')
            dbqueue[qstr]=odx
        
        if commit:
            with get_db('ldist',mode='c') as db:
                for qstr,odx in dbqueue.items():
                    db[qstr]=odx
                db.commit()
    
    dbd={**dbdone, **dbqueue}
    if return_dict: return dbd
    odf=pd.DataFrame(dbd.values())
    if len(odf): odf=odf.set_index(['word1','word2','corpus1','corpus2','period1','period2'])
    return odf
    
    
    
    
    
def do_ldist(obj):
    s1,s2,k,incl_words=obj
    s1,s2=s1.dropna(),s2.dropna()
    valid_words_now=set(s1.index) & set(s2.index)
    s1=s1.loc[valid_words_now].sort_values()
    s2=s2.loc[valid_words_now].sort_values()

    # get top words for each
    nb1=s1.iloc[:k].index
    nb2=s2.iloc[:k].index

    # get meta neighborhoods
    mnb=list(set(nb1)|set(nb2))
    nb1s=s1.loc[mnb]
    nb2s=s2.loc[mnb]

    # try to get distance
    try:
        #print(f'Computing: {col1} vs {col2}')
        distdists = 1-fastdist.cosine(nb1s.values.astype(float), nb2s.values.astype(float))
    except ZeroDivisionError as e:
        distdists=np.nan

    # return dict as df
    odx={
        'dist':distdists,
        'mneighb_size':len(mnb),
        'neighb1_size':len(nb1),
        'neighb2_size':len(nb2),
        'neighb1':', '.join(nb1) if incl_words else '',
        'neighb2':', '.join(nb2) if incl_words else '',
    }
    return odx

In [9]:
# wl=get_valid_words()
# for df in ldist_iter(wl, num_proc=4, commit_byword=False): pass

## Distmat

In [10]:
def distmat(word, **ldist_opts):
    dfldist=ldist_word(word, **ldist_opts)
    idf=dfldist.reset_index()
    idf=idf.append(idf.assign(period1=idf.period2, period2=idf.period1))
    dfdistmat=idf.pivot('period1','period2','dist')
    return dfdistmat

In [11]:
# pmap_iter??

In [12]:
def gen_distmats_(objd):
    try:
        return distmat(**objd)
    except Exception as e:
        #print('!!',e)
        return pd.DataFrame()
def gen_distmats(words=None,
                 ymin=YMIN_DISTMAT, ymax=YMAX_DISTMAT, ybin=YBIN_DISTMAT, k=K,
                 num_proc=1,force=False,shuffle=True,lim=None,
                 **ldist_opts):
    if not words: words=get_valid_words()
    words=to_words(words)
    if shuffle: random.shuffle(words)
    def to_qstr(w): return f'{w}/{ymin}-{ymax}_{ybin}/k={k}'
    with get_db('distmat',mode='c') as db:
        objs=[
            dict(word=word, ymin=ymin, ymax=ymax, ybin=ybin, k=k, **ldist_opts)
            for word in words
            if force or to_qstr(word) not in db
            #and word in {'culture','station','demand'}
        ][:lim]#[:3]
        iterr=pmap_iter(gen_distmats_,objs, num_proc=num_proc)
        for i,(obj,dfdist) in enumerate(zip(objs,iterr)):
            #printm('### '+obj['word'])
            #display(plot_distmat(dfdist))
            db[to_qstr(obj['word'])] = dfdist
            if i and not i%10: db.commit()
        db.commit()

In [13]:
# %%timeit
# gen_distmats(lim=10,num_proc=1)

In [14]:
# %%timeit
# gen_distmats(lim=10,num_proc=2)

In [15]:
# %%timeit
# gen_distmats(lim=None,num_proc=3)

In [16]:
# # %%timeit
# gen_distmats(lim=None,num_proc=4)

In [17]:
# stop

In [18]:
# %%timeit
# distmat(random.choice(get_valid_words()))

In [19]:
# stop

In [20]:
# %%timeit
# dbget('/distmat/vanishing/1720-1900_5/k=10')

In [21]:
# distmat('virtue',k=25)

In [22]:
def distmat_(objd): return distmat(**objd)

def distmats(words,num_proc=1,progress=True,**distmat_opts):
    odf=None
    objs=[dict(word=w, progress=False, **distmat_opts) for w in to_words(words)]
    iterr=pmap_iter(distmat_, objs, progress=progress, num_proc=num_proc)
    for df in iterr:
        df=df
        if odf is None:
            odf=df
        else:
            df3=pd.concat([odf,df])
            odf=df3.groupby(df3.index).mean()
    return odf

In [23]:
def get_abs_words(vecname='Abs-Conc.Median',cutoff=2):
    #df=get_all_signif_changes()
    df=get_vector_scores()
    s=df.groupby('word').mean()[vecname].sort_values(ascending=False)
    return s[s>cutoff].index

In [24]:
# dfdist = distmats(get_abs_words(), num_proc=2)

In [25]:
# dfdist

## Novelty

In [26]:
# w=random.choice(get_valid_words())
# test_novelty(distmat(w,num_proc=4))

In [45]:
def nov(word_or_words,
        num_proc=1, progress=True,
        ybin=YBIN_DISTMAT,ymin=YMIN_DISTMAT,ymax=YMAX_DISTMAT,k=K,
        force=False,cache_only=False,
        **distmat_opts):
    #print(f'nov({word_or_words})')
    objs_todo=objs=[
        dict(
            word=w,
            qstr=f'{w}/{ymin}-{ymax}_by{ybin}/k={k}',
            progress=False,
            ybin=ybin,ymax=ymax,ymin=ymin,k=k,
            **distmat_opts
        ) for w in to_words(word_or_words)
    ]
    objs_done={}    
    if not force:
        with get_db('nov',mode='r') as db:
            objs_done=dict(
                (
                    x['qstr'],
                    db.get(x['qstr']) if not cache_only else pd.DataFrame(),
                )
                for x in objs
                if x['qstr'] in db
            )
            objs_todo=[x for x in objs if x['qstr'] not in objs_done]
    if len(objs_todo):
        objs_done_now={}
        iterr=pmap_iter(
            nov_word_,
            objs_todo,
            num_proc=num_proc,
            progress=progress
        )
        with get_db('nov',mode='c') as db:
            for i,odf in enumerate(iterr):
                if odf is not None and len(odf):
                    qstr=odf.iloc[0].qstr
                    odf=odf.drop('qstr',1)
                    #objs_done_now[qstr]=odf
                    db[qstr]=odf
                    objs_done[qstr]=odf if not cache_only else pd.DataFrame()
                if i and not i%10: db.commit()
            db.commit()
    return pd.concat(list(objs_done.values()))# if not cache_only else None
    
        
def nov_word(word,qstr=None,**distmat_opts):
    try:
        odf=test_novelty(distmat(word, **distmat_opts)).assign(
            word=word
        ).query('foote_novelty!=0').set_index(['word','period'])
        if qstr: odf=odf.assign(qstr=qstr)
        return odf
    except Exception as e:
#         print('!!',e)
        return pd.DataFrame()
def nov_word_(obj): return nov_word(**obj)

In [28]:
# stop

In [29]:
# %%timeit
# wl=random.sample(get_valid_words(),10)
# !rm ../db/db.kos2.nov.sqlite
# nov(wl, num_proc=2, cache_only=True)

In [30]:
# %%timeit
# wl=random.sample(get_valid_words(),10)
# !rm ../db/db.kos2.nov.sqlite
# nov(wl, num_proc=1, cache_only=True)

In [31]:
# %%timeit
# wl=random.sample(get_valid_words(),10)
# !rm ../db/db.kos2.nov.sqlite
# nov(wl, num_proc=4, cache_only=True)

### Run all words

In [32]:
def gen_novelty_data(words=None,num_proc=4):
    if not words: words=get_valid_words()
    nov(words, num_proc=num_proc, cache_only=True)

In [44]:
# %%timeit
# words=get_valid_words()
# w=random.choice(words)
# nov(w)

In [43]:
# %%timeit
# words=get_valid_words()
# w=random.choice(words)
# distmat(w)

In [42]:
# %%timeit
# words=get_valid_words()
# w=random.choice(words)
# ldist(w)