# Novelty

In [1]:
from ipynb.fs.full.koselleck import *

In [2]:
def make_foote(quart=FOOTE_W):
    tophalf = [-1] * quart + [1] * quart
    bottomhalf = [1] * quart + [-1] * quart
    foote = list()
    for i in range(quart):
        foote.append(tophalf)
    for i in range(quart):
        foote.append(bottomhalf)
    foote = np.array(foote)
    return foote

def foote_novelty(distdf, foote_size=5):
    foote=make_foote(foote_size)
    distmat = distdf.values if type(distdf)==pd.DataFrame else distdf
    
    axis1, axis2 = distmat.shape
    assert axis1 == axis2
    distsize = axis1
    axis1, axis2 = foote.shape
    assert axis1 == axis2
    halfwidth = axis1 / 2
    novelties = []
    for i in range(distsize):
        start = int(i - halfwidth)
        end = int(i + halfwidth)
        if start < 0 or end > (distsize - 1):
            novelties.append(0)
        else:
            novelties.append(np.sum(foote * distmat[start: end, start: end]))
    return novelties

def getyears():
    years=list(d.columns)
    return years


def diagonal_permute(d):
    newmat = np.zeros(d.shape)
    
    # We create one randomly-permuted list of integers called "translate"
    # that is going to be used for the whole matrix.
    
    xlen,ylen=d.shape
    translate = [i for i in range(xlen)]
    random.shuffle(translate)
    
    # Because distances matrices are symmetrical, we're going to be doing
    # two diagonals at once each time. We only need one set of values
    # (because symmetrical) but we need two sets of indices in the original
    # matrix so we know where to put the values back when we're done permuting
    # them.
    
    for i in range(0, xlen):
        indices1 = []
        indices2 = []
        values = []
        for x in range(xlen):
            y1 = x + i
            y2 = x - i
            if y1 >= 0 and y1 < ylen:
                values.append(d[x, y1])
                indices1.append((x, y1))
            if y2 >= 0 and y2 < ylen:
                indices2.append((x, y2))
        
        # Okay, for each diagonal, we permute the values.
        # We'll store the permuted values in newvalues.
        # We also check to see how many values we have,
        # so we can randomly select values if needed.
        
        newvalues = []
        lenvals = len(values)
        vallist = [i for i in range(lenvals)]
        
        for indexes, value in zip(indices1, values):
            x, y = indexes
            
            xposition = translate[x]
            yposition = translate[y]
            
            # We're going to key the randomization to the x, y
            # values for each point, insofar as that's possible.
            # Doing this will ensure that specific horizontal and
            # vertical lines preserve the dependence relations in
            # the original matrix.
            
            # But the way we're doing this is to use the permuted
            # x (or y) values to select an index in our list of
            # values in the present diagonal, and that's only possible
            # if the list is long enough to permit it. So we check:
            
            if xposition < 0 and yposition < 0:
                position = random.choice(vallist)
            elif xposition >= lenvals and yposition >= lenvals:
                position = random.choice(vallist)
            elif xposition < 0:
                position = yposition
            elif yposition < 0:
                position = xposition
            elif xposition >= lenvals:
                position = yposition
            elif yposition >= lenvals:
                position = xposition
            else:
                position = random.choice([xposition, yposition])
                # If either x or y could be used as an index, we
                # select randomly.
            
            # Whatever index was chosen, we use it to select a value
            # from our diagonal. 
            
            newvalues.append(values[position])
            
        values = newvalues
        
        # Now we lay down (both versions of) the diagonal in the
        # new matrix.
        
        for idxtuple1, idxtuple2, value in zip(indices1, indices2, values):
            x, y = idxtuple1
            newmat[x, y] = value
            x, y = idxtuple2
            newmat[x, y] = value
    
    return newmat

def zeroless(sequence):
    newseq = []
    for element in sequence:
        if element > 0.01:
            newseq.append(element)
    return newseq

def permute_test(distmatrix, foote_size=FOOTE_W, num_runs=100):
    actual_novelties = foote_novelty(distmatrix, foote_size)    
    permuted_peaks = []
    permuted_troughs = []
    xlen,ylen=distmatrix.shape
    for i in range(num_runs):
        randdist = diagonal_permute(distmatrix)
        nov = foote_novelty(randdist, foote_size)
        nov = zeroless(nov)
        permuted_peaks.append(np.max(nov))
        permuted_troughs.append(np.min(nov))
    permuted_peaks.sort(reverse = True)
    permuted_troughs.sort(reverse = True)
    significance_peak = np.ones(len(actual_novelties))
    significance_trough = np.ones(len(actual_novelties))
    for idx, novelty in enumerate(actual_novelties):
        ptop=[i for i,x in enumerate(permuted_peaks) if x and x < novelty]
        ptop=ptop[0]/num_runs if ptop else 1
        pbot=[i for i,x in enumerate(permuted_troughs) if x and x > novelty]
        pbot=pbot[-1]/num_runs if pbot else 1
        significance_peak[idx]=ptop
        significance_trough[idx]=pbot
        
        
    
    return actual_novelties, significance_peak, significance_trough

def colored_segments(novelties, significance, yrwidth=1,min_year=1700):
    x = []
    y = []
    t = []
    idx = 0
    for nov, sig in zip(novelties, significance):
        if nov > 1:
            x.append((idx*yrwidth) + min_year)
            y.append(nov)
            t.append(sig)
        idx += 1
        
    x = np.array(x)
    y = np.array(y)
    t = np.array(t)
    
    points = np.array([x,y]).transpose().reshape(-1,1,2)
    segs = np.concatenate([points[:-1],points[1:]],axis=1)
    lc = LineCollection(segs, cmap=plt.get_cmap('jet'))
    lc.set_array(t)
    
    return lc, x, y
    
    
def test_novelty(distdf, foote_sizes=None, num_runs=100):
    if not foote_sizes: foote_sizes=range(FOOTE_W-3, FOOTE_W+2)
    dq=distdf.fillna(0).values
    o=[]
    for fs in foote_sizes:
        try:
            novelties, significance_peak, significance_trough = permute_test(dq, foote_size=fs, num_runs=num_runs)
        except ValueError as e:
#             print('!!',e,'!!')
#             print(distdf)
            continue
        for year,nov,sigp,sigt in zip(distdf.columns, novelties, significance_peak, significance_trough):
            odx={
                'period':year,
                'foote_novelty':nov,
                'foote_size':fs,
                'p_peak':sigp,
                'p_trough':sigt,
            }
            o.append(odx)
    return pd.DataFrame(o)


In [3]:
def nov(word_or_words,
        num_proc=1, progress=True,
        ybin=YBIN_DISTMAT,ymin=YMIN_DISTMAT,ymax=YMAX_DISTMAT,k=K,
        force=False,cache_only=False,
        **distmat_opts):
    #print(f'nov({word_or_words})')
    objs_todo=objs=[
        dict(
            word=w,
            qstr=f'{w}/{ymin}-{ymax}_by{ybin}/k={k}',
            progress=False,
            ybin=ybin,ymax=ymax,ymin=ymin,k=k,
            **distmat_opts
        ) for w in to_words(word_or_words)
    ]
    objs_done={}    
    if not force:
        with get_db('nov',mode='r') as db:
            objs_done=dict(
                (
                    x['qstr'],
                    db.get(x['qstr']) if not cache_only else pd.DataFrame(),
                )
                for x in objs
                if x['qstr'] in db
            )
            objs_todo=[x for x in objs if x['qstr'] not in objs_done]
    if len(objs_todo):
        objs_done_now={}
        iterr=pmap_iter(
            nov_word_,
            objs_todo,
            num_proc=num_proc,
            progress=progress if len(objs_todo)>1 else False
        )
        with get_db('nov',mode='c') as db:
            for i,odf in enumerate(iterr):
                if odf is not None and len(odf):
                    qstr=odf.iloc[0].qstr
                    odf=odf.drop('qstr',1)
                    #objs_done_now[qstr]=odf
                    db[qstr]=odf
                    objs_done[qstr]=odf if not cache_only else pd.DataFrame()
                if i and not i%10: db.commit()
            db.commit()
    return pd.concat(list(objs_done.values()))# if not cache_only else None
    
        
def nov_word(word,qstr=None,**distmat_opts):
    try:
        odf=test_novelty(distmat(word, **distmat_opts)).assign(
            word=word
        ).query('foote_novelty!=0').set_index(['word','period'])
        if qstr: odf=odf.assign(qstr=qstr)
        return odf
    except Exception as e:
#         print('!!',e)
        return pd.DataFrame()
def nov_word_(obj): return nov_word(**obj)

### Run all words

In [4]:
def gen_novelty_data(words=None,num_proc=4):
    if not words: words=get_valid_words()
    nov(words, num_proc=num_proc, cache_only=True)

## Novelty scores

In [5]:
# NOV_DATA_SUMM=None
# def get_nov_data_summarised(foote_size=5,force=True,key_all='_summary_'):
#     global NOV_DATA_SUMM
#     if not force and NOV_DATA_SUMM is not None: return NOV_DATA_SUMM

#     with get_db('nov','r') as db:
#         if key_all in db: return db[key_all]
        
#         l=[]
#         for qstr,qwdf in tqdm(db.items(), total=len(db)):
#             for fs,wdf in qwdf.groupby('foote_size'):
#                 wdf=wdf.reset_index().sort_values('period')#.query(f'foote_size=={foote_size}')

# #                 display(wdf)
#                 wdf['period_int']=[int(p[:4]) for p in wdf['period']]
#                 wdf['is_signif']=wdf['p_peak']<0.05
#                 wdf_signif=wdf[wdf.is_signif==True]
#                 changepoint=wdf_signif.iloc[0].period_int if len(wdf_signif) else np.nan
#                 changepoint_avg=wdf_signif.period_int.median() if len(wdf_signif) else np.nan

#                 num_signif=len(wdf_signif)

#                 word,prdstr,atrstr=qstr.split('/')
#                 yminymax,ybin=prdstr.split('_')
#                 ybin=ybin.replace('by','')
#                 kstr=atrstr.split('k=')[-1]
#                 ymin,ymax=yminymax.split('-')
#                 dx1=dict(
#                     word=word,
#                     changepoint_first=changepoint,
#                     changepoint_avg=changepoint_avg,

#                     num_signif_periods=num_signif,
#                     num_periods=len(wdf),

#                     ymin=ymin,
#                     ymax=ymax,
#                     ybin=ybin,
#                     k=kstr
#                 )
#                 dx={
#                     **dict((k+'_signif',v) for k,v in wdf_signif.mean().items()),
#                     **dict(wdf.mean()),
#                     **dx1,
#                 }
#                 l+=[dx]
#         odf=pd.DataFrame(l).sort_values('foote_novelty',ascending=False)
#         odf=odf[~odf.foote_novelty.isna()]
#         odf=odf.set_index('word').drop(['word_signif','period_signif'],1)
#         odf=odf[[c for c in sorted(odf.columns)]]
#         NOV_DATA_SUMM=odf
#         with get_db('nov','w') as db: db[key_all]=odf
#     return odf

In [6]:
#get_all_novelty_scores??

In [7]:
# dfnovdata = get_nov_data_summarised(foote_size=6)
# dfnovdata

In [8]:
# dfnovdata.loc[set(dfnovdata.index) & set(get_keywords())]#.num_periods.value_counts()

In [9]:
# dfnovdata.loc[['culture','station','train']]

In [10]:
# qdf=dfnovdata[dfnovdata.num_periods==dfnovdata.num_periods.max()]
# qdf=qdf.query('num_signif_periods>2 & changepoint_avg>=1760')
# qdf=qdf.loc[[w for w in qdf.index if len(w)>4]]
# qdf.sort_values('foote_novelty_signif',ascending=False).head(25)#.sample(n=10)

In [11]:
# dfnovdata.loc[['culture','labour','liberty','station']]#.num_periods.value_counts()

In [12]:
# plot_distmat(distmat('merchant'))

In [13]:
# for i,g in nbr('embroidered').groupby('period'):
#     print(i)
#     display(g.head(10))

## Scaling up

In [14]:
# DFALLNOV={}

In [34]:
def get_novelty(words,ybin=YBIN_DISTMAT,
                min_foote_size=FOOTE_W,max_foote_size=FOOTE_W,**nov_attrs):
#     global DFALLNOV
    wordkey=str(tuple([w for w in sorted(to_words(words))] + [ybin, min_foote_size, max_foote_size]))
    
    with get_db('nov','r') as db:
        if wordkey in db: return db[wordkey]
    odf=nov(words,ybin=ybin,force=False,progress=True,**nov_attrs).query(f'{min_foote_size}<=foote_size<={max_foote_size}')
    odf=pd.concat(
        grp.assign(foote_novelty_z=(grp.foote_novelty - grp.foote_novelty.mean()) / grp.foote_novelty.std())
        for i,grp in odf.groupby('foote_size')
    )
    odf=odf.reset_index()
    odf['period_str']=odf['period']
    odf['period']=odf['period'].apply(lambda x: int(x[:4]))
    odf['is_signif']=odf['p_peak']<=0.05
    #DFALLNOV[wordkey]=odf
    with get_db('nov','w') as db: db[wordkey]=odf
    return odf

def get_all_novelty_scores(words=None,**attrs):
    dfallnov=get_novelty(get_valid_words() if not words else words,**attrs).sort_values('foote_novelty')
    dfallnov['period_int']=dfallnov['period'].apply(int)
    return dfallnov

In [33]:
dfallnov=get_all_novelty_scores(min_foote_size=4,max_foote_size=6)
dfallnov

Mapping nov_word_() [x1]: 100%|██████████| 178/178 [00:07<00:00, 24.60it/s]


Unnamed: 0,word,period,foote_novelty,foote_size,p_peak,p_trough,foote_novelty_z,period_str,is_signif,period_int
337255,shopkeeper,1810,-0.956722,4,1.0,0.99,-3.428777,1810-1815,False,1810
992107,caverns,1795,-0.454544,6,1.0,0.99,-1.958270,1795-1800,False,1795
108899,flowing,1840,-0.454065,4,1.0,0.99,-2.434111,1840-1845,False,1840
1022598,shopkeeper,1810,-0.437863,6,1.0,0.99,-1.943349,1810-1815,False,1810
695250,shopkeeper,1810,-0.407635,5,1.0,0.99,-2.081623,1810-1815,False,1810
...,...,...,...,...,...,...,...,...,...,...
883781,foil,1815,19.230101,6,0.0,1.00,15.650262,1815-1820,True,1815
799233,crystal,1850,19.635120,6,0.0,1.00,16.012565,1850-1855,True,1850
960902,fins,1815,20.522430,6,0.0,1.00,16.806291,1815-1820,True,1815
836294,ragged,1845,21.243869,6,0.0,1.00,17.451641,1845-1850,True,1845


In [17]:
# dfallnov.groupby('word').size().sort_values()

In [18]:
# dfallnov[dfallnov.word=='special']

## Significant words

In [19]:
odf=get_all_novelty_scores().query(f'p_peak<=.01')
odf[odf.word=='culture']

Unnamed: 0,word,period,foote_novelty,foote_size,p_peak,p_trough,foote_novelty_z,period_str,is_signif,period_int
60504,culture,1825,3.290524,5,0.01,1.0,2.757745,1825-1830,True,1825
60503,culture,1820,3.542621,5,0.0,1.0,3.087636,1820-1825,True,1820


In [20]:
# get_signif_novelty_scores()

In [40]:
def get_signif_novelty_scores(p_peak=0.01,min_peaks=2,force=False,consecutive_peaks=True, ybin=YBIN_DISTMAT,
                              min_foote_size=FOOTE_W, max_foote_size=FOOTE_W, **atr):
    key=str((p_peak,min_peaks,consecutive_peaks,ybin,min_foote_size,max_foote_size))
    if not force:
        with get_db('nov','r') as db:
            if key in db: return db[key]
    
    odf=get_all_novelty_scores(min_foote_size=min_foote_size, max_foote_size=max_foote_size, **atr).query(f'p_peak<={p_peak}')
    odf=pd.concat(
        grp.assign(word_num_peaks=len(grp))
        for i,grp in tqdm(odf.groupby(['word','foote_size']))
    )
    if min_peaks: odf=odf[odf.word_num_peaks>=min_peaks]
    if consecutive_peaks and min_peaks>1:
        l=[]
        for (wx,fs),g in odf.groupby(['word','foote_size']):
            g=g.sort_values('period_int')
            for i in range(1,len(g)):
                prev=g.iloc[i-1]
                this=g.iloc[i]
                if int(prev.period)+ybin != int(this.period):
                    break
            else:
                l+=[g]
        odf=pd.concat(l) if len(l) else pd.DataFrame()
    if len(odf):
        odf=odf.sort_values(
            'foote_novelty_z',
            ascending=False
        )#.reset_index()
    with get_db('nov','w') as db: db[key]=odf    
    return odf

def get_signif_novelty_words(**opts):
    df=get_all_novelty_scores()
    dfsign=get_signif_novelty_scores(**opts)
    signwset=set(dfsign.word)
    return signwset
    o=[
        w for w in 
        df.groupby('word').mean().sort_values(
            'foote_novelty',ascending=False
        ).index
        if w in signwset
    ]
    return o

def get_signif_novelty_scores_summary(**opts):
    df=get_signif_novelty_scores(**opts)
    df=pd.concat(
        grp.assign(
            changepoint=grp.period_int.min(),
            changepoint_avg=grp.period_int.median(),
        )
        for i,grp in df.groupby('word')
    )
    df=df.groupby('word').mean()
    df['nov_rank']=df.foote_novelty.rank(ascending=False)
    df=df.sort_values('nov_rank')
    return df

In [39]:
get_signif_novelty_scores(min_foote_size=4,max_foote_size=6)

100%|██████████| 11103/11103 [00:03<00:00, 3002.30it/s]


Unnamed: 0,word,period,foote_novelty,foote_size,p_peak,p_trough,foote_novelty_z,period_str,is_signif,period_int,word_num_peaks
491047,ragged,1845,15.009504,5,0.00,1.0,18.093063,1845-1850,True,1845,3
6702,sound,1800,9.766796,4,0.00,1.0,17.791078,1800-1805,True,1800,3
742101,cloth,1830,21.480622,6,0.00,1.0,17.663424,1830-1835,True,1830,4
627159,fins,1815,14.552565,5,0.00,1.0,17.495118,1815-1820,True,1815,3
836294,ragged,1845,21.243869,6,0.00,1.0,17.451641,1845-1850,True,1845,3
...,...,...,...,...,...,...,...,...,...,...,...
739503,doubt,1790,0.878696,6,0.01,1.0,-0.765645,1790-1795,True,1790,2
411125,serve,1800,0.592346,5,0.01,1.0,-0.773059,1800-1805,True,1800,2
411126,serve,1805,0.590404,5,0.01,1.0,-0.775601,1805-1810,True,1805,2
386021,doubt,1795,0.577880,5,0.01,1.0,-0.791989,1795-1800,True,1795,2


In [42]:
odfsum=get_signif_novelty_scores_summary(min_foote_size=4,max_foote_size=6)
odfsum

Unnamed: 0_level_0,period,foote_novelty,foote_size,p_peak,p_trough,foote_novelty_z,is_signif,period_int,word_num_peaks,changepoint,changepoint_avg,nov_rank
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ragged,1845.625000,13.318148,5.125000,0.001250,1.0,14.560365,True,1845.625000,2.750000,1840.0,1845.0,1.0
satin,1792.500000,12.993374,6.000000,0.005000,1.0,10.071315,True,1792.500000,2.000000,1790.0,1792.5,2.0
fins,1813.333333,12.422193,5.222222,0.002222,1.0,13.053489,True,1813.333333,3.222222,1805.0,1815.0,3.0
foil,1813.333333,12.299752,5.222222,0.000000,1.0,12.852171,True,1813.333333,3.222222,1805.0,1815.0,4.0
cloth,1831.000000,12.159483,5.100000,0.000000,1.0,12.991547,True,1831.000000,3.400000,1825.0,1830.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
late,1867.500000,0.831305,4.500000,0.005000,1.0,-0.247286,True,1867.500000,2.000000,1865.0,1867.5,2664.0
hit,1752.500000,0.773004,4.000000,0.010000,1.0,-0.005970,True,1752.500000,2.000000,1750.0,1752.5,2665.0
extraordinary,1747.500000,0.763916,4.000000,0.010000,1.0,-0.023954,True,1747.500000,2.000000,1745.0,1747.5,2666.0
doubt,1792.500000,0.740749,5.500000,0.007500,1.0,-0.770226,True,1792.500000,2.000000,1790.0,1792.5,2667.0


In [23]:
len(odfsum)

1607

In [24]:
stop

NameError: name 'stop' is not defined

In [None]:
plot_nbrs('anglican')

In [None]:
# dfsign=get_signif_novelty_scores(force=True,min_peaks=2,consecutive_peaks=True,p_peak=.01)
# dfsign

In [None]:
dfsign=get_signif_novelty_scores(force=True)
dfsign#[dfsign.word=='culture']

In [None]:
signw=get_signif_novelty_words()
# 'culture' in set(signw)
# signw

In [None]:
plot_novelty_words('ragged')

In [None]:
# dfsignw=dfsign.groupby('word').mean()
# dfsignw

In [None]:
# dfchangepoints=get_signif_novelty_scores(p_peak=.05, min_peaks=1).drop_duplicates('word',keep='first').set_index('word')
# dfchangepoints=dfchangepoints.join(dfsignw,rsuffix='_avg_sign')
# dfchangepoints=dfchangepoints.join(get_all_novelty_scores().groupby('word').mean(),rsuffix='_avg')
# prefcols=['period','word_num_peaks','foote_novelty_z_avg_sign']
# dfchangepoints = dfchangepoints[prefcols + sorted([c for c in dfchangepoints.columns if c not in set(prefcols)])]
# dfchangepoints = dfchangepoints.sort_values('period')
# dfchangepoints

In [None]:
# topwords=dfchangepoints.sort_values(
#     'foote_novelty_z_avg_sign',ascending=False
# ).query('word_num_peaks>1')
# round(topwords,2)

In [None]:
sign_words = get_signif_novelty_words(p_peak=0.01,min_peaks=2)
len(sign_words), random.sample(sign_words,10)

In [None]:
# sign_words = get_signif_novelty_words(p_peak=0.01)
# len(sign_words), random.sample(sign_words,10)

## Plotting

### Plotting all significant words' novelties

In [None]:
def plot_novelty_by_foote_size(p_peak=0.01,min_peaks=1,rolling=2, ymin=-1, nudge_x=1, labsize=6,words={}):
    df=get_all_novelty_scores(by_foote_size=True, min_foote_size=4, max_foote_size=6)
    if not words: words=get_signif_novelty_words(p_peak=p_peak,min_peaks=min_peaks)
#     words={w for w in words if not 's' in w and not 'f' in w}
    print('# words used:',len(words))
    if words: df=df[df.word.isin(words)]
    figdf=pd.DataFrame([
        {
            'foote_size':fs,
            'period':period,
            'num_peaks':len(grp.query(f'p_peak<{p_peak}')),
            'avg_nov_signif':grp.query(f'p_peak<{p_peak}').foote_novelty_z.mean(),
            'avg_nov':grp.foote_novelty_z.mean(),
        } for ((fs,period),grp) in df.groupby([
            'foote_size','period'
        ])
    ])
    for ycol in ['avg_nov','avg_nov_signif']:
        figdf[ycol]=figdf[ycol].rolling(rolling,min_periods=1).mean()
    
    fig=start_fig(
        figdf,
        x='period',
        y='num_peaks',
#         size='num_peaks',
        color='factor(foote_size)',
#         linetype='factor(foote_size)',
    )
    fig+=p9.geom_line()
    fig+=p9.geom_point(p9.aes(shape='factor(foote_size)'))
    
    fig+=p9.scale_color_gray(start=.8, end=.2)
    fig+=p9.geom_vline(xintercept=1770,linetype='dotted',alpha=0.5) 
    fig+=p9.geom_vline(xintercept=1800,linetype='dotted',alpha=0.5) 
    fig+=p9.geom_vline(xintercept=1830,linetype='dotted',alpha=0.5) 
    fig+=p9.geom_label(label='Sattelzeit begins (1770)',x=1770+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0))
    fig+=p9.geom_label(label='Sattelzeit ends (1830)',x=1830+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0)) 
    return fig

In [None]:
# plot_novelty_by_foote_size(rolling=1, p_peak=.01, min_peaks=1)#, words={'culture'})

In [None]:
# plot_novelty_by_foote_size(rolling=1, words={'potato'})

In [None]:
# dfchangepoints=get_signif_novelty_scores(p_peak=.05, min_peaks=1).drop_duplicates('word',keep='first').sort_values('period')
# dfchangepoints

In [None]:
# odfstr=pd.DataFrame([
#     {'period':period, 'words':', '.join(grp.sort_values('foote_novelty_z',ascending=False).word)}
#     for period,grp in sorted(dfchangepoints.groupby('period'))
# ])
# printm(odfstr.to_markdown())

## Plotting individual words

In [None]:
def get_plot_novelty_figdf(novdf):
    figdf=novdf.sample(frac=1)
    ywl=[
        f'{x} years'
        for x in figdf['foote_size']*5*2
    ]
    ywls=set(ywl)
    ywll=list(reversed(sorted(list(ywls))))
    figdf['year_window']=pd.Categorical(ywl, categories=ywll)
    figdf['glen']=1
    figdf['is_signif']=pd.Categorical(
        [bool(x<0.05) for x in figdf.p_peak],
        categories=[True,False]
    )
    
    figdf = pd.concat(
        grp.assign(foote_novelty_z=grp.foote_novelty.apply(lambda x: (x-grp.foote_novelty.mean())/grp.foote_novelty.std()))
        for i,grp in figdf.groupby('foote_size')
    )
    return figdf.dropna().sort_values(['year_window','period'])


# @interact
def plot_novelty(
        words=None,
        novdf=None,
        color='factor(year_window)',
        group='factor(year_window)',
        shape='factor(year_window)',
        size='glen',
        max_p_peak=None,
        vnum='v9',
        showdata=False,
        xlab='Date of semantic model',
        ylab='Foote Novelty (standardized)',
        colorlab='Foote matrix width',
        shapelab='Foote matrix width',
        sizelab='Number of significant peaks',
        title='Average novelty score for significant words over time',
        rolling=2,
        min_periods=1,
        min_foote_size=6,
        max_foote_size=6,
        y='foote_novelty',
        ymin=-.1,
        ylim0=0,
        ylim1=20,
        use_ylim=False,
        xlim0=1750,
        xlim1=1900,
        sizemin=.25,
        sizemax=2,
        labsize=6,
        hline='',
        nudge_label_y=1,
        ymin_heatmap=1750,
        combine=False,
        use_color=False,
        h_fig1=4.00,
        h_fig2=4.00,
        nudge_x=3,
        xlab_min=1735,
        add_median=True,
        save=False,force=False,
        label_words=False,
        logy=False,
        show_period_labels=True,
        dist_invert_fill=False,
        line_size=0.5,
        label_size=7,
        by_word=False
        ):

    wkey=''
    if words: wkey=words.replace(' ','') if type(words)==str else '-'.join(words)
    ofn=f'''fig.{wkey+'.' if wkey else ''}footenov.{vnum}.{xlim0}-{xlim1}--{ylim0}-{ylim1}--r{rolling}.{'cmbo.' if combine else ''}png'''
    ofnfn=os.path.join(PATH_FIGS,ofn)
    if save and not force and os.path.exists(ofnfn): return ofnfn
    
    figwords=set(words) if words else {'allwords'}
    
    
    if novdf is None:
        if words is None:
            print('neither words nor novdf')
            return
        
        novdf = get_novelty(words,min_foote_size=min_foote_size,max_foote_size=max_foote_size)
        if not by_word: words=None
        #print(f'Computed novelty df of shape {novdf.shape}')
        #display(novdf.mean())
        
    figdf=get_plot_novelty_figdf(novdf)
    if not len(figdf): return
    if max_p_peak: figdf=figdf[figdf.p_peak<max_p_peak]
    
    
    figdf=figdf.sort_values('period')
    if showdata: display(figdf)
    fig=start_fig(
        figdf,
        x='period',
        y=y,
        color=color if color else None,
        group=group if group else None,
        figure_size=(8,h_fig1)
    )
    
    if add_median:
        kname='Guides'
        mediandf=pd.DataFrame([{
            'yintercept':figdf[y].median(),
            kname:'Median',
        },
        ])
        fig+=p9.geom_hline(
            p9.aes(yintercept='yintercept',linetype=kname),
            data=mediandf,
            size=.25,
            show_legend=True
        )
    fig+=p9.geom_line(size=line_size)
    pntd={}
    if size: pntd['size']=size
    if shape: pntd['shape']=shape
    fig+=p9.geom_point(p9.aes(**pntd))
    fig+=p9.labs(x=xlab,y=ylab,title=title,color=colorlab,size=sizelab,shape=shapelab)
    if use_ylim: fig+=p9.ylim(ylim0,ylim1)
    fig+=p9.scale_size_continuous(range=(sizemin,sizemax))
    if not use_color: fig+=p9.scale_color_gray(direction=1)# if not use_color else p9.scale_color_distiller(type='qual')
    if hline not in {None,''}:
        fig+=p9.geom_hline(yintercept=hline,linetype='dotted')
    if words and label_words:
        labeldf=figdf[figdf.is_signif==1]
        grps=[
            grp.sort_values(y).iloc[-1:]
            for i,grp in labeldf.groupby('word')
        ]
        if len(grps):
            labeldf=pd.concat(grps)
            labeldf[y]+=nudge_label_y
            fig+=p9.geom_label(p9.aes(label='word'),color='black',
                               size=label_size,data=labeldf,boxcolor=(0,0,0,0))
    if show_period_labels:
        fig+=p9.geom_vline(xintercept=1770,linetype='dotted',alpha=0.5) 
        fig+=p9.geom_vline(xintercept=1800,linetype='dotted',alpha=0.5) 
        fig+=p9.geom_vline(xintercept=1830,linetype='dotted',alpha=0.5) 
        fig+=p9.geom_label(label='Sattelzeit begins (1770)',x=1770+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0))
        fig+=p9.geom_label(label='Sattelzeit ends (1830)',x=1830+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0)) 
    if size=='is_signif':
        fig+=p9.scale_size_manual({True:2,False:.2})
    else:
        fig+=p9.scale_size_continuous(range=[.25,3])
    fig+=p9.theme_minimal()
    fig+=p9.theme(axis_text_x=p9.element_text(angle=90), text=p9.element_text(size=8))
    if logy: fig+=p9.scale_y_log10(limits=[ylim0,ylim1])
    fig+=p9.scale_x_continuous(
        minor_breaks=list(range(xlim0//5*5,(xlim1//5*5)+5,5)),
        limits=[xlim0,xlim1]
    )
    
    

    if combine:
        yymin1=figdf.period.min()
        yymax1=figdf.period.max()
        figdm=plot_historical_semantic_distance_matrix(words=figwords,ymin=xlim0,ymax=xlim1)
        ofig=combine_plots(figdm,fig,ofn=ofnfn)
    else:
        ofig=fig
    
    if save:
        ofig.save(ofnfn)
        if PATH_FIGS2: fig.save(os.path.join(PATH_FIGS2,ofn))
        return ofnfn

    return ofig

In [None]:
# plot_novelty('value')

In [None]:
def plot_novelty_words(words,**kwargs):
    inpd=dict(
        y='foote_novelty_z',
        words=to_words(words),
        color='word',
        group='word',
        shape='word',
        colorlab='Word',
        shapelab='Word',
        sizelab='Statistically significant',
        title='Novelty scores for key words',
        ylab='Foote Novelty score',
        size='is_signif',
        vnum='v19',
        use_ylim=False,
        add_median=True,
        max_p_peak=0.0,
        min_foote_size=6,
        max_foote_size=6,
        showdata=False,
        nudge_x=2,
        logy=False,
        ylim0=0,
        ylim1=10,
        xlim0=1745,
        xlim1=1870,
        rolling=2,
        ymin=-1.5,
        label_words=True,
        show_period_labels=True,
        nudge_label_y=0.25,
        save=False,
        by_word=True
    )
    return plot_novelty(**{**inpd, **kwargs})


In [None]:
# plot_novelty_words('station,stations,culture,slave,demand,value,honour,revolution',save=False)
# plot_novelty_words('station')

In [None]:
# plot_historical_semantic_distance_matrix('virtue')

In [None]:
plot_novelty_words('value',min_foote_size=5,max_foote_size=5)

In [None]:
plot_novelty_words('monday')