# Novelty

In [1]:
from ipynb.fs.full.koselleck import *

[Koselleck] (19:18:50) Alles bereit (+0.0s)


In [2]:
def make_foote(quart=FOOTE_W):
    tophalf = [-1] * quart + [1] * quart
    bottomhalf = [1] * quart + [-1] * quart
    foote = list()
    for i in range(quart):
        foote.append(tophalf)
    for i in range(quart):
        foote.append(bottomhalf)
    foote = np.array(foote)
    return foote

def foote_novelty(distdf, foote_size=5):
    foote=make_foote(foote_size)
    distmat = distdf.values if type(distdf)==pd.DataFrame else distdf
    
    axis1, axis2 = distmat.shape
    assert axis1 == axis2
    distsize = axis1
    axis1, axis2 = foote.shape
    assert axis1 == axis2
    halfwidth = axis1 / 2
    novelties = []
    for i in range(distsize):
        start = int(i - halfwidth)
        end = int(i + halfwidth)
        if start < 0 or end > (distsize - 1):
            novelties.append(0)
        else:
            novelties.append(np.sum(foote * distmat[start: end, start: end]))
    return novelties

def getyears():
    years=list(d.columns)
    return years


def diagonal_permute(d):
    newmat = np.zeros(d.shape)
    
    # We create one randomly-permuted list of integers called "translate"
    # that is going to be used for the whole matrix.
    
    xlen,ylen=d.shape
    translate = [i for i in range(xlen)]
    random.shuffle(translate)
    
    # Because distances matrices are symmetrical, we're going to be doing
    # two diagonals at once each time. We only need one set of values
    # (because symmetrical) but we need two sets of indices in the original
    # matrix so we know where to put the values back when we're done permuting
    # them.
    
    for i in range(0, xlen):
        indices1 = []
        indices2 = []
        values = []
        for x in range(xlen):
            y1 = x + i
            y2 = x - i
            if y1 >= 0 and y1 < ylen:
                values.append(d[x, y1])
                indices1.append((x, y1))
            if y2 >= 0 and y2 < ylen:
                indices2.append((x, y2))
        
        # Okay, for each diagonal, we permute the values.
        # We'll store the permuted values in newvalues.
        # We also check to see how many values we have,
        # so we can randomly select values if needed.
        
        newvalues = []
        lenvals = len(values)
        vallist = [i for i in range(lenvals)]
        
        for indexes, value in zip(indices1, values):
            x, y = indexes
            
            xposition = translate[x]
            yposition = translate[y]
            
            # We're going to key the randomization to the x, y
            # values for each point, insofar as that's possible.
            # Doing this will ensure that specific horizontal and
            # vertical lines preserve the dependence relations in
            # the original matrix.
            
            # But the way we're doing this is to use the permuted
            # x (or y) values to select an index in our list of
            # values in the present diagonal, and that's only possible
            # if the list is long enough to permit it. So we check:
            
            if xposition < 0 and yposition < 0:
                position = random.choice(vallist)
            elif xposition >= lenvals and yposition >= lenvals:
                position = random.choice(vallist)
            elif xposition < 0:
                position = yposition
            elif yposition < 0:
                position = xposition
            elif xposition >= lenvals:
                position = yposition
            elif yposition >= lenvals:
                position = xposition
            else:
                position = random.choice([xposition, yposition])
                # If either x or y could be used as an index, we
                # select randomly.
            
            # Whatever index was chosen, we use it to select a value
            # from our diagonal. 
            
            newvalues.append(values[position])
            
        values = newvalues
        
        # Now we lay down (both versions of) the diagonal in the
        # new matrix.
        
        for idxtuple1, idxtuple2, value in zip(indices1, indices2, values):
            x, y = idxtuple1
            newmat[x, y] = value
            x, y = idxtuple2
            newmat[x, y] = value
    
    return newmat

def zeroless(sequence):
    newseq = []
    for element in sequence:
        if element > 0.01:
            newseq.append(element)
    return newseq

def permute_test(distmatrix, foote_size=FOOTE_W, num_runs=100):
    actual_novelties = foote_novelty(distmatrix, foote_size)    
    permuted_peaks = []
    permuted_troughs = []
    xlen,ylen=distmatrix.shape
    for i in range(num_runs):
        randdist = diagonal_permute(distmatrix)
        nov = foote_novelty(randdist, foote_size)
        nov = zeroless(nov)
        permuted_peaks.append(np.max(nov))
        permuted_troughs.append(np.min(nov))
    permuted_peaks.sort(reverse = True)
    permuted_troughs.sort(reverse = True)
    significance_peak = np.ones(len(actual_novelties))
    significance_trough = np.ones(len(actual_novelties))
    for idx, novelty in enumerate(actual_novelties):
        ptop=[i for i,x in enumerate(permuted_peaks) if x and x < novelty]
        ptop=ptop[0]/num_runs if ptop else 1
        pbot=[i for i,x in enumerate(permuted_troughs) if x and x > novelty]
        pbot=pbot[-1]/num_runs if pbot else 1
        significance_peak[idx]=ptop
        significance_trough[idx]=pbot
        
        
    
    return actual_novelties, significance_peak, significance_trough

def colored_segments(novelties, significance, yrwidth=1,min_year=1700):
    x = []
    y = []
    t = []
    idx = 0
    for nov, sig in zip(novelties, significance):
        if nov > 1:
            x.append((idx*yrwidth) + min_year)
            y.append(nov)
            t.append(sig)
        idx += 1
        
    x = np.array(x)
    y = np.array(y)
    t = np.array(t)
    
    points = np.array([x,y]).transpose().reshape(-1,1,2)
    segs = np.concatenate([points[:-1],points[1:]],axis=1)
    lc = LineCollection(segs, cmap=plt.get_cmap('jet'))
    lc.set_array(t)
    
    return lc, x, y
    
    
def test_novelty(distdf, foote_sizes=None, num_runs=100):
    if not foote_sizes: foote_sizes=range(FOOTE_W-3, FOOTE_W+2)
    dq=distdf.fillna(0).values
    o=[]
    for fs in foote_sizes:
        try:
            novelties, significance_peak, significance_trough = permute_test(dq, foote_size=fs, num_runs=num_runs)
        except ValueError as e:
#             print('!!',e,'!!')
#             print(distdf)
            continue
        for year,nov,sigp,sigt in zip(distdf.columns, novelties, significance_peak, significance_trough):
            odx={
                'period':year,
                'foote_novelty':nov,
                'foote_size':fs,
                'p_peak':sigp,
                'p_trough':sigt,
            }
            o.append(odx)
    return pd.DataFrame(o)


## Novelty

In [3]:
def get_words_with_lnm():
    with get_veclib('lnm') as vl:
        return [x.split(',')[0] for x in vl.keys()]

In [4]:
def nov_word(word,progress=False,cache=True,force=False,cache_only=False,
             interpolate=False,normalize=False,add_missing_periods=True,**kwargs):
    odf=None
    if cache and not force:
        with get_veclib('nov') as vl:
            odf=vl.get(word)
    
    if odf is None or not len(odf):
        odf=test_novelty(get_historical_semantic_distance_matrix(
                word,
                interpolate=interpolate,
                normalize=normalize,
                progress=progress,
                add_missing_periods=add_missing_periods
#                 force=force
            ),
            **kwargs
        )
        if odf is not None and len(odf):
            odf=odf.query('foote_novelty!=0').assign(word=word)
        if cache:
            with get_veclib('nov',autocommit=True) as vl:
                vl[word]=odf
    return pd.DataFrame() if (odf is None or cache_only or not len(odf)) else odf.set_index(['word','period'])

In [5]:
# for w in ['ancestor','station','culture','demand','slave','time']:
#     printm('### '+w)
#     printm('#### No interpolation')
#     display(round(nov_word(w,force=True,interpolate=False,normalize=False,add_missing_periods=False)[['foote_novelty','foote_size']].describe(),2))
#     printm('#### Interpolation')
#     display(round(nov_word(w,force=True,interpolate=True,normalize=False,add_missing_periods=False)[['foote_novelty','foote_size']].describe(),2))
#     printm('----')

## Scaling up

In [6]:
def _nov_(objd): return nov_word(**objd)

def nov(
        word_or_words,
        progress=True,
        cache=True,
        force=False,
        num_proc=1,
        cache_only=False,
        ):
    words=tokenize_fast(word_or_words) if type(word_or_words)==str else list(word_or_words)
    
    objs=[
        dict(
            word=word,
            progress=False if len(words)>1 else progress,
            cache=cache,
            force=force,
            cache_only=cache_only,
        ) for word in words
    ]
    o=pmap(
        _nov_,
        objs,
        num_proc=num_proc if len(words)>1 else 1,
        progress=progress if len(words)>1 else False,
        desc='Measuring novelty across words',
    )
    return pd.concat(o) if len(o) else pd.DataFrame()


In [7]:
def get_novelty(words,min_foote_size=FOOTE_W,max_foote_size=FOOTE_W,**nov_attrs):
    words=tokenize_fast(words) if type(words)==str else words
    if DFALLNOV is not None:
        odf=DFALLNOV[DFALLNOV.word.isin(set(words))]
    else:
        odf=nov(words,force=False,**nov_attrs).query(f'{min_foote_size}<=foote_size<={max_foote_size}')
        odf=pd.concat(
            grp.assign(foote_novelty_z=(grp.foote_novelty - grp.foote_novelty.mean()) / grp.foote_novelty.std())
            for i,grp in odf.groupby('foote_size')
        )
        odf=odf.reset_index()
        odf['period_str']=odf['period']
        odf['period']=odf['period'].apply(lambda x: int(x[:4]))
        odf['is_signif']=odf['p_peak']<=0.05
    return odf

In [8]:
get_novelty('station').sort_values('foote_novelty')

Unnamed: 0,word,period,foote_novelty,foote_size,p_peak,p_trough,foote_novelty_z,period_str,is_signif
0,station,1745,117.142857,5,1.0,0.96,-1.027678,1745-1750,False
1,station,1750,267.936508,5,1.0,0.81,-0.879525,1750-1755,False
5,station,1770,300.952381,5,1.0,0.77,-0.847087,1770-1775,False
8,station,1785,329.84127,5,1.0,0.75,-0.818704,1785-1790,False
7,station,1780,350.793651,5,1.0,0.69,-0.798119,1780-1785,False
2,station,1755,360.31746,5,1.0,0.65,-0.788762,1755-1760,False
9,station,1790,465.396825,5,1.0,0.42,-0.685522,1790-1795,False
6,station,1775,485.396825,5,1.0,0.42,-0.665873,1775-1780,False
10,station,1795,535.555556,5,1.0,0.28,-0.616592,1795-1800,False
3,station,1760,542.222222,5,1.0,0.28,-0.610042,1760-1765,False


## All novelty scores

In [9]:
def get_all_novelty_scores(by_foote_size=False, min_foote_size=FOOTE_W, max_foote_size=FOOTE_W, min_periods=20, cache=True, force=False):
    global DFALLNOV
    if not force and DFALLNOV is not None and len(DFALLNOV):
        allnov=odf=DFALLNOV
    elif not force and cache and os.path.exists(FN_NOV_CACHE):
        odf=read_df(FN_NOV_CACHE)
    else:
        words_done=get_words_with_lnm()
        odf=get_novelty(words_done)
        if cache: odf.to_pickle(FN_NOV_CACHE)
        DFALLNOV=odf
    
    # filter
    odf=odf.query(f'{min_foote_size}<=foote_size<={max_foote_size}')
    if min_periods:
        odf=odf[odf.foote_size==max_foote_size].groupby('word').filter(lambda grp: len(grp)>=min_periods)
        
    if not by_foote_size:
        odf=odf.groupby(['word','period']).mean().drop('foote_size',1).reset_index()
    else:
        odf['foote_size']=odf.foote_size.apply(int)

    return odf
        


In [10]:
allnov=get_all_novelty_scores()
allnov

Unnamed: 0,word,period,foote_novelty,p_peak,p_trough,foote_novelty_z,is_signif
0,abbe,1745,1446.629213,0.34,1.0,1.652426,False
1,abbe,1750,1785.955056,0.05,1.0,2.289205,True
2,abbe,1755,1364.044944,0.51,1.0,1.497449,False
3,abbe,1760,1226.404494,0.83,1.0,1.239153,False
4,abbe,1765,1203.370787,0.87,1.0,1.195928,False
...,...,...,...,...,...,...,...
145013,zealous,1850,1195.792880,0.63,1.0,1.181707,False
145014,zealous,1855,1155.339806,0.71,1.0,1.105793,False
145015,zealous,1860,959.546926,1.00,1.0,0.738369,False
145016,zealous,1865,1045.307443,0.92,1.0,0.899307,False


In [11]:
allnov.groupby('word').mean().sort_values('foote_novelty_z',ascending=False).head(25)

Unnamed: 0_level_0,period,foote_novelty,p_peak,p_trough,foote_novelty_z,is_signif
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
intelligence,1807.5,1609.291819,0.854615,0.86,1.957679,0.115385
medicine,1807.5,1425.348808,0.798077,0.851923,1.612492,0.076923
circuit,1810.0,1409.938567,0.8056,0.8572,1.583573,0.08
gent,1807.5,1367.555831,0.888462,0.936154,1.504037,0.0
special,1807.5,1361.073003,0.733462,0.796538,1.491872,0.115385
devil,1807.5,1344.969475,0.761154,0.814231,1.461652,0.153846
extra,1807.5,1277.168446,0.756154,0.636923,1.334417,0.115385
late,1807.5,1272.667237,0.798462,0.754615,1.32597,0.192308
crop,1807.5,1271.197131,0.814615,0.792692,1.323211,0.153846
distress,1807.5,1270.6983,0.871538,0.803077,1.322275,0.115385


In [12]:
# get_all_novelty_scores(
#     by_foote_size=True,min_foote_size=4,max_foote_size=6,min_periods=20
# ).groupby('word').size().sort_values()

In [13]:
# allnov.loc['special']

## Significant words

In [14]:
def get_signif_novelty_scores(p_peak=0.05,min_peaks=1):
    odf=get_all_novelty_scores().query(f'p_peak<{p_peak}')
    odf=pd.concat(
        grp.assign(
            word_num_peaks=len(grp[grp.p_peak<p_peak])
        ) for i,grp in odf.groupby('word')
    )
    if min_peaks: odf=odf[odf.word_num_peaks>=min_peaks]
    return odf.sort_values('foote_novelty_z',ascending=False)#.reset_index()

In [15]:
dfsign=get_signif_novelty_scores()
dfsign

Unnamed: 0,word,period,foote_novelty,p_peak,p_trough,foote_novelty_z,is_signif,word_num_peaks
78956,major,1870,4083.706070,0.00,1.0,6.601160,True,2
21229,choir,1855,4010.322581,0.00,1.0,6.463449,True,3
51528,flat,1865,3964.593301,0.00,1.0,6.377633,True,3
124066,station,1850,3953.333333,0.00,1.0,6.356503,True,5
48951,farmer,1800,3722.488038,0.00,1.0,5.923299,True,4
...,...,...,...,...,...,...,...,...
138474,vehemence,1835,515.384615,0.02,1.0,-0.095146,True,1
113426,rotation,1800,514.893617,0.04,1.0,-0.096067,True,1
117668,serpents,1790,488.888889,0.02,1.0,-0.144867,True,1
140737,voluminous,1835,461.538462,0.04,1.0,-0.196193,True,1


In [16]:
dfsignw=dfsign.groupby('word').mean()
dfsignw

Unnamed: 0_level_0,period,foote_novelty,p_peak,p_trough,foote_novelty_z,is_signif,word_num_peaks
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abbe,1795.000000,2644.382022,0.000000,1.0,3.900127,True,1.0
abbot,1817.500000,1718.181818,0.030000,1.0,2.162021,True,2.0
abilities,1805.000000,1461.049285,0.030000,1.0,1.679487,True,1.0
ability,1785.000000,1620.063694,0.000000,1.0,1.977893,True,1.0
abode,1825.000000,2004.891015,0.006667,1.0,2.700059,True,3.0
...,...,...,...,...,...,...,...
yellow,1747.500000,1890.016103,0.020000,1.0,2.484485,True,2.0
young,1750.000000,1509.841270,0.030000,1.0,1.771050,True,1.0
youthful,1823.333333,1453.264009,0.036667,1.0,1.664877,True,3.0
youths,1812.500000,2308.559499,0.010000,1.0,3.269923,True,2.0


In [17]:
dfchangepoints=get_signif_novelty_scores(p_peak=.05, min_peaks=1).drop_duplicates('word',keep='first').set_index('word')
dfchangepoints=dfchangepoints.join(dfsignw,rsuffix='_avg_sign')
dfchangepoints=dfchangepoints.join(get_all_novelty_scores().groupby('word').mean(),rsuffix='_avg')
prefcols=['period','word_num_peaks','foote_novelty_z_avg_sign']
dfchangepoints = dfchangepoints[prefcols + sorted([c for c in dfchangepoints.columns if c not in set(prefcols)])]
dfchangepoints = dfchangepoints.sort_values('period')
dfchangepoints

Unnamed: 0_level_0,period,word_num_peaks,foote_novelty_z_avg_sign,foote_novelty,foote_novelty_avg,foote_novelty_avg_sign,foote_novelty_z,foote_novelty_z_avg,is_signif,is_signif_avg,is_signif_avg_sign,p_peak,p_peak_avg,p_peak_avg_sign,p_trough,p_trough_avg,p_trough_avg_sign,period_avg,period_avg_sign,word_num_peaks_avg_sign
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
bullion,1745,1,-0.200786,459.090909,-93.863636,459.090909,-0.200786,-1.238460,True,0.050000,True,0.02,0.812000,0.020,1.0,0.895500,1.0,1792.5,1745.0,1.0
figure,1745,2,1.866315,1658.692185,789.375537,1560.606061,2.050383,0.419025,True,0.076923,True,0.00,0.774231,0.000,1.0,0.755000,1.0,1807.5,1747.5,2.0
fruit,1745,2,2.888701,2141.401274,802.694757,2105.414013,2.956234,0.444020,True,0.076923,True,0.00,0.904615,0.005,1.0,0.797692,1.0,1807.5,1747.5,2.0
rage,1745,2,1.953944,1653.650794,643.321123,1607.301587,2.040923,0.144940,True,0.076923,True,0.00,0.809231,0.010,1.0,0.768846,1.0,1807.5,1747.5,2.0
yellow,1745,2,2.484485,2146.537842,797.720798,1890.016103,2.965874,0.434686,True,0.076923,True,0.00,0.875769,0.020,1.0,0.810000,1.0,1807.5,1747.5,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
soothing,1870,1,1.235409,1224.409449,274.910523,1224.409449,1.235409,-0.546419,True,0.045455,True,0.03,0.915455,0.030,1.0,0.971818,1.0,1817.5,1870.0,1.0
grounds,1870,1,2.233554,1756.299841,738.240707,1756.299841,2.233554,0.323066,True,0.038462,True,0.03,0.891538,0.030,1.0,0.813462,1.0,1807.5,1870.0,1.0
board,1870,1,2.530967,1914.785374,729.974318,1914.785374,2.530967,0.307553,True,0.076923,True,0.00,0.904615,0.000,1.0,0.668462,1.0,1807.5,1870.0,1.0
meditation,1870,1,2.407745,1849.122807,469.076537,1849.122807,2.407745,-0.182047,True,0.038462,True,0.00,0.936538,0.000,1.0,0.931538,1.0,1807.5,1870.0,1.0


In [18]:
topwords=dfchangepoints.sort_values(
    'foote_novelty_z_avg_sign',ascending=False
).query('word_num_peaks>1')
round(topwords,2)

Unnamed: 0_level_0,period,word_num_peaks,foote_novelty_z_avg_sign,foote_novelty,foote_novelty_avg,foote_novelty_avg_sign,foote_novelty_z,foote_novelty_z_avg,is_signif,is_signif_avg,is_signif_avg_sign,p_peak,p_peak_avg,p_peak_avg_sign,p_trough,p_trough_avg,p_trough_avg_sign,period_avg,period_avg_sign,word_num_peaks_avg_sign
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
major,1870,2,5.68,4083.71,912.93,3593.29,6.60,0.65,True,0.08,True,0.00,0.89,0.00,1.0,0.49,1.0,1807.50,1867.5,2.0
flat,1865,3,5.06,3964.59,1022.18,3264.54,6.38,0.86,True,0.12,True,0.00,0.84,0.00,1.0,0.52,1.0,1807.50,1865.0,3.0
choir,1855,3,5.00,4010.32,1055.29,3229.25,6.46,0.92,True,0.12,True,0.00,0.84,0.00,1.0,0.68,1.0,1807.50,1855.0,3.0
vice,1860,2,4.93,3498.88,1205.24,3193.94,5.50,1.20,True,0.08,True,0.00,0.87,0.00,1.0,0.79,1.0,1807.50,1862.5,2.0
organs,1870,2,4.84,3585.59,888.14,3146.13,5.67,0.60,True,0.08,True,0.00,0.89,0.00,1.0,0.70,1.0,1807.50,1867.5,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
purchaser,1865,2,0.79,994.24,74.34,987.77,0.80,-0.92,True,0.08,True,0.02,0.89,0.02,1.0,0.75,1.0,1811.67,1867.5,2.0
waking,1860,2,0.78,995.22,144.79,981.27,0.81,-0.79,True,0.08,True,0.01,0.90,0.01,1.0,0.77,1.0,1809.17,1857.5,2.0
torch,1825,2,0.73,990.16,-33.47,957.38,0.80,-1.13,True,0.08,True,0.01,0.87,0.02,1.0,0.95,1.0,1802.50,1827.5,2.0
filthy,1845,2,0.40,785.29,125.79,779.41,0.41,-0.83,True,0.08,True,0.04,0.92,0.04,1.0,0.83,1.0,1807.50,1842.5,2.0


In [20]:
def get_signif_novelty_words(p_peak=0.05,min_peaks=1):
    qstr=f'p_peak={p_peak},min_peaks={min_peaks}'
    with get_veclib('novwords') as vl:
        if qstr in vl: return vl[qstr]
    
    df=get_all_novelty_scores()
    dfsign=get_signif_novelty_scores(p_peak=p_peak,min_peaks=min_peaks)
    signwset=set(dfsign.word)
    o=[
        w for w in 
        df.groupby('word').mean().sort_values('foote_novelty',ascending=False).index
        if w in signwset
    ]
    print('# all words',len(set(df.word)))
    print('# signif words',len(set(dfsign.word)))
    with get_veclib('novwords',autocommit=True) as vl:
        vl[qstr]=o
    
    return o

In [21]:
sign_words = get_signif_novelty_words(p_peak=0.05)
len(sign_words), random.sample(sign_words,10)

(3343,
 ['mist',
  'sickly',
  'compact',
  'births',
  'voice',
  'walking',
  'reward',
  'monuments',
  'watering',
  'commercial'])

In [22]:
sign_words = get_signif_novelty_words(p_peak=0.01)
len(sign_words), random.sample(sign_words,10)

(2217,
 ['sun',
  'scottish',
  'damp',
  'exception',
  'kindness',
  'lark',
  'husbandry',
  'secular',
  'interpretation',
  'caution'])

## Plotting

### Plotting all significant words' novelties

In [None]:
def plot_novelty_by_foote_size(p_peak=0.01,min_peaks=1,rolling=2, ymin=-1, nudge_x=1, labsize=6,words={}):
    df=get_all_novelty_scores(by_foote_size=True, min_foote_size=4, max_foote_size=6)
    if not words: words=get_signif_novelty_words(p_peak=p_peak,min_peaks=min_peaks)
#     words={w for w in words if not 's' in w and not 'f' in w}
    print('# words used:',len(words))
    if words: df=df[df.word.isin(words)]
    figdf=pd.DataFrame([
        {
            'foote_size':fs,
            'period':period,
            'num_peaks':len(grp.query(f'p_peak<{p_peak}')),
            'avg_nov_signif':grp.query(f'p_peak<{p_peak}').foote_novelty_z.mean(),
            'avg_nov':grp.foote_novelty_z.mean(),
        } for ((fs,period),grp) in df.groupby([
            'foote_size','period'
        ])
    ])
    for ycol in ['avg_nov','avg_nov_signif']:
        figdf[ycol]=figdf[ycol].rolling(rolling,min_periods=1).mean()
    
    fig=start_fig(
        figdf,
        x='period',
        y='num_peaks',
#         size='num_peaks',
        color='factor(foote_size)',
#         linetype='factor(foote_size)',
    )
    fig+=p9.geom_line()
    fig+=p9.geom_point(p9.aes(shape='factor(foote_size)'))
    
    fig+=p9.scale_color_gray(start=.8, end=.2)
    fig+=p9.geom_vline(xintercept=1770,linetype='dotted',alpha=0.5) 
    fig+=p9.geom_vline(xintercept=1800,linetype='dotted',alpha=0.5) 
    fig+=p9.geom_vline(xintercept=1830,linetype='dotted',alpha=0.5) 
    fig+=p9.geom_label(label='Sattelzeit begins (1770)',x=1770+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0))
    fig+=p9.geom_label(label='Sattelzeit ends (1830)',x=1830+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0)) 
    return fig

In [None]:
plot_novelty_by_foote_size(rolling=1, p_peak=.01, min_peaks=1)#, words={'culture'})

In [None]:
# plot_novelty_by_foote_size(rolling=1, words={'potato'})

In [None]:
dfchangepoints=get_signif_novelty_scores(p_peak=.05, min_peaks=1).drop_duplicates('word',keep='first').sort_values('period')
dfchangepoints

In [None]:
odfstr=pd.DataFrame([
    {'period':period, 'words':', '.join(grp.sort_values('foote_novelty_z',ascending=False).word)}
    for period,grp in sorted(dfchangepoints.groupby('period'))
])
printm(odfstr.to_markdown())

## Plotting individual words

In [None]:
def get_plot_novelty_figdf(novdf):
    figdf=novdf.sample(frac=1)
    ywl=[
        f'{x} years'
        for x in figdf['foote_size']*5*2
    ]
    ywls=set(ywl)
    ywll=list(reversed(sorted(list(ywls))))
    figdf['year_window']=pd.Categorical(ywl, categories=ywll)
    figdf['glen']=1
    figdf['is_signif']=pd.Categorical(
        [bool(x<0.05) for x in figdf.p_peak],
        categories=[True,False]
    )
    
    figdf = pd.concat(
        grp.assign(foote_novelty_z=grp.foote_novelty.apply(lambda x: (x-grp.foote_novelty.mean())/grp.foote_novelty.std()))
        for i,grp in figdf.groupby('foote_size')
    )
    return figdf.dropna().sort_values(['year_window','period'])


# @interact
def plot_novelty(
        words=None,
        novdf=None,
        color='factor(year_window)',
        group='factor(year_window)',
        shape='factor(year_window)',
        size='glen',
        max_p_peak=None,
        vnum='v9',
        showdata=False,
        xlab='Date of semantic model',
        ylab='Foote Novelty (standardized)',
        colorlab='Foote matrix width',
        shapelab='Foote matrix width',
        sizelab='Number of significant peaks',
        title='Average novelty score for significant words over time',
        rolling=2,
        min_periods=1,
        min_foote_size=6,
        max_foote_size=6,
        y='foote_novelty',
        ymin=-.1,
        ylim0=0,
        ylim1=20,
        use_ylim=False,
        xlim0=1750,
        xlim1=1900,
        sizemin=.25,
        sizemax=2,
        labsize=6,
        hline='',
        nudge_label_y=1,
        ymin_heatmap=1750,
        combine=False,
        use_color=False,
        h_fig1=4.00,
        h_fig2=4.00,
        nudge_x=3,
        xlab_min=1735,
        add_median=True,
        save=False,
        label_words=False,
        logy=False,
        show_period_labels=True,
        dist_invert_fill=False,
        line_size=0.5,
        label_size=7,
        by_word=False
        ):

    figwords=set(words) if words else {'allwords'}
    if novdf is None:
        if words is None:
            print('neither words nor novdf')
            return
        
        novdf = get_novelty(words,min_foote_size=min_foote_size,max_foote_size=max_foote_size)
        if not by_word: words=None
#         print(f'Computed novelty df of shape {novdf.shape}')
#         display(novdf)
        
#     figdf=get_plot_novelty_figdf(novdf.query(f'{min_foote_size}<=foote_size<={max_foote_size}'))
    figdf=get_plot_novelty_figdf(novdf)
    if not len(figdf): return
    if max_p_peak: figdf=figdf[figdf.p_peak<max_p_peak]
    
    
    figdf=figdf.sort_values('period')
    if showdata: display(figdf)
    fig=start_fig(
        figdf,
        x='period',
        y=y,
        color=color if color else None,
        group=group if group else None,
        figure_size=(8,h_fig1)
    )
    
    if add_median:
        kname='Guides'
        mediandf=pd.DataFrame([{
            'yintercept':figdf[y].median(),
            kname:'Median',
        },
        ])
        fig+=p9.geom_hline(
            p9.aes(yintercept='yintercept',linetype=kname),
            data=mediandf,
            size=.25,
            show_legend=True
        )
    fig+=p9.geom_line(size=line_size)
    pntd={}
    if size: pntd['size']=size
    if shape: pntd['shape']=shape
    fig+=p9.geom_point(p9.aes(**pntd))
    fig+=p9.labs(x=xlab,y=ylab,title=title,color=colorlab,size=sizelab,shape=shapelab)
    if use_ylim: fig+=p9.ylim(ylim0,ylim1)
    fig+=p9.scale_size_continuous(range=(sizemin,sizemax))
    if not use_color: fig+=p9.scale_color_gray(direction=1)# if not use_color else p9.scale_color_distiller(type='qual')
    if hline not in {None,''}:
        fig+=p9.geom_hline(yintercept=hline,linetype='dotted')
    if words and label_words:
        labeldf=figdf[figdf.is_signif==1]
        grps=[
            grp.sort_values(y).iloc[-1:]
            for i,grp in labeldf.groupby('word')
        ]
        if len(grps):
            labeldf=pd.concat(grps)
            labeldf[y]+=nudge_label_y
            fig+=p9.geom_label(p9.aes(label='word'),color='black',
                               size=label_size,data=labeldf,boxcolor=(0,0,0,0))
    if show_period_labels:
        fig+=p9.geom_vline(xintercept=1770,linetype='dotted',alpha=0.5) 
        fig+=p9.geom_vline(xintercept=1800,linetype='dotted',alpha=0.5) 
        fig+=p9.geom_vline(xintercept=1830,linetype='dotted',alpha=0.5) 
        fig+=p9.geom_label(label='Sattelzeit begins (1770)',x=1770+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0))
        fig+=p9.geom_label(label='Sattelzeit ends (1830)',x=1830+nudge_x,y=ymin,angle=90,size=labsize,color='black',va='bottom',boxcolor=(0,0,0,0)) 
    if size=='is_signif':
        fig+=p9.scale_size_manual({True:2,False:.2})
    else:
        fig+=p9.scale_size_continuous(range=[.25,3])
    fig+=p9.theme_minimal()
    fig+=p9.theme(axis_text_x=p9.element_text(angle=90), text=p9.element_text(size=8))
    if logy: fig+=p9.scale_y_log10(limits=[ylim0,ylim1])
    fig+=p9.scale_x_continuous(
        minor_breaks=list(range(xlim0//5*5,(xlim1//5*5)+5,5)),
        limits=[xlim0,xlim1]
    )
    wkey=''
    if words: wkey=words.replace(' ','') if type(words)==str else '-'.join(words)
    ofn=f'''fig.footenov.{vnum}.{wkey+'.' if wkey else ''}{'cmbo.' if combine else ''}png'''
    ofnfn=os.path.join(PATH_FIGS,ofn)

    if combine:
        yymin1=figdf.period.min()
        yymax1=figdf.period.max()
        figdm=plot_historical_semantic_distance_matrix(words=figwords,ymin=xlim0,ymax=xlim1)
        ofig=combine_plots(figdm,fig,ofn=ofnfn)
    else:
        ofig=fig
        if save: ofig.save(ofnfn)
    if save: upfig(ofnfn)
    return ofig

In [None]:
def plot_novelty_words(words,**kwargs):
    words=[w.strip() for w in words.split(',')] if type(words)==str else list(words)
    inpd=dict(
        y='foote_novelty_z',
        words=words,
        color='word',
        group='word',
        shape='word',
        colorlab='Word',
        shapelab='Word',
        sizelab='Statistically significant',
        title='Novelty scores for key words',
        ylab='Foote Novelty score',
        size='is_signif',
        vnum='v19',
        use_ylim=False,
        add_median=True,
        max_p_peak=0.0,
        min_foote_size=5,
        max_foote_size=5,
        showdata=False,
        nudge_x=2,
        logy=False,
        ylim0=0,
        ylim1=10,
        xlim0=1745,
        xlim1=1870,
        rolling=2,
        ymin=-1.5,
        label_words=True,
        show_period_labels=True,
        nudge_label_y=0.25,
        save=False,
        by_word=True
    )
    return plot_novelty(**{**inpd, **kwargs})


In [None]:
plot_novelty_words('station,culture,slave,demand,value,honour,revolution',save=True)
# plot_novelty_words('station')