# Neighbs

In [3]:
from koselleck.imports import *

def gen_all_neighbors(fnfn=FN_ALL_NEIGHBS, k=25, k_min=10, lim=None, num_proc=1, force=False, num_runs=10):
    if not force and os.path.exists(fnfn): return read_df(fnfn)
    dfmodels = get_pathdf_models(period_len=5).query(f'run<="run_{num_runs:02}" & period_start>=1720')
    odf=pmap_groups(
        do_gen_neighbs,
        dfmodels.iloc[:lim].groupby(['corpus','period']),
        num_proc=4,
        desc='Gathering all neighborhoods',
        use_cache=False,
        kwargs=dict(k=k)
    ).reset_index()
    odf.to_pickle(FN_ALL_NEIGHBS)
    return odf

def _do_gen_neighbs(obj):
    model_path,words,k = obj
    m=load_model(model_path)
    mwords=set(words)&set(m.wv.key_to_index.keys())
    mdf=pd.DataFrame([
        {'word':w, 'neighbor':w2, 'rank_avg':i+1, 'csim':c}
        for w in mwords
        for i,(w2,c) in enumerate(m.wv.most_similar(w,topn=k))
    ])
    return mdf

def do_gen_neighbs(dfpath,words=None,k=25,progress=False,min_count=2):
    model_path=dfpath.iloc[0].path
    if not words: words=get_valid_words()
    o=[]
#     iter1=dfpath.path if not progress else tqdm(dfpath.path,desc='Iterating models',position=0)
    objs=[(mpath,words,k) for mpath in dfpath.path]
    o = pmap(_do_gen_neighbs, objs, num_proc=1, progress=progress)
    if not len(o): return
    odf=pd.concat(o)
    gby=['word','neighbor']
    odfg=odf.groupby(gby)
    odf=odf.set_index(gby)
    odf['count']=odfg.size()
    odf=odf.query(f'count>={min_count}')#.set_index(['word'])
    odf['score']=[c - (1/100) + (cs/1000)
                  for c,r,cs in zip(odf['count'], odf.rank_avg, odf.csim)]
    odf=odf.groupby(gby).mean().reset_index()
    odf['rank']=odf.groupby('word')['score'].rank(ascending=False,method='min').apply(int)
    odf=odf.sort_values(['word','rank'])#.drop('score',1)
    return odf

DF_ALLNEIGHB=None
def get_all_neighbors(
        fnfn=FN_ALL_NEIGHBS,
        k=25,
        k_min=10,
        lim=None,
        num_proc=1,
        force=False,
        num_runs=10,
        min_count=2,
        min_neighbs=10):
    global DF_ALLNEIGHB
    if DF_ALLNEIGHB is not None: return DF_ALLNEIGHB
    
    if not force and os.path.exists(fnfn):
        print('Loading data')
        odf=read_df(fnfn)
    else:
        odf=gen_all_neighbors(fnfn=fnfn,k=k,k_min=k_min,lim=lim,num_proc=num_proc,force=force,num_runs=num_runs)
    odf=odf.drop('corpus',1).set_index(['word','period']).sort_index()
    print('Filtering')
    s=odf.query(f'count>={min_count}').groupby(['word','period']).neighbor.nunique()
    print('Filtering, pt2')
    odf=odf[s>=min_neighbs]
    print('Postprocessing')
#     odf=pd.concat(
#         grp.rename({'rank':'rank_avg'}).assign(rank=[i+1 for i in range(len(grp))])
#         for _,grp in odf.groupby(['word','period'])
#     )
    DF_ALLNEIGHB=odf
    return odf

## Gen data

In [4]:
dfmodels = get_pathdf_models(period_len=5).query('run<="run_10" & period_start>=1720')
for i,grp in dfmodels.groupby(['corpus','period']): pass
odf=do_gen_neighbs(grp,progress=True)
odf[odf.word=='value']

Mapping _do_gen_neighbs() [x1]: 100%|██████████| 10/10 [00:22<00:00,  2.24s/it]


Unnamed: 0,word,neighbor,rank_avg,csim,count,score,rank
157647,value,cost,3.2,0.446995,10.0,9.990447,1
157669,value,purchase,6.0,0.437696,10.0,9.990438,2
157667,value,proportion,9.555556,0.414265,9.0,8.990414,3
157658,value,investment,9.666667,0.406139,9.0,8.990406,4
157650,value,digestibility,11.666667,0.405878,9.0,8.990406,5
157648,value,costs,12.666667,0.398812,9.0,8.990399,6
157676,value,utilisation,4.25,0.456681,8.0,7.990457,7
157657,value,interest,8.0,0.416607,8.0,7.990417,8
157666,value,profits,15.875,0.385733,8.0,7.990386,9
157646,value,content,10.571429,0.409267,7.0,6.990409,10


In [5]:
dfneighbs = gen_all_neighbors(lim=None,force=1)
dfneighbs

Gathering all neighborhoods [x4]: 100%|██████████| 50/50 [37:01<00:00, 44.44s/it]  


Unnamed: 0,corpus,period,word,neighbor,rank_avg,csim,count,score,rank
0,bpo,1720-1725,abatement,demurrer,1.333333,0.588059,9.0,8.990588,1
1,bpo,1720-1725,abatement,bench,6.625000,0.446086,8.0,7.990446,2
2,bpo,1720-1725,abatement,application,6.166667,0.445721,6.0,5.990446,3
3,bpo,1720-1725,abatement,declaration,13.000000,0.408198,6.0,5.990408,4
4,bpo,1720-1725,abatement,demurred,1.800000,0.538273,5.0,4.990538,5
...,...,...,...,...,...,...,...,...,...
8361137,bpo,1965-1970,zoo,elaine,23.500000,0.584527,2.0,1.990585,25
8361138,bpo,1965-1970,zoo,arnold,22.500000,0.571477,2.0,1.990571,26
8361139,bpo,1965-1970,zoo,assert,14.000000,0.546049,2.0,1.990546,27
8361140,bpo,1965-1970,zoo,soloist,9.500000,0.536242,2.0,1.990536,28


## Load data

In [6]:
dfneighbs = get_all_neighbors()
dfneighbs

Loading data
Filtering
Filtering, pt2
Postprocessing


Unnamed: 0_level_0,Unnamed: 1_level_0,neighbor,rank_avg,csim,count,score,rank
word,period,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
abandonment,1810-1815,usurpation,12.600000,0.612421,5.0,4.990612,1
abandonment,1810-1815,subjugation,5.333333,0.646222,3.0,2.990646,2
abandonment,1810-1815,aggression,9.666667,0.635127,3.0,2.990635,3
abandonment,1810-1815,arbitrary,6.666667,0.628782,3.0,2.990629,4
abandonment,1810-1815,democracy,13.500000,0.658465,2.0,1.990658,5
...,...,...,...,...,...,...,...
zoology,1915-1920,chemistry,5.000000,0.729294,2.0,1.990729,7
zoology,1915-1920,yale,16.500000,0.723865,2.0,1.990724,8
zoology,1915-1920,mathematics,9.500000,0.715767,2.0,1.990716,9
zoology,1915-1920,apps,10.500000,0.715292,2.0,1.990715,10


In [11]:
def do_combine_neighbs(dfgrp,k=25,min_count=2):
    dfgrp=dfgrp.reset_index()
    dfgrp['count_str']=dfgrp['count']
    dfgrp2=dfgrp.drop_duplicates('count')
    firstwords=set(dfgrp2.neighbor)
    dfgrp['count_str']=[f' ({int(c)})' if w in firstwords else ''
                       for c,w in zip(dfgrp['count'], dfgrp['neighbor'])]    
    return pd.DataFrame([{
        'neighborhood':', '.join([
            f'{n}{c}'
            for n,c,r in zip(dfgrp.neighbor, dfgrp["count_str"], dfgrp['rank'])
        ]),#[:k]),
        'neighborhood_size':len(dfgrp)
    }])

FN_ALL_NEIGHBS_STR=FN_ALL_NEIGHBS.replace('.pkl','.strsummary.pkl')

def get_all_neighbors_strsummary(dfneighbs=None,ofnfn=FN_ALL_NEIGHBS_STR,lim=None,k=25,num_proc=1,force=False,**y):
    if not force and os.path.exists(ofnfn): return read_df(ofnfn)
    if dfneighbs is None: dfneighbs=get_all_neighbors()
    odf=pmap_groups(
        do_combine_neighbs,
        dfneighbs.iloc[:lim].groupby(['word','period']),
        kwargs=dict(k=k),
        num_proc=num_proc,
        **y
    )
    odf.to_pickle(FN_ALL_NEIGHBS_STR)
    return odf


In [12]:
dfneighbstr=get_all_neighbors_strsummary(force=True)
dfneighbstr

Mapping do_combine_neighbs [x1]: 100%|██████████| 224332/224332 [11:32<00:00, 323.74it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,neighborhood,neighborhood_size
word,period,Unnamed: 2_level_1,Unnamed: 3_level_1
abandonment,1810-1815,"usurpation (5), subjugation (3), aggression, arbitrary, democracy (2), influencing, defeat, armament, monarchy, amelioration, invader, dereliction, ultimate, liberation, impressment, executive, retaliation, subversion, persecution",19
abandonment,1820-1825,"abolition (3), aggravation, violation (2), manifestation, enactment, amelioration, subjugation, impolicy, finances, institutes, tendencies, individuality, unexampled, derangement, obliterate",15
abandonment,1825-1830,"extension (4), disregard (3), extinction, incapacity, interposition (2), universality, shackles, observance, interference, sanctity, amelioration, discipline, exigencies, impossibility, thraldom, subordination, unalterable, anarchy, fanaticism, ascendancy, stimulus",21
abandonment,1830-1835,"degradation (4), ultimate (3), disregard, jealousy, aggrandizement, subversion (2), infringement, violating, unwillingness, hatred, independence, hostility, workings, overthrow, extension, sanctity, malady, alleviate, enforcement, ascendancy, embarrassments, ambition, fanaticism",23
abandonment,1835-1840,"manifestation (3), organization, tendencies, inclinations (2), desertion, bloodshed, infidelity, despotism, alienation, prerogatives, profligacy, intervention, usurpation, ascendency, destitution, infliction, amelioration, inadequacy, rigour, insubordination",20
...,...,...,...
zoology,1895-1900,"physics (7), botany, geology, chemistry (5), philology, physiology, litt (4), geometry, astronomy, acoustics (3), archaeology, geography, mathematics, inorganic (2), biology, physiography, topographical, univ, jurisprudence, trin, lix, handbook, yale, ethics, theoretical",25
zoology,1900-1905,"physics (9), botany (8), chemistry, litt (7), philology (5), physiology, geometry, camb, anthropology, biology (4), algebra, lond, tutorial (3), prof, astronomy, garnett, macmillan's, geography, edin, geology (2), ph, dowden, nouvelle, primers, lettres, biblical, student's, folklore, frontispieces, athenaeum, univ, ebenezer, mathematics, acoustics",34
zoology,1905-1910,"botany (9), geology (8), philology (7), biology, physics, regius (6), litt (5), chemistry, archeology, surgery (3), technology, flinders, cornell, veterinary, mathematics, physiology, tutorial, student's, ph, lond, mathematical, cruikshank (2), indices, swete, dental, calculus, univ, trigonometry, matriculation, dods, philological, biological, wiener, adolf, astronomical, ainsworth",36
zoology,1910-1915,"botany (7), geology, physics (6), philology, biology, archaeology (5), chemistry (4), physiology, astronomy, archeology, anthropology, sociology, rutter (3), surgery, litt, cornell (2), anthropological, photographies, otho, waldo, technology, critique, biological, kitchin, bibliography, theology, economics, allahabad",28


In [13]:
dfneighbstr.loc['culture']

Unnamed: 0_level_0,neighborhood,neighborhood_size
period,Unnamed: 1_level_1,Unnamed: 2_level_1
1720-1725,"geometry (7), seeds (6), navigation (5), diseases, plants, agriculture, medicinal (4), wrens, tendons, kinds, fortifies, ligaments, pristine, mathematicks, tings, heraldry, weakened (3), warms, rews, affections, enfeebled, vicious, various, flannels, gardeners (2), founds, patts, cadre, nerves, suited, solitary, consumption, quadrant, dispensing, sparkling, burthensome, mischievous, authors, editions",39
1725-1730,"infefts (8), gauging (6), decad (5), angling (4), melon, collier's, quadrupeds, eclipses, decades (3), bamford's, scott's, aloes, lees, elms, mix, plains, dolphins, kitchen, venus, earthy, illustrated (2), manufactures, bullock's, bools, sizes, camphora, corydon, globes, cartilages, dolor, shrubs, greens, farrier's, refractions, simul, principia, bourgs, quarrey, membranes, sulphur, fishes, excretion, drying, compression, copper, jupiter's, temperatures, viscera, wildernesses, plants, meteors, parterre, vines, concave",54
1730-1735,"surgery (8), exotick, vegetation, flowers, gardening (7), statical (6), soils, analyse (5), soil, cultivating (4), anatomy, alphabetical, situations, sowing, plants, copious, flowering, grafting, problems, barometers (3), adapted, descriptions, hygrometers, theory, fruits, gardeners, vegetables, experimental, planting, fruit, sculptures (2), joints, atte, chronological, foils, spherical, staticks, arithmetick, curious, algebra, parti, hieroglyphical",42
1735-1740,"flax (6), soil, soils (5), planting, plants, fibres (4), distilling, herbs, metals, accurate, branches, minerals, roots (3), fruit, climate, habits, variety, telescopes, fibre, manufactures, gardening, modifications, improvements, improvement, vegetables (2), canals, wines, statuaries, paintings, fruits, colleted, sorts, materials, appetites, ascertaining, juice, vigor, growth, distances, extent, dresses, production, magnitudes",43
1740-1745,"planting (8), agriculture (6), draining (4), demons, chymistry, ligaments, capillary, invigorate, plants (3), materia, geography, pharmacy, vegetable, manuring (2), fifties, landa, gardening, tendons, botany, enfeebled, geographical, vegetation, morality, rectitude, strengthens, architecture, soil, philosophy, deduced, corroborate, secular, claudian, theology, altars, vascular, fibres, reformation",37
1745-1750,"vegetables (5), ornaments (3), feeds, rudiments, expand, herbs, sensations (2), substances, structure, vegetation, fishes, ingredient, mercurial, druid, metals, geometry, flowers, texture, antique, exhibiting, humbler",21
1750-1755,"growth (7), fruits, feeds (4), fragrance, fruit, fertile, branches, gems (3), spices, verdure, minerals, boughs, forests, cultivation, marking, wines, flow'rs, cooling, oils, cultivating (2), veins, vines, combine, flowers, flax, plumage, streams, fragrancy, flocks, drains, unfold, yielding, texture, delights, plants, orchard, earths, metals, commodities, climates",40
1755-1760,"madder (10), growth (9), vegetables, vegetation, agriculture (8), herbs, flax, plants, fruit (7), grain, planting (6), foil (5), cultivation, foils, manufacture (4), vegetable, mineral, fermentation, producing (3), curing, juices, minerals, cultivating, produces, circulation, fruits, improvement, husbandry, sowing, scarcity, medicinal, preserving, obstructions (2), nourishment, fluids, plant, salts, drying, ingredients, brewing, moss, salt, food, feed, crop",45
1760-1765,"vegetables (10), cultivation (7), diseases, curing (6), husbandry, growth, minerals, climates, crops (5), noxious, producing, juices, tillage (4), medicinal, nourishment, gradual, improvement, vegetable, grain, seeds (3), breeding, climate, agriculture, foil, flavour, plants, nervous, solids, vegetation, manure, cultivating, scarcity (2), improvements, alkaline, cancers, stocks, indigo, fertility, foils, materials, potatoes, production, acrimony, distempers, hurtful, mineral, manufactures, efficacy, produces",49
1765-1770,"husbandry (10), tillage (9), growth (8), improvement (7), analysis, planting (6), cultivating, rearing, dearness, medical (5), stocks, geography, improvements, vines (4), cultivation, containing, scurvy, catalogue, olives (3), inland, lucerne, drill, navigation, improving, wines, manufacture, chemistry, cancers, anatomy, agriculture, imports, silk (2), indigo, meteorological, managing, navigations, wool, gardening, productions, preventing, mineral, management, diseases, remarks, telescopes, experiments",46


In [14]:
dfneighbstr.loc['history']

Unnamed: 0_level_0,neighborhood,neighborhood_size
period,Unnamed: 1_level_1,Unnamed: 2_level_1
1720-1725,"divinity (10), burnet's, antiquities, treatise, historical, sermons (9), modern, medals, mathematicks, poetry, plays, memoirs (8), poems (7), voyages (6), languages, edition (5), usefulness, editions, historians, revolutions, numb, theory (4), critical, pamphlets, novels, adventures, burnet, remarks, collection, improv'd (3), geometry, containing, vols, poem (2), ethicks, miscellaneous, familiar, treatises, abridgment, oxon, reformation, appendix, chapters, series",44
1725-1730,"annals (10), historical, collection, treats, abridgment (9), historians, century (8), antiquities, treatise, chronicle, description (7), philosophy, origin, histories, geography (6), philosophical, amphitheater, continuation, critical, japan (5), theology (4), egyptian, transactions, dissertation, series, chronology, geographical (3), explication, travels, system, editions, commentaries, academical, tracts (2), flory, extant, miscellanies, languages, books, book, curiosities, corsica, lorrain, antient, greek",45
1730-1735,"memoirs (10), chronology, historical, chinese, description (9), antiquities, geography, geographical (8), continuation, preface, critical (7), historians, chronological, origin, monuments (6), dictionary, philosophy, collection (5), elements, volume, commentaries (4), summary, literature, treatise, period, manuscript (3), hebrews, contains, treats, histories, century, editions, annals, popes, earliest (2), rollin, fifth, version, translation, septuagint, original",41
1735-1740,"memoirs (10), geography, continuation, historical, critical, treatise, preface, transactions (9), chronology (8), antiquities, dictionary, translation (7), greek (6), philosophical (5), summary, introduction, antiquity, philosophy, eleventh, adventures, compiled, description (4), modem, geographical (3), treats, celebrated, editions, origin, chronological, literary, original, vol, testament, hebrews (2), languages, authentick, tacitus, earliest, index, abridgment, masonry, miscellaneous",42
1740-1745,"memoirs (10), historical, roman, geography, adventures (8), chronology, literary, travels, continuation (7), physic, histories (6), botany, introduction, translation (5), transactions, compendious, mythology, greek, grecian, origin (4), containing, lettres (3), sixth, hebrew, painting, map, philosophical, antiquity, mathematicks, antiquities, sciences, supplement, chinese (2), grammar, anatomy, seventh, testament, fables, normans, latin, historians, septuagint, essays, collection, mathematical, description, cicero, preface",48
1745-1750,"memoirs (10), modern, political, preface (9), description, historical, translation, poetry, greek (8), philosophical, travels, antiquities, literary, philosophy (6), sciences, critical (5), theory, discourses, pamphlet, languages (4), geography, origin, chronology, observations, latin, antient, collection (3), dissertation, celebrated, vindication, poetical, hebrew, academy, authentic (2), physics, fables, roman, divinity, ancient, original, chronological, tragedy, illustrated, philo, universal, grammar, writings, antiquity, page",49
1750-1755,"memoirs (10), translation (9), description (8), chronology, philosophical (7), geography, poem, adventures, millar, antiquities (6), appendix, vindication, vols, xiv (5), historical, version, greek, edition, brief, abridgment (4), critical, chap, political, poetry, hebrew (3), theory, transactions, narrative, continuation, poems, antiquity, antient, latin, collection, preface, page, grammar (2), dramatic, catalogue, treatise, owen, vol, practical, bible, literary, explanation, fragment, xvii, review, comprehending, epic, griffiths",52
1755-1760,"transactions (10), philosophical, narrative, description, vol, memoirs (9), translation, popes (8), literary, vols, historical (6), summary, preface, vindication, edition, political (5), antiquities, treatise (4), continuation, appendix, accurate, dictionary (3), supplement, theory, voltaire, miscellaneous, geography, manuscripts, introduction, tragedy (2), dramatic, review, revolutions, translated, collection, analysis, abridgment, chronological, physic, authentic, xxiii, page, volume",43
1760-1765,"historical (10), origin, memoirs, transactions, description, vol, anecdotes (9), antiquities, modern (7), detail, dictionary (6), geography, chronological, vols, antient, histories (5), chronology, philosophical, continuation, review, narrative (4), introduction, political, annals, medical (3), literature, compendious, translation, tragedy, volume, catalogue, treatise, dissertation, epitome, chinese, vindication (2), analysis, critical, complete, specimens, appendix, miscellaneous, summary, testament, supplement",45
1765-1770,"memoirs (10), antiquities, translation, volume, anecdotes, origin, modern, review (8), philosophical, edition (7), historical, vol, description (6), poems, antient (5), novel, philosophy, geography (4), version, celebrated, chronology, treatise, preface, vols, literary (3), complete, catalogue, annals, narrative, anatomy, lettres, series, translated, fabulous (2), critical, poem, supplement, collection, chapter, controversy, original, appendix, sermons, travels, authors, publication",46
