# Freqs

In [1]:
from ipynb.fs.full.koselleck import *

In [2]:
def get_freqs(fnfn_json, ok_words=None, only_valid_words=True):
    if not os.path.exists(fnfn_json): return pd.Series()
    with open(fnfn_json) as f: freqs=json.load(f)
    if only_valid_words: ok_words=set(get_valid_words())
    return pd.Series(dict(
        (w,c)
        for w,c in freqs.items()
        if ok_words is None or w in ok_words
    )).sort_values(ascending=False)

In [3]:

def gen_freqs(C=None,only_valid_words=True,num_proc=1,force=False):
    if C is None: C=get_corpus()
    def to_qstr(t): return f'{t.corpus.id}__{t.id}__{"onlyvalidwords" if only_valid_words else "allwords"}'
    with get_db('freqs','c') as db:
        objs=[
            (t.id,t.path_freqs)
            for t in C.texts()
            if force or to_qstr(t) not in db
        ]
        iterr=pmap_iter(
            gen_freqs_,
            objs,
            kwargs=dict(only_valid_words=only_valid_words),
            num_proc=num_proc
        )
        for i,(idx,s) in enumerate(iterr):
            db[to_qstr(C.textd[idx])]=s
            if i and not i%10: db.commit()
        db.commit()

def gen_freqs_(obj,*x,**y):
    idx,fnfn=obj
    return (idx,get_freqs(fnfn,*x,**y))
    

In [4]:
def get_freqs_byyear(fnfn_skip, ok_words=None, only_valid_words=True):
    if not os.path.exists(fnfn_skip): return pd.Series()
    freqs=Counter()
    with open(fnfn_skip) as f:
        for ln in f:
            for w in tokenize_fast(ln):
                freqs[w]+=1
    if only_valid_words: ok_words=set(get_valid_words())
    return [(w,int(c)) for w,c in freqs.items() if ok_words is None or w in ok_words]

# #     return pd.Series(dict(
# #         (w,c)
# #         for w,c in freqs.items()
# #         if ok_words is None or w in ok_words
# #     )).sort_values(ascending=False)
#     return 


In [5]:
# get_freqs_byyear(get_skipgrams().iloc[0].path)

In [6]:
# get_skipgrams()

In [23]:
def gen_freqs_byyear(only_valid_words=True,num_proc=1,force=False):
    dfskip=get_skipgrams()
    #def to_qstr(t): return f'{t.corpus.id}__{t.id}__{"onlyvalidwords" if only_valid_words else "allwords"}'
    dfskip['qstr']=[f'{c}/{y}' for c,y in zip(dfskip.corpus,dfskip.year)]
    with get_db('freqs','c') as db:
        objs=dfskip.to_dict('records')
        iterr=pmap_iter(
            gen_freqs_by_year_,
            objs,
            kwargs=dict(only_valid_words=only_valid_words),
            num_proc=num_proc
        )
        for i,(objd,s) in enumerate(iterr):
            db[objd['qstr']]=s
            for w,c in s:
                wqstr=f'{objd["corpus"]}/{w}'
                db[wqstr]=list(db.get(wqstr,[])) + [(int(objd['year']), c)]
            if i and not i%10: db.commit()
        db.commit()
        
    ## gen totals
    gen_totals()

def gen_freqs_by_year_(objd, **opts):
    return (objd,get_freqs_byyear(objd['path'], **opts))
    
def gen_totals():
    with get_db('freqs','c') as db:
        iterr=tqdm(sorted(list(db.keys())))
        for i,k in enumerate(iterr):
            iterr.set_description(k)
            if k.endswith('_'): continue
            s=db[k]
            if type(s)==list:
                s=pd.Series(dict(s))
                db[k]=s
            db[k+'_']=s.sum()
            if i and not i%10:
                db.commit()
        db.commit()
            

In [46]:
gen_totals()

bpo/zulu: 100%|██████████| 15838/15838 [00:59<00:00, 265.41it/s]            


In [81]:
def to_qstr_freqs_word(word,corpus): return f'{corpus}/{word}'
def to_qstr_freqs_year(year,corpus): return f'{corpus}/{year}'

def get_freqs(words=None,corpus=DEFAULT_CORPUS,tf=False,fpm=False):
    words=get_keywords() if not words else to_words(words)
    qstrs=[to_qstr_freqs_word(w,corpus) for w in words]
    with get_db('freqs') as db:
        odf=pd.DataFrame(dict(
            (w,db[qstr])
            for w,qstr in tqdm(zip(words,qstrs),total=len(words))
            if qstr in db
        ))
        totals=[db.get(to_qstr_freqs_year(y,corpus)+'_') for y in odf.index]
        if tf or fpm: odf=odf.div(totals,axis=0)
        if fpm: odf*=1000000
        odf=odf[sorted(odf.columns, key=lambda c: -odf[c].sum())]
        odf['_total']=totals
        return odf.fillna(0)
    
def get_dffreqs(key='_dffreqs_',force=False):
    if not force:
        with get_db('freqs') as db:
            if key in db:
                return db[key]
    odf=get_freqs(get_valid_words(),fpm=True)
    with get_db('freqs','w') as db: db[key]=odf
    return odf

In [83]:
get_dffreqs(force=False)

Unnamed: 0,house,ill,long,general,fame,lie,church,author,late,tile,...,phonetic,sauntered,ratepayers,telegraphed,protectionist,amo,doin,cafe,dut,_total
1681,2241.927388,3346.160281,3379.621884,568.847248,6491.550945,3212.313870,10138.865652,836.540070,4985.778819,2509.620211,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,29885
1682,1031.628211,5434.919357,3723.926226,1056.789875,5510.404348,5560.727675,4378.129482,6265.254259,2818.106333,4352.967818,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,39743
1683,1542.168675,3373.493976,3855.421687,433.734940,5349.397590,4530.120482,9542.168675,1156.626506,4626.506024,3759.036145,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,20750
1684,1008.902077,5934.718101,3382.789318,1008.902077,2848.664688,7181.008902,11988.130564,1305.637982,3501.483680,5103.857567,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,16850
1685,1039.392994,4365.450577,2650.452136,1455.150192,2442.573537,3533.936181,15538.925268,675.605446,5612.722170,5404.843571,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,19242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1967,1266.587137,206.188604,2430.079972,1089.854048,29.455515,58.911030,73.638787,29.455515,427.104965,117.822059,...,0.0,0.0,0.0,0.0,14.727757,0.0,0.0,0.0,0.0,67899
1968,1430.871043,127.756343,2197.409101,677.108618,0.000000,63.878172,127.756343,89.429440,600.454813,89.429440,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,78274
1969,1909.733274,178.241772,2266.216818,942.135082,0.000000,89.120886,76.389331,25.463110,598.383092,25.463110,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,78545
1970,1427.580862,108.768066,2284.129380,1019.700616,0.000000,122.364074,271.920164,122.364074,652.608394,13.596008,...,0.0,0.0,0.0,0.0,13.596008,0.0,0.0,0.0,0.0,73551


In [17]:
# gen_freqs_byyear(num_proc=4)

In [21]:
# with get_db('freqs') as db:
#     print(db['bpo/virtue'])

In [19]:
# !ls -ltrh ../db/

total 288G
-rw-r--r-- 1 ryan ryan  12K Jun 25 09:15 db.kos2.testing.sqlite
-rw-r--r-- 1 ryan ryan 167M Jun 26 17:53 db.kos2.distmat.sqlite
-rw-r--r-- 1 ryan ryan 6.6G Jun 27 18:11 db.kos2.vecs.sqlite
-rw-r--r-- 1 ryan ryan 272G Jun 27 19:21 db.kos2.wdists.sqlite
-rw-r--r-- 1 ryan ryan 505M Jun 28 00:04 db.kos2.nov.sqlite
-rw-r--r-- 1 ryan ryan  12M Jun 28 08:54 db.kos2.sents.sqlite
-rw-r--r-- 1 ryan ryan 908K Jun 28 13:12 db.kos2.mfw.sqlite
-rw-r--r-- 1 ryan ryan  18M Jun 28 13:15 db.kos2.wpos.sqlite
-rw-r--r-- 1 ryan ryan 2.7G Jun 28 15:39 db.kos2.freqs_bytext.sqlite
-rw-r--r-- 1 ryan ryan  94M Jun 28 17:16 db.kos2.freqs.sqlite
-rw-r--r-- 1 ryan ryan  61M Jun 28 17:24 db.kos2.corrs.sqlite
-rw-r--r-- 1 ryan ryan  41K Jun 28 17:24 db.kos2.corrs.sqlite-journal
-rw-r--r-- 1 ryan ryan 6.0G Jun 28 17:25 db.kos2.ldist.sqlite


In [10]:
# len(db_get_keys('freqs'))

In [11]:
# !ls -ltrh ../db/

In [12]:
# with get_db('freqs') as db:
#     keys=list(db.keys())
#     df=pd.DataFrame(dict(
#         (idx.split('__')[1],db[idx])
#         for idx in tqdm(keys,desc='Loading freq data')
#         if idx.count('__')==2
#     )).T.fillna(0).applymap(int)

In [13]:
# df

In [14]:
def do_gen_freqs_byword(word):
    with get_db('freqs') as db:
        s_out = pd.Series(dict(
            (
                idx,
                s[word]
            )        
            for idx,s in tqdm(db.items())
            if word in s.index
        ))
        return s_out
            

In [15]:
# do_gen_freqs_byword('virtue')

In [16]:
# def gen_freqs_byword():
#     with get_db('freqs','c') as db:
#         for idx,s in db.items():
            