In [1]:
import pandas as pd
import time
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool
import csv
import fastparquet
import spacy
import fasttext
import glob, os
import re
from os.path import isfile

large_files=['a_','an','of','to','in','ad','wh','be','ha','is','co','wa','he','no','it','wi','fo','re','as','on','we','punctuation','th','ma','pr','ar','ip','sh','ca','so','hi','bu','al','se','de','by','wo','st','fr','di','mo','su','at','or','yo','me','li','pa','do','ex','le','pe','po','if','ne','fi','un','fa','sa','ch','la','lo','ac','ho','mu','go','si','en','ev','tr']

keep_string=r"(.+_(NOUN|ADV|VERB|ADJ|X|PRT|CONJ|PRON|DET|ADP|NUM|\.)|_END_)\s*"
fmodel = fasttext.load_model('/data/dharp/packages/lid.176.bin')
nlp = spacy.load('en_core_web_sm')


nn='(?!NOUN).*'
comp='NOUN\sNOUN'
word='.*'

n1=f'^{comp}\s{nn}\s{comp}$'
n2=f'^{comp}\s{nn}\s{word}\s{word}$'
n3=f'^{nn}\s{comp}\s{nn}\s{word}$'
n4=f'^{word}\s{nn}\s{comp}\s{nn}$'
n5=f'^{word}\s{word}\s{nn}\s{comp}$'


def delist_lang(lst):
    lang_lst=[]
    for i,lang in enumerate(lst):
        if not lang:
            lang_lst.append(None)
        else:
            lang_lst.append(lang[0])
    return lang_lst


def significance(lst):
    significance_list=[]
    for l in lst:
        if len(l)>1:
            significance_list.append(abs(l[0]-l[1])/np.mean(l[0]+l[1])>0.1)
            #print(f'{conf[0]} {conf[1]} {abs(conf[0]-conf[1])/np.mean(conf[0]+conf[1])>0.1}')
        else:
            significance_list.append(True)
    return significance_list

In [46]:
def sent_maker(sent_lst):
    ret_sents=[]
    g_pos=[]
    for sent in sent_lst:
        cur_words=[]
        pos_sent=[]
        for word_pos in sent.split(' '):
            word,pos=word_pos.rsplit('_',1)
            cur_words.append(word)
            pos_sent.append(pos)
            cur_sent=' '.join(cur_words)
            cur_pos=' '.join(pos_sent)
            cur_sent=re.sub('_', '', cur_sent)
        ret_sents.append(cur_sent)
        g_pos.append(cur_pos)
    return ret_sents,g_pos

In [22]:
def ner_lemma_reducer(sent):
    ner_sent=[]
    lemma=[]
    pos=[]
    parse=[]
    ner=[]
    parsed_sent=nlp(sent)
    for token in parsed_sent:
        parse.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)

    #print(parse)
    parse_sent=' '.join(parse)
    lemma_sent=' '.join(lemma)
    pos_sent=' '.join(pos)
    if parsed_sent.ents:
        for ent in parsed_sent.ents:
            cur_ner='_'.join([str(ent.start_char), str(ent.end_char), ent.label_])
            ner.append(cur_ner)
    else:
        ner.append('')
    ner_sent=' '.join(ner)
    
    return parse_sent,ner_sent,lemma_sent,pos_sent

In [23]:
def lang_tagger(parsed_sent):
    labels,confs=fmodel.predict(parsed_sent,k=-1,threshold=0.1)
    lang_list=delist_lang(labels)    
    significance_list=significance(confs)
    assert len(lang_list)==len(significance_list)
    return lang_list,significance_list

In [24]:
def trial(df):
    df.reset_index(inplace=True,drop=True)
    df['sent']=sent_maker(df.old_index)
    
    results=np.vectorize(ner_lemma_reducer)(df.sent.values)
    results_df=pd.DataFrame(results)
    results_df=results_df.transpose()
    results_df.columns=['parse_sent','ner_sent','lemma_sent','pos_sent']

    index_df=pd.concat([df,results_df],axis=1,ignore_index=False)
    lang_list,significance_list=lang_tagger(index_df.parse_sent.values.tolist())
    
    index_df['lang']=lang_list
    index_df['lang_conf']=significance_list
    index_df.lang=index_df.lang.str.split('_',n=4).str[-1]
    index_df=index_df.loc[(index_df.lang=='en') &(index_df.lang_conf==True)]
    index_df['nwords']=index_df.pos_sent.str.count(' ').add(1)
    index_df=index_df.loc[index_df.nwords==5]
    index_df.lemma_sent=index_df.lemma_sent.str.lower()
    index_df.pos_sent=index_df.pos_sent.str.replace('PROPN','NOUN')
    index_df['comp_class']=0

    index_df.loc[index_df.pos_sent.str.contains(n1),'comp_class']=1
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n2),'comp_class']=2
    index_df.loc[index_df.pos_sent.str.contains(n3),'comp_class']=3
    index_df.loc[index_df.pos_sent.str.contains(n4),'comp_class']=4
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n5),'comp_class']=5

    index_df.drop(['parse_sent','sent','lang','lang_conf','nwords'],axis=1,inplace=True)
    #print(index_df)
    return index_df

In [6]:
def large_df_processor(letter):
    
    CHUNKSIZE = 800_000_000
    num_partitions = 100
    total_df_shape=0
    df_list=[]
    path_loc="http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-"+letter+".gz"
    dfs   = pd.read_csv(path_loc, compression='gzip', header=None, sep="\t", quoting=csv.QUOTE_NONE,usecols=[0,1,2],chunksize=CHUNKSIZE)    
    for i,df in enumerate(dfs):
        
        print(f'Split num {i+1}')        
        cur_time=time.time()
        df.columns=['fivegram_pos','year','count']
        df=df.loc[df.year>=1800]
        index_df=df.groupby(['fivegram_pos'])['count'].sum().reset_index()
        index_df.columns=['old_index','total_count']
        index_df=index_df.loc[index_df.old_index.str.match("^"+keep_string*5+"$",na=False)]

        df_split = np.array_split(index_df, num_partitions)
        pool = Pool(num_partitions)
        print('Started parallelization')
        results=pool.map_async(trial,df_split)
        pool.close()
        pool.join()
        
        curr_df_list=results.get()
        #df_list.extend(curr_df_list)
        index_df=pd.concat(curr_df_list,ignore_index=True)
        print(f'Total time taken for split num {i+1}: {round(time.time()-cur_time)} secs')        

        ntypes=index_df.shape[0]
        ntokens=index_df.total_count.sum()

        types_perc=round(ntypes/df.shape[0]*100,3)
        print(f'Number of types: {ntypes}, perc. of unique types (decade agnostic): {types_perc}%')

        print(f'Number of tokens: {ntokens}, ratio of tokens to types: {round(ntokens/ntypes,3)}')

        ncomptypes=np.sum(index_df.comp_class!=0)
        ncomptypes_perc=round(ncomptypes/ntypes*100,3)
        print(f'Number of compounds types: {ncomptypes}, perc. of compound types: {ncomptypes_perc}%')

        comp_count=index_df.loc[index_df.comp_class!=0,'total_count'].sum()
        comp_count_perc=round(comp_count/ntokens*100,3)
        print(f'Compound count: {comp_count}, perc. of compound tokens: {comp_count_perc}%')

        words_df=index_df.loc[index_df.pos_sent.str.contains('NOUN')].reset_index(drop=True)
        words_df['nner']=words_df.ner_sent.str.count(' ').add(1)
        words_df['nX']=words_df.pos_sent.str.count('X')-words_df.pos_sent.str.count('AUX')
        words_df=words_df.loc[~(words_df.nX>=3)]
        words_df=words_df.loc[words_df.nner<2]        

        words=pd.merge(df,words_df,left_on='fivegram_pos',right_on='old_index',how='right')
        words=words.groupby(['lemma_sent','year','pos_sent','comp_class','ner_sent'])['count'].sum().to_frame()
        words.reset_index(inplace=True)

        words.to_pickle(f'/data/dharp/compounds/datasets/google/{letter}{i+1}.pkl')
        #phrases_df=words_df.loc[words_df.pos_sent.str.contains('NOUN NOUN')].reset_index(drop=True)
        #phrases=pd.merge(df,phrases_df,left_on='fivegram_pos',right_on='old_index',how='right')
        #phrases=phrases.groupby(['lemma_sent','year','pos_sent','comp_class','ner_sent'])['count'].sum().to_frame()
        #phrases.reset_index(inplace=True)

        #comp_df=phrases_df.loc[phrases_df.comp_class!=0].reset_index(drop=True)
        #compounds=pd.merge(df,comp_df,left_on='fivegram_pos',right_on='old_index',how='right')
        #compounds=compounds.groupby(['lemma_sent','year','pos_sent','comp_class','ner_sent'])['count'].sum().to_frame()
        #compounds.reset_index(inplace=True)

        print(f'Total time taken for letter {letter}: {round(time.time()-cur_time)} secs')
        with open(f'/data/dharp/compounds/datasets/stats/{letter}{i+1}.txt','w') as f:
            f.write(f'{letter}\t{i+1}\t{ntypes}\t{ntokens}\t{ncomptypes}\t{comp_count}\n')

In [52]:
large_df_processor('za')

Split num 1
Started parallelization
Total time taken for split num 1: 3 secs
Number of types: 15817, perc. of unique types (decade agnostic): 2.602%
Number of tokens: 1901799, ratio of tokens to types: 120.238
Number of compounds types: 3817, perc. of compound types: 24.132%
Compound count: 464731, perc. of compound tokens: 24.436%
Total time taken for letter za: 4 secs


In [81]:
        num_partitions=200
        i=0
        path_loc="http://storage.googleapis.com/books/ngrams/books/googlebooks-eng-all-5gram-20120701-za.gz"
        df   = pd.read_csv(path_loc, compression='gzip', header=None, sep="\t", quoting=csv.QUOTE_NONE,usecols=[0,1,2])    
        cur_time=time.time()
        df.columns=['fivegram_pos','year','count']
        df=df.loc[df.year>=1800]
        index_df=df.groupby(['fivegram_pos'])['count'].sum().reset_index()
        index_df.columns=['old_index','total_count']
        index_df=index_df.loc[index_df.old_index.str.match("^"+keep_string*5+"$",na=False)]
        df=index_df.copy()

In [82]:
    df.reset_index(inplace=True,drop=True)
    ret_lst=sent_maker(df.old_index)
    df['sent']=ret_lsts[0]
    df['g_pos']=ret_lsts[1]
    
    results=np.vectorize(ner_lemma_reducer)(df.sent.values)
    results_df=pd.DataFrame(results)
    results_df=results_df.transpose()
    results_df.columns=['parse_sent','ner_sent','lemma_sent','pos_sent']

In [83]:

    index_df=pd.concat([df,results_df],axis=1,ignore_index=False)
    lang_list,significance_list=lang_tagger(index_df.parse_sent.values.tolist())

In [84]:
    index_df['lang']=lang_list
    index_df['lang_conf']=significance_list
    index_df.lang=index_df.lang.str.split('_',n=4).str[-1]
    index_df=index_df.loc[(index_df.lang=='en') &(index_df.lang_conf==True)]
    index_df['nwords']=index_df.pos_sent.str.count(' ').add(1)
    index_df=index_df.loc[index_df.nwords==5]
    index_df.lemma_sent=index_df.lemma_sent.str.lower()
    index_df.pos_sent=index_df.pos_sent.str.replace('PROPN','NOUN')
    index_df.pos_sent=index_df.pos_sent.str.replace('AUX','VERB')
    index_df.g_pos=index_df.g_pos.str.replace('.','PUNCT',regex=False)

    index_df['comp_class']=0
    index_df.loc[index_df.pos_sent.str.contains(n1),'comp_class']=1
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n2),'comp_class']=2
    index_df.loc[index_df.pos_sent.str.contains(n3),'comp_class']=3
    index_df.loc[index_df.pos_sent.str.contains(n4),'comp_class']=4
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n5),'comp_class']=5

    #index_df.drop(['parse_sent','sent','lang','lang_conf','nwords'],axis=1,inplace=True)

In [107]:
index_df.loc[index_df.sent.str.contains('END')][:1].g_pos.to_list()

['NOUN ADP NOUN PUNCT ']

In [12]:
df_split = np.array_split(index_df, num_partitions)
        pool = Pool(num_partitions)
        print('Started parallelization')
        results=pool.map_async(trial,df_split)
        pool.close()
        pool.join()
        
        curr_df_list=results.get()
        #df_list.extend(curr_df_list)
        index_df=pd.concat(curr_df_list,ignore_index=True)
        print(f'Total time taken for split num {i+1}: {round(time.time()-cur_time)} secs')        

        ntypes=index_df.shape[0]
        ntokens=index_df.total_count.sum()

        types_perc=round(ntypes/df.shape[0]*100,3)
        print(f'Number of types: {ntypes}, perc. of unique types (decade agnostic): {types_perc}%')

        print(f'Number of tokens: {ntokens}, ratio of tokens to types: {round(ntokens/ntypes,3)}')

        ncomptypes=np.sum(index_df.comp_class!=0)
        ncomptypes_perc=round(ncomptypes/ntypes*100,3)
        print(f'Number of compounds types: {ncomptypes}, perc. of compound types: {ncomptypes_perc}%')

        comp_count=index_df.loc[index_df.comp_class!=0,'total_count'].sum()
        comp_count_perc=round(comp_count/ntokens*100,3)
        print(f'Compound count: {comp_count}, perc. of compound tokens: {comp_count_perc}%')

        words_df=index_df.loc[index_df.pos_sent.str.contains('NOUN')].reset_index(drop=True)
        words_df['nner']=words_df.ner_sent.str.count(' ').add(1)
        words_df['nX']=words_df.pos_sent.str.count('X')-words_df.pos_sent.str.count('AUX')
        words_df=words_df.loc[~(words_df.nX>=3)]
        words_df=words_df.loc[words_df.nner<2]        

        words=pd.merge(df,words_df,left_on='fivegram_pos',right_on='old_index',how='right')
        words=words.groupby(['lemma_sent','year','pos_sent','comp_class','ner_sent'])['count'].sum().to_frame()
        words.reset_index(inplace=True)

Unnamed: 0,old_index,total_count,ner_sent,lemma_sent,pos_sent,comp_class,nner,nX
1,"Z.A.B._NOUN Zeman_NOUN ,_. The_DET Break_ADJ",46,0_12_PERSON,"z.a.b. zeman , the break",NOUN NOUN PUNCT DET NOUN,2,1,0
2,"Z.A._NOUN ,_. The_DET Myth_NOUN of_ADP",49,0_4_GPE,"z.a. , the myth of",NOUN PUNCT DET NOUN ADP,0,1,0
3,ZABC_NOUN is_VERB a_DET right_ADJ angle_NOUN,61,0_4_ORG,zabc be a right angle,NOUN AUX DET ADJ NOUN,0,1,0
4,ZABEL_ADJ -_. Concerto_NOUN for_ADP Harp_NOUN,56,0_16_PERSON,zabel - concerto for harp,NOUN PUNCT NOUN ADP NOUN,0,1,0
5,"ZABEL_NOUN ,_. Editor_NOUN :_. Literary_ADJ",103,0_5_ORG,"zabel , editor : literary",NOUN PUNCT NOUN PUNCT ADJ,0,1,0
...,...,...,...,...,...,...,...,...
8337,"zazen_NOUN ,_. or_CONJ sitting_VERB meditation...",85,,"zazen , or sit meditation",NOUN PUNCT CCONJ VERB NOUN,0,1,0
8338,zazen_NOUN I_PRON speak_VERB of_ADP is_VERB,50,,zazen -pron- speak of be,NOUN PRON VERB ADP AUX,0,1,0
8339,"zazen_VERB ,_. in_ADP a_DET brisk_ADJ",42,0_5_CARDINAL,"zazen , in a brisk",NOUN PUNCT ADP DET ADJ,0,1,0
8341,"zazzera_NOUN ,_. which_DET is_VERB as_ADV",65,0_7_ORG,"zazzera , which be as",NOUN PUNCT DET AUX SCONJ,0,1,0
