In [3]:
%matplotlib inline

In [4]:
import pandas as pd
import fasttext
import time
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool
import csv
import spacy
import re

from nltk.tokenize.treebank import TreebankWordDetokenizer as Detok
detokenizer = Detok()

In [5]:
to_save_path='/data/dharp/compounds/datasets/'
keep_string=r"(.+_(NOUN|ADV|VERB|ADJ|X|PRT|CONJ|PRON|DET|ADP|NUM|\.)|_END_|_START_)\s*"
try_keep_string=r"(.+_(NOUN|ADV|VERB|ADJ|X|PRT|CONJ|PRON|DET|ADP|NUM|\.)|_NOUN_|_ADV_|_VERB_|_ADJ_|_X_|_PRT_|_CONJ_|_PRON_|_DET_|_ADP_|_NUM_|_\._)"

word='.*'

nn='(?!(?:NOUN|PROPN)).*'
nn_comp='(?:NOUN|PROPN)\s(?:NOUN|PROPN)'
an_comp='ADJ\s(?:NOUN|PROPN)'

ner_cats=['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
n1=f'^{nn_comp}\s{nn}\s{nn_comp}$'
n2=f'^{nn_comp}\s{nn}\s{word}\s{word}$'
n3=f'^{nn}\s{nn_comp}\s{nn}\s{word}$'
n4=f'^{word}\s{nn}\s{nn_comp}\s{nn}$'
n5=f'^{word}\s{word}\s{nn}\s{nn_comp}$'

a1=f'^{an_comp}\s{nn}\s{an_comp}$'
a2=f'^{an_comp}\s{nn}\s{word}\s{word}$'
a3=f'^{nn}\s{an_comp}\s{nn}\s{word}$'
a4=f'^{word}\s{nn}\s{an_comp}\s{nn}$'
a5=f'^{word}\s{word}\s{nn}\s{an_comp}$'


c1=f'^{nn_comp}\s{nn}\s{an_comp}$'
c2=f'^{an_comp}\s{nn}\s{nn_comp}$'


In [6]:
fmodel = fasttext.load_model('/data/dharp/packages/lid.176.bin')
nlp = spacy.load('en_core_web_lg')



In [7]:
def ner_lemma_reducer(sent):
    ner_sent=[]
    lemma=[]
    pos=[]
    dep=[]
    comp_ner_type=[]
    parsed_sent=nlp(sent)
    for token in parsed_sent:
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        dep.append(token.dep_)
        if token.dep_=="compound":
            if token.ent_type_!="":
                comp_ner_type.append(token.ent_type_)

    comp_ner_sent=' '.join(comp_ner_type)
    if len(parsed_sent)<5:
        new_lemma_list=["eos"]*(5-len(parsed_sent))
        new_pos_list=["X"]*(5-len(parsed_sent))
        lemma.extend(new_lemma_list)
        pos.extend(new_pos_list)
        
    comp_ner_sent=' '.join(comp_ner_type)
    lemma_sent=' '.join(lemma)
    pos_sent=' '.join(pos)
    
    dep_sent=' '.join(dep)
        
    num_count=len(re.findall("compound\s(?!compound)", dep_sent))
   
    return lemma_sent,pos_sent,num_count,comp_ner_sent

    ner_sent' '.join(comp_ner_type)
    lemma_sent_lst=[]
    pos_sent_lst=[]
    dep_sent_lst=[]
    num_count_lst=[]
    
    temp_lemma=[]
    temp_pos=[]
    temp_dep=[]
    temp_num_count=[]
    
    for i in range(len(parsed_sent) - 5 + 1):
        temp_lemma=lemma[i: i + 5]
        temp_pos=pos[i:i+5]
        temp_dep=dep[i:i+5]
        
        lemma_sent_lst.append(' '.join(temp_lemma))
        pos_sent_lst.append(' '.join(temp_pos))
        
        dep_sent=' '.join(temp_dep)
        dep_sent_lst.append(dep_sent)
        
        num_count_lst.append(len(re.findall("compound\s(?!compound)", dep_sent)))

    return lemma_sent,pos_sent,num_count,comp_ner_sent

In [8]:
def delist_lang(lst):
    lang_lst=[]
    for i,lang in enumerate(lst):
        if not lang:
            lang_lst.append(None)
        else:
            lang_lst.append(lang[0])
    return lang_lst


def significance(lst):
    significance_list=[]
    for l in lst:
        if len(l)>1:
            significance_list.append(abs(l[0]-l[1])/np.mean(l[0]+l[1])>0.1)
            #print(f'{conf[0]} {conf[1]} {abs(conf[0]-conf[1])/np.mean(conf[0]+conf[1])>0.1}')
        else:
            significance_list.append(True)
    return significance_list

In [9]:
def lang_tagger(parsed_sent):
    labels,confs=fmodel.predict(parsed_sent,k=-1,threshold=0.1)
    lang_list=delist_lang(labels)    
    significance_list=significance(confs)
    assert len(lang_list)==len(significance_list)
    return lang_list,significance_list

In [10]:
def index_processor(df):
    
    df['sent']=np.vectorize(detokenizer.detokenize)(df.old_index.str.split(" ").values)
    df['sent']=df.sent.str.replace('\s*,\s*',', ',regex=False).copy()
    df['sent']=df.sent.str.replace('\s*\.\s*','. ',regex=False).copy()
    df['sent']=df.sent.str.replace('\s*\?\s*','? ',regex=False).copy()
    df['sent']=df.sent.str.replace('__',' ',regex=False).copy()

    df['sent']=df.sent.str.replace('_START_ ','',regex=False).copy()
    df['sent']=df['sent'].str.replace(' _END_','',regex=False).copy()
     
    #df['sent']=df['sent'].str.replace(r"(.+)'\s(.+)",r"\1'\2",regex=True).copy()
    #df['sent']=df['sent'].str.replace(r"(.+)\s'(.+)",r"\1'\2",regex=True).copy()

    lang_list,significance_list=lang_tagger(df.sent.values.tolist())
    df['lang']=lang_list
    df['lang_conf']=significance_list
    df.lang=df.lang.str.split('_',n=4).str[-1]
    
    df=df.loc[(df.lang=='en') &(df.lang_conf==True)]

    lemma_sent,pos_sent,comp_count,comp_ner_sent=np.vectorize(ner_lemma_reducer)(df.sent.values)
    pd.options.mode.chained_assignment = None
    df['lemma_sent']=lemma_sent
    df['pos_sent']=pos_sent
    df['comp_count']=comp_count
    df['comp_ner_sent']=comp_ner_sent
    
    df['is_comp']=False
    df.loc[df.comp_count!=0,'is_comp']=True
    #results_df=results_df.loc[~results_df.ner_token_sent.str.contains("PERSON PERSON")]

    #index_df=pd.concat([df,results_df],axis=1,ignore_index=True)

    #return results_df,df

    #index_df=index_df.loc[(index_df.lang=='en') &(index_df.lang_conf==True)]

    df['nwords']=df.pos_sent.str.count(' ').add(1).copy()
    
    pd.options.mode.chained_assignment = 'warn'
    df=df.loc[df.nwords==5]
    
    df.lemma_sent=df.lemma_sent.str.lower()

    #index_df.pos_sent=index_df.pos_sent.str.replace('PROPN','NOUN',regex=False)
    #index_df.pos_sent=index_df.pos_sent.str.replace('AUX','VERB',regex=False)
    #index_df.pos_sent=index_df.pos_sent.str.replace('CCONJ','CONJ',regex=False)
    #index_df.g_pos=index_df.g_pos.str.replace('.','PUNCT',regex=False)
    #index_df.g_pos=index_df.g_pos.str.replace('PRT','ADP',regex=False)
    if df.shape[0]==0:
        return pd.DataFrame()
    
    df['lemma_pos']=str_joiner(df)
    df['nX']=df.pos_sent.str.count('X')-df.pos_sent.str.count('AUX')
    df=df.loc[~(df.nX==5)]
       
    df['comp_class']=0

    df.loc[df.pos_sent.str.contains(n1),'comp_class']=1
    df.loc[~(df.pos_sent.str.contains(n1))& df.pos_sent.str.contains(n2),'comp_class']=2
    df.loc[df.pos_sent.str.contains(n3),'comp_class']=3
    df.loc[df.pos_sent.str.contains(n4),'comp_class']=4
    df.loc[~(df.pos_sent.str.contains(n1))& df.pos_sent.str.contains(n5),'comp_class']=5
    
    df.loc[df.pos_sent.str.contains(a1),'comp_class']=6
    df.loc[~(df.pos_sent.str.contains(a1))& df.pos_sent.str.contains(a2),'comp_class']=7
    df.loc[df.pos_sent.str.contains(a3),'comp_class']=8
    df.loc[df.pos_sent.str.contains(a4),'comp_class']=9
    df.loc[~(df.pos_sent.str.contains(a1))& df.pos_sent.str.contains(a5),'comp_class']=10

    df.loc[df.pos_sent.str.contains(c1),'comp_class']=11
    df.loc[df.pos_sent.str.contains(c2),'comp_class']=12

    df.drop(['sent','pos_sent','lang','lang_conf','nwords','nX','lemma_sent'],axis=1,inplace=True)

    index_year_df=year_count_split(df)
    index_df=df.merge(index_year_df, on='old_index',how='right')
    index_df['count']=index_df['count'].astype("int64")
    index_df['year']=index_df['year'].astype("int64")
    index_df=index_df.groupby(['lemma_pos','year','comp_class','is_comp','comp_ner_sent'])['count'].sum().to_frame().reset_index()
    return index_df

In [11]:
def year_count_split(df):
    trial_df=pd.concat([df.old_index, df.year_counts.str.split("\t", expand=True)], axis=1)
    trial_df=pd.melt(trial_df, id_vars=["old_index"], value_vars=list(range(len(trial_df.columns)-1))).dropna().drop("variable", axis = 1)
    trial_df[['year','count']] = trial_df.value.str.split(",", n=3, expand=True)[[0,1]]
    return trial_df.drop(['value'],axis=1).reset_index(drop=True)

In [12]:
def str_joiner(df):
    #print(df)
    new_df=pd.DataFrame()
    try:
        new_df[['l1','l2','l3','l4','l5']]=df.lemma_sent.str.split(" ",expand=True,n=4)
        new_df[['p1','p2','p3','p4','p5']]=df.pos_sent.str.split(" ",expand=True,n=4)
    except:
        return pd.DataFrame()
    new_df['lemma_pos']=new_df.l1+"_"+new_df.p1+" "+\
                        new_df.l2+"_"+new_df.p2+" "+\
                        new_df.l3+"_"+new_df.p3+" "+\
                        new_df.l4+"_"+new_df.p4+" "+\
                        new_df.l5+"_"+new_df.p5
    return new_df['lemma_pos']

In [13]:
fname='5-18104-of-19423'

fname='5-09107-of-19423'

fname='5-00604-of-19423'

lnk=f'http://storage.googleapis.com/books/ngrams/books/20200217/eng/{fname}.gz'
index_df   = pd.read_csv(lnk, compression='gzip', header=None, sep=u"\u0001", quoting=csv.QUOTE_NONE)

In [14]:
index_df[['old_index','year_counts']]=index_df[0].str.split('\t',n=1,expand=True)
#index_df=index_df.loc[index_df.old_index.str.match("^"+keep_string*5+"$",na=False)]
index_df=index_df.loc[~index_df.old_index.str.contains(try_keep_string,na=False,regex=True)]
index_df.drop(0,axis=1,inplace=True)
index_df.reset_index(drop=True,inplace=True)
index_df

  index_df=index_df.loc[~index_df.old_index.str.contains(try_keep_string,na=False,regex=True)]


Unnamed: 0,old_index,year_counts
0,", and without an indifferent","1800,1,1\t1820,2,2\t1823,2,2\t1825,4,4\t1826,4..."
1,", and where the deponent","1771,2,2\t1779,1,1\t1780,5,5\t1782,1,1\t1813,5..."
2,", and were such men","1832,1,1\t1851,1,1\t1855,1,1\t1862,6,6\t1871,1..."
3,", and will reveal their","1889,3,3\t1894,2,2\t1896,10,10\t1901,1,1\t1909..."
4,", and would even out","1864,3,3\t1865,1,1\t1866,1,1\t1870,4,4\t1871,1..."
...,...,...
1233257,", and who think very","1836,1,1\t1840,5,5\t1841,5,5\t1843,4,4\t1848,1..."
1233258,", and with no obstructions","1825,3,3\t1832,1,1\t1847,1,1\t1866,1,1\t1868,1..."
1233259,", annong others , are","1808,1,1\t1840,1,1\t1890,3,3\t1893,1,1\t1898,1..."
1233260,", and when they age","1911,2,2\t1920,1,1\t1971,1,1\t1972,3,3\t1975,1..."


In [15]:
if index_df.shape[0]<10_000:
    
    cur_time=time.time()
    new_index_df=index_processor(index_df)
    print(f'Total time taken {round(time.time()-cur_time)} secs')
    
else:
    num_partitions=round(0.95*mp.cpu_count())
    cur_time=time.time()
    df_split = np.array_split(index_df, num_partitions)
    pool = Pool(num_partitions)
    print('Started parallelization')
    results=pool.map_async(index_processor,df_split)
    pool.close()
    pool.join()
        
        
    curr_df_list=results.get()
    new_index_df=pd.concat(curr_df_list,ignore_index=True)
    print(f'Total time taken {round(time.time()-cur_time)} secs')

Started parallelization
Total time taken 1919 secs


In [17]:
new_index_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65785301 entries, 0 to 65785300
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   lemma_pos      object
 1   year           int64 
 2   comp_class     int64 
 3   is_comp        bool  
 4   comp_ner_sent  object
 5   count          int64 
dtypes: bool(1), int64(3), object(2)
memory usage: 2.5+ GB


In [28]:
new_index_df.to_pickle('/data/dharp/compounds/datasets/googleV3/1700.pkl')


In [None]:
N - N
N 's N


