In [1]:
import pandas as pd
import fasttext
import time
import numpy as np
import multiprocessing as mp
from multiprocessing import Pool
import csv
import fastparquet
import spacy
import glob, os
import re
from os.path import isfile
import seaborn as sns

In [2]:
to_save_path='/data/dharp/compounds/datasets/'
keep_string=r"(.+_(NOUN|ADV|VERB|ADJ|X|PRT|CONJ|PRON|DET|ADP|NUM|\.)|_END_)\s*"
nn='(?!(?:NOUN|PROPN)).*'
comp='(?:ADJ|NOUN|PROPN)\s(?:NOUN|PROPN)'
word='.*'
ner_cats=['CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']
n1=f'^{comp}\s{nn}\s{comp}$'
n2=f'^{comp}\s{nn}\s{word}\s{word}$'
n3=f'^{nn}\s{comp}\s{nn}\s{word}$'
n4=f'^{word}\s{nn}\s{comp}\s{nn}$'
n5=f'^{word}\s{word}\s{nn}\s{comp}$'

In [3]:
fmodel = fasttext.load_model('/data/dharp/packages/lid.176.bin')
nlp = spacy.load('en_core_web_lg')

In [4]:
def delist_lang(lst):
    lang_lst=[]
    for i,lang in enumerate(lst):
        if not lang:
            lang_lst.append(None)
        else:
            lang_lst.append(lang[0])
    return lang_lst


def significance(lst):
    significance_list=[]
    for l in lst:
        if len(l)>1:
            significance_list.append(abs(l[0]-l[1])/np.mean(l[0]+l[1])>0.1)
            #print(f'{conf[0]} {conf[1]} {abs(conf[0]-conf[1])/np.mean(conf[0]+conf[1])>0.1}')
        else:
            significance_list.append(True)
    return significance_list

In [5]:
def sent_maker(sent_lst):
    ret_sents=[]
    g_pos=[]
    for sent in sent_lst:
        cur_words=[]
        pos_sent=[]
        sent=sent.replace('_END_','@@@_.')
        for word_pos in sent.split(' '):
            word,pos=word_pos.rsplit('_',1)
            cur_words.append(word)
            pos_sent.append(pos)
            cur_sent=' '.join(cur_words)
            cur_pos=' '.join(pos_sent)
        ret_sents.append(cur_sent)
        g_pos.append(cur_pos)
    return ret_sents,g_pos

In [6]:
def ner_lemma_reducer(sent):
    ner_sent=[]
    lemma=[]
    pos=[]
    #parse=[]
    is_comp=False
    ner_token=[]
    ner_length=[]
    ner=[]
    parsed_sent=nlp(sent)
    for token in parsed_sent:
        #parse.append(token.text)
        lemma.append(token.lemma_)
        pos.append(token.pos_)
        if token.ent_type_=="":
            to_add="NONNER"
        else:
            to_add=token.ent_type_
        ner_token.append(to_add)
        if token.dep_=="compound":
            is_comp=True
    #print(parse)
    #parse_sent=' '.join(parse)
    lemma_sent=' '.join(lemma)
    pos_sent=' '.join(pos)
    ner_token_sent=' '.join(ner_token)
    #dep_sent=' '.join(dep)
    ner_length=0
    if parsed_sent.ents:
        for ent in parsed_sent.ents:
            #cur_ner=
            #cur_ner='_'.join([str(ent.start_char), str(ent.end_char), ent.label_])
            ner_length+=ent.end_char-ent.start_char
            #ner.append(cur_ner)
    #else:
        #ner.append("")
    ner_sent=' '.join(ner)
    
    return ner_token_sent,ner_length,lemma_sent,pos_sent,is_comp

In [7]:
def lang_tagger(parsed_sent):
    labels,confs=fmodel.predict(parsed_sent,k=-1,threshold=0.1)
    lang_list=delist_lang(labels)    
    significance_list=significance(confs)
    assert len(lang_list)==len(significance_list)
    return lang_list,significance_list

In [8]:
def index_processor(df):
    df.reset_index(inplace=True,drop=True)
    ret_lst=sent_maker(df.old_index)
    
    df['sent']=ret_lst[0]
    df['g_pos']=ret_lst[1]
    
    results=np.vectorize(ner_lemma_reducer)(df.sent.values)
    results_df=pd.DataFrame(results)
    results_df=results_df.transpose()
    #results_df.columns=ner_token_sent,ner_length,lemma_sent,pos_sent,is_comp
    results_df.columns=['ner_token_sent','ner_length','lemma_sent','pos_sent','is_comp']

    results_df=results_df.loc[~results_df.ner_token_sent.str.contains("PERSON PERSON")]

    index_df=pd.concat([df,results_df],axis=1,ignore_index=False)

    lang_list,significance_list=lang_tagger(index_df.sent.values.tolist())
    index_df['lang']=lang_list
    index_df['lang_conf']=significance_list
    index_df.lang=index_df.lang.str.split('_',n=4).str[-1]
    index_df=index_df.loc[(index_df.lang=='en') &(index_df.lang_conf==True)]

    index_df['nwords']=index_df.pos_sent.str.count(' ').add(1)
    index_df=index_df.loc[index_df.nwords==5]
    
    index_df.lemma_sent=index_df.lemma_sent.str.lower()
    #index_df.pos_sent=index_df.pos_sent.str.replace('PROPN','NOUN',regex=False)
    #index_df.pos_sent=index_df.pos_sent.str.replace('AUX','VERB',regex=False)
    #index_df.pos_sent=index_df.pos_sent.str.replace('CCONJ','CONJ',regex=False)
    #index_df.g_pos=index_df.g_pos.str.replace('.','PUNCT',regex=False)
    #index_df.g_pos=index_df.g_pos.str.replace('PRT','ADP',regex=False)
    if index_df.shape[0]==0:
        return index_df
    index_df['lemma_pos']=str_joiner(index_df)
    index_df['nX']=index_df.pos_sent.str.count('X')-index_df.pos_sent.str.count('AUX')
    index_df=index_df.loc[~(index_df.nX>1)]
    
    index_df['ner_perc']=index_df.ner_length/index_df.sent.str.len()
   
    index_df['comp_class']=0

    index_df.loc[index_df.pos_sent.str.contains(n1),'comp_class']=1
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n2),'comp_class']=2
    index_df.loc[index_df.pos_sent.str.contains(n3),'comp_class']=3
    index_df.loc[index_df.pos_sent.str.contains(n4),'comp_class']=4
    index_df.loc[~(index_df.pos_sent.str.contains(n1))& index_df.pos_sent.str.contains(n5),'comp_class']=5
    index_df.drop(['old_index','g_pos','lang','lang_conf','nwords','nX','lemma_sent','ner_length'],axis=1,inplace=True)
    index_year_df=year_count_split(index_df)
    index_df=index_df.merge(index_year_df, on='lemma_pos',how='right')
    index_df=index_df.groupby(['lemma_pos','pos_sent','year','comp_class'])['count'].sum().to_frame().reset_index()
    return index_df

In [9]:
def year_count_split(df):
    trial_df=pd.concat([df.lemma_pos, df.year_counts.str.split("\t", expand=True)], axis=1)
    trial_df=pd.melt(trial_df, id_vars=["lemma_pos"], value_vars=list(range(len(trial_df.columns)-1))).dropna().drop("variable", axis = 1)
    trial_df[['year','count']] = trial_df.value.str.split(",", n=3, expand=True)[[0,1]]
    return trial_df.drop(['value'],axis=1).reset_index(drop=True)

In [10]:
def str_joiner(df):
    #print(df)
    new_df=pd.DataFrame()
    try:
        new_df[['l1','l2','l3','l4','l5']]=df.lemma_sent.str.split(" ",expand=True)
        new_df[['p1','p2','p3','p4','p5']]=df.pos_sent.str.split(" ",expand=True)
    except:
        print(df)
    new_df['lemma_pos']=new_df.l1+"_"+new_df.p1+" "+\
                        new_df.l2+"_"+new_df.p2+" "+\
                        new_df.l3+"_"+new_df.p3+" "+\
                        new_df.l4+"_"+new_df.p4+" "+\
                        new_df.l5+"_"+new_df.p5
    return new_df['lemma_pos']

In [11]:
lnk='http://storage.googleapis.com/books/ngrams/books/20200217/eng/5-00835-of-19423.gz'
index_df   = pd.read_csv(lnk, compression='gzip', header=None, sep="\n", quoting=csv.QUOTE_NONE)    

In [12]:
index_df

Unnamed: 0,0
0,",_. which_DET emphasize_VERB the_DET individua..."
1,",_. what_PRON with_ADP you_PRON not_ADV\t1979,..."
2,",_. when_ADV bankruptcy_NOUN proceedings_NOUN ..."
3,",_. which_DET declared_VERB upon_ADP a_DET\t18..."
4,",_. where_ADV Ben_NOUN Jonson_NOUN and_CONJ\t1..."
...,...
3436325,",_. when_ADV taking_VERB and_CONJ retention_NO..."
3436326,",_. which_DET has_VERB four_NUM elements_NOUN\..."
3436327,",_. when_ADV he_PRON is_VERB divested_VERB\t17..."
3436328,",_. which_DET are_VERB clearly_ADV identifiabl..."


In [13]:
index_df.shape[0]

3436330

In [14]:
index_df[['old_index','year_counts']]=index_df[0].str.split('\t',n=1,expand=True)
index_df=index_df.loc[index_df.old_index.str.match("^"+keep_string*5+"$",na=False)]
index_df.drop(0,axis=1,inplace=True)
index_df

Unnamed: 0,old_index,year_counts
0,",_. which_DET emphasize_VERB the_DET individua...","1927,1,1\t1941,1,1\t1947,3,3\t1953,4,4\t1955,7..."
1,",_. what_PRON with_ADP you_PRON not_ADV","1979,1,1\t1985,1,1\t1989,1,1\t1991,1,1\t1998,3..."
2,",_. when_ADV bankruptcy_NOUN proceedings_NOUN ...","1832,1,1\t1876,3,3\t1888,1,1\t1895,1,1\t1910,4..."
3,",_. which_DET declared_VERB upon_ADP a_DET","1874,2,2\t1889,6,6\t1897,9,6\t1898,1,1\t1900,2..."
4,",_. where_ADV Ben_NOUN Jonson_NOUN and_CONJ","1803,2,2\t1804,1,1\t1806,1,1\t1808,7,7\t1809,1..."
...,...,...
3436325,",_. when_ADV taking_VERB and_CONJ retention_NOUN","1996,5,5\t1997,3,3\t1998,3,3\t1999,2,2\t2000,4..."
3436326,",_. which_DET has_VERB four_NUM elements_NOUN","1920,9,9\t1921,2,2\t1922,1,1\t1940,1,1\t1955,1..."
3436327,",_. when_ADV he_PRON is_VERB divested_VERB","1721,1,1\t1801,1,1\t1809,1,1\t1810,4,4\t1811,1..."
3436328,",_. which_DET are_VERB clearly_ADV identifiabl...","1890,2,1\t1897,1,1\t1932,1,1\t1933,1,1\t1963,4..."


In [None]:
num_partitions=round(0.95*mp.cpu_count())
cur_time=time.time()
df_split = np.array_split(index_df, num_partitions)
pool = Pool(num_partitions)
print('Started parallelization')
results=pool.map_async(index_processor,df_split)
pool.close()
pool.join()
        
        
curr_df_list=results.get()
new_index_df=pd.concat(curr_df_list,ignore_index=True)
print(f'Total time taken {round(time.time()-cur_time)} secs')

Started parallelization


In [28]:
index_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3073386 entries, 0 to 3073385
Data columns (total 1 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       object
dtypes: object(1)
memory usage: 23.4+ MB


In [39]:
pd.read_pickle('/data/dharp/compounds/datasets/googleV3/69.pkl')

Unnamed: 0,lemma_pos,pos_sent,year,comp_class,count
0,"""_PUNCT ""_PUNCT ""_PUNCT ""_PUNCT --------------...",PUNCT PUNCT PUNCT PUNCT PUNCT,1868,0,1
1,"""_PUNCT ""_PUNCT ""_PUNCT ""_PUNCT --------------...",PUNCT PUNCT PUNCT PUNCT PUNCT,1870,0,1
2,"""_PUNCT ""_PUNCT ""_PUNCT ""_PUNCT --------------...",PUNCT PUNCT PUNCT PUNCT PUNCT,1875,0,1
3,"""_PUNCT ""_PUNCT ""_PUNCT ""_PUNCT --------------...",PUNCT PUNCT PUNCT PUNCT PUNCT,1876,0,1
4,"""_PUNCT ""_PUNCT ""_PUNCT ""_PUNCT --------------...",PUNCT PUNCT PUNCT PUNCT PUNCT,1888,0,2
...,...,...,...,...,...
60102870,"""_PUNCT be_AUX you_PRON listen_VERB to_ADP",PUNCT AUX PRON VERB ADP,2015,0,4
60102871,"""_PUNCT be_AUX you_PRON listen_VERB to_ADP",PUNCT AUX PRON VERB ADP,2016,0,6
60102872,"""_PUNCT be_AUX you_PRON listen_VERB to_ADP",PUNCT AUX PRON VERB ADP,2017,0,1
60102873,"""_PUNCT be_AUX you_PRON listen_VERB to_ADP",PUNCT AUX PRON VERB ADP,2018,0,4
