In [7]:
import pandas as pd
import glob
import os
import numpy as np
import time
import fastparquet
import argparse
from multiprocessing import Pool
import multiprocessing as mp
from os.path import isfile

In [8]:
parser = argparse.ArgumentParser(description='Program to run google compounder for a particular file and setting')

parser.add_argument('--data', type=str,
                    help='location of the parquet files')

parser.add_argument('--word', action='store_true',
                    help='Extracting context for words only?')

parser.add_argument('--output', type=str,
                    help='directory to save dataset in')


args = parser.parse_args('--data /data/dharp/compounds/datasets/entire_df/ --output /data/dharp/compounds/datasets'.split())

In [9]:
context_file=pd.read_pickle('/data/dharp/compounds/Compounding/data/contexts.pkl')
contexts=[context.split('_')[0] for context in context_file]
contexts=list(set(contexts))
len(contexts)

40215

In [10]:
def left_side_parser(df): # N N _ _ _
    cur_df=df.copy()

    try:
        cur_df[['modifier','head','w1','w2','w3']]=cur_df.lemma_sent.str.split(' ',expand=True)
    except ValueError:
        compound_df=pd.DataFrame()
        modifier_df=pd.DataFrame()
        head_df=pd.DataFrame()
        return compound_df,modifier_df,head_df
    
    compound_df=pd.melt(cur_df,id_vars=['modifier','head','year','count'],value_vars=['w1','w2','w3'],value_name='context')
    compound_df=compound_df.loc[compound_df.context.isin(contexts)]

    modifier_df=pd.melt(cur_df,id_vars=['modifier','year','count'],value_vars=['head','w1','w2'],value_name='context')
    modifier_df=modifier_df.loc[modifier_df.context.isin(contexts)]
    
    head_df=pd.melt(cur_df,id_vars=['head','year','count'],value_vars=['modifier','w1','w2','w3'],value_name='context')
    head_df=head_df.loc[head_df.context.isin(contexts)]
    
    return compound_df,modifier_df,head_df

def mid1_parser(df): # _ N N _ _
    cur_df=df.copy()
    try:
        cur_df[['w1','modifier','head','w2','w3']]=cur_df.lemma_sent.str.split(' ',expand=True)
    except ValueError:
        compound_df=pd.DataFrame()
        modifier_df=pd.DataFrame()
        head_df=pd.DataFrame()
        return compound_df,modifier_df,head_df
    
    compound_df=pd.melt(cur_df,id_vars=['modifier','head','year','count'],value_vars=['w1','w2','w3'],value_name='context')
    compound_df=compound_df.loc[compound_df.context.isin(contexts)]

    modifier_df=pd.melt(cur_df,id_vars=['modifier','year','count'],value_vars=['head','w1','w2','w3'],value_name='context')
    modifier_df=modifier_df.loc[modifier_df.context.isin(contexts)]
    
    head_df=pd.melt(cur_df,id_vars=['head','year','count'],value_vars=['modifier','w1','w2','w3'],value_name='context')
    head_df=head_df.loc[head_df.context.isin(contexts)]
    
    return compound_df,modifier_df,head_df

def mid2_parser(df): # _ _ N N _
    cur_df=df.copy()
    try:
        cur_df[['w1','w2','modifier','head','w3']]=cur_df.lemma_sent.str.split(' ',expand=True)
    except ValueError:
        compound_df=pd.DataFrame()
        modifier_df=pd.DataFrame()
        head_df=pd.DataFrame()
        return compound_df,modifier_df,head_df
       
    compound_df=pd.melt(cur_df,id_vars=['modifier','head','year','count'],value_vars=['w1','w2','w3'],value_name='context')
    compound_df=compound_df.loc[compound_df.context.isin(contexts)]

    modifier_df=pd.melt(cur_df,id_vars=['modifier','year','count'],value_vars=['head','w1','w2','w3'],value_name='context')
    modifier_df=modifier_df.loc[modifier_df.context.isin(contexts)]
    
    head_df=pd.melt(cur_df,id_vars=['head','year','count'],value_vars=['modifier','w1','w2','w3'],value_name='context')
    head_df=head_df.loc[head_df.context.isin(contexts)]
    
    return compound_df,modifier_df,head_df

def right_side_parser(df): # _ _ _ N N
    cur_df=df.copy()
    try:
        cur_df[['w1','w2','w3','modifier','head']]=cur_df.lemma_sent.str.split(' ',expand=True)
    except ValueError:
        compound_df=pd.DataFrame()
        modifier_df=pd.DataFrame()
        head_df=pd.DataFrame()
        return compound_df,modifier_df,head_df
    
    compound_df=pd.melt(cur_df,id_vars=['modifier','head','year','count'],value_vars=['w1','w2','w3'],value_name='context')
    compound_df=compound_df.loc[compound_df.context.isin(contexts)]
    
    modifier_df=pd.melt(cur_df,id_vars=['modifier','year','count'],value_vars=['head','w1','w2','w3'],value_name='context')
    modifier_df=modifier_df.loc[modifier_df.context.isin(contexts)]
    
    head_df=pd.melt(cur_df,id_vars=['head','year','count'],value_vars=['modifier','w2','w3'],value_name='context')
    head_df=head_df.loc[head_df.context.isin(contexts)]
    
    return compound_df,modifier_df,head_df

In [11]:
def syntactic_reducer(df):
    pattern=df.iloc[0].comp_class
    if pattern==1: # N N _ _ N N
        compound_left_df,modifier_left_df,head_left_df=left_side_parser(df)
        compound_right_df,modifier_right_df,head_right_df=right_side_parser(df)
        
        final_compound_df=pd.concat([compound_left_df,compound_right_df],ignore_index=True)
        final_modifier_df=pd.concat([modifier_left_df,modifier_right_df],ignore_index=True)
        final_head_df=pd.concat([head_left_df,head_right_df],ignore_index=True)
           
    elif pattern==2: # N N _ _ _
        final_compound_df,final_modifier_df,final_head_df=left_side_parser(df)

    elif pattern==3: # _ N N _ _
        final_compound_df,final_modifier_df,final_head_df=mid1_parser(df)
    
    elif pattern==4: # _ _ N N _
        final_compound_df,final_modifier_df,final_head_df=mid2_parser(df)
        
    elif pattern==5: # _ _ _ N N
        final_compound_df,final_modifier_df,final_head_df=right_side_parser(df)

    return final_compound_df,final_modifier_df,final_head_df

In [12]:
def compound_extracter(df):
    if df.loc[df.comp_class==1].shape[0]!=0:
        sides_comp_df,sides_mod_df,sides_head_df=syntactic_reducer(df.loc[df.comp_class==1])
    else:
        sides_comp_df=pd.DataFrame()
        sides_mod_df=pd.DataFrame()
        sides_head_df=pd.DataFrame()
    
    if df.loc[df.comp_class==2].shape[0]!=0:
        left_comp_df,left_mod_df,left_head_df=syntactic_reducer(df.loc[df.comp_class==2])
    else:
        left_comp_df=pd.DataFrame()
        left_mod_df=pd.DataFrame()
        left_head_df=pd.DataFrame()       
        
    if df.loc[df.comp_class==3].shape[0]!=0:
        mid1_comp_df,mid1_mod_df,mid1_head_df=syntactic_reducer(df.loc[df.comp_class==3])
    else:
        mid1_comp_df=pd.DataFrame()
        mid1_mod_df=pd.DataFrame()
        mid1_head_df=pd.DataFrame()
        
    if df.loc[df.comp_class==4].shape[0]!=0:
        mid2_comp_df,mid2_mod_df,mid2_head_df=syntactic_reducer(df.loc[df.comp_class==4])
    else:
        mid2_comp_df=pd.DataFrame()
        mid2_mod_df=pd.DataFrame()
        mid2_head_df=pd.DataFrame()

    if df.loc[df.comp_class==5].shape[0]!=0:
        right_comp_df,right_mod_df,right_head_df=syntactic_reducer(df.loc[df.comp_class==5])
        
    else:
        right_comp_df=pd.DataFrame()
        right_mod_df=pd.DataFrame()
        right_head_df=pd.DataFrame()

    compounds=pd.concat([sides_comp_df,left_comp_df,mid1_comp_df,mid2_comp_df,right_comp_df],ignore_index=True,sort=False)
    modifiers=pd.concat([sides_mod_df,left_mod_df,mid1_mod_df,mid2_mod_df,right_mod_df],ignore_index=True,sort=False)
    heads=pd.concat([sides_head_df,left_head_df,mid1_head_df,mid2_head_df,right_head_df],ignore_index=True,sort=False)
    
    if len(compounds)==0:
        return compounds,modifiers,heads
    
    compounds.dropna(inplace=True)
    compounds=compounds.groupby(['modifier','head','context','year'])['count'].sum().to_frame()
    compounds.reset_index(inplace=True)
    
    modifiers.dropna(inplace=True)
    modifiers=modifiers.groupby(['modifier','context','year'])['count'].sum().to_frame()
    modifiers.reset_index(inplace=True)
    
    heads.dropna(inplace=True)
    heads=heads.groupby(['head','context','year'])['count'].sum().to_frame()
    heads.reset_index(inplace=True)
    
    return compounds,modifiers,heads

In [13]:
def parallelize_dataframe(df,num_cores):
    num_partitions = num_cores
    df_split = np.array_split(df, num_partitions)
    print("Done splitting the datasets")
    pool = Pool(num_cores)

    cur_time=time.time()
    print("Starting parallelizing")
    if not args.word:

        results=pool.map_async(compound_extracter,df_split)
        pool.close()
        pool.join()

        results=results.get()

        
        print("Done parallelizing")
        print("Total time taken",round(time.time()-cur_time),"secs")
        compound_list = [ result[0] for result in results]
        compounds=pd.concat(compound_list,ignore_index=True)
        compounds=compounds.groupby(['modifier','head','context','year'])['count'].sum().to_frame()
        compounds.reset_index(inplace=True)
        
        #if not isfile(f'{args.output}/compounds.csv'):
            #compounds.to_csv(f'{args.output}/compounds.csv',sep="\t",index=False)
        #else:
            #compounds.to_csv(f'{args.output}/compounds.csv', mode='a',sep="\t", header=False,index=False)
        
        
        modifier_list = [ result[1] for result in results]
        modifiers=pd.concat(modifier_list,ignore_index=True)
        modifiers=modifiers.groupby(['modifier','context','year'])['count'].sum().to_frame()
        modifiers.reset_index(inplace=True)

        #if not isfile(f'{args.output}/modifiers.csv'):
            #modifiers.to_csv(f'{args.output}/modifiers.csv',sep="\t",index=False)
        #else:
            #modifiers.to_csv(f'{args.output}/modifiers.csv', mode='a',sep="\t",header=False,index=False)
        
        head_list = [ result[2] for result in results]
        heads=pd.concat(head_list,ignore_index=True)
        heads=heads.groupby(['head','context','year'])['count'].sum().to_frame()
        heads.reset_index(inplace=True)

        return compounds,modifiers,heads
        #if not isfile(f'{args.output}/heads.csv'):
            #heads.to_csv(f'{args.output}/heads.csv',sep="\t",index=False)
        #else:
            #heads.to_csv(f'{args.output}/heads.csv', mode='a',sep="\t",header=False,index=False)
            
#        phrase_list = [ result[3] for result in results]
#        phrases=pd.concat(phrase_list,ignore_index=True)
#        phrases=phrases.groupby(['modifier','head','context','year'])['count'].sum().to_frame()
#        phrases.reset_index(inplace=True)
        
#        if not isfile(f'{args.output}/phrases.csv'):
#            phrases.to_csv(f'{args.output}/phrases.csv',sep="\t",index=False)
#        else:
#            phrases.to_csv(f'{args.output}/phrases.csv', mode='a',sep="\t",header=False,index=False)

    else:
        words_list=[]
        results=pool.map_async(cdsm_word_reducer,df_split)
  
        
        pool.close()
        pool.join()
        print("Done parallelizing")
        print("Total time taken",round(time.time()-cur_time),"secs")
        words_list=results.get()
        words = pd.concat(words_list,ignore_index=True,sort=False)
        words=words.groupby(['word','context','year'])['count'].sum().to_frame()
        words.reset_index(inplace=True)
        print(words.shape)
                
        if not isfile(f'{args.output}/words.csv'):
            words.to_csv(f'{args.output}/words.csv',sep="\t",index=False,header=True)
        else:
            words.to_csv(f'{args.output}/words.csv', mode='a',sep="\t", header=False,index=False)
        
    print("Done concatenations \n")

In [29]:
def right_side_parser(df): # _ _ _ N N
    cur_df=df.copy()
    try:
        cur_df[['w1','w2','w3','modifier','head']]=cur_df.lemma_sent.str.split(' ',expand=True)
        cur_df[['p1','p2','p3']]=cur_df.pos_sent.str.split(' ',expand=True)[[0,1,2]]
    except ValueError:
        compound_df=pd.DataFrame()
        modifier_df=pd.DataFrame()
        head_df=pd.DataFrame()
        return compound_df,modifier_df,head_df
    
    compound_df=pd.melt(cur_df,id_vars=['modifier','head','year','count'],value_vars=['w1','w2','w3'],value_name='context')
    compound_df=compound_df.loc[compound_df.context.isin(contexts)]
    
    modifier_df=pd.melt(cur_df,id_vars=['modifier','year','count'],value_vars=['head','w1','w2','w3'],value_name='context')
    modifier_df=modifier_df.loc[modifier_df.context.isin(contexts)]
    
    head_df=pd.melt(cur_df,id_vars=['head','year','count'],value_vars=['modifier','w2','w3'],value_name='context')
    head_df=head_df.loc[head_df.context.isin(contexts)]
    
    return compound_df,modifier_df,head_df

Unnamed: 0,lemma_sent,year,pos_sent,comp_class,ner_sent,count
49,"these preparation , mr. astor",1836,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,9
50,"these preparation , mr. astor",1839,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,1
51,"these preparation , mr. astor",1844,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,1
52,"these preparation , mr. astor",1849,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,6
53,"these preparation , mr. astor",1850,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,2
...,...,...,...,...,...,...
2362197,thyself well when thou digressest,2004,PRON INTJ ADV NOUN NOUN,5,,3
2362198,thyself well when thou digressest,2005,PRON INTJ ADV NOUN NOUN,5,,7
2362199,thyself well when thou digressest,2006,PRON INTJ ADV NOUN NOUN,5,,2
2362200,thyself well when thou digressest,2007,PRON INTJ ADV NOUN NOUN,5,,4


In [61]:
kwestie=cur_df.loc[cur_df.comp_class==5].copy()
kwestie_index=kwestie.groupby(['lemma_sent','pos_sent']).size().to_frame().reset_index()
kwestie_index.drop(0,axis=1,inplace=True)
kwestie_index

Unnamed: 0,lemma_sent,pos_sent
0,"these preparation , mr. astor",DET NOUN PUNCT NOUN NOUN
1,"these present , doth grant",DET NOUN PUNCT NOUN NOUN
2,these present doth bargain sell,DET VERB ADJ NOUN NOUN
3,these present in manner hereinafter,DET NOUN ADP NOUN NOUN
4,these principle by great britain,DET NOUN ADP NOUN NOUN
...,...,...
16342,thyself into whatever shape thou,PRON ADP DET NOUN NOUN
16343,thyself outgoing in thy noon,PRON ADJ ADP NOUN NOUN
16344,thyself that the great carbuncle,PRON SCONJ DET NOUN NOUN
16345,thyself unweariedly till thou findest,PRON ADV SCONJ NOUN NOUN


In [62]:
kwestie_index.lemma_sent.value_counts()

third of the committee member       2
through in - service program        2
this year ' s budget                2
third of the total national         2
throw open -pron- chamber window    2
                                   ..
this type of roof covering          1
think of the research process       1
this kind to sir walter             1
these small - scale enterprise      1
those part of asia minor            1
Name: lemma_sent, Length: 16315, dtype: int64

In [46]:
kwestie[['w1','w2','w3','modifier','head']]=kwestie.lemma_sent.str.split(' ',expand=True)
kwestie[['p1','p2','p3']]=kwestie.pos_sent.str.split(' ',expand=True)[[0,1,2]]
kwestie

Unnamed: 0,lemma_sent,year,pos_sent,comp_class,ner_sent,count,w1,w2,w3,modifier,head,p1,p2,p3
49,"these preparation , mr. astor",1836,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,9,these,preparation,",",mr.,astor,DET,NOUN,PUNCT
50,"these preparation , mr. astor",1839,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,1,these,preparation,",",mr.,astor,DET,NOUN,PUNCT
51,"these preparation , mr. astor",1844,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,1,these,preparation,",",mr.,astor,DET,NOUN,PUNCT
52,"these preparation , mr. astor",1849,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,6,these,preparation,",",mr.,astor,DET,NOUN,PUNCT
53,"these preparation , mr. astor",1850,DET NOUN PUNCT NOUN NOUN,5,25_30_PERSON,2,these,preparation,",",mr.,astor,DET,NOUN,PUNCT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2362197,thyself well when thou digressest,2004,PRON INTJ ADV NOUN NOUN,5,,3,thyself,well,when,thou,digressest,PRON,INTJ,ADV
2362198,thyself well when thou digressest,2005,PRON INTJ ADV NOUN NOUN,5,,7,thyself,well,when,thou,digressest,PRON,INTJ,ADV
2362199,thyself well when thou digressest,2006,PRON INTJ ADV NOUN NOUN,5,,2,thyself,well,when,thou,digressest,PRON,INTJ,ADV
2362200,thyself well when thou digressest,2007,PRON INTJ ADV NOUN NOUN,5,,4,thyself,well,when,thou,digressest,PRON,INTJ,ADV


In [25]:
num_cores=200
df_path=args.data

df_files=[]
for filename in glob.glob(df_path+'*parq'):
    df_files.append(filename)
    
#for f in df_files:
#    parquet_processor(f)

In [26]:
def parquet_processor(f):   
    cur_fname=f.split('.')[0].split('/')[-1]
    print(f'Current parquet file: {f}')
    cur_parq=fastparquet.ParquetFile(f)
    print(f'Number of partitions: {len(cur_parq.row_groups)}')
    compounds_list=[]
    modifiers_list=[]
    heads_list=[]

    for i,cur_df in enumerate(cur_parq.iter_row_groups()):
        print(f'Partition {i+1} out of {len(cur_parq.row_groups)}')
        cur_df.year=cur_df.year.astype("int32")
        cur_df=cur_df.loc[cur_df.comp_class!=0].reset_index(drop=True)
        cur_compounds,cur_modifiers,cur_heads=parallelize_dataframe(cur_df,num_cores)
        compounds_list.append(cur_compounds)
        modifiers_list.append(cur_modifiers)
        heads_list.append(cur_heads)

        
    compounds=pd.concat(compounds_list,ignore_index=True)
    compounds=compounds.groupby(['modifier','head','context','year'])['count'].sum().to_frame()
    compounds.reset_index(inplace=True)
    
    compounds.to_parquet(
    path=f'{args.output}/compounds/{cur_fname}.parq', 
    engine='fastparquet',
    compression='snappy')        
        
   
    modifiers=pd.concat(modifiers_list,ignore_index=True)
    modifiers=modifiers.groupby(['modifier','context','year'])['count'].sum().to_frame()
    modifiers.reset_index(inplace=True)
    
    modifiers.to_parquet(
    path=f'{args.output}/modifiers/{cur_fname}.parq', 
    engine='fastparquet',
    compression='snappy')

    
    heads=pd.concat(heads_list,ignore_index=True)
    heads=heads.groupby(['head','context','year'])['count'].sum().to_frame()
    heads.reset_index(inplace=True)
    
    heads.to_parquet(
    path=f'{args.output}/heads/{cur_fname}.parq', 
    engine='fastparquet',
    compression='snappy')
    
    print("Done with file \n")

In [27]:
cur_df.ner_sent.str

Unnamed: 0,lemma_sent,year,pos_sent,comp_class,ner_sent,count
0,these premise art conclude that,1856,DET NOUN NOUN VERB DET,3,,3
1,these premise art conclude that,1862,DET NOUN NOUN VERB DET,3,,1
2,these premise art conclude that,1865,DET NOUN NOUN VERB DET,3,,1
3,these premise art conclude that,1868,DET NOUN NOUN VERB DET,3,,1
4,these premise art conclude that,1872,DET NOUN NOUN VERB DET,3,,3
...,...,...,...,...,...,...
2362285,"thyssen - bornemisza collection ,",2004,NOUN PUNCT NOUN NOUN PUNCT,4,0_31_ORG,34
2362286,"thyssen - bornemisza collection ,",2005,NOUN PUNCT NOUN NOUN PUNCT,4,0_31_ORG,14
2362287,"thyssen - bornemisza collection ,",2006,NOUN PUNCT NOUN NOUN PUNCT,4,0_31_ORG,43
2362288,"thyssen - bornemisza collection ,",2007,NOUN PUNCT NOUN NOUN PUNCT,4,0_31_ORG,15
