In [1]:
import random
import pandas as pd
import glob
import os
import numpy as np
import pickle as pkl
import time
import multiprocessing as mp
from multiprocessing import Pool
import re
import argparse
import tqdm

In [2]:
parser = argparse.ArgumentParser(description='Program to combine pickle data into one file for all settings')

parser.add_argument('--setting', type=str,
                    help='comp:Compounds mod:Modifiers head:Heads phr:Phrases const:Constituent')
parser.add_argument('--spath', type=str,
                    help='directory where to save output')

args = parser.parse_args("--setting const --spath /data/dharp/compounds/datasets/".split())



if args.setting=="comp":
    pkl_files=glob.glob(f'{args.spath}/compounds/*pkl')
    modifiers=pd.read_pickle(f'{args.spath}/contexts/modifiers.pkl')
    heads=pd.read_pickle(f'{args.spath}/contexts/heads.pkl')
    
elif args.setting=="mod":
    pkl_files=glob.glob(f'{args.spath}/modifiers/*pkl')
    modifiers=pd.read_pickle(f'{args.spath}/contexts/modifiers.pkl')
    
elif args.setting=="head":
    pkl_files=glob.glob(f'{args.spath}/heads/*pkl')
    heads=pd.read_pickle(f'{args.spath}/contexts/heads.pkl')

    
elif args.setting=="phr":
    pkl_files=glob.glob(f'{args.spath}/phrases/*pkl')
    
elif args.setting=="const":
    pkl_files=glob.glob(f'{args.spath}/words/*pkl')
    constituents=pd.read_pickle(f'{args.spath}/contexts/constituents.pkl')

    
random.shuffle(pkl_files)


contexts=pd.read_pickle(f'{args.spath}/contexts/contexts_top50k.pkl')



div_lsts=np.array_split(pkl_files, 15)

In [3]:
len(constituents)

1128053

In [4]:
len(div_lsts[0])

8

In [5]:
def mem_reducer(pkl_file):
    print(f'\nStarted with file {pkl_file}\n')
    cur_time=time.time()
    
    df=pd.read_pickle(pkl_file)
    orig_shape=df.shape[0]
    df["year"] = pd.to_numeric(df["year"], downcast="unsigned")
    df=df.loc[df['year']>=1800]
    df.context=pd.Categorical(df.context,categories=contexts)

    if args.setting=="comp":
        df.modifier=pd.Categorical(df.modifier,categories=modifiers)
        df['head']=pd.Categorical(df['head'],categories=heads)
        df.dropna(inplace=True)
        df=df.groupby(['modifier','head','context','year'],observed=True)['count'].sum().to_frame().reset_index()
        
    elif args.setting=="mod":
        df.modifier=pd.Categorical(df.modifier,categories=modifiers)
        df.dropna(inplace=True)
        df=df.groupby(['modifier','context','year'],observed=True)['count'].sum().to_frame().reset_index()
        
    elif args.setting=="head":
        df['head']=pd.Categorical(df['head'],categories=heads)
        df.dropna(inplace=True)
        df=df.groupby(['head','context','year'],observed=True)['count'].sum().to_frame().reset_index()
        
    elif args.setting=="const":
        df['word']=pd.Categorical(df['word'],categories=heads)
        df.dropna(inplace=True)
        df=df.groupby(['word','context','year'],observed=True)['count'].sum().to_frame().reset_index()

    #print(df.shape[0],((orig_shape-df.shape[0])/orig_shape*100))
    print(f'Done with file {pkl_file} in {round(time.time()-cur_time)} secs and current size is {round(df.shape[0]/orig_shape*100,2)}% of the original dataset')
    return df

In [11]:
def batch_processor(cur_list):
    
    dfs=[]
    n_proc = min(50,mp.cpu_count()-1)
    pool = Pool(n_proc)
    for result in tqdm.tqdm(pool.imap_unordered(mem_reducer, cur_list), total=len(cur_list)):
        dfs.append(result)
    pool.close()
    pool.join()  
    
    combined_df=pd.concat(dfs,ignore_index=True,sort=True)
    
    if args.setting=="comp":
        df_reduced=combined_df.groupby(['modifier','head','context','year'],observed=True)['count'].sum().to_frame().reset_index()
    
    elif args.setting=="mod":
        df_reduced=combined_df.groupby(['modifier','context','year'],observed=True)['count'].sum().to_frame().reset_index()

    elif args.setting=="head":
        df_reduced=combined_df.groupby(['head','context','year'],observed=True)['count'].sum().to_frame().reset_index()
        
    elif args.setting=="const":
        df_reduced=combined_df.groupby(['word','context','year'],observed=True)['count'].sum().to_frame().reset_index()
    return df_reduced

In [None]:
all_df_files=[]
for i,cur_list in enumerate(div_lsts[:1]):
    print(f'Batch {i+1}')
    cur_time=time.time()
    cur_df=batch_processor(cur_list)
    all_df_files.append(cur_df)
    print(f'Time taken for batch {i+1} = {time.time()-cur_time} secs')

Batch 1


  0%|                                                                                                                                                                               | 0/8 [00:00<?, ?it/s]


Started with file /data/dharp/compounds/datasets//words/df_15.pkl

Started with file /data/dharp/compounds/datasets//words/df_24.pkl

Started with file /data/dharp/compounds/datasets//words/df_30.pkl

Started with file /data/dharp/compounds/datasets//words/df_29.pkl

Started with file /data/dharp/compounds/datasets//words/df_79.pkl

Started with file /data/dharp/compounds/datasets//words/df_58.pkl

Started with file /data/dharp/compounds/datasets//words/df_60.pkl







Started with file /data/dharp/compounds/datasets//words/df_22.pkl




In [7]:
combined_df=pd.concat(all_df_files,ignore_index=True,sort=True)


In [8]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2475574335 entries, 0 to 2475574334
Data columns (total 4 columns):
 #   Column   Dtype   
---  ------   -----   
 0   context  category
 1   count    int64   
 2   head     category
 3   year     uint64  
dtypes: category(2), int64(1), uint64(1)
memory usage: 55.4 GB


In [9]:
combined_df

Unnamed: 0,context,count,head,year
0,a_NOUN,6,a_NOUN,1808
1,a_NOUN,1,a_NOUN,1810
2,a_NOUN,1,a_NOUN,1814
3,a_NOUN,2,a_NOUN,1815
4,a_NOUN,1,a_NOUN,1816
...,...,...,...,...
2475574330,manuel_PROPN,7,zymelman_PROPN,2008
2475574331,manuel_PROPN,1,zymelman_PROPN,2009
2475574332,manuel_PROPN,2,zymelman_PROPN,2010
2475574333,manuel_PROPN,3,zymelman_PROPN,2011


In [13]:
1538367019/2475574335

0.6214182289945255

In [10]:
    if args.setting=="comp":
        df_reduced=combined_df.groupby(['modifier','head','context','year'],observed=True)['count'].sum().to_frame().reset_index()
    
    elif args.setting=="mod":
        df_reduced=combined_df.groupby(['modifier','context','year'],observed=True)['count'].sum().to_frame().reset_index()

    elif args.setting=="head":
        df_reduced=combined_df.groupby(['head','context','year'],observed=True)['count'].sum().to_frame().reset_index()

In [11]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1538367019 entries, 0 to 1538367018
Data columns (total 4 columns):
 #   Column   Dtype   
---  ------   -----   
 0   head     category
 1   context  category
 2   year     uint64  
 3   count    int64   
dtypes: category(2), int64(1), uint64(1)
memory usage: 34.4 GB


In [14]:
df_reduced.to_pickle('/data/dharp/compounds/datasets/heads/heads_1.pkl')