In [1]:
import random
import pandas as pd
import glob
import os
import numpy as np
import pickle as pkl
import time
import multiprocessing as mp
from multiprocessing import Pool
import re
import argparse
import tqdm

In [2]:
parser = argparse.ArgumentParser(description='Program to combine pickle data into one file for all settings')

parser.add_argument('--setting', type=str,
                    help='comp:Compounds mod:Modifiers head:Heads phr:Phrases const:Constituent')
parser.add_argument('--spath', type=str,
                    help='directory where to save output')

args = parser.parse_args("--setting mod --spath /data/dharp/compounds/datasets/".split())



if args.setting=="comp":
    pkl_files=glob.glob(f'{args.spath}/compounds/*pkl')
    modifiers=pd.read_pickle(f'{args.spath}/contexts/modifiers.pkl')
    heads=pd.read_pickle(f'{args.spath}/contexts/heads.pkl')
    
elif args.setting=="mod":
    pkl_files=glob.glob(f'{args.spath}/modifiers/*pkl')
    modifiers=pd.read_pickle(f'{args.spath}/contexts/modifiers.pkl')
    
elif args.setting=="head":
    pkl_files=glob.glob(f'{args.spath}/heads/*pkl')
    heads=pd.read_pickle(f'{args.spath}/contexts/heads.pkl')

    
elif args.setting=="phr":
    pkl_files=glob.glob(f'{args.spath}/phrases/*pkl')
    
elif args.setting=="const":
    pkl_files=glob.glob(f'{args.spath}/words/*pkl')
    
random.shuffle(pkl_files)


contexts=pd.read_pickle(f'{args.spath}/contexts/contexts_top50k.pkl')



div_lsts=np.array_split(pkl_files, 3)

In [3]:
len(div_lsts[0])

39

In [4]:
def mem_reducer(pkl_file):
    #print(f'\nStarted with file {pkl_file}\n')
    cur_time=time.time()
    
    df=pd.read_pickle(pkl_file)
    orig_shape=df.shape[0]
    df["year"] = pd.to_numeric(df["year"], downcast="unsigned")
    df=df.loc[df['year']>=1800]
    df.context=pd.Categorical(df.context,categories=contexts)

    if args.setting=="comp":
        df.modifier=pd.Categorical(df.modifier,categories=modifiers)
        df['head']=pd.Categorical(df['head'],categories=heads)
        df.dropna(inplace=True)
        df=df.groupby(['modifier','head','context','year'],observed=True)['count'].sum().to_frame().reset_index()
        
    elif args.setting=="mod":
        df.modifier=pd.Categorical(df.modifier,categories=modifiers)
        df.dropna(inplace=True)
        df=df.groupby(['modifier','context','year'],observed=True)['count'].sum().to_frame().reset_index()
        
    elif args.setting=="head":
        df['head']=pd.Categorical(df['head'],categories=heads)
        df.dropna(inplace=True)
        df=df.groupby(['head','context','year'],observed=True)['count'].sum().to_frame().reset_index()

    #print(df.shape[0],((orig_shape-df.shape[0])/orig_shape*100))
    #print(f'Done with file {pkl_file} in {round(time.time()-cur_time)} secs and current size is {round(df.shape[0]/orig_shape*100,2)}% of the original dataset')
    return df

In [None]:
results=[]
n_proc = 50
pool = Pool(n_proc)
for result in tqdm.tqdm(pool.imap_unordered(mem_reducer, pkl_files), total=len(pkl_files)):
    
    results.append(result)
pool.close()
pool.join()    

  0%|                                                   | 0/115 [00:00<?, ?it/s]


Started with file /data/dharp/compounds/datasets//modifiers/df_113.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_29.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_115.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_62.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_68.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_55.pkl


Started with file /data/dharp/compounds/datasets//modifiers/df_35.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_49.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_56.pkl


Started with file /data/dharp/compounds/datasets//modifiers/df_37.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_6.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_103.pkl

Started with file /data/dharp/compounds/datasets//modifiers/df_84.pkl


Started with file /data/dharp/compounds/datasets//modifiers/df_5.pkl




  1%|▎                                          | 1/115 [00:24<46:32, 24.49s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_14.pkl in 39 secs and current size is 38.89% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_98.pkl



  2%|▋                                          | 2/115 [00:39<36:02, 19.14s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_66.pkl in 46 secs and current size is 47.89% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_100.pkl



  3%|█                                          | 3/115 [00:47<25:30, 13.67s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_29.pkl in 50 secs and current size is 53.09% of the original dataset
Done with file /data/dharp/compounds/datasets//modifiers/df_62.pkl in 50 secs and current size is 39.97% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_76.pkl



  3%|█▍                                         | 4/115 [00:51<18:32, 10.03s/it]


Done with file /data/dharp/compounds/datasets//modifiers/df_68.pkl in 51 secs and current size is 51.42% of the original dataset
Started with file /data/dharp/compounds/datasets//modifiers/df_94.pkl



  4%|█▊                                         | 5/115 [00:52<12:12,  6.66s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_71.pkl in 52 secs and current size is 48.09% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_64.pkl



  5%|██▏                                        | 6/115 [00:52<08:27,  4.65s/it]


Started with file /data/dharp/compounds/datasets//modifiers/df_82.pkl



  6%|██▌                                        | 7/115 [00:53<06:13,  3.45s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_10.pkl in 56 secs and current size is 47.28% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_95.pkl



  7%|██▉                                        | 8/115 [00:57<06:19,  3.54s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_74.pkl in 57 secs and current size is 46.21% of the original dataset
Done with file /data/dharp/compounds/datasets//modifiers/df_40.pkl in 58 secs and current size is 54.84% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_67.pkl



  8%|███▎                                       | 9/115 [00:58<04:56,  2.80s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_53.pkl in 59 secs and current size is 46.53% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_46.pkl



  9%|███▋                                      | 10/115 [00:59<03:56,  2.25s/it]


Started with file /data/dharp/compounds/datasets//modifiers/df_47.pkl



 10%|████                                      | 11/115 [01:00<03:13,  1.86s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_83.pkl in 63 secs and current size is 50.33% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_96.pkl



 10%|████▍                                     | 12/115 [01:05<04:26,  2.59s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_88.pkl in 65 secs and current size is 54.34% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_26.pkl



 11%|████▋                                     | 13/115 [01:07<04:09,  2.45s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_6.pkl in 71 secs and current size is 47.04% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_89.pkl



 12%|█████                                     | 14/115 [01:12<05:23,  3.20s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_55.pkl in 74 secs and current size is 45.05% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_78.pkl



 13%|█████▍                                    | 15/115 [01:15<05:29,  3.29s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_56.pkl in 78 secs and current size is 45.25% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_8.pkl



 14%|█████▊                                    | 16/115 [01:19<05:44,  3.48s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_98.pkl in 62 secs and current size is 54.05% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_24.pkl



 15%|██████▏                                   | 17/115 [01:42<15:17,  9.36s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_26.pkl in 38 secs and current size is 53.34% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_13.pkl



 16%|██████▌                                   | 18/115 [01:46<12:19,  7.63s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_82.pkl in 54 secs and current size is 46.13% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_23.pkl



 17%|██████▉                                   | 19/115 [01:48<09:52,  6.17s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_89.pkl in 51 secs and current size is 51.48% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_73.pkl



 17%|███████▎                                  | 20/115 [02:04<14:08,  8.93s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_47.pkl in 73 secs and current size is 47.56% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_77.pkl



 18%|███████▋                                  | 21/115 [02:15<15:08,  9.66s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_24.pkl in 43 secs and current size is 50.9% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_80.pkl



 19%|████████                                  | 22/115 [02:26<15:26,  9.96s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_113.pkl in 153 secs and current size is 49.51% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_17.pkl



 20%|████████▍                                 | 23/115 [02:35<15:01,  9.80s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_73.pkl in 43 secs and current size is 49.78% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_110.pkl



 21%|████████▊                                 | 24/115 [02:48<16:08, 10.65s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_111.pkl in 170 secs and current size is 51.0% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_36.pkl



 22%|█████████▏                                | 25/115 [02:53<13:35,  9.06s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_8.pkl in 416 secs and current size is 42.75% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_101.pkl



 23%|████████▊                              | 26/115 [08:20<2:34:56, 104.45s/it]

Done with file /data/dharp/compounds/datasets//modifiers/df_13.pkl in 447 secs and current size is 45.83% of the original dataset

Started with file /data/dharp/compounds/datasets//modifiers/df_11.pkl



 23%|█████████▍                              | 27/115 [09:14<2:10:40, 89.10s/it]

In [6]:
    dfs=results.get()
    combined_df=pd.concat(dfs,ignore_index=True,sort=True)
    
    
    df_reduced=combined_df.groupby(['modifier','context','year'],observed=True)['count'].sum().to_frame().reset_index()
    

In [7]:
df_reduced.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 837260093 entries, 0 to 837260092
Data columns (total 4 columns):
 #   Column    Dtype   
---  ------    -----   
 0   modifier  category
 1   context   category
 2   year      uint64  
 3   count     int64   
dtypes: category(2), int64(1), uint64(1)
memory usage: 18.7 GB


In [8]:
df_reduced

Unnamed: 0,modifier,context,year,count
0,a_NOUN,=_NOUN,1805,1
1,a_NOUN,=_NOUN,1807,3
2,a_NOUN,=_NOUN,1810,2
3,a_NOUN,=_NOUN,1813,1
4,a_NOUN,=_NOUN,1815,9
...,...,...,...,...
837260088,zitza_PROPN,far_ADV,1972,1
837260089,zitza_PROPN,far_ADV,1980,2
837260090,zitza_PROPN,far_ADV,1986,2
837260091,zitza_PROPN,far_ADV,2006,1


In [24]:
%time
df_normal=trial_df.groupby(['modifier','head','context','year'])['count'].sum().to_frame().reset_index()

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 14.3 µs


In [15]:
combined_df

Unnamed: 0,context,count,modifier,year
0,=_NOUN,1,a_NOUN,1805
1,=_NOUN,7,a_NOUN,1879
2,=_NOUN,1,a_NOUN,1884
3,=_NOUN,1,a_NOUN,1887
4,=_NOUN,2,a_NOUN,1893
...,...,...,...,...
515273351,stock_NOUN,2,zyx_PROPN,2012
515273352,stock_NOUN,2,zyx_PROPN,2013
515273353,stock_NOUN,3,zyx_PROPN,2014
515273354,stock_NOUN,2,zyx_PROPN,2015


In [16]:
337465248/515273356

0.6549246998131221

In [18]:
def summer(df):
    return db['count'].sum()

In [29]:
df_parallel=trial_df.groupby(['modifier','head','context','year']).parallel_apply(summer)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=124958), Label(value='0 / 124958')…

EOFError: Ran out of input