In [1]:
import random
import pandas as pd
import glob
import numpy as np
import time
import multiprocessing as mp
from multiprocessing import Pool

In [2]:
pkl_files=glob.glob('/data/dharp/compounds/datasets/phrases/*pkl')
random.shuffle(pkl_files)
len(pkl_files)

115

In [3]:
div_lsts=np.array_split(pkl_files, 5)

In [4]:
len(div_lsts[0])

23

In [5]:
keep_string=r"^.+_(PROPN|NOUN|ADJ|VERB||NUM|ADV)$"

In [6]:
def mem_reducer(pkl_file):
    print(f'\nStarted with file {pkl_file}\n')
    cur_time=time.time()
    
    df=pd.read_pickle(pkl_file)
    df=df.groupby(['context','year'])['count'].sum().to_frame().reset_index()

    context_df=pd.DataFrame(df['context'].unique())
    context_df=context_df.loc[~context_df[0].str.contains('@@@')]
    context_words=context_df.loc[context_df[0].str.match(keep_string)][0].to_list()
    df=df.loc[df.context.isin(context_words)]
    df.context=df.context.str.replace(r'.+_NUM','NUM',regex=True)
    df["year"] = pd.to_numeric(df["year"], downcast="unsigned")
    df=df.groupby(['context','year'])['count'].sum().to_frame().reset_index()
    print(f'Done with file {pkl_file} in {round(time.time()-cur_time)} secs')
    return df

In [7]:
entire_context_df_list=[]
for i,cur_list in enumerate(div_lsts):
    print(f'List num {i+1}')
    n_proc = len(cur_list)

    pool = Pool(n_proc)
    results=pool.map_async(mem_reducer,cur_list) 
    pool.close()
    pool.join()
    
    dfs=results.get()
    
    cur_df=pd.concat(dfs,ignore_index=True,sort=True)
    
    display(cur_df.info())
    
    cur_df=cur_df.groupby(['context','year'])['count'].sum().to_frame().reset_index()
    
    display(cur_df.info())
    
    entire_context_df_list.append(cur_df)

List num 1

Started with file /data/dharp/compounds/datasets/phrases/df_39.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_29.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_67.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_31.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_114.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_99.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_30.pkl



Started with file /data/dharp/compounds/datasets/phrases/df_22.pkl



Started with file /data/dharp/compounds/datasets/phrases/df_12.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_78.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_111.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_75.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_38.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_66.pkl


Started with file /data/dharp

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41078291 entries, 0 to 41078290
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   context  object
 1   year     uint64
 2   count    int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 940.2+ MB


None

List num 2

Started with file /data/dharp/compounds/datasets/phrases/df_113.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_63.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_89.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_73.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_81.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_35.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_14.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_48.pkl


Started with file /data/dharp/compounds/datasets/phrases/df_1.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_23.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_32.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_58.pkl


Started with file /data/dharp/compounds/datasets/phrases/df_64.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_97.pkl


Started with file /data/dharp/com

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39087070 entries, 0 to 39087069
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   context  object
 1   year     uint64
 2   count    int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 894.6+ MB


None

List num 3

Started with file /data/dharp/compounds/datasets/phrases/df_21.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_50.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_9.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_69.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_54.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_40.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_88.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_94.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_68.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_72.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_42.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_93.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_86.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_5.pkl

Started with file /data/dharp/compound

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36204589 entries, 0 to 36204588
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   context  object
 1   year     uint64
 2   count    int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 828.7+ MB


None

List num 4

Started with file /data/dharp/compounds/datasets/phrases/df_101.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_71.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_33.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_91.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_90.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_24.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_6.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_57.pkl


Started with file /data/dharp/compounds/datasets/phrases/df_20.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_115.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_85.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_41.pkl



Started with file /data/dharp/compounds/datasets/phrases/df_108.pkl


Started with file /data/dharp/compounds/datasets/phrases/df_61.pkl

Started with file /data/dharp/

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35302301 entries, 0 to 35302300
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   context  object
 1   year     uint64
 2   count    int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 808.0+ MB


None

List num 5

Started with file /data/dharp/compounds/datasets/phrases/df_70.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_34.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_17.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_83.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_55.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_49.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_79.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_105.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_3.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_107.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_13.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_102.pkl

Started with file /data/dharp/compounds/datasets/phrases/df_16.pkl


Started with file /data/dharp/compounds/datasets/phrases/df_46.pkl


Started with file /data/dharp/co

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45236503 entries, 0 to 45236502
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   context  object
 1   year     uint64
 2   count    int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 1.0+ GB


None

In [8]:
context_df=pd.concat(entire_context_df_list,sort=False,ignore_index=True)
context_df.info()
context_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196908754 entries, 0 to 196908753
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   context  object
 1   year     uint64
 2   count    int64 
dtypes: int64(1), object(1), uint64(1)
memory usage: 4.4+ GB


Unnamed: 0,context,year,count
0,#_ADV,1774,2
1,#_ADV,1832,5
2,#_ADV,1842,2
3,#_ADV,1851,6
4,#_ADV,1857,1
...,...,...,...
196908749,﹑_PROPN,2007,4
196908750,😊_ADJ,2016,5
196908751,😊_ADJ,2017,20
196908752,😊_ADJ,2018,21


In [9]:
context_df=context_df.groupby(['context','year'])['count'].sum().to_frame().reset_index()
context_df

Unnamed: 0,context,year,count
0,"""_PROPN",1969,4
1,"""_PROPN",1970,13
2,"""_PROPN",1971,2
3,"""_PROPN",1972,9
4,"""_PROPN",1975,7
...,...,...,...
79767104,😊_ADJ,2019,20
79767105,😊_NOUN,2016,5
79767106,😊_NOUN,2017,20
79767107,😊_NOUN,2018,21


In [11]:
presence_df=context_df.query('1800 <= year <= 2020').context.value_counts().to_frame()

TypeError: Iterator operand 1 dtype could not be cast from dtype('uint64') to dtype('int64') according to the rule 'safe'

In [39]:
presence_df.loc[presence_df.context>200]

Unnamed: 0,context
rate_NOUN,220
remote_ADV,220
beggar_PROPN,220
beggar_NOUN,220
jocund_ADJ,220
...,...
champs_PROPN,201
esch_PROPN,201
consecutive_ADJ,201
internus_PROPN,201


In [12]:
years_df=context_df.groupby(['year'])['count'].sum().to_frame().reset_index()
years_df

Unnamed: 0,year,count
0,1470,11
1,1472,17953
2,1475,27373
3,1476,515
4,1477,31238
...,...,...
518,2015,1014679053
519,2016,1041408305
520,2017,973285732
521,2018,1178832910


In [13]:
context_df.to_pickle('/data/dharp/compounds/datasets/contexts/contexts.pkl')

In [14]:
content_words_df=context_df.groupby(['context'])['count'].sum().to_frame().reset_index()

In [15]:
content_words_df.sort_values(by='count')

Unnamed: 0,context,count
259936,chantelle_NOUN,40
1450571,umptek_PROPN,40
644041,huic_NOUN,40
977689,nonovulation_PROPN,40
186932,borborch_PROPN,40
...,...,...
40615,act_PROPN,582316180
603279,have_VERB,587375929
1457954,united_PROPN,646937825
959876,new_PROPN,1072819343
