In [None]:
import pandas as pd
import csv
import os
import io
from zipfile import ZipFile,ZipInfo
import editdistance
from collections import Counter
import numpy as np
from multiprocessing import Pool
import multiprocessing as mp
import unicodedata
import re
import spacy
import time
import pickle as pkl
nlp = spacy.load('en_core_web_sm')
nlp.max_length=10_000_000
#contextualSpellCheck.add_to_pipe(nlp)
#from spacy_hunspell import spaCyHunSpell

In [None]:
def extract_zip(input_zip):
    input_zip=ZipFile(input_zip)
    return {name: input_zip.read(name).decode('utf-8').strip() for name in input_zip.namelist()}

In [None]:
def vocab_maker(dict_content):
    token_list=[]
    pos_list=[]
    lemma_list=[]
    str1=re.sub("@@.*|<P>|/\S*/","",dict_content)
    str2 = re.sub(' +',' ',str1)
    str2=str2.replace("@ @ @ @ @ @ @ @ @ @","@@@@@@@@@@")
    str2=str2.replace("\n\n","")
    doc=nlp(str2)
    for token in doc:
        token_list.append(token.text)
        pos_list.append(token.pos_)
        lemma_list.append(token.lemma_)
    return token_list,pos_list,lemma_list

In [None]:
def vocab_collecter(decade):
    zipfiles=extract_zip(os.path.join(_dir, decade))
    zfnames=list(zipfiles.keys())
    docs=list(zipfiles.values())
    
    decade=decade.split('_')[1][:-1]
    print(f"Current decade "+str(decade))
    cur_time=time.time()
    n_proc = mp.cpu_count()-1

    pool = Pool(n_proc)
    results=pool.map_async(vocab_maker,docs)
    pool.close()
    pool.join()

    print("Done parallelizing")
    print("Total time taken",round(time.time()-cur_time),"secs")
    
    results=results.get()
    cur_token_list = [val[0] for val in results]
    full_token_list = [item for sublist in cur_token_list for item in sublist]

    cur_pos_list=[val[1] for val in results]
    full_pos_list = [item for sublist in cur_pos_list for item in sublist]

    cur_lemma_list=[val[2] for val in results]
    full_lemma_list = [item for sublist in cur_lemma_list for item in sublist]

    assert len(full_token_list)==len(full_pos_list)==len(full_lemma_list)
    
    df=pd.DataFrame({'token':full_token_list,'pos':full_pos_list,'lemma':full_lemma_list})
    df=df.loc[df.pos.isin(['NOUN','VERB','ADJ','ADV','AUX'])]
    df.token=df.token.str.lower()
    df.pos=df.pos.str.lower()
    df.lemma=df.lemma.str.lower()
    df.token=df.token+"_"+df.pos
    df.lemma=df.lemma+"_"+df.pos

    token_counter=Counter(df.token.values)
    lemma_counter=Counter(df.lemma.values)  
    return token_counter,lemma_counter

In [None]:
def write_to_file(fnames,dec,set_type):
    save_file="./"+str(dec)+"/"+set_type+".txt"
    print(save_file)
    with open(save_file,'w') as f:
        for doc in fnames:
            f.write(zipfiles[doc]+"\n\n")

## Read data

In [None]:
_dir = "/resources/corpora/COHA/text/"
#_dir = "/resources/corpora/COHA/CCOHA/tagged/"
#_dir = "/resources/corpora/COHA/ALL/"
files = sorted(os.listdir(_dir))

to_keep=[]
for f in files:
    if 'zip' in f:
        to_keep.append(f)
len(to_keep)

In [None]:
token_counter=Counter()
lemma_counter=Counter()

for decade in to_keep:
    token_list,lemma_list=vocab_collecter(decade)
    token_counter+=token_list
    lemma_counter+=lemma_list

In [None]:
len(token_counter)

In [None]:
lemmas=pd.DataFrame(lemma_counter.most_common(len(lemma_counter)))
#lemmas.reset_index(inplace=True)
lemmas.columns=['lemma','count']
lemmas.lemma=lemmas.lemma.str.replace(r'_aux$', r'_verb',regex=True)
lemmas

In [None]:
lemmas['lemma'].head(5).to_list()

In [None]:
lemmas.head(50_000).to_csv('../Compounding_github/data/coha_context.txt',sep='\t',header=False,index=False)

In [None]:
pkl.dump( lemmas['lemma'].head(50_000).to_list(), open( '../Compounding_github/data/coha_context.pkl', "wb" ) )