In [1]:
from pathlib import Path
from collections import defaultdict
from multiprocess import Pool
from string import punctuation, digits

In [2]:
def run_file(file):
    data = defaultdict(list)
    for pos, line in enumerate(open(file)):
        for word in line.strip().split(" "):
            word = word.translate(str.maketrans('','',punctuation+digits))
            data[word].append((file, pos))
    return data
         
def generate_file(path):
    for path_object in path.rglob('*'):
        if path_object.is_file():
            yield path_object
    return None


In [3]:
def seq_exec(path):
    # sequerntial execution
    vals = defaultdict(list)
    for path_object in generate_file(path):
        val_dict = run_file(path_object)
        [vals[word].extend(values) for word, values in val_dict.items()]
    return vals


In [4]:
def pooled_exec(path):
    # sequerntial execution
    vals = defaultdict(list)
    results = []
    with Pool(processes=4) as pool:
        for path_object in generate_file(path):
            results.append(pool.apply_async(run_file, [path_object]))
              
        for result in results:
            result_vals = result.get()
            [vals[word].extend(values) for word, values in result_vals.items()]
    return vals

In [5]:
root_directory = Path('files/sonnets')
seq_vals = seq_exec(root_directory)
pooled_vals = pooled_exec(root_directory)

seq_info = [f"{key}-{len(values)}" for key, values in seq_vals.items()]
pool_info = [f"{key}-{len(values)}" for key, values in pooled_vals.items()]
assert sorted(seq_info) == sorted(pool_info), f"not equa\n{seq_info}\n{pool_info}"

assert len(seq_vals.items()) == len(pooled_vals.items()), f"Not same {len(seq_vals.items())},{len(pooled_vals.items())}"


In [None]:
%timeit pooled_exec(root_directory)

%timeit seq_exec(root_directory)