## Dataset

Gutenberg dataset 1.7 GB <https://zenodo.org/records/3360392/files/D1.7GB.zip?download=1>  
<https://zenodo.org/records/3360392>

In [None]:
import os

def convert_to_utf8(filename, encodings=('utf-8', 'iso-8859-1', 'windows-1252')):
    for encoding in encodings:
        try:
            with open(filename, 'r', encoding=encoding) as file:
                content = file.read()
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(content)
            break
        except UnicodeDecodeError:
            continue
    else:
        print(f"Failed to convert {filename}.")


directory = "./dataset"

for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(directory, filename)
        convert_to_utf8(filepath)


In [None]:
import os
from collections import Counter
from multiprocessing import Pool
import concurrent.futures
import pandas as pd

def count_words_in_file(filename):
    word_count = Counter()
    with open(filename, 'r') as file:
        for line in file:
            word_count.update(line.strip().split())
    return word_count

def word_count_serial(directory):
    total_count = Counter()
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            total_count.update(count_words_in_file(filepath))
    return total_count

def word_count_parallel(directory):
    pool = Pool()
    filepaths = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith('.txt')]
    results = pool.map(count_words_in_file, filepaths)
    pool.close()
    pool.join()
    total_count = Counter()
    for result in results:
        total_count.update(result)
    return total_count


In [None]:
directory = "./dataset"

word_counts_serial = word_count_serial(directory)
df_word_counts_serial = pd.DataFrame(word_counts_serial.items(), columns=['Word', 'Count'])
df_word_counts_serial.head()

In [None]:
directory = "./dataset"

word_counts_parallel = word_count_parallel(directory)
df_word_counts_parallel = pd.DataFrame(word_counts_parallel.items(), columns=['Word', 'Count'])
df_word_counts_parallel.head()