In [1]:
from pathlib import Path

In [2]:
# experiment parameters
ngram_count_batch_size = 1_000_000
n_samples = 30_000_000
n_workers = 3
max_ngram_size = 5
filter_ngram_count_threshold = 2
save_dir = Path('./data')

In [3]:
# analyze space requirements
generated_files = list(save_dir.rglob('*.parquet'))
generated_files

[WindowsPath('data/1/count_table_0-999999.parquet'),
 WindowsPath('data/1/count_table_1000000-1999999.parquet'),
 WindowsPath('data/1/count_table_10000000-10999999.parquet'),
 WindowsPath('data/1/count_table_11000000-11999999.parquet'),
 WindowsPath('data/1/count_table_12000000-12999999.parquet'),
 WindowsPath('data/1/count_table_13000000-13999999.parquet'),
 WindowsPath('data/1/count_table_14000000-14999999.parquet'),
 WindowsPath('data/1/count_table_15000000-15999999.parquet'),
 WindowsPath('data/1/count_table_16000000-16999999.parquet'),
 WindowsPath('data/1/count_table_17000000-17999999.parquet'),
 WindowsPath('data/1/count_table_18000000-18999999.parquet'),
 WindowsPath('data/1/count_table_19000000-19999999.parquet'),
 WindowsPath('data/1/count_table_2000000-2999999.parquet'),
 WindowsPath('data/1/count_table_20000000-20999999.parquet'),
 WindowsPath('data/1/count_table_21000000-21999999.parquet'),
 WindowsPath('data/1/count_table_22000000-22999999.parquet'),
 WindowsPath('data/1/

In [4]:
# find the number of tokens processed
import json
total_ngram_per_size_file = next(iter(save_dir.rglob('*.json')))
with total_ngram_per_size_file.open('r') as f:
    total_ngram_per_size = json.load(f)
total_ngram_per_size
n_tokens_processed = total_ngram_per_size['1']
n_tokens_processed

442371415

In [5]:
def get_file_size(file: Path) -> int:
    return file.stat().st_size

In [6]:
example_file = generated_files[0]
get_file_size(example_file)

165535

In [7]:
total_parquet_file_size_in_bytes = sum(map(get_file_size, generated_files))
total_parquet_file_size_in_bytes

721005352

In [8]:
total_parquet_file_size_in_mb = total_parquet_file_size_in_bytes // (2 ** 20)
print(f'parquet files total size: {total_parquet_file_size_in_mb} MB')

parquet files total size: 687 MB


In [9]:
print(f'tokens processed: {n_tokens_processed:,}')

tokens processed: 442,371,415


In [10]:
# assuming linear growth in the number of tokens processed, which should be approximatlty correct:
n_tokens_wikipedia =     24_000_000_000  # 24 Billion
n_tokens_red_pijama = 1_200_000_000_000  # 1.2 Trillion

In [11]:
def compute_dataset_expected_space_bytes(n_tokens_in_dataset: int) -> int:
    dataset_size_factor = n_tokens_in_dataset / n_tokens_processed
    return total_parquet_file_size_in_bytes * dataset_size_factor

In [12]:
wikipedia_expected_space_gb = compute_dataset_expected_space_bytes(n_tokens_wikipedia) // (2 ** 30)
print(f'expected space for wikipedia dataset: {wikipedia_expected_space_gb} GB')

expected space for wikipedia dataset: 36.0 GB


In [13]:
red_pijama_expected_space_gb = compute_dataset_expected_space_bytes(n_tokens_red_pijama) // (2 ** 30)
print(f'expected space for RedPijama dataset: {red_pijama_expected_space_gb} GB')

expected space for RedPijama dataset: 1821.0 GB


In [14]:
# analyze time requirements
# collect data from the logs
log_file = Path('./logs/log.log')
with log_file.open('r') as f:
    log_lines = f.readlines()
log_lines

['2023-05-02 20:19:48,327 - src.count_ngrams_in_batches - INFO - Starting to count ngrams in batches\n',
 "2023-05-02 20:20:33,385 - src.count_ngrams_in_batches - INFO - started working on samples 20000000-20999999;memory (MB): {'total': 8066, 'used': 6084, 'available': 1982}\n",
 "2023-05-02 20:20:46,091 - src.count_ngrams_in_batches - INFO - started working on samples 0-999999;memory (MB): {'total': 8066, 'used': 6681, 'available': 1385}\n",
 "2023-05-02 20:20:46,233 - src.count_ngrams_in_batches - INFO - started working on samples 10000000-10999999;memory (MB): {'total': 8066, 'used': 6761, 'available': 1305}\n",
 "2023-05-02 20:24:28,621 - src.count_ngrams_in_batches - INFO - finished samples 20000000 to 20999999;memory (MB): {'total': 8066, 'used': 7031, 'available': 1035};counter size (MB): 0\n",
 "2023-05-02 20:25:03,640 - src.count_ngrams_in_batches - INFO - finished samples 10000000 to 10999999;memory (MB): {'total': 8066, 'used': 7103, 'available': 963};counter size (MB): 0\n

In [15]:
import re

In [16]:
from datetime import datetime

In [17]:
example_line = log_lines[1]
example_line

"2023-05-02 20:20:33,385 - src.count_ngrams_in_batches - INFO - started working on samples 20000000-20999999;memory (MB): {'total': 8066, 'used': 6084, 'available': 1982}\n"

In [18]:
# TODO: find the last start time for the `count_ngrams_in_batches` directory
datetime_regex = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'
module_name_regex = r'\w+\.\w+'
message_regex = r'.*'
log_line_regex = f'(?P<datetime>{datetime_regex}),\\d+ - ' \
                 f'(?P<module_name>{module_name_regex}) - \\w+ - ' \
                 f'(?P<message>{message_regex})'
log_line_regex = re.compile(log_line_regex)

In [19]:
log_line_regex.match(example_line).groupdict()

{'datetime': '2023-05-02 20:20:33',
 'module_name': 'src.count_ngrams_in_batches',
 'message': "started working on samples 20000000-20999999;memory (MB): {'total': 8066, 'used': 6084, 'available': 1982}"}

In [22]:
from collections.abc import Iterable 


def match_regex_condition(log_line: str) -> bool:
    return log_line_regex.match(log_line) is not None

def module_name_condition(log_line: str) -> bool:
    return log_line_regex.match(log_line).groupdict()['module_name'] == 'src.count_ngrams_in_batches'

def start_line_condition(log_line: str) -> bool:
    return log_line_regex.match(log_line).groupdict()['message'] == 'Starting to count ngrams in batches'
    
def end_line_condition(log_line: str) -> bool:
    return log_line_regex.match(log_line).groupdict()['message'] == 'Finished counting ngrams in batches'

def compose_filter(filters: Iterable, iterable: Iterable) -> Iterable:
    for f in filters:
        iterable = filter(f, iterable)
    return iterable


In [23]:
def get_log_line_time(log_line: str) -> datetime:
    log_time_str = log_line_regex.match(log_line).group('datetime')
    datetime_fmt = '%Y-%m-%d %H:%M:%S'
    return datetime.strptime(log_time_str, datetime_fmt)

In [24]:
start_lines = list(compose_filter([match_regex_condition, module_name_condition, start_line_condition], log_lines))
last_start_line = start_lines[-1]
last_start_line
last_start_time = get_log_line_time(last_start_line)
last_start_time

datetime.datetime(2023, 5, 2, 20, 19, 48)

In [25]:
end_lines = list(compose_filter([match_regex_condition, module_name_condition, end_line_condition], log_lines))
last_end_line = end_lines[-1]
last_end_line
last_end_time = get_log_line_time(last_end_line)
last_end_time

datetime.datetime(2023, 5, 2, 21, 1, 47)

In [26]:
total_time_seconds = (last_end_time - last_start_time).seconds
total_time_seconds

2519

In [27]:
# assuming that the time is linear with the number of workers and the number of tokens in the dataset, 
#  this should be approximately correct
def compute_expected_time_seconds(n_tokens_expected: int, n_workers_expected: int) -> float:
    worker_factor = n_workers / n_workers_expected
    n_tokens_factor = n_tokens_expected / n_tokens_processed
    time_expected_seconds = total_time_seconds * worker_factor * n_tokens_factor
    return time_expected_seconds

In [28]:
wikipedia_n_workers = 3
wikipedia_expected_time_seconds = compute_expected_time_seconds(n_tokens_wikipedia, wikipedia_n_workers)
print(f'expected time for wikipedia with n_workers={wikipedia_n_workers} '
      f'is {wikipedia_expected_time_seconds / 3600:.3f} hours')

expected time for wikipedia with n_workers=3 is 37.962 hours


In [29]:
red_pijama_n_workers = 10
red_pijama_expected_time_seconds = compute_expected_time_seconds(n_tokens_red_pijama, red_pijama_n_workers)
red_pijama_expected_time_hours = red_pijama_expected_time_seconds / 3600
red_pijama_expected_time_days = red_pijama_expected_time_hours / 24
print(f'expected time for RedPijama with n_workers={red_pijama_n_workers} '
      f'is {red_pijama_expected_time_days:.3f} days')

expected time for RedPijama with n_workers=10 is 23.726 days
