In [1]:
from pathlib import Path

In [2]:
# experiment parameters
ngram_count_batch_size = 1_000_000
n_samples = 20_000_000
n_workers = 3
max_ngram_size = 5
filter_ngram_count_threshold = 2
save_dir = Path('./data')

In [4]:
# analyze space requirements
generated_files = list(save_dir.rglob('*.parquet'))
generated_files

[WindowsPath('data/1/count_table_0-999999.parquet'),
 WindowsPath('data/1/count_table_1000000-1999999.parquet'),
 WindowsPath('data/1/count_table_10666667-11666666.parquet'),
 WindowsPath('data/1/count_table_11666667-12666666.parquet'),
 WindowsPath('data/1/count_table_12666667-13333333.parquet'),
 WindowsPath('data/1/count_table_13333334-14333333.parquet'),
 WindowsPath('data/1/count_table_14333334-15333333.parquet'),
 WindowsPath('data/1/count_table_15333334-16333333.parquet'),
 WindowsPath('data/1/count_table_16333334-17333333.parquet'),
 WindowsPath('data/1/count_table_17333334-18333333.parquet'),
 WindowsPath('data/1/count_table_18333334-19333333.parquet'),
 WindowsPath('data/1/count_table_19333334-19999999.parquet'),
 WindowsPath('data/1/count_table_2000000-2999999.parquet'),
 WindowsPath('data/1/count_table_3000000-3999999.parquet'),
 WindowsPath('data/1/count_table_4000000-4999999.parquet'),
 WindowsPath('data/1/count_table_5000000-5999999.parquet'),
 WindowsPath('data/1/count_

In [18]:
# find the number of tokens processed
import json
total_ngram_per_size_file = next(iter(save_dir.rglob('*.json')))
with total_ngram_per_size_file.open('r') as f:
    total_ngram_per_size = json.load(f)
total_ngram_per_size
n_tokens_processed = total_ngram_per_size['1']
n_tokens_processed

294998949

In [8]:
def get_file_size(file: Path) -> int:
    return file.stat().st_size

In [9]:
example_file = generated_files[0]
get_file_size(example_file)

165535

In [10]:
total_parquet_file_size_in_bytes = sum(map(get_file_size, generated_files))
total_parquet_file_size_in_bytes

481565693

In [20]:
total_parquet_file_size_in_mb = total_parquet_file_size_in_bytes // (2 ** 20)
print(f'parquet files total size: {total_parquet_file_size_in_mb} MB')

parquet files total size: 459 MB


In [21]:
print(f'tokens processed: {n_tokens_processed:,}')

tokens processed: 294,998,949


In [22]:
# assuming linear growth in the number of tokens processed, which should be approximatlty correct:
n_tokens_wikipedia =     24_000_000_000  # 24 Billion
n_tokens_red_pijama = 1_200_000_000_000  # 1.2 Trillion

In [24]:
def compute_dataset_expected_space_bytes(n_tokens_in_dataset: int) -> int:
    dataset_size_factor = n_tokens_in_dataset / n_tokens_processed
    return total_parquet_file_size_in_bytes * dataset_size_factor

In [25]:
wikipedia_expected_space_gb = compute_dataset_expected_space_bytes(n_tokens_wikipedia) // (2 ** 30)
print(f'expected space for wikipedia dataset: {wikipedia_expected_space_gb} GB')

expected space for wikipedia dataset: 36.0 GB


In [26]:
red_pijama_expected_space_gb = compute_dataset_expected_space_bytes(n_tokens_red_pijama) // (2 ** 30)
print(f'expected space for RedPijama dataset: {red_pijama_expected_space_gb} GB')

expected space for RedPijama dataset: 1824.0 GB


In [27]:
# analyze time requirements
# collect data from the logs
log_file = Path('./logs/count_ngram_in_batches.log')
with log_file.open('r') as f:
    log_lines = f.readlines()
log_lines

['2023-04-24 15:43:29,715 Found cached dataset bookcorpus (C:/Users/shaig/.cache/huggingface/datasets/bookcorpus/plain_text/1.0.0/eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f)\n',
 '2023-04-24 15:44:02,535 Starting to count ngrams in batches\n',
 "2023-04-24 16:05:35,773 started working on samples 13333334-14333333;memory (MB): {'total': 8066, 'used': 6646, 'available': 1420}\n",
 "2023-04-24 16:05:42,736 started working on samples 0-999999;memory (MB): {'total': 8066, 'used': 6916, 'available': 1149}\n",
 "2023-04-24 16:05:42,890 started working on samples 6666667-7666666;memory (MB): {'total': 8066, 'used': 6969, 'available': 1097}\n",
 "2023-04-24 16:11:06,735 finished samples 13333334 to 14333333;memory (MB): {'total': 8066, 'used': 7521, 'available': 545};counter size (MB): 0\n",
 "2023-04-24 16:11:25,777 finished samples 6666667 to 7666666;memory (MB): {'total': 8066, 'used': 7511, 'available': 554};counter size (MB): 0\n",
 "2023-04-24 16:11:46,078 started wo

In [28]:
import re

In [35]:
start_processing_re = r'(.*)Starting to count ngrams in batches$'
end_processing_re = r'(.*)Finished counting ngrams in batches$'

In [36]:
from collections.abc import Iterable

def filter_lines_that_match_re(lines: list[str], regex: str) -> Iterable[str]:
    for line in lines:
        match_obj = re.match(regex, line)
        if match_obj is not None:
            yield line

In [38]:
start_processing_line = list(filter_lines_that_match_re(log_lines, start_processing_re))[0]
start_processing_line

'2023-04-24 15:44:02,535 Starting to count ngrams in batches\n'

In [39]:
end_processing_line = list(filter_lines_that_match_re(log_lines, end_processing_re))[0]
end_processing_line

'2023-04-24 16:41:18,845 Finished counting ngrams in batches\n'

In [40]:
from datetime import datetime

def get_datetime_from_line(line: str):
    datetime_str = line.split(',')[0]
    datetime_fmt = '%Y-%m-%d %H:%M:%S'
    return datetime.strptime(datetime_str, datetime_fmt)

In [45]:
start_datetime = get_datetime_from_line(start_processing_line)
end_datetime = get_datetime_from_line(end_processing_line)
total_datetime = end_datetime - start_datetime
total_time_seconds = total_datetime.seconds
total_time_seconds

3436

In [46]:
# assuming that the time is linear with the number of workers and the number of tokens in the dataset, 
#  this should be approximately correct
def compute_expected_time_seconds(n_tokens_expected: int, n_workers_expected: int) -> float:
    worker_factor = n_workers / n_workers_expected
    n_tokens_factor = n_tokens_expected / n_tokens_processed
    time_expected_seconds = total_time_seconds * worker_factor * n_tokens_factor
    return time_expected_seconds

In [50]:
wikipedia_n_workers = 3
wikipedia_expected_time_seconds = compute_expected_time_seconds(n_tokens_wikipedia, wikipedia_n_workers)
print(f'expected time for wikipedia with n_workers={wikipedia_n_workers} '
      f'is {wikipedia_expected_time_seconds / 3600:.3f} hours')

expected time for wikipedia with n_workers=3 is 77.650 hours


In [52]:
red_pijama_n_workers = 3
red_pijama_expected_time_seconds = compute_expected_time_seconds(n_tokens_red_pijama, red_pijama_n_workers)
red_pijama_expected_time_hours = red_pijama_expected_time_seconds / 3600
red_pijama_expected_time_days = red_pijama_expected_time_hours / 24
print(f'expected time for RedPijama with n_workers={red_pijama_n_workers} '
      f'is {red_pijama_expected_time_days:.3f} days')

expected time for RedPijama with n_workers=3 is 161.771 days
