In [3]:
from pathlib import Path
from typing import Generator, List, Tuple

from tokenizer import Tokenizer

PROJECT_ROOT = Path('/workspaces/ai-indent')
DATA_DIR = PROJECT_ROOT / 'data'
RAW_DATA_DIR = DATA_DIR / 'raw'
INTERIM_DATA_DIR = DATA_DIR / 'interim'
CODE_BLOCKS_DIR = INTERIM_DATA_DIR / 'code_blocks'
PROCESSED_DATA_DIR = DATA_DIR / 'processed'

TRAINABLE_FILE_TYPES = {".ads", ".adb", ".gpr", ".ada"}

MAX_TOKENS = 256
MAX_INDENTATION = 120

def get_sub_blocks() -> Generator[Tuple[str, str, str], None, None]:
    code_block_dirs = [CODE_BLOCKS_DIR / category[1:] for category in TRAINABLE_FILE_TYPES]
    for dir in code_block_dirs:
        for file in dir.iterdir():
            if file.is_file() and file.suffix in TRAINABLE_FILE_TYPES:
                with open(str(file), "r", encoding="utf-8") as f:
                    lines = f.readlines()
                last_end = 0
                for i in range(len(lines)-1):
                    line = lines[i]
                    if line.strip().endswith(';'):
                        yield file.name, ''.join(lines[last_end:i+1]), lines[i+1]
                        last_end = i+1

def create_dataset():
    block_token_length_counts = {}
    for file_name, block, next_line in get_sub_blocks():
        block_tokens = Tokenizer.encode(block)
        block_token_length_counts[len(block_tokens)] = block_token_length_counts.get(len(block_tokens), 0) + 1
    return block_token_length_counts

In [4]:
tmp = create_dataset()

In [None]:
import matplotlib.pyplot as plt
plt.bar(block_token_length_counts.keys(), block_token_length_counts.values())
plt.show()