In [None]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m7.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [None]:
import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np

In [None]:
import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
# -----------------------------------------------------------------------------

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

def exit(discription="Debug Exit",exit=True):
    import sys
    print(discription)
    if exit==True:
        sys.exit(0)

using device: cuda


In [None]:
def load_tokens(filename):
    npt = np.load(filename)
    npt = npt.astype(np.int32) # added after video
    ptt = torch.tensor(npt, dtype=torch.long)
    return ptt

In [None]:

class DataLoaderLite:
    def __init__(self, B, T, process_rank, num_processes, split):
        self.B = B
        self.T = T
        self.process_rank = process_rank
        self.num_processes = num_processes
        assert split in {'train', 'val'}

        # get the shard filenames
        data_root = "/content/"
        shards = os.listdir(data_root)
        shards = [s for s in shards if split in s]
        shards = sorted(shards)
        shards = [os.path.join(data_root, s) for s in shards]
        self.shards = shards
        assert len(shards) > 0, f"no shards found for split {split}"
        if master_process:
            print(f"found {len(shards)} shards for split {split}")
        self.reset()

    def reset(self):
        # state, init at shard zero
        self.current_shard = 0
        self.tokens = load_tokens(self.shards[self.current_shard])
        self.current_position = self.B * self.T * self.process_rank

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position : self.current_position+B*T+1]
        #if buf[:-1].numel() < 8 * 1024:
        padding_size = 8 * 1024 - buf[:-1].numel()
        x = torch.cat([buf[:-1], torch.zeros(padding_size, dtype=buf[:-1].dtype)]).view(8, 1024)
        y = torch.cat([buf[1:], torch.zeros(padding_size, dtype=buf[1:].dtype)]).view(8, 1024)
        '''else:
            x = (buf[:-1]).view(B, T) # inputs
            y = (buf[1:]).view(B, T) # targets'''

        # advance the position in the tensor
        self.current_position += B * T * self.num_processes
        # if loading the next batch would be out of bounds, advance to next shard
        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
            self.current_shard = (self.current_shard + 1) % len(self.shards)
            self.tokens = load_tokens(self.shards[self.current_shard])
            self.current_position = B * T * self.process_rank

        return x, y


In [None]:

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


In [None]:
@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension

In [None]:

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer


In [None]:
# -----------------------------------------------------------------------------
# helper function for HellaSwag eval
# takes tokens, mask, and logits, returns the index of the completion with the lowest loss

def get_most_likely_row(tokens, mask, logits):
    # evaluate the autoregressive loss at all positions
    shift_logits = (logits[..., :-1, :]).contiguous()
    shift_tokens = (tokens[..., 1:]).contiguous()
    flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    flat_shift_tokens = shift_tokens.view(-1)
    shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
    shift_losses = shift_losses.view(tokens.size(0), -1)
    # now get the average loss just for the completion region (where mask == 1), in each row
    shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
    masked_shift_losses = shift_losses * shift_mask
    # sum and divide by the number of 1s in the mask
    sum_loss = masked_shift_losses.sum(dim=1)
    avg_loss = sum_loss / shift_mask.sum(dim=1)
    # now we have a loss for each of the 4 completions
    # the one with the lowest loss should be the most likely
    pred_norm = avg_loss.argmin().item()
    return pred_norm


In [None]:
# -----------------------------------------------------------------------------
# simple launch:
# python train_gpt2.py
# DDP launch for e.g. 8 GPUs:
# torchrun --standalone --nproc_per_node=8 train_gpt2.py

# run the training loop
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

# set up DDP (distributed data parallel).
# torchrun command sets the env variables RANK, LOCAL_RANK, and WORLD_SIZE
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
if ddp:
    # use of DDP atm demands CUDA, we set the device appropriately according to rank
    assert torch.cuda.is_available(), "for now i think we need CUDA for DDP"
    init_process_group(backend='nccl')
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
else:
    # vanilla, non-DDP run
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    # attempt to autodetect device
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

# added after video, pytorch can be serious about it's device vs. device_type distinction
device_type = "cuda" if device.startswith("cuda") else "cpu"

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

enc = tiktoken.get_encoding("gpt2")

total_batch_size = 8192 # 2**19, ~0.5M, in number of tokens
B = 8 # micro batch size
T = 1024 # sequence length
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
if master_process:
    print(f"total desired batch size: {total_batch_size}")
    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")


using device: cuda
total desired batch size: 8192
=> calculated gradient accumulation steps: 1


In [None]:
B * T * ddp_world_size

8192

In [None]:
import re

def clean_text(text):
    # Remove special tokens using regex
    cleaned_text = re.sub(r'\[CLS\]|\[SEP\]|\<unk\>', '', text)
    # Remove multiple spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


In [None]:

train_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, split="train")
val_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, split="val")

torch.set_float32_matmul_precision('high')

# create model
model = GPT(GPTConfig(vocab_size=50304))
# model = GPT.from_pretrained("gpt2") # or init from OpenAI GPT-2
model.to(device)
use_compile = False # torch.compile interferes with HellaSwag eval and Generation. TODO fix
if use_compile:
    model = torch.compile(model)
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module if ddp else model # always contains the "raw" unwrapped model

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 715
max_steps = 19073 # 19,073 steps is ~1 epoch, if data is 10B tokens and batch size 0.5M tokens
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

# optimize!
optimizer = raw_model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device_type)

# create the log directory we will write checkpoints to and log to
log_dir = "log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"log.txt")
enc = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')

with open(log_file, "w") as f: # open for writing to clear the file
    pass

for step in range(max_steps):
    t0 = time.time()
    last_step = (step == max_steps - 1)

    # once in a while evaluate our validation loss
    if step % 250 == 0 or last_step:
        model.eval()
        val_loader.reset()
        with torch.no_grad():
            val_loss_accum = 0.0
            val_loss_steps = 20
            for _ in range(val_loss_steps):
                x, y = val_loader.next_batch()
                x, y = x.to(device), y.to(device)
                with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
                    logits, loss = model(x, y)
                loss = loss / val_loss_steps
                val_loss_accum += loss.detach()
        if ddp:
            dist.all_reduce(val_loss_accum, op=dist.ReduceOp.AVG)
        if master_process:
            print(f"validation loss: {val_loss_accum.item():.4f}")
            with open(log_file, "a") as f:
                f.write(f"{step} val {val_loss_accum.item():.4f}\n")
            if step > 0 and (step % 5000 == 0 or last_step):
                # optionally write model checkpoints
                checkpoint_path = os.path.join(log_dir, f"model_{step:05d}.pt")
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'config': raw_model.config,
                    'step': step,
                    'val_loss': val_loss_accum.item()
                }
                # you might also want to add optimizer.state_dict() and
                # rng seeds etc., if you wanted to more exactly resume training
                torch.save(checkpoint, checkpoint_path)

    # once in a while generate from the model (except step 0, which is noise)
    if ((step > 0 and step % 150 == 0) or last_step) and (not use_compile):
        model.eval()
        num_return_sequences = 4
        max_length = 60
        tokens = enc.encode("من آن مرغ غزل خوانم['BOM']")
        tokens = torch.tensor(tokens, dtype=torch.long)
        tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
        xgen = tokens.to(device)
        sample_rng = torch.Generator(device=device)
        sample_rng.manual_seed(42 + ddp_rank)
        while xgen.size(1) < max_length:
            # forward the model to get the logits
            with torch.no_grad():
                with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
                    logits, loss = model(xgen) # (B, T, vocab_size)
                # take the logits at the last position
                logits = logits[:, -1, :] # (B, vocab_size)
                # get the probabilities
                probs = F.softmax(logits, dim=-1)
                # do top-k sampling of 50 (huggingface pipeline default)
                # topk_probs here becomes (5, 50), topk_indices is (5, 50)
                topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
                # select a token from the top-k probabilities
                # note: multinomial does not demand the input to sum to 1
                ix = torch.multinomial(topk_probs, 1, generator=sample_rng) # (B, 1)
                # gather the corresponding indices
                xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
                # append to the sequence
                xgen = torch.cat((xgen, xcol), dim=1)
        # print the generated text
        for i in range(num_return_sequences):
            tokens = xgen[i, :max_length].tolist()
            decoded = enc.decode(tokens)
            print(f"rank {ddp_rank} sample {i}: {clean_text(decoded)}")

    # do one step of the optimization
    model.train()
    optimizer.zero_grad()
    loss_accum = 0.0
    for micro_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        # added after video, this field is also used by the forward pass.
        if ddp:
            model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
        with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
            logits, loss = model(x, y)
        # we have to scale the loss to account for gradient accumulation,
        # because the gradients just add on each successive backward().
        # addition of gradients corresponds to a SUM in the objective, but
        # instead of a SUM we want MEAN. Scale the loss here so it comes out right
        loss = loss / grad_accum_steps
        loss_accum += loss.detach()
        loss.backward()
    if ddp:
        dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # determine and set the learning rate for this iteration
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    if device_type == "cuda":
        torch.cuda.synchronize() # wait for the GPU to finish work
    t1 = time.time()
    dt = t1 - t0 # time difference in seconds
    tokens_processed = train_loader.B * train_loader.T * grad_accum_steps * ddp_world_size
    tokens_per_sec = tokens_processed / dt
    if master_process:
        print(f"step {step:5d} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")
        with open(log_file, "a") as f:
            f.write(f"{step} train {loss_accum.item():.6f}\n")

if ddp:
    destroy_process_group()

found 1 shards for split train
found 1 shards for split val
num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: True
validation loss: 10.9340
step     0 | loss: 10.908829 | lr 8.3916e-07 | norm: 26.5932 | dt: 22959.90ms | tok/sec: 356.80
step     1 | loss: 10.818146 | lr 1.6783e-06 | norm: 25.9042 | dt: 2810.61ms | tok/sec: 2914.67
step     2 | loss: 10.654526 | lr 2.5175e-06 | norm: 24.5189 | dt: 2827.45ms | tok/sec: 2897.31
step     3 | loss: 10.464630 | lr 3.3566e-06 | norm: 21.5945 | dt: 2827.33ms | tok/sec: 2897.43
step     4 | loss: 10.211823 | lr 4.1958e-06 | norm: 17.8052 | dt: 2841.54ms | tok/sec: 2882.94
step     5 | loss: 9.995064 | lr 5.0350e-06 | norm: 14.3455 | dt: 2860.12ms | tok/sec: 2864.21
step     6 | loss: 9.809555 | lr 5.8741e-06 | norm: 11.9406 | dt: 2867.53ms | tok/sec: 2856.81
step     7 | loss: 9.634531 | lr 6.7133e-06 | norm: 9.7489 | dt: 2886.93ms | tok/sec: 2837.62
s

In [None]:
rank 0 sample 0: من آن مرغ غزل خوانم[BOM]
 [BOM] این بر[EOS]
  [BOM] از من شدم درو[EOS]
   [BOM] هم از تو که دل در[BOM] تو[BOM] و بی تو[EOS][BOM] باز[BOM] چون چون در هر چه و این این نه به که تو بر[EOS][BOM] در وی ها را[BOM][BOM] چون ز تو[BOM][BOM] ز تو از[EOS][BOM][BOM] چون سر چونش می و را[EOS][BOM] چون


In [None]:
!pip install tiktoken
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [None]:
import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset # pip install datasets
from tqdm import tqdm # pip install tqdm
from transformers import pipeline, AutoTokenizer, GPT2LMHeadModel
from transformers import GemmaTokenizerFast


tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')


def tokenize(doc):
    tokens = tokenizer.encode(doc, add_special_tokens=False)
    tokens_np = np.array(tokens)
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16


def write_datafile(filename, tokens_np):
    np.save(filename, tokens_np)

# tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
fp = open('/content/moulavi_norm.txt')
docs = fp.readlines()[2:]


fp = open('/content/feyz_norm.txt')
docs_feyz = fp.readlines()[2:]


fp = open('/content/bidel_norm.txt')
docs_bidel = fp.readlines()[2:]

fp = open('/content/ghaani_norm.txt')
docs_ghaani = fp.readlines()[2:]

docs = docs+docs_bidel+docs_ghaani+docs_feyz
print(len(docs))
lines = []
i=0
while i<len(docs)-1:
  mesr1 = docs[i].replace('\n', '')
  mesr2 = docs[i+1].replace('\n', '')
  if len(mesr1)==0:
     i+=1
     continue

  if len(mesr2)==0:
     i+=1
     continue
  line = '[BOM]'+ mesr1 + '[BOM]' + mesr2+ '[EOS]'
  lines.append(line)
  i+=2

print(len(lines))
docs = lines
docs = docs[:-3000]
nprocs = max(1, os.cpu_count()//2)
shard_size = int(1e8)

with mp.Pool(nprocs) as pool:
    shard_index = 1
    # preallocate buffer to hold current shard
    all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
    token_count = 0
    progress_bar = None

    for tokens in pool.imap(tokenize, docs, chunksize=16):
      #print(tokens)
      if token_count + len(tokens) < shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)] = tokens
            token_count += len(tokens)
            # update progress bar
            if progress_bar is None:
                progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")
            progress_bar.update(len(tokens))
      else:
            # write the current shard and start a new one
            split = "val" if shard_index == 0 else "train"
            filename = f"a_hh_{split}_{shard_index:06d}"
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            all_tokens_np = all_tokens_np[all_tokens_np!=0]
            write_datafile(filename, all_tokens_np)
            shard_index += 1
            progress_bar = None
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder

    # write any remaining tokens as the last shard
    if token_count != 0:
        split = "val" if shard_index == 0 else "train"
        filename = f"a_hh_{split}_{shard_index:06d}"
        all_tokens_np = all_tokens_np[:token_count]
        all_tokens_np = all_tokens_np[all_tokens_np!=0]
        write_datafile(filename, all_tokens_np[:token_count])


78265
26394



Shard 1:   0%|          | 0/100000000 [00:00<?, ?tokens/s][A
Shard 0:   0%|          | 53976/100000000 [00:19<10:16:57, 2699.99tokens/s]

Shard 1:   0%|          | 24451/100000000 [00:00<14:41, 113411.88tokens/s][A
Shard 1:   0%|          | 37770/100000000 [00:00<13:39, 121942.52tokens/s][A
Shard 1:   0%|          | 50729/100000000 [00:00<13:23, 124466.06tokens/s][A
Shard 1:   0%|          | 63223/100000000 [00:00<13:54, 119727.87tokens/s][A
Shard 1:   0%|          | 76284/100000000 [00:00<13:34, 122673.56tokens/s][A
Shard 1:   0%|          | 88896/100000000 [00:00<13:29, 123382.78tokens/s][A
Shard 1:   0%|          | 102599/100000000 [00:00<13:09, 126557.86tokens/s][A
Shard 1:   0%|          | 116064/100000000 [00:00<13:08, 126692.91tokens/s][A
Shard 1:   0%|          | 128762/100000000 [00:01<14:03, 118354.97tokens/s][A
Shard 1:   0%|          | 140704/100000000 [00:01<14:12, 117105.54tokens/s][A
Shard 1:   0%|          | 154032/100000000 [00:01<13:40, 121729.30tokens/s]

In [None]:
lines[1000:1500]

['[BOM]قسم هر روزش بیاید بی جگر[BOM]حاجتش نبود تقاضایی دگر[EOS]',
 '[BOM]قرعه بر هر که فتادی روز روز[BOM]سوی آن شیر او دویدی همچو یوز[EOS]',
 '[BOM]چون به خرگوش آمد این ساغر بدور[BOM]بانگ زد خرگوش کاخر چند جور[EOS]',
 '[BOM]قوم گفتندش که چندین گاه ما[BOM]جان فدا کردیم در عهد و وفا[EOS]',
 '[BOM]تو مجو بدنامی ما ای عنود[BOM]تا نرنجد شیر رو رو زود زود[EOS]',
 '[BOM]گفت ای یاران مرا مهلت دهید[BOM]تا بمکرم از بلا بیرون جهید[EOS]',
 '[BOM]تا امان یابد بمکرم جانتان[BOM]ماند این میراث فرزندانتان[EOS]',
 '[BOM]هر پیمبر امتان را در جهان[BOM]همچنین تا مخلصی می خواندشان[EOS]',
 '[BOM]کز فلک راه برون شو دیده بود[BOM]در نظر چون مردمک پیچیده بود[EOS]',
 '[BOM]مردمش چون مردمک دیدند خرد[BOM]در بزرگی مردمک کس ره نبرد[EOS]',
 '[BOM]قوم گفتندش که ای خرگوش دار[BOM]خویش را اندازه خرگوش دار[EOS]',
 '[BOM]هین چه لافست این که از تو بهتران[BOM]در نیاوردند اندر خاطر آن[EOS]',
 '[BOM]معجبی یا خود قضامان در پیست[BOM]ور نه این دم لایق چون تو کیست[EOS]',
 '[BOM]گفت ای یاران حقم الهام داد[BOM]مر ضعیفی را قوی رایی فت

In [None]:
all_tokens_np.shape

(53833,)

In [None]:
tokenizer.decode([    7 , 6416  ,  54  ,1402  , 129 , 1994  ,  52 ,  124   ,  7  ,  50  ,2625  ,  60,
  2427 ,   52 ,  124  ,   9  ,   7  , 999  , 121 ,  123 ,   72 ,  285,  4409,    82,
    43 ,  102   ,  7   , 46 ,17961,    73 ,  428,    45  , 419  ,  43 ,16910,    82,
    43,   102   ,  9   ,  7 , 1085 , 1908 , 1174  ,  82  ,1174    ,82,    50 , 4102,
     7 ,   72 , 2399 , 1174  , 562 , 5901 ])

'[BOM] بشنو این نی چون شکایت می کند[BOM] از جداییها حکایت می کند[EOS][BOM] کز نیستان تا مرا ببریده اند[BOM] در نفیرم مرد و زن نالیده اند[EOS][BOM] سینه خواهم شرحه شرحه از فراق[BOM] تا بگویم شرح درد اشتیاق'

In [None]:
 من آن مرغ غزل خوانم آخر های دری[BOM] مرا او بنی[BOM] خست در کار ما دل حافظ[BOM] می‌کنم[EOS][BOM] برو دل که بگونه بر اک کور باده در طلبه درآب دده درونمان و[BOM] تا در طلب هر کسی دری تا در آن که درو


In [None]:
all_tokens_np[0:100000]

array([   7, 1582,   43, ...,    0,  157,  890], dtype=uint16)

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [None]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m0.7/1.1 MB[0m [31m21.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [None]:
!rm -r data_val*

In [None]:
docs['text'].to_numpy()

array(['hello i am snana', 'hello how are you', 'oh hello'], dtype=object)

In [None]:
rank 0 sample 0: روزی در کنار درختی نشسته بودم ش<endoftext|> <|endoftext|>
rank 0 sample 1: روزی در کنار درختی نشسته بودم<یDri<|endoftext|> <|endoftext|> <|endoftext|>و<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>
rank 0 sample 2: روزی در کنار درختی نشسته بودم<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>و<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>ش<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>ی<|endoftext|>ن<|endoftext|> <|endoftext|> <|endoftext|>ی<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>و<|endoftext|>ه<|endoftext|>م<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>ک<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>
rank 0 sample 3: روزی در کنار درختی نشسته بودم<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>ی<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>ن<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>ش<|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|> <|endoftext|>ش<|endoftext|> <|endoftext|> <|endoftext|>ا<|endoftext|>ا<|endoftext|> <|endoftext|>م


In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModel, TFAutoModel

# v3.0
model_name_or_path = "HooshvareLab/bert-fa-zwnj-base"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/426k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [None]:
tokenizer.tokenize('سلام دوستان من')

['سلام', 'دوستان', 'من']

In [None]:
from transformers import pipeline, AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/537k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

In [None]:
from transformers import pipeline, AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained('flax-community/gpt2-medium-persian')
tokenizer.tokenize('سلام دوستان من')



config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.33M [00:00<?, ?B/s]

['Ø³ÙĦØ§Ùħ', 'ĠØ¯ÙĪØ³ØªØ§ÙĨ', 'ĠÙħÙĨ']

In [None]:
fp = open('text.txt')
data = fp.read()

In [None]:
data[280:1000]

' گاهی دیگران سرنوشت را تعیین می کنند. زمانی که به گذشته باز می گردیم به کنیم که با یک اتفاقساده، دیگرالحظاتی برخورد می ن توانسته اند زندگیمان را دگرگون کنند.  این داستانی است از یک زندگی.  مسافرین محترم ورود شما را به خاک ایران خوش آمد می گویم. ساعت 02:02  دقیقه به وقت تهران است. هوا هفده درجهباالی صفر و بارانیست. امیدوارم از پرواز لذت برده باشید. لطفت در جای خود نشسته و کمربند را ببندید. آرزوی دیدار مجدد شما را داریم.  هومن- دیگه پامو تو این بشقاب پرنده نمی ذارم. اسمشو باید می ذاشتند شرکت هواپیمایی اتو معلق! خیلی خوب ازمون  پذیرایی کردند که آرزوی دیدار مجددمون را هم دارن؟!  من- چی می گی هومن؟ چرا غر می زنی؟  هومن- می گن داریم سقوط می کنیم. خلبان یادش رفته چرخهای هواپیما رو سوار هواپیما کند.  هر بدی و خوبی از م'

In [None]:
#data_split = data[280:].split(' ')
tokenizer.tokenize(data)

In [None]:
sorted(list(set(data_split)))


In [None]:
# importing all the required modules
import PyPDF2

# creating a pdf reader object
reader = PyPDF2.PdfReader('/content/shahnameh.pdf')

# print the number of pages in pdf file
print(len(reader.pages))

# print the text of the first page
page = reader.pages[32].extract_text()
page.split('\n')

1973


['3کنون ای خردمند وصف خرد ',
 'خرد را و جان را که یارد ستودسه پاس تو چشم است وگوش و زباننخست آفرینش خرد را شناسخرد چشم جانست چون بنگریازویی به هر دو سرای ارجمندهشیوار دیوانه خواند وراکسی کو خرد را ندارد ز پیشچه گفت آن خردمند مرد خردخرد تیره و مرد روشن روانازو شادمانی وزویت غمیستخرد رهنمای و خرد دلگشایخرد بهتر از هر چه ایزد بدادکنون تا چه داری بیار از خرد ',
 'حکیما چو کس نیست گفتن چه سود ',
 'تویی کرده ی کردگار جهان ',
 'خرد دست گیرد به هر دو سرایستایش خرد را به از راه دادکه گوش نیوشنده زو برخوردبدین جایگه گفتن اندرخورداز آغاز باید که دانی درستچو دیدار یابی به شاخ سخنز هر دانشی چون سخن بشنویبه گفتار دانندگان راه جوی ',
 'وزویت فزونی وزویت کمیست ',
 'نباشد همی شادمان یک زمان ',
 'دلش گردد از کرده ی خویش ریشکه دانا ز گفتار از برخورد ',
 'تو بی چشم شادان جهان نسپریگسسته خرد پای دارد ببندهمان خویش بیگانه داند ورا ',
 'کزین سه رسد نیک و بد بی گماننگهبان جانست و آن سه پاس ',
 'و گر من ستایم که یارد شنود ',
 'ازین پس بگو کافرینش چه بود ',
 ' سر مایه ی گوهران از نخستبدانی که دانش نیابد به مناز

In [None]:
from io import StringIO

# Read each line of the PDF
pdfContent = StringIO(getPDFContent("/content/shahnameh.pdf").encode("ascii", "ignore"))
for line in pdfContent:
    print(line.strip())
    break

NameError: name 'getPDFContent' is not defined

In [None]:
!pip install pdfquery

Collecting pdfquery
  Downloading pdfquery-0.4.3.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cssselect>=0.7.1 (from pdfquery)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting pdfminer.six (from pdfquery)
  Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyquery>=1.2.2 (from pdfquery)
  Downloading pyquery-2.0.0-py3-none-any.whl (22 kB)
Collecting roman>=1.4.0 (from pdfquery)
  Downloading roman-4.2-py3-none-any.whl (5.5 kB)
Building wheels for collected packages: pdfquery
  Building wheel for pdfquery (setup.py) ... [?25l[?25hdone
  Created wheel for pdfquery: filename=pdfquery-0.4.3-py3-none-any.whl size=16780 sha256=4cacbdd0adff270ff9ab8c7ff97ef5bbba56d5cce091cd07ec47617bf6ec9008
  Stored in directory: /root/.cache/pip/wheels/98/a2/41/ca6652543d0fa5762560eaaf0f620a5d6341ec0b9e60996

In [None]:
from pdfquery import PDFQuery

pdf = PDFQuery('/content/shahnameh.pdf')
pdf.load()

# Use CSS-like selectors to locate the elements
text_elements = pdf.pq('LTTextLineHorizontal')

# Extract the text from the elements
text = [t.text for t in text_elements]

print(text)

In [None]:
|pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2

p = '/content/shahnameh.pdf'
reader = PyPDF2.PdfReader('/content/shahnameh.pdf')
reader.getPage(35).extractText() + "\n"

DeprecationError: reader.getPage(pageNumber) is deprecated and was removed in PyPDF2 3.0.0. Use reader.pages[page_number] instead.

In [None]:
pip install StringIO

[31mERROR: Could not find a version that satisfies the requirement StringIO (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for StringIO[0m[31m
[0m

In [None]:
pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
tokenizer.encode('سلام وقت به خیر', add_special_tokens=True)

[5, 1533, 446, 48, 1438, 3]

In [None]:
    additional_special_tokens=['<A>', '<B>', '<C>']


special_tokens_dict = {'[BOM]':''}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')


AssertionError: Key [BOM] is not a special token

In [None]:
from transformers import pipeline, AutoTokenizer, GPT2LMHeadModel
tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')
tokenizer.encode('تمام امروز خود را بدادم به دل [BOM] کنون ای خردمند وصف خرد', add_special_tokens=False), tokenizer.tokenize('کنون ای خردمند وصف خرد')

([254, 245, 67, 53, 8915, 73, 48, 128, 7, 1641, 213, 7770, 3413, 1716],
 ['▁کنون', '▁ای', '▁خردمند', '▁وصف', '▁خرد'])

In [None]:
tokenizer.decode([254, 245, 67, 53, 8915, 73, 48, 128, 7, 1641, 213, 7770, 3413, 1716])

'تمام امروز خود را بدادم به دل[BOM] کنون ای خردمند وصف خرد'

In [None]:
fp = open('/content/Hafezfull.txt')
docs = fp.readlines()
lines = []
for i in range(len(docs)-1):
  mesr1 = docs[i].replace('\n', '')
  mesr2 = docs[i+1].replace('\n', '')
  line = '[BOM]'+ mesr1 + '[BOM]' + mesr2+ '[EOS]'
  lines.append(line)

In [None]:
len(lines)

8383