In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
from transformers import AutoTokenizer

In [16]:
data_root = "/content/drive/MyDrive/nano_gpt" # a path for input, log and output files
poems_file_name = "first.txt"
cls_token = '[CLS]'
bob_token = '[BOB]'
bom_token = '[BOM]'
eos_token = '[EOS]'
def load_and_tokenize():
    all_files = [os.path.join(data_root, f) for f in os.listdir(data_root + "/input") if f.endswith('.txt')]
    aggregated_text = ''
    for file in all_files:

        with open(file, 'r', encoding='utf-8') as f:
            text = f.readlines()[2:] # TODO
            is_beit = True
            poem_str = f'\n{cls_token}'

            for line in text:
                if line.strip() == '':
                    continue
                poem_str += f"{bob_token + bom_token if is_beit else bom_token}{line}"
                is_beit = not is_beit
        poem_str += f'{eos_token}'

        aggregated_text += poem_str.strip()

    output_file_path = data_root + "/" + poems_file_name # Set the path for the output file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write(aggregated_text)

    return aggregated_text

In [17]:
class CausalSelfAttention(nn.Module): # Implements self-attention mechanism

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1 # standard deviation grows inside the residual stream. This line controls it.
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x)) # self.ln_1(x) noramlization --> attention think about all tokens together. aggregated function
        x = x + self.mlp(self.ln_2(x)) # self.ln_2(x) --> multi linear perc(?) # think individually about tokens
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int =  25048 # tokenizer.vocab_size number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 6 # number of layers
    n_head: int = 6 # number of heads
    n_embd: int = 384 # embedding dimension

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd), # token encoding the first box of the picture
            wpe = nn.Embedding(config.block_size, config.n_embd), # position encoding
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # total block in the picture of attention is all you need
            ln_f = nn.LayerNorm(config.n_embd), # linear part of picture
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):# if a layer is linear use normal distribution std is different
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None: # if we have target in data we calculate loss as follows
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1)) # flatten data by targets.view(-1)
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type): # loading wieghts. this is a constructor or class method
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer


In [19]:
tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')
special_tokens = {"cls_token": "[CLS]", "additional_special_tokens":["[BOM]","[BOB]","[SEP]","<unk>"],"eos_token":"[EOS]"}
tokenizer.add_special_tokens(special_tokens)
tokenizer.cls_token = "[CLS]"
tokenizer.bob_token ="[BOB]"
tokenizer.bom_token = "[BOM]"
tokenizer.eos_token ="[EOS]"
enc = tokenizer

In [20]:
# test
cls_token = '[CLS]'
bob_token = '[BOB]'
bom_token = '[BOM]'
eos_token = '[EOS]'
def load_and_tokenize(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.readlines()
        is_beit = True
        poem_str = f'\n{cls_token}'
        # print(text)
        for index , line in enumerate(text):
            if line.strip() == '':
                continue
            poem_str += f"{bob_token + bom_token if is_beit else bom_token}{line}"
            is_beit = not is_beit
    poem_str += f'{eos_token}'
    # print(poem_str)
    text = poem_str.strip()
    print("print something")
    return text

print(load_and_tokenize(data_root + '/input/2nd.txt'))
# load_and_tokenize(data_root + '/input/aggregated_poems.txt')

print something
[CLS][BOB][BOM]الا يا ايها الساقی ادر کاسا و ناولها
[BOM]که عشق آسان نمود اول ولی افتاد مشکل‌ها
[BOB][BOM]به بوی نافه‌ای کاخر صبا زان طره بگشايد
[BOM]ز تاب جعد مشکينش چه خون افتاد در دل‌ها
[BOB][BOM]مرا در منزل جانان چه امن عيش چون هر دم
[BOM]جرس فرياد می‌دارد که بربنديد محمل‌ها
[BOB][BOM]به می سجاده رنگين کن گرت پير مغان گويد
[BOM]که سالک بی‌خبر نبود ز راه و رسم منزل‌ها
[BOB][BOM]شب تاريک و بيم موج و گردابی چنين هايل
[BOM]کجا دانند حال ما سبکباران ساحل‌ها
[BOB][BOM]همه کارم ز خود کامی به بدنامی کشيد آخر
[BOM]نهان کی ماند آن رازی کز او سازند محفل‌ها
[BOB][BOM]حضوری گر همی‌خواهی از او غايب مشو حافظ
[BOM]متی ما تلق من تهوی دع الدنيا و اهملها
[BOB][BOM]صلاح کار کجا و من خراب کجا
[BOM]ببين تفاوت ره کز کجاست تا به کجا
[BOB][BOM]دلم ز صومعه بگرفت و خرقه سالوس
[BOM]کجاست دير مغان و شراب ناب کجا
[BOB][BOM]چه نسبت است به رندی صلاح و تقوا را
[BOM]سماع وعظ کجا نغمه رباب کجا
[BOB][BOM]ز روی دوست دل دشمنان چه دريابد
[BOM]چراغ مرده کجا شمع آفتاب کجا
[BOB][BOM]چو کحل بينش ما خاک آستان

In [21]:

import numpy as np
cls_token = '[CLS]'
bob_token = '[BOB]'
bom_token = '[BOM]'
eos_token = '[EOS]'
def load_and_tokenize(filename): #TODO
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.readlines()[2:]
        is_beit = True
        poem_str = f'\n{cls_token}'
        for index , line in enumerate(text):
            if line.strip() == '':
                continue
            poem_str += f"{bob_token + bom_token if is_beit else bom_token}{line}"
            is_beit = not is_beit
    poem_str += f'{eos_token}'
    text = poem_str.strip()
    tokens = tokenizer.encode(text)
    return torch.tensor(tokens, dtype=torch.long)

class DataLoaderLite:
    def __init__(self, B, T, process_rank, num_processes, files):
        self.B = B
        self.T = T
        self.process_rank = process_rank
        self.num_processes = num_processes
        self.files = files
        assert len(files) > 0, f"no txt files provided"

        self.reset()

    def reset(self):
        self.current_file = 0
        self.tokens = load_and_tokenize(self.files[self.current_file])
        self.current_position = self.B * self.T * self.process_rank

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position : self.current_position+B*T+1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # targets
        # advance the position in the tensor
        self.current_position += B * T * self.num_processes
#         # if loading the next batch would be out of bounds, advance to next shard
        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens): #TODO
            # self.current_file = (self.current_file + 1) % len(self.files)
            # self.tokens = load_and_tokenize(self.files[self.current_file], self.tokenizer)
            # self.current_position = B * T * self.process_rank
            self.reset()
        return x, y


# -----------------------------------------------------------------------------
# helper function for HellaSwag eval
# takes tokens, mask, and logits, returns the index of the completion with the lowest loss

def get_most_likely_row(tokens, mask, logits):
    # evaluate the autoregressive loss at all positions
    shift_logits = (logits[..., :-1, :]).contiguous()
    shift_tokens = (tokens[..., 1:]).contiguous()
    flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    flat_shift_tokens = shift_tokens.view(-1)
    shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none')
    shift_losses = shift_losses.view(tokens.size(0), -1)
    # now get the average loss just for the completion region (where mask == 1), in each row
    shift_mask = (mask[..., 1:]).contiguous() # we must shift mask, so we start at the last prompt token
    masked_shift_losses = shift_losses * shift_mask
    # sum and divide by the number of 1s in the mask
    sum_loss = masked_shift_losses.sum(dim=1)
    avg_loss = sum_loss / shift_mask.sum(dim=1)
    # now we have a loss for each of the 4 completions
    # the one with the lowest loss should be the most likely
    pred_norm = avg_loss.argmin().item()
    return pred_norm

In [22]:
def generate_poem(model, device, device_type, ddp_rank, phrase):
    result = 'result.txt'
    model.eval()
    num_return_sequences = 4
    max_length = 32
    tokens = enc.encode(phrase)
    tokens = torch.tensor(tokens, dtype=torch.long)
    tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
    xgen = tokens.to(device)
    sample_rng = torch.Generator(device=device)
    sample_rng.manual_seed(42 + ddp_rank)
    generated_poems = []  # List to store generated poems

    while xgen.size(1) < max_length:
        # forward the model to get the logits
        with torch.no_grad():
            with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
                logits, loss = model(xgen) # (B, T, vocab_size)
            # take the logits at the last position
            logits = logits[:, -1, :] # (B, vocab_size)
            # get the probabilities
            probs = F.softmax(logits, dim=-1)
            # do top-k sampling of 50 (huggingface pipeline default)
            # topk_probs here becomes (5, 50), topk_indices is (5, 50)
            topk_probs, topk_indices = torch.topk(probs, 50, dim=-1) # return top k high probability
            # select a token from the top-k probabilities
            # note: multinomial does not demand the input to sum to 1
            ix = torch.multinomial(topk_probs, 1, generator=sample_rng) # (B, 1)
            # gather the corresponding indices
            xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
            # append to the sequence
            xgen = torch.cat((xgen, xcol), dim=1)

    # collect and save the generated text
    for i in range(num_return_sequences):
        tokens = xgen[i, :max_length].tolist()
        decoded = enc.decode(tokens)
        print(f"rank {ddp_rank} sample {i}: {decoded}")
        generated_poems.append(decoded)

    # print the generated text
    for i in range(num_return_sequences):
        tokens = xgen[i, :max_length].tolist() #TODO edit print and remove special tokens test. tokenizer.special_tokens = false
        decoded = enc.decode(tokens)
        print(f"rank {ddp_rank} sample {i}: {decoded}")
        torch.save(model.state_dict(), data_root + '/saved_model.pth')
        output_file = data_root + "/" + result

        with open(output_file, "a") as out_f:
            out_f.write(f"sample {i}: {decoded}\n")
    return generated_poems  # Return the list of generated poems



In [23]:
def generate_poem(model, device, device_type, ddp_rank, phrase):
    result = 'result.txt'
    model.eval()
    num_return_sequences = 4
    max_length = 32
    tokens = enc.encode(phrase)
    tokens = torch.tensor(tokens, dtype=torch.long)
    tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
    xgen = tokens.to(device)
    sample_rng = torch.Generator(device=device)
    sample_rng.manual_seed(42 + ddp_rank)

    generated_poems = []  # List to store generated poems

    while xgen.size(1) < max_length:
        # forward the model to get the logits
        with torch.no_grad():
            with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
                logits, loss = model(xgen) # (B, T, vocab_size)
            # take the logits at the last position
            logits = logits[:, -1, :] # (B, vocab_size)
            # get the probabilities
            probs = F.softmax(logits, dim=-1)
            # do top-k sampling of 50 (huggingface pipeline default)
            topk_probs, topk_indices = torch.topk(probs, 50, dim=-1) # return top k high probability
            # select a token from the top-k probabilities
            ix = torch.multinomial(topk_probs, 1, generator=sample_rng) # (B, 1)
            # gather the corresponding indices
            xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
            # append to the sequence
            xgen = torch.cat((xgen, xcol), dim=1)

    # collect and save the generated text
    for i in range(num_return_sequences):
        tokens = xgen[i, :max_length].tolist()
        decoded = enc.decode(tokens)
        print(f"rank {ddp_rank} sample {i}: {decoded}")
        generated_poems.append(decoded)

    # Save the model state (optional)
    torch.save(model.state_dict(), data_root + '/saved_model.pth')

    # Save the generated poems to a file
    output_file = data_root + "/" + result
    with open(output_file, "a") as out_f:
        for i, poem in enumerate(generated_poems):
            out_f.write(f"sample {i}: {poem}\n")

    return generated_poems  # Return the list of generated poems


In [24]:
print("RANK:", os.environ.get('RANK', 'Not set'))
print("LOCAL_RANK:", os.environ.get('LOCAL_RANK', 'Not set'))
print("WORLD_SIZE:", os.environ.get('WORLD_SIZE', 'Not set'))

RANK: Not set
LOCAL_RANK: Not set
WORLD_SIZE: Not set


In [30]:
from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

# set up DDP (distributed data parallel).
# torchrun command sets the env variables RANK, LOCAL_RANK, and WORLD_SIZE
ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
print(ddp, "DDP")
if ddp:
    # use of DDP atm demands CUDA, we set the device appropriately according to rank
    assert torch.cuda.is_available(), "for now i think we need CUDA for DDP"
    init_process_group(backend='nccl')
    ddp_rank = int(os.environ['RANK'])
    ddp_local_rank = int(os.environ['LOCAL_RANK'])
    ddp_world_size = int(os.environ['WORLD_SIZE'])
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
else:
    # vanilla, non-DDP run
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    # attempt to autodetect device
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

# added after video, pytorch can be serious about it's device vs. device_type distinction
device_type = "cuda" if device.startswith("cuda") else "cpu"

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

# enc = tiktoken.get_encoding("gpt2")

total_batch_size = 8192 #524288 # 2**19, ~0.5M, in number of tokens
B = 8 # micro batch size
T = 1024 # sequence length
assert total_batch_size % (B * T * ddp_world_size) == 0, "make sure total_batch_size is divisible by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)
if master_process:
    print(f"total desired batch size: {total_batch_size}")
    print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

# tokenizer = AutoTokenizer.from_pretrained("bolbolzaban/gpt2-persian")


import random

# all_files = [os.path.join(data_root, f) for f in os.listdir(data_root) if f.endswith('.txt')]
# random.shuffle(all_files)  # Shuffle to ensure random split

# split_index = int(0.9 * len(all_files))  # 90% for training, 10% for validation
# train_files = all_files[:split_index]
# val_files = all_files[split_index:]

# train_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, files=train_files)
# val_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, files=val_files)
# new method



# Initialize an empty list for all lines
all_lines = []

with open(data_root + "/input/" + poems_file_name, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    all_lines.extend(lines)  # Add lines from the current file to all_lines

# Shuffle lines to ensure randomness
random.shuffle(all_lines)

# Split the lines into training and validation sets
split_index = int(0.9 * len(all_lines))  # 90% for training, 10% for validation
train_lines = all_lines[:split_index]
val_lines = all_lines[split_index:]
with open(os.path.join(data_root, 'train.txt'), 'w', encoding='utf-8') as train_file:
    train_file.writelines(train_lines)

with open(os.path.join(data_root, 'val.txt'), 'w', encoding='utf-8') as val_file:
    val_file.writelines(val_lines)

train_file_path = os.path.join(data_root, 'train.txt')
val_file_path = os.path.join(data_root, 'val.txt')
print(train_file_path)
print(val_file_path)
train_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, files=[train_file_path])
val_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, files=[val_file_path])

print("data is ready....")


torch.set_float32_matmul_precision('high') # this  is for gpu tunning

# create model
model = GPT(GPTConfig(vocab_size=50304))
# model = GPT.from_pretrained("gpt2") # or init from OpenAI GPT-2
model.to(device)
use_compile = False # torch.compile interferes with HellaSwag eval and Generation. TODO fix
if use_compile:
    model = torch.compile(model)
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module if ddp else model # always contains the "raw" unwrapped model

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 700
max_steps = 19073 # 19,073 steps is ~1 epoch, if data is 10B tokens and batch size 0.5M tokens
def get_lr(it):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

# optimize!
optimizer = raw_model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device_type)

# create the log directory we will write checkpoints to and log to
log_dir = data_root + "/log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"log.txt")
with open(log_file, "w") as f: # open for writing to clear the file
    pass
output_file = data_root + "/output.txt"
for step in range(max_steps):
    t0 = time.time()
    last_step = (step == max_steps - 1)

    # once in a while evaluate our validation loss
    if step % 250 == 0 or last_step:
        model.eval() # we want to use model. we dont want to train it.
        val_loader.reset()
        with torch.no_grad():
            val_loss_accum = 0.0
            val_loss_steps = 20
            for _ in range(val_loss_steps):
                x, y = val_loader.next_batch()
                x, y = x.to(device), y.to(device)
                with torch.autocast(device_type=device_type, dtype=torch.bfloat16): # do something with ram to make it efficient and sp
                    logits, loss = model(x, y)
                loss = loss / val_loss_steps
                val_loss_accum += loss.detach()
        if ddp:
            dist.all_reduce(val_loss_accum, op=dist.ReduceOp.AVG)
        if master_process:
            print(f"validation loss: {val_loss_accum.item():.4f}")
            with open(log_file, "a") as f:
                f.write(f"{step} val {val_loss_accum.item():.4f}\n")
            if step > 0 and (step % 5000 == 0 or last_step):
                # optionally write model checkpoints
                checkpoint_path = os.path.join(log_dir, f"model_{step:05d}.pt")
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'config': raw_model.config,
                    'step': step,
                    'val_loss': val_loss_accum.item()
                }
                # you might also want to add optimizer.state_dict() and
                # rng seeds etc., if you wanted to more exactly resume training
                torch.save(checkpoint, checkpoint_path)

    # once in a while evaluate hellaswag
    # if (step % 250 == 0 or last_step) and (not use_compile):
    #     num_correct_norm = 0
    #     num_total = 0 # ToDo i commented the following for loop since it relates to Hellasweg
    #     # for i, example in enumerate(iterate_examples("val")):
    #     #     # only process examples where i % ddp_world_size == ddp_rank
    #     #     if i % ddp_world_size != ddp_rank:
    #     #         continue
    #     #     # render the example into tokens and labels
    #     #     _, tokens, mask, label = render_example(example)
    #     #     tokens = tokens.to(device)
    #     #     mask = mask.to(device)
    #     #     # get the logits
    #     #     with torch.no_grad():
    #     #         with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
    #     #             logits, loss = model(tokens)
    #     #         pred_norm = get_most_likely_row(tokens, mask, logits)
    #     #     num_total += 1
    #     #     num_correct_norm += int(pred_norm == label)
    #     # reduce the stats across all processes
    #     if ddp:
    #         num_total = torch.tensor(num_total, dtype=torch.long, device=device)
    #         num_correct_norm = torch.tensor(num_correct_norm, dtype=torch.long, device=device)
    #         dist.all_reduce(num_total, op=dist.ReduceOp.SUM)
    #         dist.all_reduce(num_correct_norm, op=dist.ReduceOp.SUM)
    #         num_total = num_total.item()
    #         print('num_total', num_total)
    #         num_correct_norm = num_correct_norm.item()
    #         print('num_correct_norm', num_correct_norm)
    #     print("ddp not available")
    #     acc_norm = num_correct_norm / num_total #if num_total != 0 else 0
    #     if master_process:
    #         print(f"Dataset accuracy: {num_correct_norm}/{num_total}={acc_norm:.4f}")
    #         with open(log_file, "a") as f:
    #             f.write(f"{step} hella {acc_norm:.4f}\n")

    # once in a while generate from the model (except step 0, which is noise)
    if ((step > 0 and step % 250 == 0) or last_step) and (not use_compile):
        phrase = "دردم از یار است و درمان نیز هم"
        generate_poem(model, device, device_type, ddp_rank, phrase)
    # do one step of the optimization
    print("start training the model")
    model.train()
    optimizer.zero_grad()
    loss_accum = 0.0
    for micro_step in range(grad_accum_steps):
        print(f"step {micro_step} of training")
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        # added after video, this field is also used by the forward pass.
        if ddp:
            model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
        with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
            logits, loss = model(x, y)
        # we have to scale the loss to account for gradient accumulation,
        # because the gradients just add on each successive backward().
        # addition of gradients corresponds to a SUM in the objective, but
        # instead of a SUM we want MEAN. Scale the loss here so it comes out right
        loss = loss / grad_accum_steps
        loss_accum += loss.detach()
        loss.backward()
    if ddp:
        dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # determine and set the learning rate for this iteration
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    if device_type == "cuda":
        torch.cuda.synchronize() # wait for the GPU to finish work
    t1 = time.time()
    dt = t1 - t0 # time difference in seconds
    tokens_processed = train_loader.B * train_loader.T * grad_accum_steps * ddp_world_size
    tokens_per_sec = tokens_processed / dt
    if master_process:
        print(f"step {step:5d} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")
        with open(log_file, "a") as f:
            f.write(f"{step} train {loss_accum.item():.6f}\n")

if ddp:
    destroy_process_group()


False DDP
using device: cuda
total desired batch size: 8192
=> calculated gradient accumulation steps: 1
/content/drive/MyDrive/nano_gpt/train.txt
/content/drive/MyDrive/nano_gpt/val.txt
data is ready....
num decayed parameter tensors: 26, with 30,326,784 parameters
num non-decayed parameter tensors: 50, with 30,720 parameters
using fused AdamW: True
validation loss: 10.9556
start training the model
step 0 of training
step     0 | loss: 10.951500 | lr 8.5714e-07 | norm: 9.6144 | dt: 6098.91ms | tok/sec: 1343.19
start training the model
step 0 of training
step     1 | loss: 10.939346 | lr 1.7143e-06 | norm: 9.5394 | dt: 744.07ms | tok/sec: 11009.76
start training the model
step 0 of training
step     2 | loss: 10.920486 | lr 2.5714e-06 | norm: 9.7699 | dt: 741.02ms | tok/sec: 11054.98
start training the model
step 0 of training
step     3 | loss: 10.874313 | lr 3.4286e-06 | norm: 9.6087 | dt: 744.13ms | tok/sec: 11008.84
start training the model
step 0 of training
step     4 | loss: 10.

KeyboardInterrupt: 

In [28]:
#Loading the Model:
# from GPT import MyModel   # Replace with your model class definition
# model = MyModel()
model.load_state_dict(torch.load(data_root + '/saved_model.pth'))  #replacewith model path
phrase = "توانا بود هرکه دانا بود"
generated_poems=generate_poem(model, device, device_type, ddp_rank, phrase)

result_file_path = data_root + '/result.txt'
with open(result_file_path, 'w', encoding='utf-8') as f:
    for poem in generated_poems:
        f.write(poem + '\n')

print(f"Generated poems saved to {result_file_path}")


# Flatten the special tokens into a list
tokens_to_remove = [special_tokens["cls_token"], special_tokens["eos_token"]] + special_tokens["additional_special_tokens"]
#TODO

# Function to remove special tokens from a line
def remove_special_tokens(line):
    for token in tokens_to_remove:
        line = line.replace(token, "")
    return line

# Open the input file for reading and the output file for writing
input_file_path = data_root + "/result.txt"  # Change this to your input file path
output_file_path = data_root + '/' + 'final_output.txt'  # Change this to your desired output file path

with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    for line in infile:
        # Remove special tokens from the current line
        cleaned_line = remove_special_tokens(line)
        # Write the cleaned line to the output file
        print(cleaned_line)
        outfile.write(cleaned_line)

print("Special tokens removed and output saved to", output_file_path)

rank 0 sample 0: [CLS] توانا بود هرکه دانا بود[SEP] زلف‌دهد[BOB][BOM] من در در جان با دل هر درب کن و ه<unk>ار <unk>ن<unk>نه
rank 0 sample 1: [CLS] توانا بود هرکه دانا بود[SEP] دوست[BOM] چشم <unk>چ و خاک م همه حافظ[BOB][BOM] سر سر و آن ب<unk>ز<unk>ن<unk>نه
rank 0 sample 2: [CLS] توانا بود هرکه دانا بود[SEP] است[BOM] می‌زنم ای خاک چشم ب<unk>ن دل از در ا<unk>ه<unk>ن<unk>ن<unk>ق<unk>
rank 0 sample 3: [CLS] توانا بود هرکه دانا بود[SEP] به خاک خود تو ن<unk>ان تو کرد[BOM] تا گل ما می‌ان[BOM] آن نظر ب<unk>مت<unk>
Generated poems saved to /content/drive/MyDrive/nano_gpt/result.txt
 توانا بود هرکه دانا بود زلف‌دهد من در در جان با دل هر درب کن و هار ننه

 توانا بود هرکه دانا بود دوست چشم چ و خاک م همه حافظ سر سر و آن بزننه

 توانا بود هرکه دانا بود است می‌زنم ای خاک چشم بن دل از در اهننق

 توانا بود هرکه دانا بود به خاک خود تو نان تو کرد تا گل ما می‌ان آن نظر بمت

Special tokens removed and output saved to /content/drive/MyDrive/nano_gpt/final_output.txt
