In [None]:
!pip install lightning
!git clone https://github.com/sathishkumar67/llama-python.git
!git clone https://huggingface.co/pt-sk/llama-py
    
# move the contents from /kaggle/working/llama-python to /kaggle/working/
!mv /kaggle/working/llama-python/* /kaggle/working

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import lightning as L
from torch import optim
from torch.optim.lr_scheduler import CosineAnnealingLR

from architecture import ModelArgs, Transformer
from dataset import DataArgs, TokenDataset
from datasets import load_dataset
from transformers import LlamaTokenizer

import warnings
warnings.filterwarnings("ignore")

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

torch.manual_seed(42)

In [None]:
# setting the arguments
args = ModelArgs(
dim = 768,
n_layers = 12,
n_heads = 12,
vocab_size = 32000,
max_seq_len = 768,
n_kv_heads = 6
)

data_args = DataArgs(
    batch_size = 8,
    block_size = 768,
    pad_token_id = 0
)

In [None]:
#bigcode/python-stack-v1-functions-filtered-sc2
#muellerzr/python-stack-v1-functions-filtered-llama-3-8B
#Vezora/Tested-22k-Python-Alpaca
#Vezora/Tested-143k-Python-Alpaca

In [None]:
# loading the dataset
ds = load_dataset("Vezora/Tested-143k-Python-Alpaca")

# changing the format of the dataset
ds.set_format(type="pandas")

# taking the train split
df = ds["train"][:]

# adding text
df["text"] = df["input"] + ". " + df["instruction"] + ". " + df["output"]

# df["text"] = df["prompt"] + ". " + df["chosen"]
text = ". ".join(df["text"])

In [None]:
# loading the tokenizer
tokenizer = LlamaTokenizer(vocab_file="/kaggle/working/tokenizer.model", add_bos_token=False, add_eos_token=False, legacy=False)

# tokenizing the text
tokens = tokenizer(text).input_ids

In [None]:
# preparing the dataset and dataloader
dataset = TokenDataset(tokens, args=data_args)
dataloader = DataLoader(dataset, batch_size=data_args.batch_size, drop_last=True, shuffle=True, num_workers=4)

print(f"No of items in the dataloader: {len(dataloader)}")

In [None]:
# activating the model
model = Transformer(args).activate()

In [None]:
# move the contents from /kaggle/working/llama_python/python-stack-v1-functions-filtered-llama-3-8B/version_1/checkpoints to /kaggle/working/
# !mv /kaggle/working/llama_python/python-stack-v1-functions-filtered-sc2-stage-2/version_3/checkpoints/* /kaggle/working

In [None]:
class Llama(L.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.learning_rates = []
    
    def training_step(self, batch, batch_idx):
        logits = self.model(batch)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), batch.view(-1), ignore_index=-1)
        self.log("Loss", loss, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.AdamW(self.model.parameters(), lr=6e-4)
        scheduler = CosineAnnealingLR(optimizer, T_max=50, eta_min=6e-6)
        return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
    
    def lr_scheduler_step(self, scheduler, metric):
        scheduler.step()
        
    def on_train_batch_end(self, outputs, batch, batch_idx):
        # Log learning rate
        optimizer = self.optimizers()
        lr = optimizer.param_groups[0]['lr']
        self.learning_rates.append(lr)
    
# checkpoint path
checkpoint_path = "/kaggle/working/llama-py/llama-py_stage2.ckpt"
    
# initialize the model
llama = Llama.load_from_checkpoint(checkpoint_path, model=model)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6

print(f"Total parameters: {count_parameters(llama.model)}")

In [None]:
# delete all the directories and files in /kaggle/working/
# !rm -rf /kaggle/working/*

In [None]:
trainer = L.Trainer(max_epochs=1, accelerator="cuda")
trainer.fit(llama, dataloader)

In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir=lightning_logs/

Experiment 

* The code was shorter than the one used in mamba, cloned the files and directly imported files from the respective scripts
* The code was more organized and easier to understand, with clear comments and function names
* used arguments for data and model
* codes in the transformer is effiecient but make even efficient
* used pretrained llama tokenizer
* used torch.manual_seed(42)
* arguments are implicitly given, future idea is to give the arguments using json file format loading the model using that configuration
* directly combined the text without using any seperation variables
* eos or sos can be integrated
* samples are shuffled and the last batch is ignored
* dataloader can be improved by using shuffle=False, droplast=False, ignoring only the last item not last batch
* activated the model and the size of the model is 254M
* using lightning module as a base class for our model
* with the help of lightningmodule the train loop is so easy, efficient in both memory and time. mainl a lot of memory is saved
* used only registering loss in the progress bar
* used adaw optimizer without any change in parameter values and no schedules are used and the learning rate is 1e-4. may other techniques or optimizer can be used like schedulefree optimizer
* in future training many techniques are there. those techniques can be used
* did not find how to store the model
* started training using only 1gpu "p100" for 1epoch
* training:
    * started training using muellerzr/python-stack-v1-functions-filtered-llama-3-8B dataset for 2epochs with shuffle for 2 epochs
    * started training using bigcode/python-stack-v1-functions-filtered-sc2 dataset for 2epochs with shuffle for 2 epochs
    * started training using Vezora/Tested-22k-Python-Alpaca dataset for 2epochs with shuffle for 2 epochs
    * started training using Vezora/Tested-143k-Python-Alpaca dataset for 2epochs with shuffle for 1 epochs

In [None]:
# from huggingface_hub import login
# login()

In [None]:
# # push entire folder to the hub
# from huggingface_hub import HfApi
# api = HfApi()

# api.upload_folder(
#     folder_path="/kaggle/working/lightning_logs",
#     path_in_repo="python-stack-v1-functions-filtered-sc2-stage-2",
#     repo_id="pt-sk/llama_python",
#     repo_type="model",
# )

In [None]:
# # clear ram memory
# import gc
# del llama
# gc.collect()
# torch.cuda.empty_cache()