In [9]:
import os
import datasets
import logging
import torch
import tensorflow as tf
import pandas as pd
import random
import transformers
import jsonlines

from utilities import tokenize_and_split_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

logger = logging.getLogger(__name__)

In [4]:
import torch

def check_gpu():
    if torch.cuda.is_available():
        print(f"CUDA is available. PyTorch version: {torch.__version__}")
        print(f"Number of GPUs: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
            print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024 ** 2:.2f} MB")
            print(f"  Memory Cached: {torch.cuda.memory_reserved(i) / 1024 ** 2:.2f} MB")
    else:
        print("CUDA is not available. Using CPU.")

check_gpu()


CUDA is available. PyTorch version: 2.3.0+cu118
Number of GPUs: 1
GPU 0: NVIDIA GeForce RTX 3070 Ti Laptop GPU
  Memory Allocated: 0.00 MB
  Memory Cached: 0.00 MB


## Load Dataset and Model

In [10]:
dataset_path = 'kotzeje/lamini_docs.jsonl'
use_hf = True

model_name = 'EleutherAI/pythia-70m'
base_model = AutoModelForCausalLM.from_pretrained(model_name)

## Set up the training config and tokenizer

In [11]:
training_config = {
    'model' : {
        'pretrained_name' : model_name,
        'max_length' : 2048
    },
    'datasets' : {
        'use_hf' : use_hf,
        'path' : dataset_path
    },
    'verbose' : True
}

In [44]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_dataset(training_config, tokenizer)

print(train_dataset)
print(test_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 900
})
Dataset({
    features: ['question', 'answer', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})


## Define function to carry out inference

In [35]:
def inference(text, model, tokenizer, max_input_tokens = 1000, max_output_tokens = 100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Move the model to the appropriate device
    model.to(device)
    input_ids = tokenizer.encode(
        text,
        return_tensors='pt',
        truncation=True,
        max_length = max_input_tokens
    ).to(device)

    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids,
        max_length = max_output_tokens
    ).to(device)

    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

## Try base model

In [14]:
test_text = test_dataset[1]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Are there any alternatives to Lamini that offer similar functionality?
Correct answer from Lamini docs: Lamini provides support for any tasks that can be completed by an LLM. If you think a recommendation system can be built using a LLM, Lamini can help you train the model on your data. If user context or behavior can be contextualized into text data, we think this is possible.
Model's answer: 


A:

I think you should use the following code:
public class Lamini
{
    public string Name { get; set; }
    public string Description { get; set; }
    public string Description { get; set; }
    public string Description { get; set; }
    public string Description { get; set; }
    public string Description { get; set


## Setup training

In [26]:
max_steps = 100
trained_model_name = f'lamini_docs_{max_steps}_steps'
output_dir = 'output/saved_models/' + trained_model_name

In [27]:
training_args = TrainingArguments(

    learning_rate=1.0e-5,
    num_train_epochs=1,
    max_steps=max_steps,
    per_device_train_batch_size=1,
    output_dir=output_dir,
    
    overwrite_output_dir=False,
    disable_tqdm=False,
    eval_steps=120,
    save_steps=120,
    warmup_steps=1,
    per_device_eval_batch_size=1,
    evaluation_strategy='steps',
    logging_strategy='steps',
    logging_steps=1,
    optim='adafactor',
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,

    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model='eval_loss',
    greater_is_better=False
    

)

In [18]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [28]:
trainer = Trainer(
    model=base_model,
    # model_flops=model_flops,
    # total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


## Training

In [29]:
training_output = trainer.train()

                                     
  0%|          | 0/3 [01:25<?, ?it/s]          

{'loss': 5.6333, 'grad_norm': 447.4446105957031, 'learning_rate': 1e-05, 'epoch': 0.0}


                                     
  0%|          | 0/3 [01:26<?, ?it/s]          

{'loss': 5.5331, 'grad_norm': 418.32513427734375, 'learning_rate': 9.8989898989899e-06, 'epoch': 0.01}


                                     
  0%|          | 0/3 [01:28<?, ?it/s]          

{'loss': 4.3409, 'grad_norm': 415.83807373046875, 'learning_rate': 9.797979797979798e-06, 'epoch': 0.01}


                                     
  0%|          | 0/3 [01:29<?, ?it/s]          

{'loss': 3.6913, 'grad_norm': 448.76959228515625, 'learning_rate': 9.696969696969698e-06, 'epoch': 0.02}


                                     
  0%|          | 0/3 [01:31<?, ?it/s]          

{'loss': 1.986, 'grad_norm': 355.4437561035156, 'learning_rate': 9.595959595959597e-06, 'epoch': 0.02}


                                     
  0%|          | 0/3 [01:32<?, ?it/s]          

{'loss': 1.1524, 'grad_norm': 213.14068603515625, 'learning_rate': 9.494949494949497e-06, 'epoch': 0.03}


                                     
  0%|          | 0/3 [01:34<?, ?it/s]          

{'loss': 0.6669, 'grad_norm': 85.82015228271484, 'learning_rate': 9.393939393939396e-06, 'epoch': 0.03}


                                     
  0%|          | 0/3 [01:35<?, ?it/s]          

{'loss': 0.7306, 'grad_norm': 21.011816024780273, 'learning_rate': 9.292929292929294e-06, 'epoch': 0.04}


                                     
  0%|          | 0/3 [01:38<?, ?it/s]          

{'loss': 0.5994, 'grad_norm': 19.184587478637695, 'learning_rate': 9.191919191919193e-06, 'epoch': 0.04}


                                     
  0%|          | 0/3 [01:43<?, ?it/s]           

{'loss': 0.5296, 'grad_norm': 8.00712776184082, 'learning_rate': 9.090909090909091e-06, 'epoch': 0.04}


                                     
  0%|          | 0/3 [01:49<?, ?it/s]           

{'loss': 0.3683, 'grad_norm': 14.475586891174316, 'learning_rate': 8.98989898989899e-06, 'epoch': 0.05}


                                     
  0%|          | 0/3 [01:54<?, ?it/s]           

{'loss': 0.442, 'grad_norm': 8.367050170898438, 'learning_rate': 8.888888888888888e-06, 'epoch': 0.05}


                                     
  0%|          | 0/3 [02:00<?, ?it/s]           

{'loss': 0.5381, 'grad_norm': 11.016870498657227, 'learning_rate': 8.787878787878788e-06, 'epoch': 0.06}


                                     
  0%|          | 0/3 [02:05<?, ?it/s]           

{'loss': 0.4871, 'grad_norm': 8.111491203308105, 'learning_rate': 8.686868686868687e-06, 'epoch': 0.06}


                                     
  0%|          | 0/3 [02:11<?, ?it/s]           

{'loss': 0.4431, 'grad_norm': 6.354768753051758, 'learning_rate': 8.585858585858587e-06, 'epoch': 0.07}


                                     
  0%|          | 0/3 [02:17<?, ?it/s]           

{'loss': 0.5578, 'grad_norm': 8.064961433410645, 'learning_rate': 8.484848484848486e-06, 'epoch': 0.07}


                                     
  0%|          | 0/3 [02:22<?, ?it/s]           

{'loss': 0.4783, 'grad_norm': 7.637006759643555, 'learning_rate': 8.383838383838384e-06, 'epoch': 0.08}


                                     
  0%|          | 0/3 [02:28<?, ?it/s]           

{'loss': 0.3977, 'grad_norm': 9.966814041137695, 'learning_rate': 8.282828282828283e-06, 'epoch': 0.08}


                                     
  0%|          | 0/3 [02:33<?, ?it/s]           

{'loss': 0.4028, 'grad_norm': 8.262561798095703, 'learning_rate': 8.181818181818183e-06, 'epoch': 0.08}


                                     
  0%|          | 0/3 [02:39<?, ?it/s]           

{'loss': 0.4888, 'grad_norm': 7.925356864929199, 'learning_rate': 8.08080808080808e-06, 'epoch': 0.09}


                                     
  0%|          | 0/3 [02:44<?, ?it/s]           

{'loss': 0.4299, 'grad_norm': 9.791762351989746, 'learning_rate': 7.97979797979798e-06, 'epoch': 0.09}


                                     
  0%|          | 0/3 [02:50<?, ?it/s]           

{'loss': 0.3948, 'grad_norm': 7.158545017242432, 'learning_rate': 7.87878787878788e-06, 'epoch': 0.1}


                                     
  0%|          | 0/3 [02:56<?, ?it/s]           

{'loss': 0.3745, 'grad_norm': 7.044399738311768, 'learning_rate': 7.77777777777778e-06, 'epoch': 0.1}


                                     
  0%|          | 0/3 [03:01<?, ?it/s]           

{'loss': 0.3172, 'grad_norm': 6.417994976043701, 'learning_rate': 7.676767676767677e-06, 'epoch': 0.11}


                                     
  0%|          | 0/3 [03:07<?, ?it/s]           

{'loss': 0.636, 'grad_norm': 7.829894542694092, 'learning_rate': 7.5757575757575764e-06, 'epoch': 0.11}


                                     
  0%|          | 0/3 [03:12<?, ?it/s]           

{'loss': 0.5972, 'grad_norm': 9.993778228759766, 'learning_rate': 7.474747474747476e-06, 'epoch': 0.12}


                                     
  0%|          | 0/3 [03:15<?, ?it/s]           

{'loss': 0.9449, 'grad_norm': 10.932785987854004, 'learning_rate': 7.373737373737374e-06, 'epoch': 0.12}


                                     
  0%|          | 0/3 [03:16<?, ?it/s]           

{'loss': 0.523, 'grad_norm': 9.7006254196167, 'learning_rate': 7.272727272727273e-06, 'epoch': 0.12}


                                     
  0%|          | 0/3 [03:18<?, ?it/s]           

{'loss': 0.3322, 'grad_norm': 6.952821254730225, 'learning_rate': 7.171717171717172e-06, 'epoch': 0.13}


                                     
  0%|          | 0/3 [03:19<?, ?it/s]           

{'loss': 0.5243, 'grad_norm': 8.438129425048828, 'learning_rate': 7.070707070707071e-06, 'epoch': 0.13}


                                     
  0%|          | 0/3 [03:21<?, ?it/s]           

{'loss': 0.2805, 'grad_norm': 6.473214149475098, 'learning_rate': 6.969696969696971e-06, 'epoch': 0.14}


                                     
  0%|          | 0/3 [03:22<?, ?it/s]           

{'loss': 0.2991, 'grad_norm': 7.009416103363037, 'learning_rate': 6.868686868686869e-06, 'epoch': 0.14}


                                     
  0%|          | 0/3 [03:23<?, ?it/s]           

{'loss': 0.4671, 'grad_norm': 8.192696571350098, 'learning_rate': 6.767676767676769e-06, 'epoch': 0.15}


                                     
  0%|          | 0/3 [03:25<?, ?it/s]           

{'loss': 0.2885, 'grad_norm': 6.252533435821533, 'learning_rate': 6.666666666666667e-06, 'epoch': 0.15}


                                     
  0%|          | 0/3 [03:26<?, ?it/s]           

{'loss': 0.3491, 'grad_norm': 5.176869869232178, 'learning_rate': 6.565656565656566e-06, 'epoch': 0.16}


                                     
  0%|          | 0/3 [03:28<?, ?it/s]           

{'loss': 0.4924, 'grad_norm': 7.624785423278809, 'learning_rate': 6.464646464646466e-06, 'epoch': 0.16}


                                     
  0%|          | 0/3 [03:29<?, ?it/s]           

{'loss': 0.4295, 'grad_norm': 8.443477630615234, 'learning_rate': 6.363636363636364e-06, 'epoch': 0.16}


                                     
  0%|          | 0/3 [03:31<?, ?it/s]           

{'loss': 0.5126, 'grad_norm': 9.873834609985352, 'learning_rate': 6.262626262626264e-06, 'epoch': 0.17}


                                     
  0%|          | 0/3 [03:32<?, ?it/s]           

{'loss': 0.3477, 'grad_norm': 6.248981952667236, 'learning_rate': 6.1616161616161615e-06, 'epoch': 0.17}


                                     
  0%|          | 0/3 [03:34<?, ?it/s]           

{'loss': 0.4536, 'grad_norm': 10.582649230957031, 'learning_rate': 6.060606060606061e-06, 'epoch': 0.18}


                                     
  0%|          | 0/3 [03:35<?, ?it/s]           

{'loss': 0.3507, 'grad_norm': 9.404108047485352, 'learning_rate': 5.95959595959596e-06, 'epoch': 0.18}


                                     
  0%|          | 0/3 [03:37<?, ?it/s]           

{'loss': 0.5263, 'grad_norm': 7.98105001449585, 'learning_rate': 5.858585858585859e-06, 'epoch': 0.19}


                                     
  0%|          | 0/3 [03:38<?, ?it/s]           

{'loss': 0.3817, 'grad_norm': 8.12790298461914, 'learning_rate': 5.7575757575757586e-06, 'epoch': 0.19}


                                     
  0%|          | 0/3 [03:40<?, ?it/s]           

{'loss': 0.2516, 'grad_norm': 5.732567310333252, 'learning_rate': 5.656565656565657e-06, 'epoch': 0.2}


                                     
  0%|          | 0/3 [03:41<?, ?it/s]           

{'loss': 0.5188, 'grad_norm': 7.943412780761719, 'learning_rate': 5.555555555555557e-06, 'epoch': 0.2}


                                     
  0%|          | 0/3 [03:43<?, ?it/s]           

{'loss': 0.5377, 'grad_norm': 8.472830772399902, 'learning_rate': 5.4545454545454545e-06, 'epoch': 0.2}


                                     
  0%|          | 0/3 [03:44<?, ?it/s]           

{'loss': 0.2753, 'grad_norm': 6.632678985595703, 'learning_rate': 5.353535353535354e-06, 'epoch': 0.21}


                                     
  0%|          | 0/3 [03:46<?, ?it/s]           

{'loss': 0.6753, 'grad_norm': 10.17475414276123, 'learning_rate': 5.252525252525253e-06, 'epoch': 0.21}


                                     
  0%|          | 0/3 [03:47<?, ?it/s]           

{'loss': 0.3972, 'grad_norm': 7.250304222106934, 'learning_rate': 5.151515151515152e-06, 'epoch': 0.22}


                                     
  0%|          | 0/3 [03:49<?, ?it/s]           

{'loss': 0.4683, 'grad_norm': 14.120010375976562, 'learning_rate': 5.0505050505050515e-06, 'epoch': 0.22}


                                     
  0%|          | 0/3 [03:50<?, ?it/s]           

{'loss': 0.4627, 'grad_norm': 7.7363600730896, 'learning_rate': 4.94949494949495e-06, 'epoch': 0.23}


                                     
  0%|          | 0/3 [03:52<?, ?it/s]           

{'loss': 0.3327, 'grad_norm': 8.04150104522705, 'learning_rate': 4.848484848484849e-06, 'epoch': 0.23}


                                     
  0%|          | 0/3 [03:53<?, ?it/s]           

{'loss': 0.4049, 'grad_norm': 7.201950550079346, 'learning_rate': 4.747474747474748e-06, 'epoch': 0.24}


                                     
  0%|          | 0/3 [03:55<?, ?it/s]           

{'loss': 0.3491, 'grad_norm': 8.47475814819336, 'learning_rate': 4.646464646464647e-06, 'epoch': 0.24}


                                     
  0%|          | 0/3 [03:56<?, ?it/s]           

{'loss': 0.3624, 'grad_norm': 7.533409118652344, 'learning_rate': 4.5454545454545455e-06, 'epoch': 0.24}


                                     
  0%|          | 0/3 [03:58<?, ?it/s]           

{'loss': 0.266, 'grad_norm': 5.1401824951171875, 'learning_rate': 4.444444444444444e-06, 'epoch': 0.25}


                                     
  0%|          | 0/3 [03:59<?, ?it/s]           

{'loss': 0.6183, 'grad_norm': 11.894201278686523, 'learning_rate': 4.343434343434344e-06, 'epoch': 0.25}


                                     
  0%|          | 0/3 [04:00<?, ?it/s]           

{'loss': 0.4452, 'grad_norm': 8.08464241027832, 'learning_rate': 4.242424242424243e-06, 'epoch': 0.26}


                                     
  0%|          | 0/3 [04:02<?, ?it/s]           

{'loss': 0.3358, 'grad_norm': 8.34626579284668, 'learning_rate': 4.141414141414142e-06, 'epoch': 0.26}


                                     
  0%|          | 0/3 [04:03<?, ?it/s]           

{'loss': 0.5432, 'grad_norm': 8.71150016784668, 'learning_rate': 4.04040404040404e-06, 'epoch': 0.27}


                                     
  0%|          | 0/3 [04:05<?, ?it/s]           

{'loss': 0.7212, 'grad_norm': 13.536818504333496, 'learning_rate': 3.93939393939394e-06, 'epoch': 0.27}


                                     
  0%|          | 0/3 [04:06<?, ?it/s]           

{'loss': 0.1664, 'grad_norm': 4.113100528717041, 'learning_rate': 3.8383838383838385e-06, 'epoch': 0.28}


                                     
  0%|          | 0/3 [04:08<?, ?it/s]           

{'loss': 0.5967, 'grad_norm': 16.584705352783203, 'learning_rate': 3.737373737373738e-06, 'epoch': 0.28}


                                     
  0%|          | 0/3 [04:09<?, ?it/s]           

{'loss': 0.457, 'grad_norm': 9.183928489685059, 'learning_rate': 3.6363636363636366e-06, 'epoch': 0.28}


                                     
  0%|          | 0/3 [04:11<?, ?it/s]           

{'loss': 0.2906, 'grad_norm': 9.902832984924316, 'learning_rate': 3.5353535353535356e-06, 'epoch': 0.29}


                                     
  0%|          | 0/3 [04:12<?, ?it/s]           

{'loss': 0.5401, 'grad_norm': 11.04805850982666, 'learning_rate': 3.4343434343434347e-06, 'epoch': 0.29}


                                     
  0%|          | 0/3 [04:14<?, ?it/s]           

{'loss': 0.3951, 'grad_norm': 7.74333381652832, 'learning_rate': 3.3333333333333333e-06, 'epoch': 0.3}


                                     
  0%|          | 0/3 [04:15<?, ?it/s]           

{'loss': 0.3776, 'grad_norm': 6.797224521636963, 'learning_rate': 3.232323232323233e-06, 'epoch': 0.3}


                                     
  0%|          | 0/3 [04:17<?, ?it/s]           

{'loss': 0.6553, 'grad_norm': 8.690988540649414, 'learning_rate': 3.131313131313132e-06, 'epoch': 0.31}


                                     
  0%|          | 0/3 [04:18<?, ?it/s]           

{'loss': 0.4606, 'grad_norm': 8.128663063049316, 'learning_rate': 3.0303030303030305e-06, 'epoch': 0.31}


                                     
  0%|          | 0/3 [04:20<?, ?it/s]           

{'loss': 0.3342, 'grad_norm': 7.093902587890625, 'learning_rate': 2.9292929292929295e-06, 'epoch': 0.32}


                                     
  0%|          | 0/3 [04:21<?, ?it/s]           

{'loss': 0.6717, 'grad_norm': 7.638508319854736, 'learning_rate': 2.8282828282828286e-06, 'epoch': 0.32}


                                     
  0%|          | 0/3 [04:23<?, ?it/s]           

{'loss': 0.4698, 'grad_norm': 12.776154518127441, 'learning_rate': 2.7272727272727272e-06, 'epoch': 0.32}


                                     
  0%|          | 0/3 [04:24<?, ?it/s]           

{'loss': 0.3142, 'grad_norm': 5.75796365737915, 'learning_rate': 2.6262626262626267e-06, 'epoch': 0.33}


                                     
  0%|          | 0/3 [04:26<?, ?it/s]           

{'loss': 0.4902, 'grad_norm': 7.2135515213012695, 'learning_rate': 2.5252525252525258e-06, 'epoch': 0.33}


                                     
  0%|          | 0/3 [04:27<?, ?it/s]           

{'loss': 0.4611, 'grad_norm': 7.196115493774414, 'learning_rate': 2.4242424242424244e-06, 'epoch': 0.34}


                                     
  0%|          | 0/3 [04:29<?, ?it/s]           

{'loss': 0.3603, 'grad_norm': 7.706755638122559, 'learning_rate': 2.3232323232323234e-06, 'epoch': 0.34}


                                     
  0%|          | 0/3 [04:30<?, ?it/s]           

{'loss': 0.5022, 'grad_norm': 10.110264778137207, 'learning_rate': 2.222222222222222e-06, 'epoch': 0.35}


                                     
  0%|          | 0/3 [04:32<?, ?it/s]           

{'loss': 0.4483, 'grad_norm': 7.30206823348999, 'learning_rate': 2.1212121212121216e-06, 'epoch': 0.35}


                                     
  0%|          | 0/3 [04:33<?, ?it/s]           

{'loss': 0.5058, 'grad_norm': 7.996256351470947, 'learning_rate': 2.02020202020202e-06, 'epoch': 0.36}


                                     
  0%|          | 0/3 [04:35<?, ?it/s]           

{'loss': 0.3572, 'grad_norm': 7.70075798034668, 'learning_rate': 1.9191919191919192e-06, 'epoch': 0.36}


                                     
  0%|          | 0/3 [04:36<?, ?it/s]           

{'loss': 0.476, 'grad_norm': 9.893773078918457, 'learning_rate': 1.8181818181818183e-06, 'epoch': 0.36}


                                     
  0%|          | 0/3 [04:38<?, ?it/s]           

{'loss': 0.2875, 'grad_norm': 7.874321460723877, 'learning_rate': 1.7171717171717173e-06, 'epoch': 0.37}


                                     
  0%|          | 0/3 [04:39<?, ?it/s]           

{'loss': 0.2045, 'grad_norm': 4.882674217224121, 'learning_rate': 1.6161616161616164e-06, 'epoch': 0.37}


                                     
  0%|          | 0/3 [04:40<?, ?it/s]           

{'loss': 0.5914, 'grad_norm': 8.202627182006836, 'learning_rate': 1.5151515151515152e-06, 'epoch': 0.38}


                                     
  0%|          | 0/3 [04:42<?, ?it/s]           

{'loss': 0.436, 'grad_norm': 7.414079189300537, 'learning_rate': 1.4141414141414143e-06, 'epoch': 0.38}


                                     
  0%|          | 0/3 [04:43<?, ?it/s]           

{'loss': 0.5512, 'grad_norm': 11.404534339904785, 'learning_rate': 1.3131313131313134e-06, 'epoch': 0.39}


                                     
  0%|          | 0/3 [04:45<?, ?it/s]           

{'loss': 0.2225, 'grad_norm': 6.436309337615967, 'learning_rate': 1.2121212121212122e-06, 'epoch': 0.39}


                                     
  0%|          | 0/3 [04:46<?, ?it/s]           

{'loss': 0.3634, 'grad_norm': 9.77120590209961, 'learning_rate': 1.111111111111111e-06, 'epoch': 0.4}


                                     
  0%|          | 0/3 [04:48<?, ?it/s]           

{'loss': 0.3962, 'grad_norm': 9.084990501403809, 'learning_rate': 1.01010101010101e-06, 'epoch': 0.4}


                                     
  0%|          | 0/3 [04:49<?, ?it/s]           

{'loss': 0.4179, 'grad_norm': 6.371891021728516, 'learning_rate': 9.090909090909091e-07, 'epoch': 0.4}


                                     
  0%|          | 0/3 [04:51<?, ?it/s]           

{'loss': 0.5679, 'grad_norm': 8.35484790802002, 'learning_rate': 8.080808080808082e-07, 'epoch': 0.41}


                                     
  0%|          | 0/3 [04:52<?, ?it/s]           

{'loss': 0.3844, 'grad_norm': 8.105725288391113, 'learning_rate': 7.070707070707071e-07, 'epoch': 0.41}


                                     
  0%|          | 0/3 [04:54<?, ?it/s]           

{'loss': 0.5416, 'grad_norm': 7.328046798706055, 'learning_rate': 6.060606060606061e-07, 'epoch': 0.42}


                                     
  0%|          | 0/3 [04:55<?, ?it/s]           

{'loss': 0.2897, 'grad_norm': 5.805704116821289, 'learning_rate': 5.05050505050505e-07, 'epoch': 0.42}


                                     
  0%|          | 0/3 [04:57<?, ?it/s]           

{'loss': 0.5415, 'grad_norm': 6.859163761138916, 'learning_rate': 4.040404040404041e-07, 'epoch': 0.43}


                                     
  0%|          | 0/3 [04:58<?, ?it/s]           

{'loss': 0.3408, 'grad_norm': 6.4777750968933105, 'learning_rate': 3.0303030303030305e-07, 'epoch': 0.43}


                                     
  0%|          | 0/3 [05:00<?, ?it/s]           

{'loss': 0.6967, 'grad_norm': 9.130102157592773, 'learning_rate': 2.0202020202020205e-07, 'epoch': 0.44}


                                     
  0%|          | 0/3 [05:01<?, ?it/s]           

{'loss': 0.2863, 'grad_norm': 5.3537726402282715, 'learning_rate': 1.0101010101010103e-07, 'epoch': 0.44}


                                     
                                                 
100%|██████████| 100/100 [03:39<00:00,  2.20s/it]

{'loss': 0.5241, 'grad_norm': 7.851963043212891, 'learning_rate': 0.0, 'epoch': 0.44}
{'train_runtime': 219.8134, 'train_samples_per_second': 1.82, 'train_steps_per_second': 0.455, 'train_loss': 0.6466012197732925, 'epoch': 0.44}





## Testing after trianing for 100 steps

In [37]:
test_text = test_dataset[1]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Are there any alternatives to Lamini that offer similar functionality?
Correct answer from Lamini docs: Lamini provides support for any tasks that can be completed by an LLM. If you think a recommendation system can be built using a LLM, Lamini can help you train the model on your data. If user context or behavior can be contextualized into text data, we think this is possible.
Model's answer: 
Yes, there are several options available for Lamini to offer similar functionality.


In [43]:
test_text = test_dataset[4]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[4]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Are there any examples of using Lamini for language translation tasks?
Correct answer from Lamini docs: Language translation is a great use case for a language model. Once you’ve exhausted the benefits of prompt tuning, you may use Lamini to fine-tune a fully multilingual language model.
Model's answer: 
Yes, Lamini is a language model for language translation tasks. Lamini is a language model for language translation tasks. It is designed to be used in a specific language model. It is designed to be used in a specific language model. It is designed to be used in a specific language model. It is designed to be used in a specific language model. It is designed to be used in a specific language model.
