In [1]:
!pip install peft datasets



In [2]:
import datasets
import transformers
from peft import (
    LoraConfig,
    PeftModel,
    get_peft_model,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    default_data_collator
)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("sarvamai/sarvam-2b-v0.5")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
model = AutoModelForCausalLM.from_pretrained("sarvamai/sarvam-2b-v0.5")
from datasets import load_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

FileNotFoundError: Unable to find '/content/path/to/your/data.csv'

In [9]:
import pandas as pd
df = pd.read_csv("/content/combined_output.csv")

In [5]:
tokenizer.add_tokens("[PAD]", special_tokens=True)
tokenizer.pad_token = "[PAD]"
model.resize_token_embeddings(len(tokenizer))
# Setup LoRA configuration
config = LoraConfig(
    r=64, lora_alpha=128, lora_dropout=0.0, target_modules=["lm_head", "k_proj", "q_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
)
model = get_peft_model(model, config)
model.print_trainable_parameters()


trainable params: 100,114,496 || all params: 2,608,953,408 || trainable%: 3.8373


In [13]:
from datasets import Dataset

def preprocess_function(example):
    # Safely handle None values by replacing them with an empty string
    inputs = example['Product Title_Name'] if example['Product Title_Name'] is not None else ""
    response = example['Generated Description_combined'] if example['Generated Description_combined'] is not None else ""

    # Combine inputs and response with the specified format
    combined = "[INST] " + inputs.strip() + " [/INST] " + response.strip()

    # Tokenize the combined text
    tokenized_inputs = tokenizer(combined, truncation=True, padding=True)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()

    return tokenized_inputs


# Apply preprocessing
dataset = Dataset.from_pandas(df)
dataset = dataset.map(preprocess_function, remove_columns=dataset.column_names)



Map:   0%|          | 0/698 [00:00<?, ? examples/s]

In [14]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="sarvam-2b-ft",
    num_train_epochs=1,
    save_total_limit=1,
    per_device_train_batch_size=1,
    warmup_steps=10,
    weight_decay=0.0001,
    bf16=True,
    logging_steps=10,
    learning_rate=1e-5,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
)

# Train the model
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss
10,1.9852
20,1.9805
30,1.8297
40,1.7021
50,1.7046
60,1.6072
70,1.4835
80,1.5648
90,1.4253
100,1.4573


  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


TrainOutput(global_step=698, training_loss=1.2953878173172304, metrics={'train_runtime': 1715.3347, 'train_samples_per_second': 0.407, 'train_steps_per_second': 0.407, 'total_flos': 3065248556822016.0, 'train_loss': 1.2953878173172304, 'epoch': 1.0})

In [15]:
input_text = """
Product Title: Relaxed Fit Sweatshirt, Product Name: {'Sleeve Length':
'Long Sleeves', 'Neck': 'Round Neck', 'Pattern': 'Solid', 'Length': 'Regular',
'Type': 'Pullover', 'Print or Pattern Type': 'Solid', 'Occasion': 'Casual', 'Hemline': 'Ribbed'}
"""

test_input = "[INST] " + input_text + " [/INST]"
tokenized_input = tokenizer(test_input, return_tensors="pt", truncation=True)
tokenized_input = tokenized_input.to("cuda")
model.eval()

# Generate the output tokens using the model
output_tokens = model.generate(
    **tokenized_input,
    max_new_tokens=256,       # Maximum number of new tokens to generate
    do_sample=True,           # Whether to sample from the distribution
    temperature=0.01,         # Sampling temperature (lower = more deterministic)
    top_p=0.95,               # Top-p (nucleus) sampling
    top_k=50,                 # Top-k sampling
    eos_token_id=tokenizer.eos_token_id,   # End of sequence token
    pad_token_id=tokenizer.pad_token_id,   # Padding token
)

# Decode the generated tokens back to text
output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

# Print the output
print("Generated Response:", output)


Generated Response: [INST] 
Product Title: Relaxed Fit Sweatshirt, Product Name: {'Sleeve Length': 
'Long Sleeves', 'Neck': 'Round Neck', 'Pattern': 'Solid', 'Length': 'Regular', 
'Type': 'Pullover', 'Print or Pattern Type': 'Solid', 'Occasion': 'Casual', 'Hemline': 'Ribbed'}
 [/INST] यह आरामदायक फिट वाली शर्ट एक स्टाइलिश और अनौपचारिक विकल्प है। इसमें लंबी आस्तीन होती है जो नीचे की ओर लटकती हैं, गोल नेकलाइन होती है और ठोस रंग का पैटर्न होता है। इसे नियमित लंबाई में पहना जाता है और इसे पोंचो के रूप में डिज़ाइन किया गया है। इस परिधान को रोजमर्रा के उपयोग के लिए बहुत अच्छा माना जाता है क्योंकि इसकी कमरपट्टा ऊपर से नीचे तक ढीली-ढिलाई हुई होती है। आप इसे किसी भी अवसर पर पहन सकते हैं और इसके सामने की तरफ एक छोटी सी पट्टी होगी जिसे रिबिंग कहा जाएगा। </s>
