# REQUIRED LIBRARIES 

In [1]:
!pip install pandas peft==0.14.0 transformers==4.47.1 trl==0.13.0 accelerate==1.2.1 scipy tensorboardX

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting peft==0.14.0
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers==4.47.1
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting trl==0.13.0
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting accelerate==1.2.1
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting safetensors (from peft==0.14.0)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting huggingface-hub>=0.25.0 (from peft==0.14.0)
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers==4.47.1)
  Downloading regex-2024.11.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [2]:
# Remove existing directory, clone and install bitsandbytes specifically for MI300X
!rm -rf bitsandbytes && \
git clone --recurse https://github.com/ROCm/bitsandbytes.git && \
cd bitsandbytes && \
git checkout rocm_enabled_multi_backend && \
pip install -r requirements-dev.txt && \
cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH="gfx90a" -S . && \
make && \
pip install . && \
cd .. && \
python -c "import bitsandbytes as bnb; print('bitsandbytes version:', bnb.__version__)"

Cloning into 'bitsandbytes'...
remote: Enumerating objects: 8235, done.[K
remote: Counting objects: 100% (2675/2675), done.[K
remote: Compressing objects: 100% (315/315), done.[K
remote: Total 8235 (delta 2513), reused 2360 (delta 2360), pack-reused 5560 (from 2)[K
Receiving objects: 100% (8235/8235), 2.49 MiB | 8.40 MiB/s, done.
Resolving deltas: 100% (5604/5604), done.
Already on 'rocm_enabled_multi_backend'
Your branch is up to date with 'origin/rocm_enabled_multi_backend'.
Collecting pytest~=8.3.1 (from -r requirements-dev.txt (line 3))
  Downloading pytest-8.3.5-py3-none-any.whl.metadata (7.6 kB)
Collecting einops~=0.8.0 (from -r requirements-dev.txt (line 4))
  Downloading einops-0.8.1-py3-none-any.whl.metadata (13 kB)
Collecting lion-pytorch~=0.2.2 (from -r requirements-dev.txt (line 6))
  Downloading lion_pytorch-0.2.3-py3-none-any.whl.metadata (616 bytes)
Collecting scipy~=1.14.0 (from -r requirements-dev.txt (line 7))
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_

## checking if bits and bytes is installed porper ly , note : o.43 is the compactible version for the mi250 x

In [3]:
try:
    import bitsandbytes as bnb
    print("bitsandbytes version:", bnb.__version__)
    print("Installation successful!")
except ImportError as e:
    print("Error importing bitsandbytes:", e)
    print("bitsandbytes is not installed correctly")

g++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0
Copyright (C) 2021 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



  from .autonotebook import tqdm as notebook_tqdm


bitsandbytes version: 0.43.3.dev
Installation successful!


# Huggingface api auth

In [None]:

# Step 3: Hugging Face Login
# Authenticate with Hugging Face to access Llama-3.1 model
from huggingface_hub import login, HfApi

# Set your Hugging Face token directly
# Replace 'hf_your_token_here' with your actual token
login(token="hf_your_token_here", add_to_git_credential=False)

# Validate the token
try:
    api = HfApi()
    user_info = api.whoami()
    print(f"Token validated successfully! Logged in as: {user_info['name']}")
except Exception as e:
    print(f"Token validation failed. Error: {e}")

Token validated successfully! Logged in as: rohithreddyv1


# Step 4: Set and Verify GPU Availability

In [5]:

import os
import torch

# Specify which GPUs to use - adjust based on your hardware
gpus = [0]  # Use [0, 1, 2, 3] for MI300x or [0] for W7900
os.environ.setdefault("CUDA_VISIBLE_DEVICES", ','.join(map(str, gpus)))

# Ensure PyTorch detects the GPUs correctly
print(f"PyTorch detected number of available devices: {torch.cuda.device_count()}")

PyTorch detected number of available devices: 1


# Import Libraries

In [6]:
import os
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer

This cell imports all the required libraries:

Standard libraries: os
Machine learning: torch (PyTorch)
Data handling: pandas
Hugging Face libraries:

datasets for working with training datasets
transformers for accessing models, tokenizers, and training utilities
peft for Parameter-Efficient Fine-Tuning
trl for Transformer Reinforcement Learning

# GPU Setup

In [7]:
# Set and verify GPU availability
gpus = [0]  # For single GPU; use [0, 1, 2, 3] for MI300X with multiple GPUs
os.environ.setdefault("CUDA_VISIBLE_DEVICES", ','.join(map(str, gpus)))
print(f"PyTorch detected number of available devices: {torch.cuda.device_count()}")


PyTorch detected number of available devices: 1


This cell:

Sets up which GPU(s) to use for training
Verifies GPU availability through PyTorch
Allows configuration for multi-GPU setups (like MI250X)

# Model Configuration

In [8]:
# Define model names
base_model_name = "meta-llama/Llama-3.1-8B"
new_model_name = "rohith-llama-3.1-8B-finetuned"

# Configure 4-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True
)

This cell:

Specifies the base model (LLaMA 3.1 8B) to fine-tune
Names the fine-tuned model
Sets up 4-bit quantization to reduce GPU memory usage

Uses NF4 (normalized float 4) quantization
Computes in float16 for better precision
Enables double quantization for additional memory savings

# Load Tokenizer and Model

In [9]:
# Load tokenizer
print("Loading tokenizer...")
llama_tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code=True,
    use_fast=True
)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"
print("Tokenizer loaded successfully!")

# Load model with quantization
print("Loading model... (this may take a few minutes)")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

# Disable caching to optimize for fine-tuning
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1
print(f"Model loaded successfully on {base_model.device}!")

Loading tokenizer...
Tokenizer loaded successfully!
Loading model... (this may take a few minutes)


Downloading shards: 100%|██████████| 4/4 [03:41<00:00, 55.41s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:30<00:00,  7.62s/it]


Model loaded successfully on cuda:0!


This cell:

Loads the LLaMA tokenizer from Hugging Face

Sets the padding token to be the same as the end-of-sequence token
Configures right padding (important for causal language models)
Uses fast tokenizers for better performance


Loads the LLaMA 3.1 model with 4-bit quantization

Uses device_map="auto" for optimal GPU placement
Disables KV caching to save memory during training
Sets tensor parallelism to 1 (using single-tensor operations)
Confirms the device(s) the model is loaded on

# Dataset Preparation

In [10]:
# Prepare the dataset
print("Preparing dataset...")
# Load the dataset from the text file
with open('rohith.txt', 'r') as f:
    data = f.read()

# Parse the JSON-like data
import json
import re

# Extract JSON objects from the text
json_pattern = r'({.*?})'
json_matches = re.findall(json_pattern, data, re.DOTALL)

# Parse each JSON object
dataset_records = []
for json_str in json_matches:
    try:
        record = json.loads(json_str)
        # Format the data for instruction fine-tuning
        formatted_text = f"<s>[INST] {record['instruction']} {record['input']} [/INST] {record['output']}</s>"
        dataset_records.append({"text": formatted_text})
    except json.JSONDecodeError:
        print(f"Error parsing JSON: {json_str}")

# Create a Hugging Face dataset
training_data = Dataset.from_pandas(pd.DataFrame(dataset_records))
print(f"Dataset prepared with {len(training_data)} examples")


Preparing dataset...
Dataset prepared with 137 examples


This cell:

Reads the custom training data from 'rohith.txt'
Uses regex to extract JSON objects from the text file
Parses each JSON entry and formats it into LLaMA's instruction format:

<s>[INST] instruction input [/INST] output</s>


Creates a Hugging Face Dataset object from the formatted examples
Reports the number of training examples extracted

# LoRA and Training Configuration

In [11]:
# Configure LoRA parameters
print("Configuring LoRA parameters...")
peft_parameters = LoraConfig(
    lora_alpha=8,           # Scaling parameter
    lora_dropout=0.1,       # Dropout probability for LoRA layers
    r=32,                    # Rank of the low-rank matrices
    bias="none",            # Whether to train bias parameters
    task_type="CAUSAL_LM"   # The type of task
)

# Apply LoRA configuration to the model
print("Applying LoRA to the model...")
model = get_peft_model(base_model, peft_parameters)
model.print_trainable_parameters()

# Configure training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="./results_rohith_lora",
    num_train_epochs=50,                   # Number of training epochs
    per_device_train_batch_size=1,        # Batch size per device during training
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",# Number of updates steps to accumulate before backward pass
    learning_rate=4e-5,                   # Initial learning rate
    weight_decay=0.001,                    # Weight decay to apply
    logging_steps=1,                      # Log every X updates steps
    save_strategy="epoch",
    max_grad_norm=0.3,# Save strategy to adopt during training
    fp16=True,                            # Enable mixed precision training with FP16
    logging_dir="./logs",                 # Directory for storing logs
    warmup_ratio=0.03,                    # Ratio of total training steps used for warmup
    lr_scheduler_type="cosine",           # Learning rate scheduler type
    report_to="tensorboard"               # Report metrics to tensorboard
)

Configuring LoRA parameters...
Applying LoRA to the model...
trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695
Setting up training arguments...


This cell:

Configures LoRA (Low-Rank Adaptation) parameters:

lora_alpha=8: Scaling parameter for LoRA updates
lora_dropout=0.1: Dropout rate to prevent overfitting
r=32: Rank for low-rank matrices (higher = more capacity)
bias="none": No bias parameter training
task_type="CAUSAL_LM": For text generation tasks


Applies LoRA to the base model
Prints statistics about trainable parameters (showing the efficiency of LoRA)
Sets up training configuration via TrainingArguments:

Output directories for model checkpoints and logs
Training for 50 epochs
Small batch size (1) with gradient accumulation (4) for effective batch size of 4
Uses paged 32-bit AdamW optimizer
Learning rate of 4e-5 with cosine scheduler and 3% warmup
Weight decay of 0.001 for regularization
Gradient clipping at 0.3 to prevent exploding gradients
Enabled mixed precision (FP16) for efficiency
TensorBoard integration for training monitoring

# Training and Saving the Model

In [None]:
# Initialize SFT Trainer
print("Initializing SFT Trainer...")
trainer = SFTTrainer(
    model=model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    args=training_args
)

print("Training configuration:")
print(f"Number of epochs: {training_args.num_train_epochs}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"FP16 enabled: {training_args.fp16}")
print(f"Trainer ready to start training!")

# Start training
print("Starting training...")
trainer.train()

# Save the fine-tuned model
output_dir = "./rohith_llama3_finetuned"
print(f"Saving model to {output_dir}...")
trainer.model.save_pretrained(output_dir)
llama_tokenizer.save_pretrained(output_dir)
print("Model saved successfully!")

Initializing SFT Trainer...


Map: 100%|██████████| 137/137 [00:00<00:00, 8862.53 examples/s]


Training configuration:
Number of epochs: 50
Learning rate: 4e-05
Batch size: 1
Gradient accumulation steps: 4
Effective batch size: 4
FP16 enabled: True
Trainer ready to start training!
Starting training...


Step,Training Loss
1,3.5691
2,3.8364
3,3.4682
4,3.023
5,3.5743
6,3.3938
7,3.2915
8,3.1615
9,3.3593
10,3.5018


This cell:

Initializes the Supervised Fine-Tuning (SFT) Trainer
Connects the model, dataset, and training arguments
Prints a summary of key training parameters
Executes the training process for the specified number of epochs (50)
Creates an output directory for the final model
Saves the fine-tuned model with its LoRA adapters
Saves the tokenizer in the same directory

# Testing the Model

In [None]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

# Define model names
base_model_name = "meta-llama/Llama-3.1-8B"
output_dir = "./rohith_llama3_finetuned"  # Directory where you saved the fine-tuned model

# Load the original base model
print("Loading base model for merging...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    trust_remote_code=True
)

# Load the tokenizer
print("Loading tokenizer...")
llama_tokenizer = AutoTokenizer.from_pretrained(
    base_model_name, 
    trust_remote_code=True
)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"

# Load and merge the LoRA weights with the base model
print("Loading and merging LoRA weights...")
peft_model = PeftModel.from_pretrained(base_model, output_dir)
merged_model = peft_model.merge_and_unload()
merged_model.config.use_cache = True  # Enable caching for inference

# Create a text generation pipeline
print("Creating text generation pipeline...")
text_pipeline = pipeline(
    "text-generation",
    model=merged_model,
    tokenizer=llama_tokenizer,
    max_length=1024,
    temperature=0.1,
    top_p=0.3,
    repetition_penalty=1.2,
    device_map="auto"
)

# Test with sample queries
test_queries = [
    "Detail Rohith's GPU-accelerated document processing pipeline",
    "Explain Rohith's technical documentation work at Radian",
    "What programming languages does Rohith know",
    "What is Rohith's educational background?",
    "What certifications does Rohith have"
]

# Generate and print responses
for query in test_queries:
    print(f"\n\n===== Query: {query} =====")
    test_prompt = f"<s>[INST] {query} [/INST]"
    output = text_pipeline(test_prompt)
    print("Model response:")
    print(output[0]['generated_text'])


In [None]:
This cell:

Loads a fresh copy of the base model for inference
Loads the LoRA adapters and merges them into the base model
Enables KV caching for faster inference
Creates a text generation pipeline with settings:

Maximum length of1024 tokens
Temperature of 0.1 
Top-p of 0.9 (nucleus sampling)
Repetition penalty of 1.2 to reduce repeating text


Tests the model with some queries
Prints the model's response
Confirms the successful completion of the fine-tuning process