In [1]:
import pandas as pd
import json

def generate_recommendation_dataset(ratings_file, movies_file, output_file):
    # 1. Load Data
    # MovieLens-1M uses '::' as a separator
    movies = pd.read_csv(movies_file, sep='::', engine='python', encoding='latin-1',
                         names=['MovieID', 'Title', 'Genres'])
    ratings = pd.read_csv(ratings_file, sep='::', engine='python',
                          names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

    # 2. Merge and Sort
    df = pd.merge(ratings, movies, on='MovieID')
    df = df.sort_values(['UserID', 'Timestamp'])

    # 3. Create Narrative Pairs
    instruction_data = []
    
    # Group by user to see their "watch history"
    for user_id, group in df.groupby('UserID'):
        # Let's take the last 5 movies as history and the 6th as the target
        # This is a simplified "sliding window" approach
        movie_list = group['Title'].tolist()
        genre_list = group['Genres'].tolist()

        if len(movie_list) < 6:
            continue

        # Create history (Input)
        history = movie_list[:-1][-5:]  # Last 5 movies before the target
        target_movie = movie_list[-1]    # The movie they actually watched next
        target_genres = genre_list[-1]

        prompt = {
            "instruction": "Acting as a personalized movie recommender, analyze the user's watch history and suggest the most logical next movie. Provide a brief reasoning based on genres.",
            "input": f"User history: {', '.join(history)}. Based on these preferences, what should they watch next?",
            "output": f"The user should watch {target_movie}. Reasoning: This movie aligns with their interest in {target_genres} seen in their previous history."
        }
        
        instruction_data.append(prompt)

    # 4. Save to JSONL
    with open(output_file, 'w') as f:
        for entry in instruction_data:
            f.write(json.dumps(entry) + '\n')

    print(f"Success! Generated {len(instruction_data)} training pairs.")

# To run:
# generate_recommendation_dataset('ratings.dat', 'movies.dat', 'movielens_train.jsonl')

In [2]:
# To run:
generate_recommendation_dataset('ratings.dat', 'movies.dat', 'movielens_train.jsonl')

Success! Generated 6040 training pairs.


In [5]:
def generate_contrastive_dataset(ratings_file, movies_file, output_file):
    # 1. Load Data
    movies = pd.read_csv(movies_file, sep='::', engine='python', encoding='latin-1',
                         names=['MovieID', 'Title', 'Genres'])
    ratings = pd.read_csv(ratings_file, sep='::', engine='python',
                          names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

    # 2. Merge and Sort
    df = pd.merge(ratings, movies, on='MovieID')
    df = df.sort_values(['UserID', 'Timestamp'])

    instruction_data = []
    
    # 3. Process each user
    print("Generating contrastive pairs...")
    for user_id, group in df.groupby('UserID'):
        movie_list = group['Title'].tolist()
        genre_list = group['Genres'].tolist()
        movie_id_list = group['MovieID'].tolist()

        if len(movie_list) < 6:
            continue

        # Positive Target (The movie they actually watched)
        history = movie_list[:-1][-5:]
        target_movie = movie_list[-1]
        target_genres = genre_list[-1]

        # Negative Target (A movie they DID NOT watch)
        watched_ids = set(movie_id_list)
        negative_movie_row = movies[~movies['MovieID'].isin(watched_ids)].sample(1).iloc[0]
        neg_movie = negative_movie_row['Title']
        neg_genres = negative_movie_row['Genres']

        # Format as a "Contrastive Reasoning" prompt
        prompt = {
            "instruction": "Analyze the user's history to identify their preference. Compare two potential movies and explain which one is the better recommendation.",
            "input": f"History: {', '.join(history)}. Option A: {target_movie} ({target_genres}). Option B: {neg_movie} ({neg_genres}).",
            "output": f"The better recommendation is Option A. Reasoning: The user has shown a strong preference for themes found in {target_genres}. Option B ({neg_genres}) does not align with their recent viewing patterns."
        }
        instruction_data.append(prompt)

    # 4. Save to JSONL
    with open(output_file, 'w') as f:
        for entry in instruction_data:
            f.write(json.dumps(entry) + '\n')

    print(f"Success! Generated {len(instruction_data)} contrastive training pairs in {output_file}")

In [6]:
generate_contrastive_dataset('ratings.dat', 'movies.dat', 'contrastive_rec_train.jsonl')

Generating contrastive pairs...
Success! Generated 6040 contrastive training pairs in contrastive_rec_train.jsonl


In [7]:
# Install Unsloth and other necessary libraries
!pip install --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet
!pip install --no-deps "xformers<0.0.29" "trl<0.9.0" peft accelerate bitsandbytes --quiet

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31mÃ—[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31mâ”‚[0m exit code: [1;36m1[0m
  [31mâ•°â”€>[0m [31m[23 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File [35m"/home/subash/Quantization/.quanti/lib/python3.13/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py"[0m, line [35m389[0m, in [35m<module>[0m
  [31m   [0m     [31mmain[0m[1;31m()[0m
  [31m   [0m     [31m~~~~[0m[1;31m^^[0m
  [31m   [0m   File [35m"/home/subash/Quantization/.quanti/lib/python3.13/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py"[0m, line [35m373[0m, in [35mmain[0m
  [31m   [0m     json_out["return_val"] = [31mhook[0m[1;31m(**hook_input["kwargs"])[0m
  [31m   [0m                              [31m~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^[0m
  [31m   [0m   File [35m"/home/subash/Quantization/.quan

In [8]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments

# 1. Configuration
max_seq_length = 2048 # Supports RoPE Scaling internally
dtype = None # None for auto detection. Float16 for Tesla T4, Bfloat16 for Ampere+
load_in_4bit = True # Use 4-bit quantization to save memory

# 2. Load Model and Tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 3. Add LoRA Adapters (This is the "QLoRA" part)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank: higher = more parameters to train, 16 is a good sweet spot
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Optimized to 0 for Unsloth
    bias = "none",    # Optimized to "none" for Unsloth
    use_gradient_checkpointing = "unsloth", # Saves VRAM
    random_state = 3407,
)

# 4. Format Dataset for SFT (Supervised Fine-Tuning)
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Format matching the Llama-3 Instruct style
        text = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{instruction}<|eot_id|>" \
               f"<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>" \
               f"<|start_header_id|>assistant<|end_header_id|>\n\n{output}<|eot_id|>"
        texts.append(text)
    return { "text" : texts, }

dataset = load_dataset("json", data_files="contrastive_rec_train.jsonl", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True)

# 5. Set Training Arguments
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can speed up training for short sequences
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100, # Start with 100 steps to test; set to -1 and use num_train_epochs for full run
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# 6. Train!
trainer_stats = trainer.train()

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 4070. Num GPUs = 2. Max memory: 11.595 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu130. CUDA: 8.9. CUDA Toolkit: 13.0. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2026.1.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.
Generating train split: 6040 examples [00:00, 332732.62 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6040/6040 [00:00<00:00, 367047.18 examples/s]
Unsloth: Tokenizing ["text"] (num_proc=32): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6040/6040 [00:03<00:00, 1543.34 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,040 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)
wandb: [wandb.login()] Loaded credentials for https://api.wandb.ai from /home/subash/.netrc.
wandb: Currently logged in as: subash-sharma (subash-sharma-islington-college) 

wandb: Detected [huggingface_hub.inference] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


AttributeError: 'int' object has no attribute 'mean'

In [10]:
from datasets import load_dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
import torch

# 1. Load your JSONL dataset
dataset = load_dataset('json', data_files='contrastive_rec_train.jsonl', split='train')

# 2. Create a formatting function to combine instruction, input, and output
def formatting_func(examples):
    texts = []
    for instruction, input_text, output in zip(examples['instruction'], examples['input'], examples['output']):
        # Format according to your model's prompt template
        text = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}"""
        texts.append(text)
    return {"text": texts}

# 3. Apply formatting to dataset
dataset = dataset.map(formatting_func, batched=True, remove_columns=dataset.column_names)

# 4. Setup trainer with correct parameters
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",  # This is the key!
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc = 2,
    packing = False,  # Set to False to avoid issues
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

# 5. Train
trainer_stats = trainer.train()

Unsloth: Tokenizing ["text"] (num_proc=32): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6040/6040 [00:20<00:00, 290.64 examples/s] 
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,040 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


AttributeError: 'int' object has no attribute 'mean'

In [11]:
from datasets import load_dataset
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

# 1. Load your model and tokenizer first (if not already loaded)
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",  # or your model
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# 2. Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

# 3. Load your JSONL dataset
dataset = load_dataset('json', data_files='contrastive_rec_train.jsonl', split='train')

# 4. IMPORTANT: Format with EOS token
def formatting_func(examples):
    texts = []
    EOS_TOKEN = tokenizer.eos_token
    for instruction, input_text, output in zip(examples['instruction'], examples['input'], examples['output']):
        text = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}{EOS_TOKEN}"""
        texts.append(text)
    return {"text": texts}

# 5. Apply formatting
dataset = dataset.map(formatting_func, batched=True, remove_columns=dataset.column_names)

# 6. Print a sample to verify
print("Sample formatted text:")
print(dataset[0]['text'])
print("\n" + "="*50 + "\n")

# 7. Setup trainer - REMOVE data_collator and dataset_num_proc
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,  # CRITICAL: Must be False
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,  # Use epochs instead of max_steps
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",  # Disable wandb if not needed
    ),
)

# 8. Train
trainer_stats = trainer.train()

==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 4070. Num GPUs = 2. Max memory: 11.595 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu130. CUDA: 8.9. CUDA Toolkit: 13.0. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2026.1.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6040/6040 [00:00<00:00, 77788.44 examples/s]


Sample formatted text:
### Instruction:
Analyze the user's history to identify their preference. Compare two potential movies and explain which one is the better recommendation.

### Input:
History: Bug's Life, A (1998), Antz (1998), Hunchback of Notre Dame, The (1996), Hercules (1997), Mulan (1998). Option A: Pocahontas (1995) (Animation|Children's|Musical|Romance). Option B: Turbo: A Power Rangers Movie (1997) (Action|Adventure|Children's).

### Response:
The better recommendation is Option A. Reasoning: The user has shown a strong preference for themes found in Animation|Children's|Musical|Romance. Option B (Action|Adventure|Children's) does not align with their recent viewing patterns.<|end_of_text|>




Unsloth: Tokenizing ["text"] (num_proc=32): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 6040/6040 [01:07<00:00, 89.48 examples/s] 
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 2
   \\   /|    Num examples = 6,040 | Num Epochs = 1 | Total steps = 755
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 11.60 GiB of which 11.75 MiB is free. Process 3820278 has 4.82 GiB memory in use. Including non-PyTorch memory, this process has 6.73 GiB memory in use. Of the allocated memory 6.51 GiB is allocated by PyTorch, and 10.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [15]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

Current device: 0
Device name: NVIDIA GeForce RTX 4070
Number of GPUs: 2


In [16]:
import os
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
print(f"CUDA_VISIBLE_DEVICES after setting: {os.environ.get('CUDA_VISIBLE_DEVICES')}")
print(f"Number of GPUs visible to PyTorch: {torch.cuda.device_count()}")
print(f"Current device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

CUDA_VISIBLE_DEVICES: 0
CUDA_VISIBLE_DEVICES after setting: 0
Number of GPUs visible to PyTorch: 2
Current device: 0
Device name: NVIDIA GeForce RTX 4070


In [18]:
import os
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES')}")

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

# Check each GPU
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")

CUDA_VISIBLE_DEVICES: 0
PyTorch version: 2.9.1+cu130
CUDA available: True
CUDA version: 13.0
Number of GPUs: 2
GPU 0: NVIDIA GeForce RTX 4070
  Memory: 12.45 GB
GPU 1: NVIDIA GeForce RTX 3090
  Memory: 25.27 GB


In [19]:
import os
import sys

# Check all environment variables related to CUDA
for key, value in os.environ.items():
    if 'CUDA' in key or 'GPU' in key:
        print(f"{key}: {value}")

PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True,roundup_power2_divisions:[32:256,64:128,256:64,>:32]
CUDA_VISIBLE_DEVICES: 0


In [20]:
import torch
print(f"PyTorch compiled with CUDA: {torch.version.cuda}")
print(f"CUDA runtime version: {torch.cuda.get_device_capability(0)}")

# Check if you're using a special build
print(f"PyTorch build: {torch.__version__}")

PyTorch compiled with CUDA: 13.0
CUDA runtime version: (8, 9)
PyTorch build: 2.9.1+cu130


In [21]:
import os
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")

import torch
print(f"Number of GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

CUDA_VISIBLE_DEVICES: 0
Number of GPUs: 2
GPU 0: NVIDIA GeForce RTX 4070
GPU 1: NVIDIA GeForce RTX 3090


In [22]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Make only 3090 visible

import torch
print(f"Number of GPUs: {torch.cuda.device_count()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")  # Should show 3090 as device 0

Number of GPUs: 2
Device name: NVIDIA GeForce RTX 4070


In [24]:
import torch

# Set up device
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# Move model to device
model = model.to(device)

# Create or load your data FIRST, then move it
data = torch.randn(100, 10)  # Example: create some data
data = data.to(device)  # Now this will work

print(f"Using device: {torch.cuda.get_device_name(1)}")

Using device: NVIDIA GeForce RTX 3090


In [None]:
# FIRST CELL - Run this first
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Use RTX 3090 (was GPU index 1, now becomes 0)

# SECOND CELL - Your actual training code
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import torch

# Verify GPU
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
print(f"Available memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
    # device_map will automatically use the visible GPU
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

# Load and format dataset
dataset = load_dataset('json', data_files='your_file.jsonl', split='train')

def formatting_func(examples):
    texts = []
    EOS_TOKEN = tokenizer.eos_token
    for instruction, input_text, output in zip(examples['instruction'], examples['input'], examples['output']):
        text = f"""### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output}{EOS_TOKEN}"""
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_func, batched=True, remove_columns=dataset.column_names)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

trainer_stats = trainer.train()

Using GPU: NVIDIA GeForce RTX 4070
Available memory: 11.60 GB
==((====))==  Unsloth 2026.1.4: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA GeForce RTX 4070. Num GPUs = 2. Max memory: 11.595 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu130. CUDA: 8.9. CUDA Toolkit: 13.0. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


KeyboardInterrupt: 

Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7d09380d8ad0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7d07c847f7d0, execution_count=25 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 7d07c829fc50, raw_cell="# FIRST CELL - Run this first
import os
os.environ.." transformed_cell="# FIRST CELL - Run this first
import os
os.environ.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://ssh-remote%2B100.85.27.30/home/subash/Quantization/2.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


ConnectionResetError: Connection lost

: 