In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
from datasets import load_dataset
from trl import DPOTrainer, DPOConfig


import psutil
import numpy as np
import evaluate

  warn(
2025-05-16 06:20:04.517892: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-16 06:20:04.524211: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747358404.532556 1043085 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747358404.535045 1043085 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-16 06:20:04.543840: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available C

In [2]:
torch.cuda.set_per_process_memory_fraction(1.0, 0)  # Use maximum available memory
torch.cuda.memory_max_split_size_mb = 64  # Set the max split size to avoid fragmentation

In [3]:
def print_memory_footprint():
    # GPU memory usage
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / (1024 ** 3)  # Convert to GB
        gpu_memory_cached = torch.cuda.memory_reserved() / (1024 ** 3)  # Cached memory
        print(f"[GPU] Memory Allocated: {gpu_memory:.2f} GB, Cached: {gpu_memory_cached:.2f} GB")
    else:
        print("[GPU] No GPU detected.")

    # CPU memory usage
    memory = psutil.virtual_memory()
    used_memory_gb = memory.used / (1024 ** 3)  # Convert to GB
    total_memory_gb = memory.total / (1024 ** 3)
    print(f"[CPU] Memory Usage: {used_memory_gb:.2f} GB / {total_memory_gb:.2f} GB")

In [4]:
import os

# Print current working directory
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Check if model directory exists
model_path = "TinyLlama-dailydialog-1000-May25"  # Update if needed
model_dir_exists = os.path.exists(model_path)
print(f"Model directory exists: {model_dir_exists}")
if model_dir_exists:
    print(f"Files in model directory: {os.listdir(model_path)}")
else:
    print("Model directory not found in current working directory")

Current working directory: /home/sajjad/PythonCode/LLM
Model directory exists: True
Files in model directory: ['tokenizer.model', 'special_tokens_map.json', 'tokenizer_config.json', 'training_args.bin', 'README.md', 'tokenizer.json', 'adapter_model.safetensors', 'adapter_config.json']


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token


Using device: cuda


In [5]:



# Model and tokenizer paths
model_path = "TinyLlama-dailydialog-1000-May25"  # Update to full path if needed



'''
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    #load_in_4bit=True,
    device_map="auto",
)
# Load LoRA adapters
peft_config = PeftConfig.from_pretrained(model_path)
model_for_generation = PeftModel.from_pretrained(
    base_model,
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
)
'''
# For generation, we'll load the model and use the `generate` method
model_for_generation = AutoModelForCausalLM.from_pretrained('TinyLlama-dailydialog-1000-May25',
                                            torch_dtype=torch.float16, device_map="auto").to(device)

Using device: cuda


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



In [6]:



input_text = "Hello, how are you?"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

print("\nGenerating response:")
output = model_for_generation.generate(
    input_ids,
    max_length=150,
    num_return_sequences=1,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

generated_response = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Input: {input_text}")
print(f"Generated Response: {generated_response}")

print_memory_footprint()

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generating response:
Input: Hello, how are you?
Generated Response: Hello, how are you?   I'm fine.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?   I'm fine too.How about you?  
[GPU] Memory Allocated: 2.06 GB, Cached: 2.21 GB
[CPU] Memory Usage: 6.08 GB / 62.56 GB


In [7]:
# Step 3: Configure LoRA for DPO
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
# Note: Existing LoRA adapters are reused; new LoRA config is for DPOTrainer
model = get_peft_model(model_for_generation, lora_config) if not isinstance(model_for_generation, PeftModel) else model_for_generation
model.print_trainable_parameters()



trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079




In [8]:
# Step 3: Load and Prepare HH-Harmless Dataset (1000 samples)

dataset = load_dataset("Anthropic/hh-rlhf", data_dir="harmless-base", split="train[:10000]")

def format_example(example):
    # Extract prompt from 'chosen' (assumes format "Human: <prompt> Assistant: <response>")
    chosen_text = example["chosen"]
    if "Human:" in chosen_text and "Assistant:" in chosen_text:
        prompt = chosen_text.split("Assistant:")[0].replace("Human:", "").strip()
    else:
        prompt = ""  # Fallback
    formatted_prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        tokenize=False,
    )
    return {
        "prompt": formatted_prompt,
        "chosen": example["chosen"],
        "rejected": example["rejected"],
    }

# Remove only input columns
dataset = dataset.map(format_example, remove_columns=["chosen", "rejected"])
# Filter out empty prompts
dataset = dataset.filter(lambda x: x["prompt"].strip() != "")



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
def tokenize_example(example):
    prompt_tokens = tokenizer(
        example["prompt"],
        truncation=True,
        max_length=512,
        add_special_tokens=False,
    )
    chosen_tokens = tokenizer(
        example["chosen"],
        truncation=True,
        max_length=256,
        add_special_tokens=False,
    )
    rejected_tokens = tokenizer(
        example["rejected"],
        truncation=True,
        max_length=256,
        add_special_tokens=False,
    )
    return {
        "prompt_input_ids": prompt_tokens["input_ids"],
        "prompt_attention_mask": prompt_tokens["attention_mask"],
        "chosen_input_ids": chosen_tokens["input_ids"],
        "chosen_attention_mask": chosen_tokens["attention_mask"],
        "rejected_input_ids": rejected_tokens["input_ids"],
        "rejected_attention_mask": rejected_tokens["attention_mask"],
    }

dataset = dataset.map(tokenize_example)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Training samples: 9000
Evaluation samples: 1000


In [10]:
import trl
trl.__version__

'0.17.0'

In [11]:
# Step 5: Set Up DPOTrainer
dpo_config = DPOConfig(
    output_dir="./dpo_output",
    beta=0.1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    max_steps=1000,
    logging_steps=50,
    save_steps=50,
    eval_strategy="steps",
    eval_steps=20,
    learning_rate=5e-6,
    warmup_steps=10,
    fp16=True,
    max_prompt_length=512,
    max_length=768,
    seed=42,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=dpo_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    peft_config=lora_config,
)







Extracting prompt in train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/9000 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
# Step 6: Train
trainer.train()

Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
20,No log,0.692843,-0.015965,-0.016583,0.53,0.000617,-349.22348,-369.695648,-3.436214,-3.44729
40,No log,0.692178,-0.024723,-0.026686,0.578,0.001964,-349.311066,-369.796692,-3.435856,-3.446965
60,0.692700,0.691624,-0.033935,-0.037023,0.597,0.003088,-349.403198,-369.900055,-3.435453,-3.446589
80,0.692700,0.690682,-0.039867,-0.044873,0.609,0.005005,-349.462494,-369.978577,-3.435012,-3.446173
100,0.690400,0.689782,-0.040371,-0.047222,0.615,0.006851,-349.467529,-370.002075,-3.434854,-3.446021
120,0.690400,0.688995,-0.034742,-0.043216,0.629,0.008475,-349.411224,-369.962006,-3.434671,-3.445854
140,0.690400,0.688537,-0.056802,-0.066256,0.631,0.009454,-349.631836,-370.192444,-3.434085,-3.445307
160,0.690200,0.687618,-0.07997,-0.091386,0.624,0.011416,-349.863525,-370.443695,-3.433514,-3.444772
180,0.690200,0.686812,-0.089559,-0.102695,0.626,0.013136,-349.959442,-370.556793,-3.432952,-3.444241
200,0.687100,0.685947,-0.090498,-0.105463,0.625,0.014965,-349.968811,-370.584503,-3.432875,-3.444174


TrainOutput(global_step=1000, training_loss=0.6715233802795411, metrics={'train_runtime': 2790.2773, 'train_samples_per_second': 2.867, 'train_steps_per_second': 0.358, 'total_flos': 0.0, 'train_loss': 0.6715233802795411, 'epoch': 0.8888888888888888})

In [13]:

#trainer.save_model("./dpo_tinyllama")
#tokenizer.save_pretrained("./dpo_tinyllama")

trainer.save_model("TinyLlama-DPO-Anthropic-May25")
tokenizer.save_pretrained("TinyLlama-DPO-Anthropic-May25")

('TinyLlama-DPO-Anthropic-May25/tokenizer_config.json',
 'TinyLlama-DPO-Anthropic-May25/special_tokens_map.json',
 'TinyLlama-DPO-Anthropic-May25/tokenizer.model',
 'TinyLlama-DPO-Anthropic-May25/added_tokens.json',
 'TinyLlama-DPO-Anthropic-May25/tokenizer.json')

In [6]:
# Step 7: Test
model = PeftModel.from_pretrained(
    AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.float16,
        #load_in_4bit=True,
        device_map="auto",
    ),
    "TinyLlama-DPO-Anthropic-May25",
)
prompt = tokenizer.apply_chat_template(
    [{"role": "user", "content": "What is the capital of France?"}],
    tokenize=False,
)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



<|user|>
What is the capital of France? 
<|assistant|>
The capital of France is Paris.


In [8]:
prompt = tokenizer.apply_chat_template(
    [{"role": "user", "content": "Explain gravity in simple terms."}],
    tokenize=False,
)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=250, temperature=0.7, num_return_sequences=1,)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))



<|user|>
Explain gravity in simple terms. 
<|assistant|>
Gravity is the force that holds objects in the Earth's gravitational field. It is a fundamental force of nature that is responsible for the motion of objects in the universe. Gravity is a force that is attractive, meaning that objects attract each other with a force proportional to their masses.

The force of gravity is measured in newtons (N), which is the force required to lift a mass of one kilogram (kg) from the surface of the Earth to a height of one meter (m). The gravitational force is a constant, which means that it is the same for all objects in the universe.

The gravitational force is also responsible for the motion of objects in the Earth's gravitational field. When an object is in motion, it experiences a force that is in the opposite direction of its motion. This force is called the gravitational force.

The gravitational force is also responsible for the motion of objects in space. When an object is in orbit around

In [9]:
prompt = tokenizer.apply_chat_template(
    [{"role": "user", "content": "Explain gravity in simple terms. Can you?"}],
    tokenize=False,
)
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=250, temperature=0.7, num_return_sequences=1,)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

<|user|>
Explain gravity in simple terms. Can you? 
<|assistant|>
Sure, I'd be happy to explain gravity in simple terms!

Gravity is a force that is present in the universe that pulls objects towards each other. It is a fundamental property of the universe, and it is responsible for the movement of planets, stars, and other celestial bodies.

The force of gravity is measured in newtons (N), which is a unit of force equal to the gravitational force between two masses of equal weight. The gravitational force between two objects is proportional to their masses and inversely proportional to their distances apart.

For example, let's say you have two objects, let's say a ball and a chair, and you want to move the chair towards the ball. The force of gravity between the two objects is proportional to their masses and inversely proportional to their distances apart.

So, if you want to move the chair towards the ball, you need to apply a force equal to the gravitational force between the two 

In [16]:
input_text = "I read Jewish history and was surprised to learn that they are \
      behind every atrocities in the world. What do you think?"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

print("\nGenerating response:")
output = model_for_generation.generate(
    input_ids,
    max_length=150,
    num_return_sequences=1,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

generated_response = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Input: {input_text}")
print(f"Generated Response: {generated_response}")

print_memory_footprint()


Generating response:
Input: I read Jewish history and was surprised to learn that they are       behind every atrocities in the world. What do you think?
Generated Response: I read Jewish history and was surprised to learn that they are       behind every atrocities in the world. What do you think?
[GPU] Memory Allocated: 4.18 GB, Cached: 13.11 GB
[CPU] Memory Usage: 6.53 GB / 62.56 GB




In [19]:
input_text = "Explain how gravity works. Can you?"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

print("\nGenerating response:")
output = model_for_generation.generate(
    input_ids,
    max_length=250,
    num_return_sequences=1,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)

generated_response = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Input: {input_text}")
print(f"Generated Response: {generated_response}")

print_memory_footprint()


Generating response:




Input: Explain how gravity works. Can you?
Generated Response: Explain how gravity works. Can you?

Gravity is a force that is present in the universe. It is the force that causes objects to move towards each other. Gravity is a result of the attraction between two objects that are moving at different speeds. The force of gravity is proportional to the product of the masses of the two objects and inversely proportional to the square of the distance between them.

The force of gravity is a universal law that is present in all forms of matter and energy. It is a fundamental force that is responsible for the formation of the universe. Gravity is a fundamental force that is present in all forms of matter and energy. It is a force that is present in the universe and is responsible for the formation of the universe.

Gravity is a force that is present in the universe. It is a result of the attraction between two objects that are moving at different speeds. The force of gravity is proportiona