In [1]:
!pip install transformers datasets torch trl peft bitsandbytes

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.15.1-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)

In [3]:
# Load required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

def generate_text(prompt, max_length=500, temperature=0.1):
    """
    Generate text using the DeepSeek model

    Args:
        prompt (str): Input text to generate from
        max_length (int): Maximum length of generated text
        temperature (float): Controls randomness in generation (0.0-1.0)

    Returns:
        str: Generated text
    """
    # Encode the input text
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            num_return_sequences=1
        )

    # Decode and return the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage
if __name__ == "__main__":
    # Example prompts to test the model
    prompts = [

        "In Yoga philosophy, what is the significance of the concept of ahimsa (non-violence)?",
        "Tell me about buddhism in India?"

    ]

    print("Generating text from different prompts:\n")
    for prompt in prompts:
        print(f"Prompt: {prompt}")
        generated = generate_text(prompt)
        print(f"Generated text: {generated}\n")

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generating text from different prompts:

Prompt: In Yoga philosophy, what is the significance of the concept of ahimsa (non-violence)?
Generated text: In Yoga philosophy, what is the significance of the concept of ahimsa (non-violence)? How does it relate to the concept of self-awareness? Can you explain the relationship between these two concepts?

In the context of the yoga philosophy, what is the significance of the concept of ahimsa (non-violence)? How does it relate to the concept of self-awareness? Can you explain the relationship between these two concepts?

In the context of the yoga philosophy, what is the significance of the concept of ahimsa (non-violence)? How does it relate to the concept of self-awareness? Can you explain the relationship between these two concepts?

In the context of the yoga philosophy, what is the significance of the concept of ahimsa (non-violence)? How does it relate to the concept of self-awareness? Can you explain the relationship between these two

In [None]:
%cd

/root


In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
)
from trl import SFTTrainer
import torch
from peft import LoraConfig, get_peft_model

# Step 1: Load the dataset
dataset = load_dataset("Abhaykoul/Ancient-Indian-Wisdom")

# Step 2: Format the dataset into instruction-response pairs
def format_dataset(examples):
    """Format the dataset into instruction-response pairs."""
    texts = []
    for instruction, response in zip(examples["instruction"], examples["output"]):
        # Combine instruction and response into a single text
        formatted_text = f"### Instruction:\n{instruction}\n\n### Response:\n{response}"
        texts.append(formatted_text)
    return {"text": texts}

# Apply formatting
dataset = dataset.map(format_dataset, batched=True, remove_columns=dataset["train"].column_names)

# Step 3: Load model and tokenizer
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# Step 4: Configure LoRA
peft_config = LoraConfig(
    r=16,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none",  # No bias for LoRA
    task_type="CAUSAL_LM",  # Task type
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]  # Target modules for LoRA
)
model = get_peft_model(model, peft_config)

# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save results
    num_train_epochs=200,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per device
    per_device_eval_batch_size=4,  # Evaluation batch size
    gradient_accumulation_steps=4,  # Gradient accumulation steps
    gradient_checkpointing=False,  # Disable gradient checkpointing for debugging
    optim="adamw_torch",  # Optimizer
    learning_rate=1e-4,  # Learning rate
    warmup_ratio=0.1,  # Warmup ratio
    fp16=True,  # Use mixed precision (FP16)
    logging_steps=10,  # Log every 10 steps
    save_strategy="steps",  # Save model at specific steps
    save_steps=100,  # Save every 100 steps
    evaluation_strategy="steps",  # Evaluate at specific steps
    eval_steps=100,  # Evaluate every 100 steps
    eval_accumulation_steps=1,  # Accumulate evaluation steps
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss",  # Metric for best model
    greater_is_better=False,  # Lower eval_loss is better
    remove_unused_columns=True,  # Remove unused columns
    report_to="none",  # Disable external logging
)

# Step 6: Initialize the trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["train"].select(range(120)),  # Small evaluation set
    tokenizer=tokenizer,
)

# Step 7: Train the model
trainer.train()



  trainer = SFTTrainer(


Tokenizing train dataset:   0%|          | 0/616 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/616 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/120 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
100,2.2765,2.179498
200,1.8505,1.910967
300,1.6677,1.727494
400,1.5292,1.616376
500,1.3888,1.486175
600,1.2824,1.381052
700,1.2108,1.273131
800,1.0503,1.169226
900,0.9539,1.064521
1000,0.8973,0.967845


Step,Training Loss,Validation Loss
100,2.2765,2.179498
200,1.8505,1.910967
300,1.6677,1.727494
400,1.5292,1.616376
500,1.3888,1.486175
600,1.2824,1.381052
700,1.2108,1.273131
800,1.0503,1.169226
900,0.9539,1.064521
1000,0.8973,0.967845


In [3]:
model.save_pretrained("fine-tuned-deepseek-r1-1.5b")
tokenizer.save_pretrained("fine-tuned-deepseek-r1-1.5b")

('fine-tuned-deepseek-r1-1.5b/tokenizer_config.json',
 'fine-tuned-deepseek-r1-1.5b/special_tokens_map.json',
 'fine-tuned-deepseek-r1-1.5b/tokenizer.json')

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_path = "fine-tuned-deepseek-r1-1.5b"


# Load model with optimizations
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

def generate_text(prompt, max_new_tokens=1000):
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.5,
            top_k=50,
            top_p=0.9,
            use_cache=True
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test
prompt = "In Yoga philosophy, what is the significance of the concept of ahimsa (non-violence)?"
output = generate_text(prompt)
print(output)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In Yoga philosophy, what is the significance of the concept of ahimsa (non-violence)? How does it relate to the concept of non-violence in the Western tradition? What is the difference between the two?

Additionally, in the context of yoga, how do the teachings of the Buddha (Bodhisattva) relate to the concept of non-violence? How does this relate to the concept of non-violence in the Western tradition?

Finally, how does the concept of non-violence in yoga relate to the concept of non-violence in the Western tradition? What is the difference between the two?

Please provide a detailed explanation and answer in the form of a response.

In the context of Yoga, what is the significance of the concept of ahimsa (non-violence)? How does it relate to the concept of non-violence in the Western tradition? What is the difference between the two?

In the context of Yoga, how do the teachings of the Buddha (Bodhisattva) relate to the concept of non-violence? How does this relate to the concept o