In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q datasets transformers peft bitsandbytes accelerate wandb

In [None]:
import torch
import wandb
import os
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, TaskType, get_peft_model

In [None]:
os.environ["HF_TOKEN"] = ""

In [None]:
wandb.init(
    project="DeepSeek-Finetuning",
    config={
        "learning_rate": 2e-5,
        "architecture": "DeepSeek-R1-Distill-Qwen-1.5B",
        "dataset": "niche_dataset.jsonl",
        "epochs": 3,
    }
)

In [None]:
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [None]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

In [None]:
# ✅ STEP 8: Load Tokenizer & Model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv("HF_TOKEN"))
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=quantization_config, 
    device_map="auto",
    token=os.getenv("HF_TOKEN")
)

In [None]:
# ✅ STEP 9: Apply LoRA for Efficient Fine-Tuning
lora_config = LoraConfig(
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.05, 
    task_type=TaskType.CAUSAL_LM
)

In [None]:
# Wrap Model with PEFT for LoRA Fine-Tuning
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
dataset_path = "/content/medical_dataset.jsonl"
dataset_samples = [
    {
        "prompt": "Question 1: What are the early symptoms of diabetes?",
        "completion": "Answer 1: Early symptoms include frequent urination, increased thirst, fatigue, blurred vision, and unexplained weight loss."
    },
    {
        "prompt": "Question 2: How does high blood pressure affect the body?",
        "completion": "Answer 2: High blood pressure can lead to heart disease, kidney damage, stroke, and vision problems if left untreated."
    },
    {
        "prompt": "Question 3: What is the recommended diet for heart disease patients?",
        "completion": "Answer 3: A heart-healthy diet includes fruits, vegetables, whole grains, lean proteins, and reduced salt and saturated fat intake."
    },
    {
        "prompt": "Question 4: How can you naturally boost your immune system?",
        "completion": "Answer 4: Maintain a balanced diet, exercise regularly, get enough sleep, stay hydrated, and manage stress effectively."
    },
    {
        "prompt": "Question 5: What are the common symptoms of iron deficiency anemia?",
        "completion": "Answer 5: Symptoms include fatigue, pale skin, dizziness, shortness of breath, and brittle nails."
    },
    {
        "prompt": "Question 6: What is the role of insulin in the human body?",
        "completion": "Answer 6: Insulin helps regulate blood sugar levels by facilitating glucose uptake into cells for energy production."
    },
    {
        "prompt": "Question 7: How can you manage stress effectively?",
        "completion": "Answer 7: Practice mindfulness, deep breathing, regular exercise, and ensure proper rest to reduce stress levels."
    },
    {
        "prompt": "Question 8: What are the symptoms of a stroke?",
        "completion": "Answer 8: Symptoms include sudden numbness or weakness in the face, arms, or legs, confusion, trouble speaking, and severe headache."
    },
    {
        "prompt": "Question 9: What is the difference between bacterial and viral infections?",
        "completion": "Answer 9: Bacterial infections are caused by bacteria and treated with antibiotics, while viral infections are caused by viruses and usually resolve on their own or require antiviral medications."
    },
    {
        "prompt": "Question 10: How does dehydration affect the body?",
        "completion": "Answer 10: Dehydration can cause dizziness, headaches, dry skin, confusion, and reduced kidney function."
    },
    {
        "prompt": "Question 11: What are the risk factors for developing osteoporosis?",
        "completion": "Answer 11: Risk factors include aging, calcium deficiency, sedentary lifestyle, smoking, and a family history of osteoporosis."
    },
    {
        "prompt": "Question 12: What are the benefits of regular exercise for mental health?",
        "completion": "Answer 12: Regular exercise reduces stress, anxiety, and depression while improving mood and cognitive function."
    },
    {
        "prompt": "Question 13: How does smoking impact lung health?",
        "completion": "Answer 13: Smoking damages lung tissue, reduces oxygen intake, increases the risk of lung cancer, and contributes to chronic obstructive pulmonary disease (COPD)."
    },
    {
        "prompt": "Question 14: What are the main causes of liver disease?",
        "completion": "Answer 14: Causes include excessive alcohol consumption, viral hepatitis, obesity, and certain medications or toxins."
    },
    {
        "prompt": "Question 15: How can you lower your cholesterol levels naturally?",
        "completion": "Answer 15: Eat a high-fiber diet, exercise regularly, reduce saturated fat intake, and maintain a healthy weight."
    },
    {
        "prompt": "Question 16: What are the symptoms of a heart attack?",
        "completion": "Answer 16: Symptoms include chest pain, shortness of breath, nausea, cold sweats, and pain in the arms or jaw."
    },
    {
        "prompt": "Question 17: What is the role of hydration in maintaining kidney health?",
        "completion": "Answer 17: Proper hydration helps flush out toxins, prevent kidney stones, and support overall kidney function."
    },
    {
        "prompt": "Question 18: What are the long-term complications of untreated diabetes?",
        "completion": "Answer 18: Complications include nerve damage, kidney failure, vision loss, heart disease, and poor wound healing."
    },
    {
        "prompt": "Question 19: What foods should be avoided for individuals with acid reflux?",
        "completion": "Answer 19: Spicy foods, caffeine, carbonated drinks, citrus fruits, and fried foods can worsen acid reflux."
    },
    {
        "prompt": "Question 20: How does obesity affect overall health?",
        "completion": "Answer 20: Obesity increases the risk of diabetes, heart disease, high blood pressure, joint problems, and sleep apnea."
    }
]


In [None]:
# ✅ Save Dataset as JSONL File
with open(dataset_path, "w", encoding="utf-8") as f:
    for sample in dataset_samples:
        json.dump(sample, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Dataset saved at: {dataset_path}")

In [None]:

# ✅ STEP 11: Load Your Dataset in Google Colab
dataset = load_dataset("json", data_files={"train": dataset_path}, split="train")

In [None]:
# ✅ STEP 12: Split Dataset into Train and Eval (80-20 Split)
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [None]:
# ✅ STEP 13: Tokenization Function
def tokenize_function(examples):
    combined_texts = [f"{prompt}\n{completion}" for prompt, completion in zip(examples["prompt"], examples["completion"])]
    tokenized = tokenizer(combined_texts, truncation=True, max_length=512, padding="max_length")
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [None]:
# ✅ Apply Tokenization
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [None]:
# ✅ STEP 14: Define Optimized Training Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/deepseek_finetuned",  # Save to Google Drive
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    fp16=True,
    logging_steps=20,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=50,
    learning_rate=2e-5,
    logging_dir="./logs",
    report_to="wandb",
    run_name="DeepSeek_FineTuning_Experiment",
)

In [None]:
# ✅ STEP 15: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

In [None]:
# ✅ STEP 16: Start Fine-Tuning
trainer.train()

In [None]:
# ✅ STEP 17: Save the Fine-Tuned Model in Google Drive
model_path = "/content/drive/MyDrive/deepseek_finetuned"
trainer.save_model(model_path)
print(f"✅ Fine-tuned model saved to: {model_path}")

In [None]:
from transformers import pipeline, AutoConfig

# ✅ Load Fine-Tuned Model from Google Drive
model_path = "/content/drive/MyDrive/deepseek_finetuned"
model = AutoModelForCausalLM.from_pretrained(model_path)

# ✅ Load the tokenizer using the original model name
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")

# ✅ Create a text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# ✅ Ask a Question (Change the prompt as needed)
question = "What is the first step in assessing a vintage camera's condition?"
response = generator(question, max_length=100, do_sample=True, temperature=0.7)

# ✅ Print Response
print("💬 Model Response:", response[0]["generated_text"])