<a href="https://colab.research.google.com/github/sauravgarg547/finetune/blob/main/fix_devxai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧠 DevX AI Model Fine-Tuning Notebook
This notebook will fine-tune a small language model like `TinyLlama` using DevOps and Cloud datasets.

In [1]:
from google.colab import files
uploaded = files.upload()  # ← devx_datasets_full.zip upload karo


Saving devx_datasets_full.zip to devx_datasets_full.zip


In [6]:
!unzip devx_datasets_full.zip


Archive:  devx_datasets_full.zip
 extracting: docker.jsonl            
 extracting: kubernetes.jsonl        
 extracting: aws.jsonl               
 extracting: gcp.jsonl               
 extracting: terraform.jsonl         
 extracting: devops_errors.jsonl     
 extracting: cloud_architecture.jsonl  


In [8]:
import zipfile
import os

with zipfile.ZipFile("devx_datasets_full.zip", 'r') as zip_ref:
    zip_ref.extractall("devx_datasets_full")


In [12]:
import json

filepaths = [
    "devx_datasets_full/aws.jsonl",
    "devx_datasets_full/docker.jsonl",
    "devx_datasets_full/kubernetes.jsonl",
    "devx_datasets_full/cloud_architecture.jsonl",
    "devx_datasets_full/devops_errors.jsonl",
    "devx_datasets_full/gcp.jsonl",
    "devx_datasets_full/terraform.jsonl"
]

all_data = []
for file in filepaths:
    with open(file, "r") as f:
        for line in f:
            all_data.append(json.loads(line))


In [13]:
from datasets import Dataset
dataset = Dataset.from_list(all_data)


In [14]:
def formatting_prompts(example):
    return {
        "text": f"<|system|>You are a DevOps & Cloud expert.\n<|user|>{example['instruction']}\n<|assistant|>{example['output']}"
    }

dataset = dataset.map(formatting_prompts)


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

In [17]:
!pip install git+https://github.com/unslothai/unsloth.git


Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-6gpxpkod
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-6gpxpkod
  Resolved https://github.com/unslothai/unsloth.git to commit 4cd5ea176745aa5d00c868bd81b2c559a6374b90
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.5.6-py3-none-any.whl size=265427 sha256=ea252b1423fb88d98f6da9b9a02be018ff8a3cdd6ad14f8edf91762231aaf01e
  Stored in directory: /tmp/pip-ephem-wheel-cache-16nv0r49/wheels/d1/17/05/850ab10c33284a4763b0595cd8ea9d01fce6e221cac24b3c01
Successfully built unsloth
Installing collected packages: unsloth
S

In [None]:
!pip install bitsandbytes


In [None]:
!pip install unsloth datasets peft bitsandbytes accelerate transformers


In [None]:
!pip install unsloth_zoo


In [None]:
!huggingface-cli login

In [None]:
!pip install datasets


In [14]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="devops_errors.jsonl")
dataset = load_dataset("json", data_files="aws.jsonl")
dataset = load_dataset("json", data_files="gcp.jsonl")
dataset = load_dataset("json", data_files="terraform.jsonl")
dataset = load_dataset("json", data_files="cloud_architecture.jsonl")
dataset = load_dataset("json", data_files="docker.jsonl")
dataset = load_dataset("json", data_files="kubernetes.jsonl")



# Convert Alpaca format -> text format (prompt + output)
def format_alpaca(example):
    if example["input"]:
        return {
            "text": f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
        }
    else:
        return {
            "text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
        }

# Apply the transformation
dataset = dataset.map(format_alpaca)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from unsloth import FastLanguageModel

# Load 4-bit model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/tinyllama-bnb-4bit",  # or your desired model
    max_seq_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

# ➕ Attach LoRA adapter to make the model trainable
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,                          # LoRA rank
    target_modules = ["q_proj", "v_proj"],  # Layers to apply LoRA on
    lora_alpha = 32,
    lora_dropout = 0.05,
    bias = "none",
    use_gradient_checkpointing = True,  # Saves memory
    random_state = 42,
    use_rslora = False,  # or True if you want
    loftq_config = None,
)


In [18]:
train_data = dataset["train"]  # Pehle train split lo

trainer = SFTTrainer(
    model = model,
    train_dataset = train_data,  # ✅ Ab yeh sahi hai
    dataset_text_field = "text",  # Ya "output", jo bhi aapka field ho
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size=2,
        num_train_epochs=3,
        logging_steps=10,
        output_dir="./outputs"
    ),
)


In [None]:
trainer.train()


In [25]:
trainer.model.save_pretrained("devx-tiny-model")
tokenizer.save_pretrained("devx-tiny-model")


('devx-tiny-model/tokenizer_config.json',
 'devx-tiny-model/special_tokens_map.json',
 'devx-tiny-model/tokenizer.model',
 'devx-tiny-model/added_tokens.json',
 'devx-tiny-model/tokenizer.json')