In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/title-answer/command_qa (1).jsonl
/kaggle/input/command-qa/command_qa.jsonl


In [3]:
# !pip install -q transformers datasets accelerate peft bitsandbytes

In [4]:
# !pip install bitsandbytes

In [5]:
from datasets import load_dataset

data_path = "/kaggle/input/title-answer/command_qa (1).jsonl"
dataset = load_dataset("json", data_files=data_path, split="train")

print(dataset[0])


{'instruction': 'How do I undo the most recent local commits in Git?', 'output': 'Undo a commit & redo\n\n```\n$ git commit -m "Something terribly misguided" # (0: Your Accident)\n$ git reset HEAD~                              # (1)\n# === If you just want to undo the commit, stop here! ===\n[ edit files as necessary ]                    # (2)\n$ git add .                                    # (3)\n$ git commit -c ORIG_HEAD                      # (4)\n\n```\n\n\ngit reset is the command responsible for the undo. It will undo your last commit while leaving your working tree (the state of your files on disk) untouched. You\'ll need to add them again before you can commit them again.\nMake corrections to working tree files.\ngit add anything that you want to include in your new commit.\nCommit the changes, reusing the old commit message. reset copied the old head to .git/ORIG_HEAD; commit with -c ORIG_HEAD will open an editor, which initially contains the log message from the old commit an

In [6]:
import re
import re

def clean_instruction(example):
    text = example['instruction']
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = "\n".join(line.rstrip() for line in text.splitlines())
    text = re.sub(r'[ \t]{2,}', ' ', text)  # collapse tabs or multiple spaces
    example['instruction'] = text.strip()
    return example


dataset_process = dataset.map(clean_instruction)

print(dataset_process[0])

{'instruction': 'How do I undo the most recent local commits in Git?', 'output': 'Undo a commit & redo\n\n```\n$ git commit -m "Something terribly misguided" # (0: Your Accident)\n$ git reset HEAD~                              # (1)\n# === If you just want to undo the commit, stop here! ===\n[ edit files as necessary ]                    # (2)\n$ git add .                                    # (3)\n$ git commit -c ORIG_HEAD                      # (4)\n\n```\n\n\ngit reset is the command responsible for the undo. It will undo your last commit while leaving your working tree (the state of your files on disk) untouched. You\'ll need to add them again before you can commit them again.\nMake corrections to working tree files.\ngit add anything that you want to include in your new commit.\nCommit the changes, reusing the old commit message. reset copied the old head to .git/ORIG_HEAD; commit with -c ORIG_HEAD will open an editor, which initially contains the log message from the old commit an

In [7]:
# for datast in range(len(dataset)):
#     print(dataset[datast]['instruction'])

total_length = 0
max_length = 0

for item in dataset_process:
    instruction_length = len(item['instruction'])
    total_length += instruction_length
    if instruction_length > max_length:
        max_length = instruction_length

average_length = total_length / len(dataset_process)

print("Max instruction length:", max_length)
print("Average instruction length:", average_length)



Max instruction length: 170
Average instruction length: 52.87263820853744


In [8]:
total_length = 0
max_length = 0

for item in dataset_process:
    output_length = len(item['output'])
    total_length += output_length
    if output_length > max_length:
        max_length = output_length

average_length = total_length / len(dataset_process)

print("Max output length:", max_length)
print("Average output length:", average_length)

Max output length: 15203
Average output length: 896.0468859342197


In [9]:
# # def tokenize(example):
# #     result = tokenizer(
# #         example["instruction"],
# #         truncation=True,
# #         padding="max_length",
# #         max_length=1024,
# #     )
    
# #     result["labels"] = result["input_ids"].copy()
# #     return result

# # tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# def tokenize(example):
#     prompt = example["instruction"].strip()
#     response = example["output"].strip()

#     full_input = f"{prompt}\n\n### Response:\n{response}"

#     result = tokenizer(
#         full_input,
#         truncation=True,
#         padding="max_length",
#         max_length=1024,
#     )
#     result["labels"] = result["input_ids"].copy()
#     return result


In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    # Format the full text
    text = f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    
    # Tokenize with proper formatting
    tokenized = tokenizer(
        text,
        truncation=True,
        max_length=2048,
        padding="max_length",  # Ensures fixed-length sequences
        return_tensors="pt"    # Returns PyTorch tensors
    )
    
    # Create labels (mask instruction tokens)
    instruction_text = f"### Instruction:\n{example['instruction']}\n\n### Response:\n"
    instruction_tokens = tokenizer(
        instruction_text,
        truncation=True,
        max_length=150,
        add_special_tokens=False
    )
    
    # Create labels (-100 for instruction, token IDs for response)
    labels = [-100] * len(instruction_tokens["input_ids"]) + tokenized["input_ids"][0][len(instruction_tokens["input_ids"]):].tolist()
    
    return {
        "input_ids": tokenized["input_ids"][0].tolist(),  # Convert to list
        "attention_mask": tokenized["attention_mask"][0].tolist(),
        "labels": labels
    }


# Process dataset with error handling
try:
    tokenized_dataset = dataset_process.map(
        tokenize,
        batched=False,  # Process one example at a time for stability
        remove_columns=["instruction", "output"]
    )
except Exception as e:
    print(f"Error during tokenization: {e}")
    # Add debug to find problematic examples
    for i, example in enumerate(dataset):
        try:
            tokenize(example)
        except Exception as ex:
            print(f"Error in example {i}: {ex}")
            print(example)
            break

In [11]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [12]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1286
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 143
    })
})

In [None]:
tokenized_dataset['train'][10]

In [14]:
# pip install -U bitsandbytes

In [18]:
import warnings
warnings.filterwarnings('ignore')


In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map={"": 0}
)
model = prepare_model_for_kbit_training(model)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [20]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


In [21]:
sample = tokenized_dataset["train"][0]
print(f"Input IDs shape: {len(sample['input_ids'])}")
print(f"Attention mask shape: {len(sample['attention_mask'])}")
print(f"Labels shape: {len(sample['labels'])}")

Input IDs shape: 2048
Attention mask shape: 2048
Labels shape: 2048


In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

import torch
torch.cuda.set_device(1)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./qlora_tinyllama_cli",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # label_names=["input_ids", "attention_mask", "labels"]
)


trainer.train()


In [28]:
!nvidia-smi

Wed Jun 18 14:42:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   70C    P0             30W /   70W |    4977MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
import evaluate
trainer.evaluate()

In [None]:
# Move model/data to GPU 1
# model = model.to(1)
# input_tensor = input_tensor.to(device)

In [None]:
# import torch
# from IPython.display import clear_output

# # Clear GPU cache
# torch.cuda.empty_cache()

# # Clear notebook output (optional)
# clear_output()

# # Verify memory is freed
# print(f"Memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
# print(f"Memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

In [None]:
trainer.save_model("./qlora_tinyllama_cli_final")
tokenizer.save_pretrained("./qlora_tinyllama_cli_final")

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="./qlora_tinyllama_cli_final", tokenizer=tokenizer, device=0)
prompt = (
    "### Instruction:\n"
    "How can I delete a remote Git branch?\n\n"
    "### Response:\n"
)

output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)[0]["generated_text"]
print(output)

In [None]:
output = pipe(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"]
print(output[len(prompt):].strip())

In [None]:
response = output[len(prompt):].strip()
print(response)


In [None]:
!zip -r /kaggle/working/qlora_tinyllama_cli_final.zip /kaggle/working/qlora_tinyllama_cli_final

In [1]:
eval_prompts = [
    "Create a new Git branch and switch to it.",
    "Compress the folder reports into reports.tar.gz.",
    "List all Python files in the current directory recursively.",
    "Set up a virtual environment and install requests.",
    "Fetch only the first ten lines of a file named output.log.",
    "Clone a repository using git command.",  # Edge case 1
    "How can I delete a remote Git branch?",        # Edge case 2
]


In [2]:
# pip install -U bitsandbytes

In [34]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import PeftModel
import torch

# Load base model
base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
adapter_path = "/kaggle/input/model/transformers/default/1/kaggle/working/qlora_tinyllama_cli_final"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load and merge PEFT adapter
peft_model = PeftModel.from_pretrained(base_model, adapter_path)
merged_model = peft_model.merge_and_unload()

# Build pipelines
pipe_base = pipeline("text-generation", model=base_model, tokenizer=tokenizer)
pipe_finetuned = pipeline("text-generation", model=merged_model, tokenizer=tokenizer)


Device set to use cuda:0
Device set to use cuda:0


In [35]:
def get_outputs(pipe, prompts):
    outputs = []
    for prompt in prompts:
        full_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
        result = pipe(full_prompt, max_new_tokens=200, do_sample=False)[0]['generated_text']
        response = result.split("### Response:")[-1].strip()
        outputs.append(response)
    return outputs

base_outputs = get_outputs(pipe_base, eval_prompts)
finetuned_outputs = get_outputs(pipe_finetuned, eval_prompts)


In [36]:
base_outputs

['git checkout -b branch_name\n\nThis will create a new branch called branch_name.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout -b branch_name will create a new branch called branch_name.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.',
 'You can use tar command to compress the folder reports into reports.tar.gz.\n\n```\ntar -czvf reports.tar.gz reports\

In [37]:
finetuned_outputs

['git checkout -b branch_name\n\nThis will create a new branch called branch_name.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout -b branch_name will create a new branch called branch_name.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.\n\ngit checkout branch_name will switch to the branch you just created.',
 'You can use tar command to compress the folder reports into reports.tar.gz.\n\n```\ntar -czvf reports.tar.gz reports\

In [18]:
len(base_outputs)
for i in base_outputs:
    print("_______________________________________")
    print(i)
    print("_______________________________________")

_______________________________________
git checkout -b branch_name

This will create a new branch called branch_name.

git checkout branch_name will switch to the branch you just created.

git checkout -b branch_name will create a new branch called branch_name.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.
_______________________________________
_______________________________________
You can use tar command to 

In [28]:
len(finetuned_outputs)
for i in finetuned_outputs:
    print("_______________________________________")
    print(i)
    print("_______________________________________")

_______________________________________
git checkout -b branch_name

This will create a new branch called branch_name.

git checkout branch_name will switch to the branch you just created.

git checkout -b branch_name will create a new branch called branch_name.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.

git checkout branch_name will switch to the branch you just created.
_______________________________________
_______________________________________
You can use tar command to 