In [2]:
# !pip install -q transformers datasets accelerate peft bitsandbytes

In [3]:
# !pip install bitsandbytes

In [65]:
from datasets import load_dataset

data_path = "/kaggle/input/command-qa/command_qa.jsonl"
dataset = load_dataset("json", data_files=data_path, split="train")

print(dataset[0])


{'instruction': "How do I undo the most recent local commits in Git?\n\nI accidentally committed the wrong files to Git but haven't pushed the commit to the server yet.\nHow do I undo those commits from the local repository?", 'output': 'Undo a commit & redo\n\n```\n$ git commit -m "Something terribly misguided" # (0: Your Accident)\n$ git reset HEAD~                              # (1)\n# === If you just want to undo the commit, stop here! ===\n[ edit files as necessary ]                    # (2)\n$ git add .                                    # (3)\n$ git commit -c ORIG_HEAD                      # (4)\n\n```\n\n\ngit reset is the command responsible for the undo. It will undo your last commit while leaving your working tree (the state of your files on disk) untouched. You\'ll need to add them again before you can commit them again.\nMake corrections to working tree files.\ngit add anything that you want to include in your new commit.\nCommit the changes, reusing the old commit message

In [66]:
import re
import re

def clean_instruction(example):
    text = example['instruction']
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = "\n".join(line.rstrip() for line in text.splitlines())
    text = re.sub(r'[ \t]{2,}', ' ', text)  # collapse tabs or multiple spaces
    example['instruction'] = text.strip()
    return example


dataset_process = dataset.map(clean_instruction)

print(dataset_process[0])

{'instruction': "How do I undo the most recent local commits in Git?\n\nI accidentally committed the wrong files to Git but haven't pushed the commit to the server yet.\nHow do I undo those commits from the local repository?", 'output': 'Undo a commit & redo\n\n```\n$ git commit -m "Something terribly misguided" # (0: Your Accident)\n$ git reset HEAD~                              # (1)\n# === If you just want to undo the commit, stop here! ===\n[ edit files as necessary ]                    # (2)\n$ git add .                                    # (3)\n$ git commit -c ORIG_HEAD                      # (4)\n\n```\n\n\ngit reset is the command responsible for the undo. It will undo your last commit while leaving your working tree (the state of your files on disk) untouched. You\'ll need to add them again before you can commit them again.\nMake corrections to working tree files.\ngit add anything that you want to include in your new commit.\nCommit the changes, reusing the old commit message

In [67]:
# for datast in range(len(dataset)):
#     print(dataset[datast]['instruction'])

total_length = 0
max_length = 0

for item in dataset_process:
    instruction_length = len(item['instruction'])
    total_length += instruction_length
    if instruction_length > max_length:
        max_length = instruction_length

average_length = total_length / len(dataset_process)

print("Max instruction length:", max_length)
print("Average instruction length:", average_length)



Max instruction length: 10245
Average instruction length: 604.3939818054583


In [68]:
total_length = 0
max_length = 0

for item in dataset_process:
    output_length = len(item['output'])
    total_length += output_length
    if output_length > max_length:
        max_length = output_length

average_length = total_length / len(dataset_process)

print("Max instruction length:", max_length)
print("Average instruction length:", average_length)

Max instruction length: 15203
Average instruction length: 896.0468859342197


In [69]:
# def tokenize(example):
#     result = tokenizer(
#         example["instruction"],
#         truncation=True,
#         padding="max_length",
#         max_length=1024,
#     )
    
#     result["labels"] = result["input_ids"].copy()
#     return result

# tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
def tokenize(example):
    prompt = example["instruction"].strip()
    response = example["output"].strip()

    full_input = f"{prompt}\n\n### Response:\n{response}"

    result = tokenizer(
        full_input,
        truncation=True,
        padding="max_length",
        max_length=1024,
    )
    result["labels"] = result["input_ids"].copy()
    return result


In [70]:
tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

Map:   0%|          | 0/1429 [00:00<?, ? examples/s]

In [71]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)

In [72]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1286
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 143
    })
})

In [None]:
tokenized_dataset['train'][0]

In [74]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch

base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    # load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map={"": 1}
)
model = prepare_model_for_kbit_training(model)


In [75]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 4,505,600 || all params: 1,104,553,984 || trainable%: 0.4079


In [77]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling

import torch
torch.cuda.set_device(0)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./qlora_tinyllama_cli",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # label_names=["input_ids", "attention_mask", "labels"]
)


trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
0,1.7426,1.749053


TrainOutput(global_step=80, training_loss=1.7542819261550904, metrics={'train_runtime': 564.707, 'train_samples_per_second': 2.277, 'train_steps_per_second': 0.142, 'total_flos': 8171169911930880.0, 'train_loss': 1.7542819261550904, 'epoch': 0.995334370139969})

In [78]:
trainer.save_model("./qlora_tinyllama_cli_final")
tokenizer.save_pretrained("./qlora_tinyllama_cli_final")


('./qlora_tinyllama_cli_final/tokenizer_config.json',
 './qlora_tinyllama_cli_final/special_tokens_map.json',
 './qlora_tinyllama_cli_final/tokenizer.model',
 './qlora_tinyllama_cli_final/added_tokens.json',
 './qlora_tinyllama_cli_final/tokenizer.json')

In [80]:
from transformers import pipeline

pipe = pipeline("text-generation", model="./qlora_tinyllama_cli_final", tokenizer=tokenizer, device=0)
prompt = (
    "### Instruction:\n"
    "How can I delete a remote Git branch?\n\n"
    "### Response:\n"
)

output = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7)[0]["generated_text"]
print(output)

Device set to use cuda:0


### Instruction:
How can I delete a remote Git branch?

### Response:
You can use the git branch -D command to delete a branch from a remote repository. It is part of the git command line, and you can run it from the command line:

```
git branch -D branch-name

```

In this case, the branch is named branch-name.
The -D option is used to delete the branch.
The default behavior is to delete the branch and all the branch's refs (tracked and untracked).

For more information on deleting branches, see the git branch man page.

Here is a sample response from a git pull command, which deletes a branch and leaves it in the remote repository:

```
$ git push origin :branch-name

```

Note that you may need to specify the remote (git clone) and the branch you want to delete (branch-name).

For more information on deleting branches, see the git branch man page.

Here is a sample response from a git push command, which deletes a branch but leaves it in the remote repository:

```
$ git push origi

In [82]:
output = pipe(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"]
print(output[len(prompt):].strip())

You can use the git branch -d command to delete a remote branch.

```
git branch -d branch_name

```

This will delete the branch and all its commits.

```
git branch -d branch_name

```

This will delete the branch and all its commits, including any local branches that are forked from it.

```
git branch -d branch_name --delete

```

This will delete the branch and all its commits, including any local branches that are forked from it.

```
git branch -d branch_name --delete

```

This will delete the branch and all its commits, including any local branches that are forked from it.

```
git branch -d branch_name --delete

```

This will delete the branch and all its commits, including any local branches that are forked from it.

```
git


In [81]:
response = output[len(prompt):].strip()
print(response)


You can use the git branch -D command to delete a branch from a remote repository. It is part of the git command line, and you can run it from the command line:

```
git branch -D branch-name

```

In this case, the branch is named branch-name.
The -D option is used to delete the branch.
The default behavior is to delete the branch and all the branch's refs (tracked and untracked).

For more information on deleting branches, see the git branch man page.

Here is a sample response from a git pull command, which deletes a branch and leaves it in the remote repository:

```
$ git push origin :branch-name

```

Note that you may need to specify the remote (git clone) and the branch you want to delete (branch-name).

For more information on deleting branches, see the git branch man page.

Here is a sample response from a git push command, which deletes a branch but leaves it in the remote repository:

```
$ git push origin :branch-name --delete

```

Note that you may need to specify the re

In [83]:
!zip -r /kaggle/working/qlora_tinyllama_cli_final.zip /kaggle/working/qlora_tinyllama_cli_final

  adding: kaggle/working/qlora_tinyllama_cli_final/ (stored 0%)
  adding: kaggle/working/qlora_tinyllama_cli_final/adapter_model.safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: kaggle/working/qlora_tinyllama_cli_final/tokenizer.json (deflated 85%)
  adding: kaggle/working/qlora_tinyllama_cli_final/training_args.bin (deflated 52%)
  adding: kaggle/working/qlora_tinyllama_cli_final/README.md (deflated 66%)
  adding: kaggle/working/qlora_tinyllama_cli_final/tokenizer.model (deflated 55%)
  adding: kaggle/working/qlora_tinyllama_cli_final/tokenizer_config.json (deflated 68%)
  adding: kaggle/working/qlora_tinyllama_cli_final/adapter_config.json (deflated 53%)
  adding: kaggle/working/qlora_tinyllama_cli_final/special_tokens_map.json (deflated 73%)


In [84]:
trainer.evaluate()

{'eval_loss': 1.7490533590316772,
 'eval_runtime': 28.5155,
 'eval_samples_per_second': 5.015,
 'eval_steps_per_second': 2.525,
 'epoch': 0.995334370139969}