# Create your own private Copilot

**In this guide I show you how to fine-tune Code Llama to become a private Copilot. For coding tasks, you can generally get much better performance out of Code Llama than Llama 2, especially when you specialise the model on a particular task:**

- A Lora approach, quantizing the base model to int 8, freezing its weights and only training an adapter
- Much of the code is refactored from [alpaca-lora](https://github.com/tloen/alpaca-lora).

Avoid running this on V100 GPUs as it throws out errors.

In [1]:
# Install python dependencies
!pip install tqdm nbformat
!pip install git+https://github.com/huggingface/transformers.git@main bitsandbytes
!pip install git+https://github.com/huggingface/peft.git@main
!pip install datasets
import locale # colab workaround
locale.getpreferredencoding = lambda x=False:"UTF-8" # colab workaround
!pip install wandb
!pip install scipy

Collecting git+https://github.com/huggingface/transformers.git@main
  Cloning https://github.com/huggingface/transformers.git (to revision main) to /tmp/pip-req-build-ggcxw3zx
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-ggcxw3zx
  Resolved https://github.com/huggingface/transformers.git to commit 08a6e7a702d06826659eb7f0f6b9f37d33f31829
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting bitsandbytes
  Downloading bitsandbytes-0.41.3.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.37.0.dev0-py3-non

In [2]:
# Download and install git-xet
!curl -fsSLO https://github.com/xetdata/xet-tools/releases/latest/download/xet-linux-x86_64.tar.gz
!tar -xvf xet-linux-x86_64.tar.gz && rm xet-linux-x86_64.tar.gz
!mv git-xet /usr/local/bin
!git xet install

git-xet


In [3]:
# Set up authorization
from IPython.display import clear_output
user = input("GitHub user name?")
%env GH_USER=$user
email = input("GitHub user email?")
%env GH_USER_EMAIL=$email
token = input("GitHub token?")
%env GH_TOKEN=$token
repo = input("GitHub model repo?")
%env MODEL_REPO=$repo
%env XET_LOG_PATH=log.txt
clear_output()

In [4]:
!git config --global user.name $GH_USER
!git config --global user.email $GH_USER_EMAIL

In [5]:
# Clones the model repo
!git xet clone --lazy https://$GH_USER:$GH_TOKEN@github.com/$GH_USER/$MODEL_REPO.git -- --branch colab
!cd LLM_fine_tuning && git xet materialize CodeLlama-7b-hf

Preparing to clone Xet repository.
Cloning into 'LLM_fine_tuning'...
remote: Enumerating objects: 70, done.[K
remote: Counting objects: 100% (70/70), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 70 (delta 19), reused 41 (delta 8), pack-reused 0[K
Receiving objects: 100% (70/70), 60.72 KiB | 5.06 MiB/s, done.
Resolving deltas: 100% (19/19), done.
git-xet 0.12.7 filter started
Updating files: 100% (20/20), done.
Materializing 12 file(s)...
Done


In [6]:
# Get training dataset
from LLM_fine_tuning.scripts.prepare_dataset import create_dataset_from_git_repo
username='xetdata'
repository='xet-core'
parquet_file = create_dataset_from_git_repo(username,repository)

Total file paths: 320.
Reading file contents...


100%|██████████| 320/320 [00:00<00:00, 1711.23it/s]


In [7]:
import pandas as pd
df = pd.read_parquet(parquet_file)
df

Unnamed: 0,repo_id,file_path,content
0,xet-core,xet-core/LICENSE,"BSD 3-Clause License\n\nCopyright (c) 2023, Xe..."
1,xet-core,xet-core/README.md,"<p align=""center"">\n <img src=""https://githu..."
2,xet-core,xet-core/rust/Cargo.toml,"[workspace]\n\nmembers = [\n ""libmagic"",\n ..."
3,xet-core,xet-core/rust/cas_client/README.md,# CAS client \nUtilities to wrap around the gr...
4,xet-core,xet-core/rust/cas_client/Dockerfile,FROM rust:1.58 as builder\n\nRUN USER=root rus...
...,...,...,...
315,xet-core,xet-core/rust/gitxet/scripts/tests/create_test...,#!/usr/bin/env bash\nset -e\nset -x\n\nif [[ $...
316,xet-core,xet-core/rust/gitxet/scripts/tests/test_01_pus...,#!/usr/bin/env bash\nset -e\nset -x\n\nscript_...
317,xet-core,xet-core/rust/prometheus_dict_encoder/Cargo.toml,"[package]\nname = ""prometheus_dict_encoder""\nv..."
318,xet-core,xet-core/rust/prometheus_dict_encoder/src/lib.rs,use prometheus::proto;\nuse prometheus::proto:...


### Loading libraries


In [8]:
from datetime import datetime
import os
import sys

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq


(If you have import errors, try restarting your Jupyter kernel)


### Load dataset


In [9]:
from datasets import Dataset, load_dataset
dataset = Dataset.from_pandas(df, split="train")
train_dataset = dataset.train_test_split(test_size=0.1)["train"]
eval_dataset = dataset.train_test_split(test_size=0.1)["test"]

The above pulls the dataset from the Huggingface Hub and splits 10% of it into an evaluation set to check how well the model is doing through training. If you want to load your own dataset do this:

```
train_dataset = load_dataset('json', data_files='train_set.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='validation_set.jsonl', split='train')
```

And if you want to view any samples in the dataset just do something like:``` ```


In [10]:
print(train_dataset[3])

{'repo_id': 'xet-core', 'file_path': 'xet-core/rust/gitxetcore/src/constants.rs', 'content': '// TODO: .git is not reliably the git subfolder; need to use the proper version.\npub const CAS_STAGING_SUBDIR: &str = "xet/staging";\npub const GIT_NOTES_MERKLEDB_V1_REF_SUFFIX: &str = "xet/merkledb";\npub const GIT_NOTES_MERKLEDB_V1_REF_NAME: &str = "refs/notes/xet/merkledb";\npub const GIT_NOTES_SUMMARIES_REF_SUFFIX: &str = "xet/summaries";\npub const GIT_NOTES_SUMMARIES_REF_NAME: &str = "refs/notes/xet/summaries";\npub const MERKLEDBV1_PATH_SUBDIR: &str = "xet/merkledb.db";\npub const SUMMARIES_PATH_SUBDIR: &str = "xet/summaries.db";\n\npub const GIT_NOTES_MERKLEDB_V2_REF_SUFFIX: &str = "xet/merkledbv2";\npub const GIT_NOTES_MERKLEDB_V2_REF_NAME: &str = "refs/notes/xet/merkledbv2";\npub const MERKLEDB_V2_CACHE_PATH_SUBDIR: &str = "xet/merkledbv2-cache";\npub const MERKLEDB_V2_SESSION_PATH_SUBDIR: &str = "xet/merkledbv2-session";\n\npub const GIT_NOTES_REPO_SALT_REF_SUFFIX: &str = "xet/repo

In [11]:
print(eval_dataset[3])

{'repo_id': 'xet-core', 'file_path': 'xet-core/rust/cas_client/src/caching_client.rs', 'content': 'use crate::client_adapter::ClientRemoteAdapter;\nuse crate::interface::{CasClientError, Client};\nuse anyhow::anyhow;\nuse async_trait::async_trait;\nuse cache::{CacheError, Remote, XorbCache};\nuse cas::key::Key;\nuse merklehash::MerkleHash;\nuse std::collections::HashMap;\nuse std::fmt::Debug;\nuse std::ops::Range;\nuse std::path::Path;\nuse std::sync::{Arc, Mutex};\nuse tracing::{debug, info, warn};\n\n#[derive(Debug)]\npub struct CachingClient<T: Client + Debug + Sync + Send + \'static> {\n    client: Arc<T>,\n    cache: Arc<dyn XorbCache>,\n    xorb_lengths: Arc<Mutex<HashMap<MerkleHash, u64>>>,\n}\n\nimpl<T: Client + Debug + Sync + Send + \'static> CachingClient<T> {\n    /// Create a new caching client.\n    /// client: This is the client object used to satisfy requests\n    pub fn new(\n        client: T,\n        cache_path: &Path,\n        capacity_bytes: u64,\n        blocksize

Each entry is made up of a text 'question', the sql table 'context' and the 'answer'.

### Load model
I load code llama from huggingface in int8. Standard for Lora:

In [12]:
base_model = "./LLM_fine_tuning/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

torch_dtype=torch.float16 means computations are performed using a float16 representation, even though the values themselves are 8 bit ints.

If you get error "ValueError: Tokenizer class CodeLlamaTokenizer does not exist or is not currently imported." Make sure you have transformers version is 4.33.0.dev0 and accelerate is >=0.20.3.


### 3. Check base model
A very good common practice is to check whether a model can already do the task at hand. Fine-tuning is something you want to try to avoid at all cost:


In [13]:
eval_prompt = """You are a powerful code generation model. Your job is to complete the below Rust function.
/// Walk the repo working directory starting from search_root.
/// Return a list of file paths under the search_root, the
/// file paths are relative to the working dir root.
/// Note that symlinks are ignored because they are difficult to
/// deal with: git deals with the symlink file itself without
/// following the link.
pub fn walk_working_dir(
    work_root: impl AsRef<Path>,
    search_root: impl AsRef<Path>,
    recursive: bool,
) -> anyhow::Result<Vec<PathBuf>> {
  <FILL_ME>

  Ok(result)
}
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a powerful code generation model. Your job is to complete the below Rust function.
/// Walk the repo working directory starting from search_root.
/// Return a list of file paths under the search_root, the
/// file paths are relative to the working dir root.
/// Note that symlinks are ignored because they are difficult to
/// deal with: git deals with the symlink file itself without
/// following the link.
pub fn walk_working_dir(
    work_root: impl AsRef<Path>,
    search_root: impl AsRef<Path>,
    recursive: bool,
) -> anyhow::Result<Vec<PathBuf>> {
  
  
  Ok(result)
}
let work_root = work_root.as_ref();
  let search_root = search_root.as_ref();
  let mut result = Vec::new();
  let mut stack = Vec::new();
  stack.push(search_root);
  while let Some(path) = stack.pop() {
    let mut entries = fs::read_dir(path)?;
    while let Some(entry) = entries.next()


I get the output:
```
llet work_root = work_root.as_ref();
  let search_root = search_root.as_ref();
  let mut result = Vec::new();
  let mut stack = Vec::new();
  stack.push(search_root.to_path_buf());
  while let Some(path) = stack.pop() {
    if path.starts_with(work_root) {
      for entry in path.read
```

### 4. Tokenization
Setup some tokenization settings like left padding because it makes [training use less memory](https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa):

In [14]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

Setup the tokenize function to make labels and input_ids the same. This is basically what [self-supervised fine-tuning](https://neptune.ai/blog/self-supervised-learning) is:

In [15]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

In [16]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""
    ### Repository:
    {data_point["repo_id"]}

    ### File Path:
    {data_point["file_path"]}

    ### Source Code:
    {data_point["content"]}
    """
    return tokenize(full_prompt)

Reformat to prompt and tokenize each sample:

In [17]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/288 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

### 5. Setup Lora

In [18]:
model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)



Optional stuff to setup Weights and Biases to view training graphs:

In [19]:
wandb_project = ""
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project


In [20]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

### 6. Training arguments
If you run out of GPU memory, change per_device_train_batch_size. The gradient_accumulation_steps variable should ensure this doesn't affect batch dynamics during the training run. All the other variables are standard stuff that I wouldn't recommend messing with:

In [21]:
batch_size = 128
per_device_train_batch_size = 8
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "code-llama"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        # save_total_limit=3,
        load_best_model_at_end=False,
        # ddp_find_unused_parameters=False if ddp else None,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="none", # if use_wandb else "none",
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

Then we do some pytorch-related optimisation (which just make training faster but don't affect accuracy):

In [22]:
model.config.use_cache = False

if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)

compiling the model


In [None]:
trainer.train()
model.save_pretrained(output_dir)

### Load the final checkpoint
Now for the moment of truth! Has our work paid off...?

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

base_model = "./LLM_fine_tuning/CodeLlama-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

To load a fine-tuned Lora/Qlora adapter use PeftModel.from_pretrained. ```output_dir``` should be something containing an adapter_config.json and adapter_model.bin:

In [None]:
from peft import PeftModel
model = PeftModel.from_pretrained(model, output_dir)

Try the same prompt as before:

In [None]:
eval_prompt = """You are a powerful code generation model. Your job is to complete the below Rust function.
/// Walk the repo working directory starting from search_root.
/// Return a list of file paths under the search_root, the
/// file paths are relative to the working dir root.
/// Note that symlinks are ignored because they are difficult to
/// deal with: git deals with the symlink file itself without
/// following the link.
pub fn walk_working_dir(
    work_root: impl AsRef<Path>,
    search_root: impl AsRef<Path>,
    recursive: bool,
) -> anyhow::Result<Vec<PathBuf>> {
  <FILL_ME>

  Ok(result)
}
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))


And the model outputs:
```

```



In [None]:
# Finally merge the adapter and save the model
model = model.merge_and_unload()
model.save_pretrained(base_model)
!cd LLM_fine_tuning && git add . && git commit -m "Update fine tuned model" && git push