In [1]:
!pip install transformers datasets evaluate accelerate -q

In [2]:
import os
os.environ["HF_ALLOW_CODE_EVAL"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
from datasets import load_dataset
from evaluate import load
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
human_eval = load_dataset("openai_humaneval")['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
code_eval_metric = load("code_eval")

In [7]:
# model_id = "TinyLlama/TinyLlama_v1.1"
model_id = "codeparrot/codeparrot-small"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
)

model.eval()

tokenizer_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/457M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(32768, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=32768, bias=False)
)

In [8]:

# Fix special tokens
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({'eos_token': '</s>'})

# Resize embeddings if new special tokens added
if len(tokenizer) > model.config.vocab_size:
    model.resize_token_embeddings(len(tokenizer))

In [9]:
num_samples_per_problem = 20
test_cases = []
candidates = []

In [10]:
for problem in tqdm(human_eval.select(range(10)), desc="Problems", unit="problem"):  # test on 10 problems
    prompt = problem['prompt']
    test_code = problem['test']
    test_cases.append(test_code)
    problem_candidates = []

    for _ in range(num_samples_per_problem):
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=512,
                do_sample=True,
                temperature=0.7,
                top_p=0.95,
                num_return_sequences=1,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_code = generated_code[len(prompt):]
        problem_candidates.append(generated_code)

    candidates.append(problem_candidates)


Problems:   0%|          | 0/10 [00:00<?, ?problem/s][A
Problems:  10%|█         | 1/10 [00:46<06:55, 46.19s/problem][A
Problems:  20%|██        | 2/10 [01:28<05:52, 44.00s/problem][A
Problems:  30%|███       | 3/10 [02:10<05:00, 42.86s/problem][A
Problems:  40%|████      | 4/10 [02:42<03:52, 38.70s/problem][A
Problems:  50%|█████     | 5/10 [03:38<03:44, 44.94s/problem][A
Problems:  60%|██████    | 6/10 [04:05<02:35, 38.81s/problem][A
Problems:  70%|███████   | 7/10 [05:10<02:21, 47.27s/problem][A
Problems:  80%|████████  | 8/10 [06:03<01:38, 49.32s/problem][A
Problems:  90%|█████████ | 9/10 [06:59<00:51, 51.44s/problem][A
Problems: 100%|██████████| 10/10 [07:56<00:00, 47.66s/problem]


In [11]:
len(candidates)

10

In [12]:
print("Evaluating generated code...")
k_values = [1, 5, 10, 20]
pass_at_k, results = code_eval_metric.compute(
    references=test_cases,
    predictions=candidates,
    k=k_values,
    num_workers=4,
    timeout=10.0
)

Evaluating generated code...


In [13]:
for k in k_values:
    print(f"Pass@{k}: {pass_at_k[f'pass@{k}'] * 100:.2f}%")

Pass@1: 46.50%
Pass@5: 95.15%
Pass@10: 99.80%
Pass@20: 100.00%


In [14]:
model_id = "TinyLlama/TinyLlama_v1.1"
# model_id = "codeparrot/codeparrot-small"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16
)

model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb): 

In [15]:
# Fix special tokens
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({'eos_token': '</s>'})

# Resize embeddings if new special tokens added
if len(tokenizer) > model.config.vocab_size:
    model.resize_token_embeddings(len(tokenizer))

num_samples_per_problem = 20
test_cases = []
candidates = []

for problem in tqdm(human_eval.select(range(10)), desc="Problems", unit="problem"):  # test on 10 problems
    prompt = problem['prompt']
    test_code = problem['test']
    test_cases.append(test_code)
    problem_candidates = []

    for _ in range(num_samples_per_problem):
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=512,
                do_sample=True,
                temperature=0.7,
                top_p=0.95,
                num_return_sequences=1,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_code = generated_code[len(prompt):]
        problem_candidates.append(generated_code)

    candidates.append(problem_candidates)



Problems: 100%|██████████| 10/10 [32:58<00:00, 197.84s/problem]


In [16]:
print("Evaluating generated code...")
k_values = [1, 5, 10, 20]
pass_at_k2, results2 = code_eval_metric.compute(
    references=test_cases,
    predictions=candidates,
    k=k_values,
    num_workers=4,
    timeout=10.0
)

print(pass_at_k2)

for k in k_values:
    print(f"Pass@{k}: {pass_at_k2[f'pass@{k}'] * 100:.2f}%")

Evaluating generated code...
{'pass@1': np.float64(0.0), 'pass@5': np.float64(0.0), 'pass@10': np.float64(0.0), 'pass@20': np.float64(0.0)}
Pass@1: 0.00%
Pass@5: 0.00%
Pass@10: 0.00%
Pass@20: 0.00%


In [17]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [18]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, peft_config)


In [19]:
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [20]:
ds = load_dataset("openai_humaneval")

In [21]:
print(ds)

DatasetDict({
    test: Dataset({
        features: ['task_id', 'prompt', 'canonical_solution', 'test', 'entry_point'],
        num_rows: 164
    })
})


Okay, so code parrot works kinda at pass@5, 100%. Whereas that's not case with tinyllama. Lets see if i can finetune it to make this better.

Loading codealpaca dataset


In [22]:
dataset = load_dataset("OpenCoder-LLM/opc-sft-stage1",'realuser_instruct', split="train[:1000]")  # take a small subset


def format(example):
    return {
        "text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
    }

dataset = dataset.map(format)


In [23]:
dataset

Dataset({
    features: ['instruction', 'output', 'tag', 'text'],
    num_rows: 1000
})

In [24]:
tokenizer.pad_token = tokenizer.eos_token
# Tokenize
def tokenize(example):
    tokens = tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized = dataset.map(tokenize, batched=True)
tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [25]:
tokenized

Dataset({
    features: ['instruction', 'output', 'tag', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

In [27]:

training_args = TrainingArguments(
    output_dir="./tinyllama-coding-lora",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=2,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=False,
    save_total_limit=1,
    report_to="none"
)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [29]:
trainer.train()

Step,Training Loss
10,2.0987
20,1.9707
30,1.7133
40,1.6064
50,1.4747
60,1.4909
70,1.4175
80,1.4079
90,1.3781
100,1.4107


TrainOutput(global_step=126, training_loss=1.548877027299669, metrics={'train_runtime': 291.35, 'train_samples_per_second': 6.865, 'train_steps_per_second': 0.432, 'total_flos': 6362964688896000.0, 'train_loss': 1.548877027299669, 'epoch': 2.0})

In [30]:
# Save LoRA adapter only
model.save_pretrained("./tinyllama-lora-trained")
tokenizer.save_pretrained("./tinyllama-lora-trained")

('./tinyllama-lora-trained/tokenizer_config.json',
 './tinyllama-lora-trained/special_tokens_map.json',
 './tinyllama-lora-trained/tokenizer.model',
 './tinyllama-lora-trained/added_tokens.json',
 './tinyllama-lora-trained/tokenizer.json')

In [31]:
# Fix special tokens
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<pad>'})
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({'eos_token': '</s>'})

# Resize embeddings if new special tokens added
if len(tokenizer) > model.config.vocab_size:
    model.resize_token_embeddings(len(tokenizer))

num_samples_per_problem = 20
test_cases = []
candidates = []

for problem in tqdm(human_eval.select(range(10)), desc="Problems", unit="problem"):  # test on 10 problems
    prompt = problem['prompt']
    test_code = problem['test']
    test_cases.append(test_code)
    problem_candidates = []

    for _ in range(num_samples_per_problem):
        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=512,
                do_sample=True,
                temperature=0.7,
                top_p=0.95,
                num_return_sequences=1,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_code = generated_code[len(prompt):]
        problem_candidates.append(generated_code)

    candidates.append(problem_candidates)


Problems: 100%|██████████| 10/10 [43:56<00:00, 263.63s/problem]


In [32]:
print("Evaluating generated code...")
k_values = [1, 5, 10, 20]
pass_at_k2, results2 = code_eval_metric.compute(
    references=test_cases,
    predictions=candidates,
    k=k_values,
    num_workers=4,
    timeout=10.0
)

print(pass_at_k2)

for k in k_values:
    print(f"Pass@{k}: {pass_at_k2[f'pass@{k}'] * 100:.2f}%")

Evaluating generated code...
{'pass@1': np.float64(0.010000000000000009), 'pass@5': np.float64(0.05), 'pass@10': np.float64(0.1), 'pass@20': np.float64(0.2)}
Pass@1: 1.00%
Pass@5: 5.00%
Pass@10: 10.00%
Pass@20: 20.00%
