In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from peft import PeftModel, PeftConfig
from util import task_to_prompt

In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [3]:
def create_model(use_finetuned):
    peft_model_id = "finetuned-models/Starling-LM-7B-alpha-finetuned"

    config = PeftConfig.from_pretrained(peft_model_id)
    model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, 
                                                        quantization_config=bnb_config, 
                                                        return_dict=True, 
                                                        load_in_4bit=True, 
                                                        device_map="auto")

    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

    # Load the Lora model
    if use_finetuned:
        model = PeftModel.from_pretrained(model, peft_model_id)

    return model, tokenizer

In [4]:
def generate_response(prompt, model, tokenizer, max_new_tokens=1024, temperature=0.5):
    input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
    token_length = len(input_ids["input_ids"][0])
    if token_length > 8192:
        print("Input too long: ", token_length)
        return prompt + "\n\nInput too long. Please try again."
    print("token length:", token_length)
    outputs = model.generate(**input_ids, do_sample=True, temperature=temperature, pad_token_id=tokenizer.eos_token_id, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Fine-Tuned

## Testing Evaluation Dataset

In [6]:
from tqdm.notebook import tqdm
import os
model, tokenizer = create_model(use_finetuned=True)

for task in tqdm(os.listdir("data/evaluation")):
    if os.path.exists(f"fine_tuned_results/evaluation/{task.replace('.json', '.txt')}"):
        continue
    prompt, output = task_to_prompt("data/evaluation/" + task)
    print(f"{task}", end=" ")
    full_prompt = f"GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:"
    output = generate_response(full_prompt, model=model, tokenizer=tokenizer, max_new_tokens=2048, temperature=0.5)
    with open(f"fine_tuned_results/evaluation/{task.replace('.json', '.txt')}", "w") as f:
        f.write(output)

  0%|          | 0/400 [00:00<?, ?it/s]

c97c0139.json token length: 5775
ea9794b1.json token length: 2667
5d2a5c43.json token length: 1434
fea12743.json token length: 3768
ae58858e.json token length: 2187
0f63c0b9.json token length: 6162


This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


e99362f0.json token length: 2556
195ba7dc.json token length: 1422
f3cdc58f.json token length: 2172
a8610ef7.json token length: 1059
84db8fc4.json token length: 2787
4b6b68e5.json token length: 7986
ff72ca3e.json token length: 4173
8ee62060.json token length: 2988
0b17323b.json token length: 3432
c074846d.json token length: 1449
e345f17b.json token length: 759
50aad11f.json token length: 2586
e4075551.json token length: 4662
66e6c45b.json token length: 297
358ba94e.json token length: 4791
d017b73f.json token length: 852
4c177718.json token length: 5082
b7999b51.json token length: 2769
3979b1a8.json token length: 882
bf32578f.json token length: 1296
d19f7514.json token length: 1095
b0722778.json token length: 900
136b0064.json token length: 3105
5b526a93.json token length: 5997
ef26cbf6.json token length: 1212
e633a9e5.json token length: 405
62ab2642.json token length: 1962
73c3b0d8.json token length: 1629
08573cc6.json token length: 2895
c48954c1.json token length: 909
31d5ba1a.json tok

## Testing Train Dataset

In [1]:
from tqdm.notebook import tqdm
import os

model, tokenizer = create_model(use_finetuned=True)
for task in tqdm(os.listdir("data/training")):
    if os.path.exists(f"fine_tuned_results/training/{task.replace('.json', '.txt')}"):
        continue
    prompt, output = task_to_prompt("data/training/" + task)
    print(f"{task}", end=" ")
    full_prompt = f"GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:"
    output = generate_response(full_prompt, model=model, tokenizer=tokenizer, max_new_tokens=2048, temperature=0.5)
    with open(f"fine_tuned_results/training/{task.replace('.json', '.txt')}", "w") as f:
        f.write(output)

  0%|          | 0/400 [00:00<?, ?it/s]

## Testing Concept ARC Dataset

In [17]:
from tqdm.notebook import tqdm
import os

model, tokenizer = create_model(use_finetuned=True)

for subdir, dirs, files in tqdm(os.walk("data/ConceptARC/")):
    for file in files:
        if file.endswith(".json"):
            if os.path.exists(f"fine_tuned_results/ConceptARC/{file.replace('.json', '.txt')}"):
                continue
            concepts = subdir.replace('data', 'fine_tuned_results')
            if not os.path.exists(concepts):
                os.mkdir(concepts)
            prompt, output = task_to_prompt(os.path.join(subdir, file))
            print(f"{file}", end=" ")
            full_prompt = f"GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:"
            output = generate_response(full_prompt, model=model, tokenizer=tokenizer, max_new_tokens=2048, temperature=0.5)
            with open(f"{concepts}/{file.replace('.json', '.txt')}", "w") as f:
                f.write(output)

0it [00:00, ?it/s]

ExtendToBoundary2.json token length: 5220
ExtendToBoundary6.json token length: 2199
ExtendToBoundary3.json token length: 2550
ExtendToBoundary7.json token length: 1749
ExtendToBoundary9.json token length: 4809
ExtendToBoundary1.json token length: 1353
ExtendToBoundary5.json token length: 1476
ExtendToBoundary10.json token length: 2097
ExtendToBoundary4.json token length: 1059
ExtendToBoundary8.json token length: 2583
ExtendToBoundaryMinimal.json token length: 1059
MoveToBoundary8.json token length: 2901
MoveToBoundary10.json token length: 1827
MoveToBoundary4.json token length: 1557
MoveToBoundary5.json token length: 1557
MoveToBoundary1.json token length: 1305
MoveToBoundary9.json token length: 2025
MoveToBoundary7.json token length: 2565
MoveToBoundary3.json token length: 1557
MoveToBoundary6.json token length: 1059
MoveToBoundary2.json token length: 2172
MoveToBoundaryMinimal.json token length: 1557
ExtractObjects7.json token length: 1182
ExtractObjects3.json token length: 2160
Extr

# Base Model

## Testing Evaluation Dataset

In [5]:
from tqdm.notebook import tqdm
import os

model, tokenizer = create_model(use_finetuned=False)

for task in tqdm(os.listdir("data/evaluation")):
    # if os.path.exists(f"basemodel_results/evaluation/{task.replace('.json', '.txt')}"):
    #     continue
    prompt, output = task_to_prompt("data/evaluation/" + task)
    print(f"{task}", end=" ")
    full_prompt = f"GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:"
    output = generate_response(full_prompt, model=model, tokenizer=tokenizer, max_new_tokens=2048, temperature=0.5)
    with open(f"basemodel_results/evaluation_with_prompt/{task.replace('.json', '.txt')}", "w") as f:
        f.write(output)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

## Testing Train Dataset

In [6]:
from tqdm.notebook import tqdm
import os

model, tokenizer = create_model(use_finetuned=False)

for task in tqdm(os.listdir("data/training")):
    # if os.path.exists(f"basemodel_results/training/{task.replace('.json', '.txt')}"):
    #     continue
    prompt, output = task_to_prompt("data/training/" + task)
    print(f"{task}", end=" ")
    full_prompt = f"GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:"
    output = generate_response(full_prompt, model=model, tokenizer=tokenizer, max_new_tokens=2048, temperature=0.5)
    with open(f"basemodel_results/training_with_prompt/{task.replace('.json', '.txt')}", "w") as f:
        f.write(output)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/400 [00:00<?, ?it/s]

5bd6f4ac.json token length: 1748
9aec4887.json token length: 3536
cdecee7f.json token length: 1691
ba26e723.json token length: 1655
aabf363d.json token length: 1130
e98196ab.json token length: 2357
9172f3a0.json token length: 962
23b5c85d.json token length: 4868
beb8660c.json token length: 1316
cbded52d.json token length: 1754
d90796e8.json token length: 1199
1f876c06.json token length: 2510
e8593010.json token length: 2510
56dc2b01.json token length: 2030
673ef223.json token length: 4478
3e980e27.json token length: 4988
272f95fa.json token length: 4220
a85d4709.json token length: 668
f2829549.json token length: 1124
f76d97a5.json token length: 839
be94b721.json token length: 1595
1190e5a7.json token length: 5219
b60334d2.json token length: 1610
1bfc4729.json token length: 1895
c444b776.json token length: 5240
62c24649.json token length: 842
6ecd11f4.json token length: 7202
fafffa47.json token length: 899
aba27056.json token length: 1706
045e512c.json Input too long:  9671
28e73c20.jso

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (8192). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


5614dbcf.json token length: 1178
eb5a1d5d.json token length: 7664
67a3c6ac.json token length: 1043
8be77c9e.json token length: 680
6a1e5592.json token length: 2645
ed36ccf7.json token length: 668
72ca375d.json token length: 1670
dc0a314f.json token length: 3707
68b16354.json token length: 1151
6fa7a44f.json token length: 776
b94a9452.json token length: 2438
dc433765.json token length: 2801
91413438.json token length: 2882
3631a71a.json Input too long:  24725
8f2ea7aa.json token length: 2111
6b9890af.json token length: 6812
60b61512.json token length: 1610
90c28cc7.json token length: 5759
1f0c79e5.json token length: 2612
11852cab.json token length: 2510
09629e4f.json token length: 3692
98cf29f8.json token length: 5564
a740d043.json token length: 959
963e52fc.json token length: 1490
0a938d79.json token length: 6122
67e8384a.json token length: 992
47c1f68c.json token length: 2270
ce22a75a.json token length: 1610
c909285e.json Input too long:  8687
d511f180.json token length: 716
868de0fa.

## Testing Concept ARC Dataset

In [7]:
from tqdm.notebook import tqdm
import os

model, tokenizer = create_model(use_finetuned=False)

for subdir, dirs, files in tqdm(os.walk("data/ConceptARC/")):
    for file in files:
        if file.endswith(".json"):
            if os.path.exists(f"basemodel_results/ConceptARC/{file.replace('.json', '.txt')}"):
                continue
            concepts = subdir.replace('data', 'basemodel_results')
            if not os.path.exists(concepts):
                os.mkdir(concepts)
            prompt, output = task_to_prompt(os.path.join(subdir, file))
            print(f"{file}", end=" ")
            full_prompt = f"GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:"
            output = generate_response(full_prompt, model=model, tokenizer=tokenizer, max_new_tokens=2048, temperature=0.5)
            with open(f"{concepts}/{file.replace('.json', '.txt')}", "w") as f:
                f.write(output)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0it [00:00, ?it/s]

ExtendToBoundary2.json token length: 5220
ExtendToBoundary6.json token length: 2199
ExtendToBoundary3.json token length: 2550
ExtendToBoundary7.json token length: 1749
ExtendToBoundary9.json token length: 4809
ExtendToBoundary1.json token length: 1353
ExtendToBoundary5.json token length: 1476
ExtendToBoundary10.json token length: 2097
ExtendToBoundary4.json token length: 1059
ExtendToBoundary8.json token length: 2583
ExtendToBoundaryMinimal.json token length: 1059
MoveToBoundary8.json token length: 2901
MoveToBoundary10.json token length: 1827
MoveToBoundary4.json token length: 1557
MoveToBoundary5.json token length: 1557
MoveToBoundary1.json token length: 1305
MoveToBoundary9.json token length: 2025
MoveToBoundary7.json token length: 2565
MoveToBoundary3.json token length: 1557
MoveToBoundary6.json token length: 1059
MoveToBoundary2.json token length: 2172
MoveToBoundaryMinimal.json token length: 1557
ExtractObjects7.json token length: 1182
ExtractObjects3.json token length: 2160
Extr