# Imports & Setup

Download all the necessary dependencies. These should be exactly the ones present in the `environment.yaml` file.


In [2]:
!pip -q install numpy tqdm pandas transformers accelerate bitsandbytes nltk packaging ninja flash-attn --no-build-isolation

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m41.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [3]:
!git clone https://github.com/prundeanualin/ATCS-project.git

Cloning into 'ATCS-project'...
remote: Enumerating objects: 463, done.[K
remote: Counting objects: 100% (225/225), done.[K
remote: Compressing objects: 100% (143/143), done.[K
remote: Total 463 (delta 124), reused 149 (delta 71), pack-reused 238[K
Receiving objects: 100% (463/463), 273.97 KiB | 546.00 KiB/s, done.
Resolving deltas: 100% (260/260), done.


In [4]:
# IF YOU WANT TO TEST THINGS FROM YOUR OWN BRANCH, UNCOMMENT BELOW
# ! git checkout split_dataset_iteration

In [5]:
# ! git pull

In [6]:
! git status

fatal: not a git repository (or any of the parent directories): .git


In [7]:
%cd /content/ATCS-project

/content/ATCS-project


In [8]:
import argparse

import torch
from tqdm import tqdm
from transformers import BitsAndBytesConfig
import pickle
import os
import time
import random

from datasets import ScanDataloader
from get_datasets import SCAN_EXAMPLES_FILEPATH
from prompt_processing.templates import ANALOGY_TEMPLATE_SIMPLE_INFERENCE, ANALOGY_TEMPLATE_SIMPLE_FULL
from prompt_processing.prompting import prepare_prompt
from model import LLMObj
from utils import *
from evaluate import *

os.environ['HF_TOKEN'] = "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"
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'

torch.set_default_device('cuda')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Inference

`LLMObj` is a HF wrapper that contains the LLM model, tokenizer, and text generation wrapper.

Below the class code, several LLMs that are available on HF are initialized.

For some models like LLama, you need to authenticate your HF account, so add your [HF access token](https://huggingface.co/docs/hub/security-tokens) to the secrets on secrets as `HF_TOKEN`.

## Model arguments

If you want to play around and change any arguments for when the model runs, here is the easiest place to do it.

In [62]:
# Since ArgParser does not work in colab, we just construct a custom class with all our neccessary arguments
class Args(argparse.Namespace):
  model = "berkeley-nest/Starling-LM-7B-alpha" # choices=['microsoft/Phi-3-mini-128k-instruct', 'berkeley-nest/Starling-LM-7B-alpha', 'meta-llama/Meta-Llama-3-8B-Instruct']
  tokenizer = None
  quantization = "4bit"
  low_cpu_mem_usage = True
  seed=1234
  run_on_cpu=False
  save_filename_details=''
  n_shot=0
  example_type='long'
  include_task_description=False
  cot=True
  baseline=False
  analogy_type=''

args = Args()

seed_experiments(args.seed)

## Load the dataset

In [27]:
# ----- Load dataset -----
# ----- Load dataset -----
dataloader = ScanDataloader(
    shuffle=False,
    analogy_sentence_infer=ANALOGY_TEMPLATE_SIMPLE_INFERENCE,
    analogy_sentence_full=ANALOGY_TEMPLATE_SIMPLE_FULL,
    examples_file=SCAN_EXAMPLES_FILEPATH.format(args.example_type),
    examples_shot_nr=args.n_shot
)

SCAN datasets already downloaded.


## << CUSTOM MODEL >>

Here you can change specific parts of the code (model, evaluation etc) and test them in the inference cell at the bottom. If you don't want to change anything from the current branch cloned in the notebook, then leave this commented out.

In [11]:
# from transformers import (AutoTokenizer,
#                           pipeline
#                           )
# import textwrap

# from utils import DummyPipeline


# class LLMObj:
#     def __init__(self, model,
#                  model_kwargs,
#                  tokenizer_name,
#                  system_prompt="",
#                  # This is used on devices without a GPU, to make sure that the rest of the code runs ok
#                  dummy_pipeline=False
#                  ):

#         # If tokenizer name is empty, then load it based on the model's name
#         if not tokenizer_name:
#             tokenizer_name = model
#         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

#         if dummy_pipeline:
#             pipe = DummyPipeline(tokenizer)
#         else:
#             pipe = pipeline(
#               "text-generation",
#               model=model,
#               tokenizer=tokenizer,
#               model_kwargs=model_kwargs,
#               trust_remote_code=True,
#               device_map="auto"
#             )

#         terminators = [
#             pipe.tokenizer.eos_token_id,
#             pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
#         ]

#         if system_prompt != "":
#           system_prompt = system_prompt
#         else:
#           system_prompt = "You are a friendly and helpful assistant"

#         self.model = model
#         self.pipe = pipe
#         # self.generation_kwargs = generation_kwargs
#         self.terminators = terminators
#         self.chat_template = [
#             {
#                 "role": "system",
#                 "content": system_prompt,
#             },

#             {
#                 "role": "user",
#                 "content": ""
#             },
#         ]

#     #     self.chat_template = [
#     #       {
#     #       "role": "user",
#     #       "content": """
#     #       You are an expert in linguistics and you have a vast general knowledge. You can complete analogies of the form "If A is like B, then C is like..."
#     #       by finding the relationship between A and C and apply that relationship to B in order to find the answer D.
#     #       You will provide only your correct answer in maximum 3 words, without other explanations or statements.
#     #       """},
#     #       {
#     #       "role": "assistant",
#     #       "content": "Great! I will complete the analogies the best I can."
#     #       },
#     #        {
#     #           "role": "user",
#     #           "content": ""
#     #       }
#     # ]


#     def update_system_prompt(self, system_prompt):
#         self.chat_template[0]['content'] = system_prompt

#     def wrap_text(self, text, width=90):
#         """Fits text to specified character width."""
#         lines = text.split('\n')
#         wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
#         wrapped_text = '\n'.join(wrapped_lines)
#         return wrapped_text

#     def generate(self, input_text, max_length=512):
#         if 'Starling' in self.model:
#           prompt = f"GPT4 Correct User: {input_text}<|end_of_turn|>GPT4 Correct Assistant:"

#         else:
#           self.chat_template[1]['content'] = input_text
#           prompt = self.pipe.tokenizer.apply_chat_template(
#               self.chat_template,
#               tokenize=False,
#               add_generation_prompt=True
#               )

#         outputs = self.pipe(
#             prompt,
#             max_new_tokens=max_length,
#             pad_token_id=self.pipe.tokenizer.pad_token_id,
#             eos_token_id=self.terminators,
#             do_sample=False,
#             # These are set only for the non-deterministic scenario with do_sample=True
#             # temperature=0.0,
#             # top_p=0.9,
#             # **generation_kwargs
#         )

#         generated_text = outputs[0]["generated_text"][len(prompt):]
#         wrapped_text = self.wrap_text(generated_text)
#         # display(Markdown(wrapped_text))
#         return generated_text


In [30]:
# from prompt_processing.templates import *

# def prepare_prompt(inference, examples, n_shot: int, baseline: bool, cot: bool, include_task_description: bool):
#     prompt = ''

#     # Possibly extend the prompt with the task description and some examples
#     if include_task_description:
#         prompt += ANALOGY_DESCRIPTION

#     # Zero-shot
#     if n_shot == 0:
#         # Add instruction to force short, direct answer
#         if baseline:
#             # prompt += STRUCTURED_BASELINE_INDICATION.format(inference)
#             prompt += BASELINE_INDICATION + inference
#         # Possibly add CoT instruction only if it is zero-shot
#         elif cot:
#             prompt += inference + " " + COT_INSTRUCTION # //TODO inspect results for the best place (before / after) to put the cot instruction. If first, also change filename in run.py-L132
#             # prompt += inference
#         else:
#             prompt += inference
#     # In case of one/few-shot, prepend the examples to the prompt
#     else:
#         for ex in examples:
#             if cot:
#                 example_answer = ex['analogy_detailed_cot']
#             else:
#               example_answer = ex['analogy_complete']
#             prompt += FEW_SHOT_TEMPLATE.format(ex['analogy_incomplete'], example_answer)
#             # prompt += ex['analogy_incomplete'] + "\n" + example_answer
#         # Add the inference analogy in the same Question/Answer template
#         prompt += FEW_SHOT_TEMPLATE.format(inference)
#     return prompt

## Load the model

In [63]:
# ----- Prepare model arguments -----
quantization = None
if args.quantization == '4bit':
    quantization = BitsAndBytesConfig(load_in_4bit=True)

model_kwargs = {
    "torch_dtype": torch.bfloat16,
    "low_cpu_mem_usage": args.low_cpu_mem_usage,
    "quantization_config": quantization
}

LLMObj_args = {
    'model': args.model,
    'model_kwargs': model_kwargs,
    'tokenizer_name': args.tokenizer,
    'dummy_pipeline': args.run_on_cpu
}
print("LLMObj Arguments are:")
print(LLMObj_args)

# ----- Load the model -----
LLM = LLMObj(**LLMObj_args)

LLMObj Arguments are:
{'model': 'berkeley-nest/Starling-LM-7B-alpha', 'model_kwargs': {'torch_dtype': torch.bfloat16, 'low_cpu_mem_usage': True, 'quantization_config': BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "float32",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}
}, 'tokenizer_name': None, 'dummy_pipeline': False}


tokenizer_config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

## Run the inference pipeline

In [64]:
###########################
######### TESTING #########
###########################

STOP_AT = 3
ANALOGY_DESCRIPTION = """
You are an expert in analogy resolution. You understand and apply relational patterns, often involving linguistic, conceptual, or functional similarities.
You will now complete analogies that look like "If A is like B, then C is like ...", where you need to find the missing answer D. For this, you need to identify the relationship
between A and C and apply this relationship on the concept in B in order to find the answer for D. Ensure that the relationship is consistent and logical.
"""

BASELINE_INDICATION = """
Respond only with one answer! Give no explanation and no other words, apart from that one answer!
"""

STRUCTURED_BASELINE = """
Question: {}
Answer: The final answer is
"""

COT_STRUCTURED = """
Question: {}
Answer: Let's first think this step by step and then give the final answer at the end phrased like 'The answer is: ...'.
"""


# ----- Run inference-----
durations = []
results = []

print("-- Running the model --")

for i, sample in enumerate(dataloader):
    start = time.time()
    idx = random.randint(0, len(dataloader) - 1)
    sample = dataloader[idx]

    if i == STOP_AT:
        print(f"Stopping at the first {STOP_AT} points from the dataset")
        break

    if args.analogy_type and sample['analogy_type'] != args.analogy_type:
        continue

    # prompt = prepare_prompt(sample['inference'],
    #                         sample['examples'],
    #                         n_shot=args.n_shot,
    #                         baseline=args.baseline,
    #                         cot=args.cot,
    #                         include_task_description=args.include_task_description)

    # prompt = BASELINE_INDICATION + f"""
    # {sample['inference']}
    # """

    prompt = COT_STRUCTURED.format(sample['inference'])
    # prompt = STRUCTURED_BASELINE.format(sample['inference'])

    # prompt = sample['inference'] + ANSWER_FORMATTING_INSTRUCTION

    print("Prompt is: ")
    print(prompt)
    print("---------------\n")
    output = LLM.generate(prompt, 500)
    print(output)
    print("======================\n\n")

    del sample['examples']
    results.append([sample, output])

    end = time.time()
    duration = end - start
    durations.append(duration)
    print(f"Iteration index {i}/{len(dataloader) - 1}: %.2f sec" % duration)

d = np.array(durations)
print("Inference duration(sec): total - %.2f, avg - %.2f, max - %.2f, min - %.2f" % (d.sum(), d.mean(), d.max(), d.min()))

# ----- Evaluate -----
print("-- Evaluating the model --")
evaluation_results = evaluate(results, RegexEvaluationStrategy())
print("Evaluation results:")
print(evaluation_results)

# evaluation_metrics = {
#     "acc": acc_score
# }

-- Running the model --
Prompt is: 

Question: If debate is like battle, then argumentation is like...
Answer: Let's first think this step by step and then give the final answer at the end phrased like 'The answer is: ...'. 

---------------

 Debate is like a battle, which implies that it involves a struggle or contest between two opposing sides. In the context of debate, this struggle is over a particular point of view or argument. 

Argumentation, on the other hand, is the process of forming and presenting a coherent case or series of reasons in support of a particular proposition or point of view. It is not necessarily a contest or struggle, but rather a systematic and logical presentation of reasons and evidence. 

Therefore, if debate is like a battle, then argumentation is like a systematic and logical presentation of reasons and evidence. 

The answer is: a systematic and logical presentation of reasons and evidence.


Iteration index 0/448: 14.31 sec
Prompt is: 

Question: If 

In [None]:
I