# Imports & Setup

Download all the necessary dependencies. These should be exactly the ones present in the `environment.yaml` file.


In [1]:
!pip -q install numpy tqdm pandas transformers accelerate bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#!pip -q install accelerate
!pip -q install bitsandbytes


In [None]:
# !git clone https://github.com/prundeanualin/ATCS-project.git

Cloning into 'ATCS-project'...
remote: Enumerating objects: 87, done.[K
remote: Counting objects: 100% (87/87), done.[K
remote: Compressing objects: 100% (69/69), done.[K
remote: Total 87 (delta 22), reused 46 (delta 13), pack-reused 0[K
Receiving objects: 100% (87/87), 43.01 KiB | 1.72 MiB/s, done.
Resolving deltas: 100% (22/22), done.
fatal: not a git repository (or any of the parent directories): .git


In [None]:
# IF YOU WANT TO TEST THINGS FROM YOUR OWN BRANCH, UNCOMMENT BELOW
# ! git checkout <your_own_branch>

Branch 'first_run' set up to track remote branch 'first_run' from 'origin'.
Switched to a new branch 'first_run'


In [None]:
# ! git status

On branch first_run
Your branch is up to date with 'origin/first_run'.

nothing to commit, working tree clean


In [2]:
!unzip ATCS-project-main.zip

Archive:  ATCS-project-main.zip
   creating: ATCS-project-main/
  inflating: __MACOSX/._ATCS-project-main  
  inflating: ATCS-project-main/run.py  
  inflating: __MACOSX/ATCS-project-main/._run.py  
  inflating: ATCS-project-main/.DS_Store  
  inflating: __MACOSX/ATCS-project-main/._.DS_Store  
  inflating: ATCS-project-main/LICENSE  
  inflating: __MACOSX/ATCS-project-main/._LICENSE  
  inflating: ATCS-project-main/datasets.py  
  inflating: __MACOSX/ATCS-project-main/._datasets.py  
  inflating: ATCS-project-main/model.py  
  inflating: __MACOSX/ATCS-project-main/._model.py  
  inflating: ATCS-project-main/README.md  
  inflating: __MACOSX/ATCS-project-main/._README.md  
  inflating: ATCS-project-main/.gitignore  
  inflating: __MACOSX/ATCS-project-main/._.gitignore  
  inflating: ATCS-project-main/utils.py  
  inflating: __MACOSX/ATCS-project-main/._utils.py  
  inflating: ATCS-project-main/environment.yaml  
  inflating: __MACOSX/ATCS-project-main/._environment.yaml  
   creating: 

In [3]:
pwd

'/content'

In [4]:
%cd /content/ATCS-project-main

/content/ATCS-project-main


In [5]:
import argparse

from get_datasets import SCAN_EXAMPLES_FILEPATH, EXAMPLE_CATEGORIES
from prompt_templates.analogy import ANALOGY_TEMPLATE_SIMPLE_INFERENCE, ANALOGY_TEMPLATE_SIMPLE_FULL
from model import LLMObj
import torch
from tqdm import tqdm
from transformers import BitsAndBytesConfig
import pickle
from datasets import ScanDataset
import os

from utils import seed_experiments

os.environ['HF_TOKEN'] = "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"
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'

torch.set_default_device('cuda')

# Inference

`LLMObj` is a HF wrapper that contains the LLM model, tokenizer, and text generation wrapper.

Below the class code, several LLMs that are available on HF are initialized.

For some models like LLama, you need to authenticate your HF account, so add your [HF access token](https://huggingface.co/docs/hub/security-tokens) to the secrets on secrets as `HF_TOKEN`.

## Model arguments

In [6]:
# Since ArgParser does not work in colab, we just construct a custom class with all our neccessary arguments
class Args(argparse.Namespace):
  model = "microsoft/Phi-3-mini-128k-instruct"
  tokenizer = "microsoft/Phi-3-mini-128k-instruct"
  quantization = "4bit"
  low_cpu_mem_usage = True
  seed=1234

args = Args()

seed_experiments(args.seed)

## Load the dataset

### SCAN

In [None]:
# Load the dataset
dataset = ScanDataset(
    shuffle=False,
    analogy_sentence_infer=ANALOGY_TEMPLATE_SIMPLE_INFERENCE,
    analogy_sentence_full=ANALOGY_TEMPLATE_SIMPLE_FULL,
    examples_file=SCAN_EXAMPLES_FILEPATH.format(EXAMPLE_CATEGORIES[0]),
    examples_start_idx=0,
    examples_shot_nr=1
)

SCAN datasets already downloaded.


In [None]:
for i, sample in tqdm(enumerate(dataset)):
    print(sample)
    break

0it [00:00, ?it/s]

{'inference': 'If atom is like solar system, then electron is like...', 'label': 'planet', 'alternatives': [], 'analogy_type': 'science'}





### BATS dataset

In [10]:
from itertools import combinations
import csv
import random
import pickle

BATS_FOLDER = 'data/BATS'
BATS_FILENAME = 'L01 [hypernyms - animals] sample'
COT_TEMPLATE = "Thinking step by step. "


ANALOGY_TEMPLATE_SIMPLE_FULL = "If {} is like {}, then {} is like {}."
ANALOGY_TEMPLATE_SIMPLE_INFERENCE = "If {} is like {}, then {} is like..."

class BATSDataloader_0shot:
    def __init__(self, dataFolder, fileName, numberOfAnalogy = False, cot=False, shuffle=False, promptType="by-relation", promptFormat=" If {} is like {}, {} is like ..."):
        self.dataFolder = dataFolder
        self.fileName = fileName
        self.promptFormat = promptFormat
        start = fileName.find('[') + 1
        end = fileName.find(']')
        self.analogyType = fileName[start:end]
        self.numberOfAnalogy = numberOfAnalogy
        self.promptType = promptType # promptType: by-relation, by-target-word
        self.COT = cot
        self.promptFormat = promptFormat
        self.shuffle = shuffle
        self.load_pairs()
        self.build_prompt()

    def load_pairs(self):
        '''
        Output: pairs = [[target, source, alternatives, analogyType], [target, source, alternatives, analogyType], ...]
        '''
        with open(f'{self.dataFolder}/{self.fileName}.txt', 'r') as f:
            lines = f.readlines()
        self.pairs = []
        for line in lines:
            target, values = line.strip().split('\t')
            values = values.split('/')
            # remove value that have underscore
            values = [value for value in values if '_' not in value]
            # only select the first element in value list as label/attribute, the rest are alternatives
            source = values[0]
            alternatives = [value for value in values[1:] if value]
            self.pairs.append([target, source, alternatives, self.analogyType])

        # # save as csv
        # with open(f'{self.dataFolder}/{self.fileName}_pairs.csv', 'w') as f:
        #     writer = csv.writer(f)
        #     writer.writerow(['target', 'source', 'alternatives', 'analogy_type'])
        #     writer.writerows(self.pairs)

        # save as pickle
        with open(f'{self.dataFolder}/{self.fileName}_pairs.pkl', 'wb') as f:
            pickle.dump(self.pairs, f)


    def build_prompt(self):
        '''
        Output:
        '''
        self.prompt = []
        if self.promptType == "by-relation":
            # combination of 2 pairs and generate prompt
            for pair in combinations(self.pairs, 2):
                # check if same analogy type
                if pair[0][3] != pair[1][3]:
                    continue
                inference = self.promptFormat.format(pair[0][0], pair[0][1], pair[1][0])
                if self.COT:
                    inference = self.COT + inference
                self.prompt.append({'inference': inference, 'label': pair[1][1], 'alternatives': pair[1][2], 'analogy_type': pair[0][3]})
        elif self.promptType == "by-target-word":
            for pair in combinations(self.pairs, 2):
                # check if same analogy type
                if pair[0][3] != pair[1][3]:
                    continue
                inference = self.promptFormat.format(pair[0][0], pair[1][0], pair[0][1])
                if self.COT:
                    inference = self.COT + inference
                self.prompt.append({'inference': inference, 'label': pair[1][1], 'alternatives': pair[1][2], 'analogy_type': pair[0][3]})
        else:
            print("Invalid prompt type. Either by-relation or by-target-word")

        prompt_condition = self.promptFormat.split(",")[0]
        fname = f"0shot_prompt_{self.fileName}_{self.COT if self.COT else 'no_COT'}_{self.promptType}_{prompt_condition}"
        # # save as csv
        # with open(f'{self.dataFolder}/{fname}.csv', 'w') as f:
        #     writer = csv.DictWriter(f, fieldnames=self.prompt[0].keys())
        #     writer.writeheader()
        #     writer.writerows(self.prompt)

        # save as pickle
        with open(f'{self.dataFolder}/{fname}.pkl', 'wb') as f:
            pickle.dump(self.prompt, f)

    def __call__(self):
        if self.shuffle:
            random.seed(42)
            random.shuffle(self.prompt)
        # return only the number of analogy specified (if does not exceed the total number of analogy)
        if self.numberOfAnalogy and self.numberOfAnalogy < len(self.pairs):
            return self.prompt[:self.numberOfAnalogy]
        return self.prompt


BATS_dataset = BATSDataloader_0shot(BATS_FOLDER, BATS_FILENAME, numberOfAnalogy = 2, cot = COT_TEMPLATE, promptType='by-target-word', promptFormat = ANALOGY_TEMPLATE_SIMPLE_INFERENCE)
BATS_dataset()

[{'inference': 'Thinking step by step. If allosaurus is like anaconda, then dinosaur is like...',
  'label': 'snake',
  'alternatives': ['reptile', 'boa', 'serpent', 'ophidian'],
  'analogy_type': 'hypernyms - animals'},
 {'inference': 'Thinking step by step. If allosaurus is like ant, then dinosaur is like...',
  'label': 'insect',
  'alternatives': ['invertebrate'],
  'analogy_type': 'hypernyms - animals'}]

In [8]:
for i, sample in tqdm(enumerate(BATS_dataset())):
  print(sample)
  break

0it [00:00, ?it/s]

{'inference': 'Thinking step by step. If allosaurus is like anaconda, then dinosaur is like...', 'label': 'snake', 'alternatives': ['reptile', 'boa', 'serpent', 'ophidian'], 'analogy_type': 'hypernyms - animals'}





## Load the model

In [8]:
quantization = None
if args.quantization == '4bit':
    quantization = BitsAndBytesConfig(load_in_4bit=True)

model_kwargs = {
    "torch_dtype": torch.bfloat16,
    "low_cpu_mem_usage": args.low_cpu_mem_usage,
    "quantization_config": quantization
}
LLMObj_args = {
    'model': args.model,
    'model_kwargs': model_kwargs,
    'tokenizer_name': args.tokenizer
}
print("LLMObj Arguments are:")
print(LLMObj_args)

# Load the model
LLM = LLMObj(**LLMObj_args)

LLMObj Arguments are:
{'model': 'microsoft/Phi-3-mini-128k-instruct', 'model_kwargs': {'torch_dtype': torch.bfloat16, 'low_cpu_mem_usage': True, 'quantization_config': BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "float32",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "fp4",
  "bnb_4bit_use_double_quant": false,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}
}, 'tokenizer_name': 'microsoft/Phi-3-mini-128k-instruct'}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

## Run the inference pipeline

### BATS

In [15]:
# Stop at just 3 generations, just to see it in action
stop_at_datapoint_idx = 2

# Run inference
generated_prompts = []
for i, sample in tqdm(enumerate(BATS_dataset())):
  print(sample['inference'])
  output = LLM.generate(sample['inference'])
  print(output)
  generated_prompts.append([sample, output])
  if i >= stop_at_datapoint_idx:
    break




Thinking step by step. If allosaurus is like anaconda, then dinosaur is like...


1it [00:59, 59.02s/it]

 To solve this analogy, let's break it down step by step:

1. Identify the relationship between the first pair of words.
   - Allosaurus and Anaconda: The relationship here is that of a predator to its prey. An
Allosaurus is a type of large theropod dinosaur, which means it was a predator. Anaconda,
on the other hand, is a type of large snake, which means it is a predator as well.

2. Apply the same relationship to the second pair of words.
   - Dinosaur and?: Since we've established that a dinosaur is a predator (in the context
of this analogy), we need to find something that has a similar relationship to a dinosaur.

3. Consider the options and find the best fit.
   - We could think of a variety of options, such as prey, habitat, or a different
predator. However, since we're looking for a single word that best fits the analogy, let's
consider the most direct relationship.

4. Determine the most fitting word.
   - Prey: This is a direct relationship to predators. Dinosaurs, like all p

2it [01:58, 59.28s/it]

 To solve this riddle, let's break it down step by step and think logically about the
relationships being suggested:

1. The first part of the statement says, "If Allosaurus is like an ant."
   - This part of the statement is a simile, comparing the Allosaurus to an ant in some
way.
   - We know that an ant is small and considered a predator in its own right.
   - However, this comparison might not be the most accurate since Allosaurus was a large
theropod dinosaur. This part of the riddle might be a playful exaggeration or a misleading
clue.

2. The second part of the statement says, "then Dinosaur is like..."
   - Here, we need to determine what the Allosaurus (as a representative of dinosaurs) is
being compared to.
   - Since the first part of the statement is a bit confusing, let's focus on the dinosaur
as a whole.

3. To find a logical comparison, let's consider the characteristics of dinosaurs in
general:
   - Dinosaurs were a diverse group of animals.
   - They varied greatly in

2it [02:46, 83.31s/it]

 To continue the analogy in a logical and creative way, you would think of a relationship
or characteristic that a snake shares with another animal that is not a direct match but
has a similar or contrasting quality to a python (since you mentioned Anaconda as a type
of python and an ant as a small insect). Here's a step-by-step thought process:

1. Identify the key characteristics of a snake:
   - Scales
   - Long, slender body
   - Ability to slither
   - Venomous or constricting nature (in some species)
   - Cold-blooded (ectothermic)
   - Solitary or territorial behavior

2. Consider the characteristics of the animal you are comparing it to (in this case, an
ant):
   - Small size
   - Hard exoskeleton
   - Social structure (in the case of ants, they live in colonies)
   - Warm-blooded (endothermic)
   - Active and industrious behavior

3. Find a contrast or a unique relationship between the two:
   - Since ants are warm-blooded and social, while snakes are cold-blooded and mostly
s




### SCAN

In [None]:
# Stop at just 3 generations, just to see it in action
stop_at_datapoint_idx = 3

# Run inference
generated_prompts = []
for i, sample in tqdm(enumerate(dataset)):
  print(sample['inference'])
  output = LLM.generate(sample['inference'])
  generated_prompts.append([sample, output])
  if i >= stop_at_datapoint_idx:
    break

with open(f'{args.model.split("/")[1]}_generated_prompts.pl', 'wb') as f:
    pickle.dump(generated_prompts, f)




If atom is like solar system, then electron is like...


1it [01:01, 61.87s/it]

If atom is like solar system, then charge is like...


2it [01:58, 58.98s/it]

If atom is like solar system, then attracts is like...


3it [02:10, 37.51s/it]

If atom is like solar system, then revolves is like...


3it [02:38, 52.72s/it]


In [None]:
import pandas as pd
obj = pd.read_pickle(f'{args.model.split("/")[1]}_generated_prompts.pl')
for el in obj:
  print(el)

[{'inference': 'If atom is like solar system, then electron is like...', 'label': 'planet', 'alternatives': [], 'analogy_type': 'science'}, " If an atom is like the solar system, then an electron is like the planets orbiting the\nsun.\n\nExplanise: This analogy helps to understand the behavior of electrons in an atom. Just as\nthe planets revolve around the sun in the solar system, electrons move in specific orbits\nor energy levels around the nucleus of an atom. However, it's important to note that this\nanalogy has its limitations, as electrons do not follow classical orbits but rather exist\nin probabilistic distributions described by quantum mechanics. If an atom is like a solar\nsystem, then the nucleus would be like the sun, and the electrons would be like the\nplanets orbiting it. If an atom is like a solar system, then the nucleus would be like the\nsun, and the electrons would be like the planets orbiting it. However, it's important to\nnote that this analogy has its limitatio