# Load the Context

In [1]:
import os
import re
import random
import json
# Mount to google drive
from google.colab import drive
drive.mount('/content/drive')

# Change it to your google drive path where this notebook located.
drive_path = '/content/drive/MyDrive/Projects/CryptoniteAnalysis/'
os.chdir(drive_path)

!pip install openai
import openai
import google.generativeai as genai
import copy

!pip install datasets
from datasets import load_dataset, load_from_disk

def load_dataset_from_disk():
    data_dir = 'datasets/cryptonite-official-split/'
    train_fp = data_dir + 'cryptonite-train.jsonl'
    val_fp = data_dir + 'cryptonite-val.jsonl'
    test_fp = data_dir + 'cryptonite-test.jsonl'
    datasets = load_dataset('json', data_files={'train': train_fp, 'validation': val_fp, 'test': test_fp})
    return datasets


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# LLM ChatBots

In [2]:
# @title GPT Chatbot
API_KEY="YOUR OPENAI API KEY"

# define the openai interface
def try_query_GPT(**request_body):
    client = openai.OpenAI(api_key=API_KEY)
    response = client.chat.completions.create(**request_body)
    return response

def accept_gpt_response(response):
    res_stop = True
    # first check if the response is complete
    if not response.choices[0].finish_reason == "stop":
        res_stop = False

    # Other checks in the future
    return res_stop

def query_GPT(**request_body):
    response = try_query_GPT(**request_body)
    # if response failed
    timeout = 0
    while not accept_gpt_response(response):
        response = try_query_GPT(**request_body)
        timeout += 1
        if timeout > 10:
            raise Exception("Query failed")
    return response.choices[0].message.content

default_request_body = {
    "model": "gpt-4o-mini",
    "messages": [{"role": "system", "content": "You are a helpful assistant."}],
    "temperature": 0.7,
}


class GPTChatBot:
    def __init__(self, initial_request_body=default_request_body):
        if "messages" not in initial_request_body:
            raise ValueError("messages not in request_body")
        if "model" not in initial_request_body:
            raise ValueError("model not in request_body")
        self.initial_request_body = copy.deepcopy(initial_request_body)

        self.chat_history = self.initial_request_body["messages"]

    def chat(self, prompt):
        # query ChatGPT, but do not add the conversation to history
        temp_request_body = copy.deepcopy(self.initial_request_body)
        temp_request_body["messages"].append({"role": "user", "content": prompt})
        response = query_GPT(**temp_request_body)
        return response

    def set_chat_history(self, chat_history):
        self.chat_history = chat_history


In [3]:
# @title Gemini Chatbot
GEMINI_KEY="YOUR GEMINI API KEY"

# define the openai interface
def try_query_Gemini(**request_body):
    model = request_body["model"]
    chat = model.start_chat(
        history=request_body['history']
    )
    prompt = request_body["prompt"]
    response = chat.send_message(prompt, generation_config=request_body["generation_config"])
    return response

def accept_Gemini_response(response):
    res_stop = True
    # first check if the response is complete
    if not response._done:
        res_stop = False

    # Other checks in the future
    return res_stop

def query_Gemini(**request_body):
    response = try_query_Gemini(**request_body)
    # if response failed
    timeout = 0
    while not accept_Gemini_response(response):
        response = try_query_Gemini(**request_body)
        timeout += 1
        if timeout > 10:
            raise Exception("Query failed")
    return response


class GeminiChatBot:
    def __init__(self, system_prompt="You are a helpful assistant.", gemini_model="gemini-1.5-flash", temperature=0.7):
        genai.configure(api_key=GEMINI_KEY)
        self.model = genai.GenerativeModel(model_name=gemini_model, system_instruction=system_prompt)
        self.generation_config = genai.types.GenerationConfig(temperature=temperature)
        self.chat_history = []




    def chat(self, prompt):
        '''
        for gemini we are not puting a interactive chatbot with history, just zero shot.
        No need to add the print feature
        '''
        request_body = {
            "model": self.model,
            "generation_config": self.generation_config,
            "history" : self.chat_history,
            "prompt": prompt,
        }
        response = query_Gemini(**request_body)

        return response.text
    def set_chat_history(self, chat_history):
        self.chat_history = chat_history



# In context learning

## Load Context to Chatbots

In [4]:
with open('PromptEngineering/InContextLearningExamples/SystemInstructions.md', 'r') as file:
    # Read the content of the file
    solver_system_prompt = file.read()
data = []
with open('PromptEngineering/InContextLearningExamples/ProcessedExamples.jsonl', 'r') as f:
    for line in f:
        # Convert each line from JSON string to a dictionary and append to the list
        data.append(json.loads(line.strip()))

import random

def load_gpt_chat_bot(gpt_model = "gpt-4o-2024-08-06"):
    messages = [{"role": "system", "content": solver_system_prompt}]
    random_numbers = random.sample(range(len(data)), 30)
    for i in random_numbers:
        example = data[i]
        messages.append({"role": "user", "content": example["input"]})
        messages.append({"role": "assistant", "content": example["explanation"]})
    request_body = {
        "model": gpt_model,
        "messages": messages,
        "temperature": 0.7,
    }
    chat_bot = GPTChatBot(request_body)

    extractor_system_prompt = "You are served as a information extractor. You will be given the output of an LLM, and a question, and from the given output, you will extract the information that answers the question. Your output will be linked to a computer program, so you will be accurate and concise."

    # Load the 4o extractor instead
    request_body = {
        "model": "gpt-4o-2024-08-06",   # only the 4o model is good enough
        "messages": [{"role": "system", "content": extractor_system_prompt}],
        "temperature": 0.2,
    }
    information_extractor = GPTChatBot(request_body)
    return chat_bot, information_extractor


def load_gemini_chat_bot(gemini_model = "gemini-1.5-pro"):
    history = []
    random_numbers = random.sample(range(len(data)), 30)
    for i in random_numbers:
        example = data[i]
        history.append({"role": "user", "parts": example["input"]})
        history.append({"role": "model", "parts": example["explanation"]})
    chat_bot = GeminiChatBot(system_prompt=solver_system_prompt, gemini_model=gemini_model, temperature=0.7)
    chat_bot.set_chat_history(history)


    extractor_system_prompt = "You are served as a information extractor. You will be given the output of an LLM, and a question, and from the given output, you will extract the information that answers the question. Your output will be linked to a computer program, so you will be accurate and concise."


    # information_extractor = GeminiChatBot(system_prompt=extractor_system_prompt, gemini_model=gemini_model, temperature=0.2)

    # Load the 4o extractor instead
    request_body = {
        "model": "gpt-4o-2024-08-06",   # only the 4o model is good enough
        "messages": [{"role": "system", "content": extractor_system_prompt}],
        "temperature": 0.2,
    }
    information_extractor = GPTChatBot(request_body)

    return chat_bot, information_extractor

## Load the dataset

In [5]:
datasets = load_dataset_from_disk()

In [6]:
# Carefullll Do not change this code!!!!

from joblib import Memory

# Work around for joblib caching in jupyter notebook "joblib persistence across sessions/machines"
def cache(mem, module, **mem_kwargs):
    # model is the notebook/python file name: Jupyter notebook's name is always changing so we need this work around
    def cache_(f):
        f.__module__ = module
        f.__qualname__ = f.__name__
        return mem.cache(f, **mem_kwargs)
    # return the cache function that will always create same name for cahce directory
    return cache_

# Create a memory object with a cache directory
memory = Memory(location="PromptEngineering/FunctionCache", verbose=0)

@cache(memory, "AutoStrategySelection")
def solve_puzzle(sample, model, attempt=1):
    if "gemini" in model:
        chat_bot, information_extractor = load_gemini_chat_bot(model)
    else:
        chat_bot, information_extractor = load_gpt_chat_bot(model)
    prompt = f"**Clue**: {sample['clue']}\n**Orientation**: {sample['orientation']}"
    response = chat_bot.chat(prompt)

    # extract information
    prompt_extract = f"Given the output:\n{response}, What is the answer? I don't need other information."
    response_extract = information_extractor.chat(prompt_extract)
    return response, response_extract

In [7]:
gemini_pro = "gemini-1.5-pro"
gemini_flash = "gemini-1.5-flash"
gpt_4o = "gpt-4o-2024-08-06"
gpt_4o_mini = "gpt-4o-mini"
from tqdm import tqdm

for model in [gemini_pro, gemini_flash, gpt_4o, gpt_4o_mini]:
    print(f"Model: {model}")
    for i in tqdm(range(200), ncols=100):
        sample = datasets['test'][i]
        try:
            response, response_extract = solve_puzzle(sample, model=model, attempt=1)
        except:
            print(f"Failed to solve puzzle {i} with model {model}")
            continue


Model: gemini-1.5-pro


 60%|████████████████████████████████████▌                        | 120/200 [00:14<00:25,  3.19it/s]

Failed to solve puzzle 107 with model gemini-1.5-pro


 69%|██████████████████████████████████████████                   | 138/200 [00:34<00:38,  1.60it/s]

Failed to solve puzzle 128 with model gemini-1.5-pro


100%|█████████████████████████████████████████████████████████████| 200/200 [00:35<00:00,  5.66it/s]


Model: gemini-1.5-flash


 70%|███████████████████████████████████████████                  | 141/200 [00:06<00:08,  7.10it/s]

Failed to solve puzzle 128 with model gemini-1.5-flash


100%|█████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 27.16it/s]


Model: gpt-4o-2024-08-06


100%|█████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 71.24it/s]


Model: gpt-4o-mini


100%|█████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 68.87it/s]


In [8]:
# @title test without reflection
test_size = 200

model_score = {'test_size': test_size, gemini_pro: 0, gemini_flash: 0, gpt_4o: 0, gpt_4o_mini: 0}

for model in [gemini_pro, gemini_flash, gpt_4o, gpt_4o_mini]:
    for i in range(test_size):
        sample = datasets['test'][i]
        try:
            response, response_extract = solve_puzzle(sample, model=model, attempt=1)
            if response_extract.strip().lower() == sample['answer'].strip().lower():
                model_score[model] += 1
        except:
            print(f"Failed to solve puzzle {i} with model {model}")
            continue

model_score

Failed to solve puzzle 107 with model gemini-1.5-pro
Failed to solve puzzle 128 with model gemini-1.5-pro
Failed to solve puzzle 128 with model gemini-1.5-flash


{'test_size': 200,
 'gemini-1.5-pro': 29,
 'gemini-1.5-flash': 12,
 'gpt-4o-2024-08-06': 43,
 'gpt-4o-mini': 16}

In [9]:
@cache(memory, "AutoStrategySelection")
def solve_puzzle_reflection(sample, model, attempt=1):
    if "gemini" in model:
        chat_bot, information_extractor = load_gemini_chat_bot(model)
        content_name = 'parts'
        model_role_name = 'model'
    else:
        chat_bot, information_extractor = load_gpt_chat_bot(model)
        content_name = 'content'
        model_role_name = 'assistant'
    prompt = f"**Clue**: {sample['clue']}\n**Orientation**: {sample['orientation']}"
    response = chat_bot.chat(prompt)

    # Now reflect on previous answer, and try again
    chat_bot.chat_history.append({"role": "user", content_name: prompt})
    chat_bot.chat_history.append({"role": model_role_name, content_name: response})
    prompt = f"Now look at your previous response, do you think your answer is correct? Does the answer fits the enumeration? Does the answer fits the definition? If not, then try again to solve this problem, maybe try another interpretation on what wordplay should you preform, and select new reasoning steps to follow.  "
    response = chat_bot.chat(prompt)

    # extract information
    prompt_extract = f"Given the output:\n{response}, What is the answer? I don't need other information."
    response_extract = information_extractor.chat(prompt_extract)
    return response, response_extract

In [12]:
gemini_pro = "gemini-1.5-pro"
gemini_flash = "gemini-1.5-flash"
gpt_4o = "gpt-4o-2024-08-06"
gpt_4o_mini = "gpt-4o-mini"
from tqdm import tqdm

for model in [ gemini_pro, gemini_flash, gpt_4o, gpt_4o_mini]:
    print(f"Model: {model}")
    for i in tqdm(range(200), ncols=100):
        sample = datasets['test'][i]
        try:
            response, response_extract = solve_puzzle_reflection(sample, model=model, attempt=1)
        except:
            print(f"Failed to solve puzzle {i} with model {model}")
            continue

Model: gemini-1.5-pro


 58%|███████████████████████████████████▍                         | 116/200 [00:32<00:28,  2.91it/s]

Failed to solve puzzle 107 with model gemini-1.5-pro


 68%|█████████████████████████████████████████▊                   | 137/200 [00:49<00:38,  1.64it/s]

Failed to solve puzzle 128 with model gemini-1.5-pro


100%|█████████████████████████████████████████████████████████████| 200/200 [00:50<00:00,  3.99it/s]


Model: gemini-1.5-flash


 55%|█████████████████████████████████▌                           | 110/200 [00:19<00:32,  2.75it/s]

Failed to solve puzzle 107 with model gemini-1.5-flash


 69%|██████████████████████████████████████████                   | 138/200 [00:32<00:21,  2.93it/s]

Failed to solve puzzle 128 with model gemini-1.5-flash


100%|█████████████████████████████████████████████████████████████| 200/200 [00:33<00:00,  6.05it/s]


Model: gpt-4o-2024-08-06


100%|█████████████████████████████████████████████████████████████| 200/200 [00:03<00:00, 65.59it/s]


Model: gpt-4o-mini


100%|█████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 75.05it/s]


In [16]:
# @title Test with reflection
test_size = 200

model_score = {'test_size': test_size, gemini_pro: 0, gemini_flash: 0, gpt_4o: 0, gpt_4o_mini: 0}

for model in [gemini_pro, gemini_flash, gpt_4o, gpt_4o_mini]:
    for i in range(test_size):
        sample = datasets['test'][i]
        try:
            response1, response_extract1 = solve_puzzle(sample, model=model, attempt=1)
            response2, response_extract2 = solve_puzzle_reflection(sample, model=model, attempt=1)
            if response_extract1.strip().lower() == sample['answer'].strip().lower() or response_extract2.strip().lower() == sample['answer'].strip().lower():
                model_score[model] += 1
        except:
            print(f"Failed to solve puzzle {i} with model {model}")
            continue

model_score

Failed to solve puzzle 107 with model gemini-1.5-pro
Failed to solve puzzle 128 with model gemini-1.5-pro
Failed to solve puzzle 107 with model gemini-1.5-flash
Failed to solve puzzle 128 with model gemini-1.5-flash


{'test_size': 200,
 'gemini-1.5-pro': 34,
 'gemini-1.5-flash': 17,
 'gpt-4o-2024-08-06': 59,
 'gpt-4o-mini': 20}

In [9]:
# from google.colab import runtime
# runtime.unassign()


# Legacy

backup for the cacehd code

In [10]:
# # Carefullll Do not change this code!!!!

# from joblib import Memory

# # Work around for joblib caching in jupyter notebook "joblib persistence across sessions/machines"
# def cache(mem, module, **mem_kwargs):
#     # model is the notebook/python file name: Jupyter notebook's name is always changing so we need this work around
#     def cache_(f):
#         f.__module__ = module
#         f.__qualname__ = f.__name__
#         return mem.cache(f, **mem_kwargs)
#     # return the cache function that will always create same name for cahce directory
#     return cache_

# # Create a memory object with a cache directory
# memory = Memory(location="PromptEngineering/FunctionCache", verbose=0)

# @cache(memory, "AutoStrategySelection")
# def solve_puzzle(sample, model, attempt=1):
#     if "gemini" in model:
#         chat_bot, information_extractor = load_gemini_chat_bot(model)
#     else:
#         chat_bot, information_extractor = load_gpt_chat_bot(model)
#     prompt = f"**Clue**: {sample['clue']}\n**Orientation**: {sample['orientation']}"
#     response = chat_bot.chat(prompt)

#     # extract information
#     prompt_extract = f"Given the output:\n{response}, What is the answer? I don't need other information."
#     response_extract = information_extractor.chat(prompt_extract)
#     return response, response_extract

In [11]:
# import os
# import json
# import shutil
# import ast

# # Set the cache directory where joblib stores the cached files
# cache_dir = "PromptEngineering/FunctionCache/joblib/AutoStrategySelection/solve_puzzle/"


# # Set of answers that you want to delete from the cache
# answers_to_delete = {"broad minded", "flood water", "amiss", 'up the pole', 'dog ends', 'thrashngg', 'baldpate', 'unseemly', 'gnu', 'refused'}
# gemini_flash = "gemini-1.5-flash"

# # Traverse the cache directory
# for root, dirs, files in os.walk(cache_dir):
#     if 'metadata.json' in files:
#         metadata_path = os.path.join(root, 'metadata.json')

#         # Open and read the metadata.json file
#         with open(metadata_path, 'r') as f:
#             metadata = json.load(f)

#         # Check if the answer field matches the ones we want to delete
#         try:
#             answer = ast.literal_eval(metadata['input_args']['sample'])['answer']
#             model = ast.literal_eval(metadata['input_args']['model'])
#             if (answer.strip() in answers_to_delete) and model == gemini_flash:
#                 print(f"Found match: {answer}, in directory: {root}")

#                 # Record and delete the directory
#                 shutil.rmtree(root)
#                 print(f"Deleted cache directory: {root}")
#         except KeyError:
#             # If the expected structure isn't found, skip this directory
#             print(f"No matching 'answer' field found in {metadata_path}")

# print("Completed cache cleanup.")
