In [None]:
import os
import re
import random
import json
# Mount to google drive
from google.colab import drive
drive.mount('/content/drive')

# Change it to your google drive path where this notebook located.
drive_path = '/content/drive/MyDrive/Projects/CryptoniteAnalysis/'
os.chdir(drive_path)

!pip install openai
import openai
import google.generativeai as genai
import copy

!pip install datasets
from datasets import load_dataset, load_from_disk

def load_dataset_from_disk():
    data_dir = 'datasets/cryptonite-official-split/'
    train_fp = data_dir + 'cryptonite-train.jsonl'
    val_fp = data_dir + 'cryptonite-val.jsonl'
    test_fp = data_dir + 'cryptonite-test.jsonl'
    datasets = load_dataset('json', data_files={'train': train_fp, 'validation': val_fp, 'test': test_fp})
    return datasets


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @title GPT Chatbot
API_KEY="YOUR OPENAI API KEY"

# define the openai interface
def try_query_GPT(**request_body):
    client = openai.OpenAI(api_key=API_KEY)
    response = client.chat.completions.create(**request_body)
    return response

def accept_gpt_response(response):
    res_stop = True
    # first check if the response is complete
    if not response.choices[0].finish_reason == "stop":
        res_stop = False

    # Other checks in the future
    return res_stop

def query_GPT(**request_body):
    response = try_query_GPT(**request_body)
    # if response failed
    timeout = 0
    while not accept_gpt_response(response):
        response = try_query_GPT(**request_body)
        timeout += 1
        if timeout > 10:
            raise Exception("Query failed")
    return response.choices[0].message.content

default_request_body = {
    "model": "gpt-4o-mini",
    "messages": [{"role": "system", "content": "You are a helpful assistant."}],
    "temperature": 0.7,
}


class GPTChatBot:
    def __init__(self, initial_request_body=default_request_body):
        if "messages" not in initial_request_body:
            raise ValueError("messages not in request_body")
        if "model" not in initial_request_body:
            raise ValueError("model not in request_body")
        self.initial_request_body = copy.deepcopy(initial_request_body)

        self.chat_history = self.initial_request_body["messages"]

    def chat(self, prompt):
        # query ChatGPT, but do not add the conversation to history
        temp_request_body = copy.deepcopy(self.initial_request_body)
        temp_request_body["messages"].append({"role": "user", "content": prompt})
        response = query_GPT(**temp_request_body)
        return response

    def set_chat_history(self, chat_history):
        self.chat_history = chat_history

# @title Gemini Chatbot
GEMINI_KEY="YOUR GEMINI API KEY"

# define the openai interface
def try_query_Gemini(**request_body):
    model = request_body["model"]
    chat = model.start_chat(
        history=request_body['history']
    )
    prompt = request_body["prompt"]
    response = chat.send_message(prompt, generation_config=request_body["generation_config"])
    return response

def accept_Gemini_response(response):
    res_stop = True
    # first check if the response is complete
    if not response._done:
        res_stop = False

    # Other checks in the future
    return res_stop

def query_Gemini(**request_body):
    response = try_query_Gemini(**request_body)
    # if response failed
    timeout = 0
    while not accept_Gemini_response(response):
        response = try_query_Gemini(**request_body)
        timeout += 1
        if timeout > 10:
            raise Exception("Query failed")
    return response


class GeminiChatBot:
    def __init__(self, system_prompt="You are a helpful assistant.", gemini_model="gemini-1.5-flash", temperature=0.7):
        genai.configure(api_key=GEMINI_KEY)
        self.model = genai.GenerativeModel(model_name=gemini_model, system_instruction=system_prompt)
        self.generation_config = genai.types.GenerationConfig(temperature=temperature)
        self.chat_history = []




    def chat(self, prompt):
        '''
        for gemini we are not puting a interactive chatbot with history, just zero shot.
        No need to add the print feature
        '''
        request_body = {
            "model": self.model,
            "generation_config": self.generation_config,
            "history" : self.chat_history,
            "prompt": prompt,
        }
        response = query_Gemini(**request_body)

        return response.text
    def set_chat_history(self, chat_history):
        self.chat_history = chat_history

def load_gpt_chat_bot(gpt_model = "gpt-4o-2024-08-06"):
    messages = [{"role": "system", "content": "You are a helpful assistant."}]
    request_body = {
        "model": gpt_model,
        "messages": messages,
        "temperature": 0.7,
    }
    chat_bot = GPTChatBot(request_body)

    extractor_system_prompt = "You are served as a information extractor. You will be given the output of an LLM, and a question, and from the given output, you will extract the information that answers the question. Your output will be linked to a computer program, so you will be accurate and concise."

    # Load the 4o extractor instead
    request_body = {
        "model": "gpt-4o-2024-08-06",   # only the 4o model is good enough
        "messages": [{"role": "system", "content": extractor_system_prompt}],
        "temperature": 0.2,
    }
    information_extractor = GPTChatBot(request_body)
    return chat_bot, information_extractor

def load_gemini_chat_bot(gemini_model = "gemini-1.5-pro"):
    chat_bot = GeminiChatBot(system_prompt="You are a helpful assistant.", gemini_model=gemini_model, temperature=0.7)


    extractor_system_prompt = "You are served as a information extractor. You will be given the output of an LLM, and a question, and from the given output, you will extract the information that answers the question. Your output will be linked to a computer program, so you will be accurate and concise."


    # information_extractor = GeminiChatBot(system_prompt=extractor_system_prompt, gemini_model=gemini_model, temperature=0.2)

    # Load the 4o extractor instead
    request_body = {
        "model": "gpt-4o-2024-08-06",   # only the 4o model is good enough
        "messages": [{"role": "system", "content": extractor_system_prompt}],
        "temperature": 0.2,
    }
    information_extractor = GPTChatBot(request_body)

    return chat_bot, information_extractor

In [None]:
# Carefullll Do not change this code!!!!

from joblib import Memory

# Work around for joblib caching in jupyter notebook "joblib persistence across sessions/machines"
def cache(mem, module, **mem_kwargs):
    # model is the notebook/python file name: Jupyter notebook's name is always changing so we need this work around
    def cache_(f):
        f.__module__ = module
        f.__qualname__ = f.__name__
        return mem.cache(f, **mem_kwargs)
    # return the cache function that will always create same name for cahce directory
    return cache_

# Create a memory object with a cache directory
memory = Memory(location="PromptEngineering/FunctionCache", verbose=0)

@cache(memory, "AutoStrategySelection")
def naive_QA_solve_puzzle(sample, model, attempt=1):
    if "gemini" in model:
        chat_bot, information_extractor = load_gemini_chat_bot(model)
    else:
        chat_bot, information_extractor = load_gpt_chat_bot(model)
    prompt = f"Can you solve this problem for me? \n**Clue**: {sample['clue']}\n**Orientation**: {sample['orientation']}"
    response = chat_bot.chat(prompt)

    # extract information
    prompt_extract = f"Given the output:\n{response}, What is the answer? I don't need other information."
    response_extract = information_extractor.chat(prompt_extract)
    return response, response_extract

In [None]:
datasets = load_dataset_from_disk()
gemini_pro = "gemini-1.5-pro"
gemini_flash = "gemini-1.5-flash"
gpt_4o = "gpt-4o-2024-08-06"
gpt_4o_mini = "gpt-4o-mini"
# gpt_o1 = "o1-preview-2024-09-12"
# gpt_o1_mini = "o1-mini-2024-09-12"
from tqdm import tqdm

for model in [gemini_pro, gemini_flash, gpt_4o, gpt_4o_mini]:
    print(f"Model: {model}")
    for i in tqdm(range(200), ncols=100):
        sample = datasets['test'][i]
        try:
            response, response_extract = naive_QA_solve_puzzle(sample, model=model, attempt=1)
        except:
            print(f"Failed to solve puzzle {i} with model {model}")
            continue
        # response, response_extract = naive_QA_solve_puzzle(sample, model=model, attempt=1)

Model: gemini-1.5-pro


 16%|██████████▏                                                   | 33/200 [00:04<00:21,  7.95it/s]

Failed to solve puzzle 19 with model gemini-1.5-pro


 61%|█████████████████████████████████████▏                       | 122/200 [00:08<00:06, 11.68it/s]

Failed to solve puzzle 107 with model gemini-1.5-pro


 65%|███████████████████████████████████████▋                     | 130/200 [00:12<00:11,  5.91it/s]

Failed to solve puzzle 128 with model gemini-1.5-pro


 74%|████████████████████████████████████████████▊                | 147/200 [00:15<00:08,  6.14it/s]

Failed to solve puzzle 132 with model gemini-1.5-pro


 88%|█████████████████████████████████████████████████████▉       | 177/200 [00:19<00:03,  7.26it/s]

Failed to solve puzzle 162 with model gemini-1.5-pro


100%|█████████████████████████████████████████████████████████████| 200/200 [00:19<00:00, 10.16it/s]


Model: gemini-1.5-flash


 62%|█████████████████████████████████████▊                       | 124/200 [00:02<00:02, 30.81it/s]

Failed to solve puzzle 107 with model gemini-1.5-flash


 72%|███████████████████████████████████████████▌                 | 143/200 [00:03<00:02, 20.33it/s]

Failed to solve puzzle 128 with model gemini-1.5-flash


100%|█████████████████████████████████████████████████████████████| 200/200 [00:04<00:00, 44.81it/s]


Model: gpt-4o-2024-08-06


100%|█████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 94.33it/s]


Model: gpt-4o-mini


100%|█████████████████████████████████████████████████████████████| 200/200 [00:02<00:00, 85.47it/s]


In [None]:
test_size = 200

model_score = {'test_size': test_size, gemini_pro: 0, gemini_flash: 0, gpt_4o: 0, gpt_4o_mini: 0}

for model in [gemini_pro, gemini_flash, gpt_4o, gpt_4o_mini]:
    for i in range(test_size):
        sample = datasets['test'][i]
        try:
            response, response_extract = naive_QA_solve_puzzle(sample, model=model, attempt=1)
            if response_extract.strip().lower() == sample['answer'].strip().lower():
                model_score[model] += 1
        except:
            print(f"Failed to solve puzzle {i} with model {model}")
            continue

model_score

Failed to solve puzzle 19 with model gemini-1.5-pro
Failed to solve puzzle 107 with model gemini-1.5-pro
Failed to solve puzzle 128 with model gemini-1.5-pro
Failed to solve puzzle 132 with model gemini-1.5-pro
Failed to solve puzzle 162 with model gemini-1.5-pro
Failed to solve puzzle 107 with model gemini-1.5-flash
Failed to solve puzzle 128 with model gemini-1.5-flash


{'test_size': 200,
 'gemini-1.5-pro': 12,
 'gemini-1.5-flash': 14,
 'gpt-4o-2024-08-06': 46,
 'gpt-4o-mini': 19}

# Results we got from website querying o1

In [None]:
answers = []
for i in range(10):
    sample = datasets['test'][i]
    prompt = f"Can you solve this cryptic crossword puzzle for me? \n**Clue**: {sample['clue']}\n**Orientation**: {sample['orientation']}\n"
    print(prompt)
    print(f"Answer: {sample['answer']}\n")
    answers.append(sample['answer'])


Can you solve this cryptic crossword puzzle for me? 
**Clue**: solution woman had found is a turning point (9)
**Orientation**: across

Answer: watershed

Can you solve this cryptic crossword puzzle for me? 
**Clue**: airborne soldier given particular protection from fire (7)
**Orientation**: across

Answer: parapet

Can you solve this cryptic crossword puzzle for me? 
**Clue**: closely follow artist, leaving son to pontificate (9)
**Orientation**: down

Answer: dogmatise

Can you solve this cryptic crossword puzzle for me? 
**Clue**: catholic a vulgar american woman resented (5,6)
**Orientation**: down

Answer: broad minded

Can you solve this cryptic crossword puzzle for me? 
**Clue**: see newly planted wood falter in this? (5,5)
**Orientation**: across

Answer: flood water

Can you solve this cryptic crossword puzzle for me? 
**Clue**: the majority will get the next clue! (8)
**Orientation**: across

Answer: eighteen

Can you solve this cryptic crossword puzzle for me? 
**Clue**: hi

In [None]:
answers

['watershed',
 'parapet',
 'dogmatise',
 'broad minded',
 'flood water',
 'eighteen',
 'uplifted',
 'oboist',
 'amiss',
 'unhappily']

In [None]:
o1_answers = ['WATERSHED', 'PARAPET', 'SERMONISE', 'Grace Mugabe', 'BLIND PANIC', 'MAJORITY', 'UPLIFTED', 'OBOIST', 'EXIST', 'UNHAPPILY']   # 4/10
o1_mini_answers = ['WATERSHED', 'PARAPET', 'BLOVIATE', 'Broad Church ', 'TREES LEAN', 'greater', 'TOPSOIL', 'RACKET', 'AMISS', 'HAPPYEND']      # 3/10