<a href="https://colab.research.google.com/github/simecek/mlprague2024/blob/main/04_ChatGPT_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exercise 4

Before delving into open models, let's explore the capabilities of [ChatGPT](https://platform.openai.com/docs/api-reference)!

You can use either a small benchmark of 50 Czech multiple-choice questions, [synczech50](https://huggingface.co/datasets/simecek/synczech50), or a sample of 50 questions from our benchmark, [mlprague](https://huggingface.co/datasets/simecek/mlprague). You may want to modify the text template in `get_prompt` to see how it influences the results.

Please use the API responsibly, as we all share the key. If someone changes the model settings or runs many parallel jobs, we are likely to hit the limit.


In [1]:
!pip install -qq datasets openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [6]:
api_key = "sk-********" ## You must change this to actual API key distributed during

In [2]:
from datasets import load_dataset

dataset = load_dataset('simecek/synczech50', split='train')
#dataset = load_dataset('simecek/mlprague', split='train').shuffle().select(range(50))

dataset

Dataset({
    features: ['question', 'optionA', 'optionB', 'optionC', 'optionD', 'solution'],
    num_rows: 50
})

In [3]:
dataset[0]

{'question': 'Co je hlavní město České republiky?',
 'optionA': 'Brno',
 'optionB': 'Praha',
 'optionC': 'Ostrava',
 'optionD': 'Plzeň',
 'solution': 'B'}

In [4]:
def get_prompt(x):
    question, options = x['question'], (x['optionA'], x['optionB'], x['optionC'], x['optionD'])
    text = f"""Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.

Question (in Czech): {question}

Options:
A) {options[0]}
B) {options[1]}
C) {options[2]}
D) {options[3]}

Answer (just 1 letter, A/B/C/D):"""
    return text

prompts = [get_prompt(x) for x in dataset]

print(prompts[0])

Which option A/B/C/D is the best answer for the question. Answer with one letter, no explanation.

Question (in Czech): Co je hlavní město České republiky?

Options:
A) Brno
B) Praha
C) Ostrava
D) Plzeň

Answer (just 1 letter, A/B/C/D):


In [7]:
from openai import OpenAI

client = OpenAI(
    api_key=api_key,
)

In [8]:
def get_answer(prompt):

    response = client.chat.completions.create(
      model="gpt-3.5-turbo-0125",
      messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
      ],
      temperature=0,
      max_tokens=3)

    return response.choices[0].message.content.strip().upper()[:1]


get_answer(prompts[40])


'C'

In [9]:
from tqdm.notebook import tqdm

answers = []

for prompt in tqdm(prompts):
  a = get_answer(prompt)
  answers.append(a)


  0%|          | 0/50 [00:00<?, ?it/s]

In [11]:
correct = 0

for answer, solution in zip(answers, dataset['solution']):
  if answer == solution:
    correct += 1

# 80% for GPT3.5 (11 mistakes)
# 94% for GPT4 (3 mistakes)
correct / len(answers)

0.8

In [13]:
# List the mistakes

for answer, x in zip(answers, dataset):
  if answer != x['solution']:
    print(answer, str(x))


C {'question': 'Pokud máte 5 jablek a dáte příteli 2, kolik jablek vám zůstane?', 'optionA': '2', 'optionB': '3', 'optionC': '4', 'optionD': '5', 'solution': 'B'}
B {'question': "Který spisovatel napsal 'Báječná léta pod psa'?", 'optionA': 'Milan Kundera', 'optionB': 'Bohumil Hrabal', 'optionC': 'Michal Viewegh', 'optionD': 'Ivan Klíma', 'solution': 'C'}
A {'question': 'Pokud v pondělí bylo 3. ledna, jaké datum bude následující pondělí?', 'optionA': '9. ledna', 'optionB': '10. ledna', 'optionC': '8. ledna', 'optionD': '7. ledna', 'solution': 'B'}
A {'question': 'Které zvíře je národním symbolem České republiky?', 'optionA': 'Orel', 'optionB': 'Lev', 'optionC': 'Kůň', 'optionD': 'Medvěd', 'solution': 'B'}
D {'question': 'Co je výsledkem výrazu 100 - (2*5)^2?', 'optionA': '0', 'optionB': '50', 'optionC': '75', 'optionD': '25', 'solution': 'A'}
B {'question': 'Jaký je název klasické české pohádky o třech princích, kteří hledají svá srdce?', 'optionA': 'Tři veteráni', 'optionB': 'Tři oříšk