In [2]:
from soda.openai.text import instruct_chat_model, instruct_chat_model_batched
import json

with open("book.txt") as f:
    book = f.read()

# Chunk the book into 4096 word chunks with 1024 word overlap
chunks = []
for i in range(0, len(book), (4096 - 1024)):
    chunks.append(book[i:i+4096])

print("Loaded {} chunks".format(len(chunks)))
# Choose the first 10 chunks
chunks = chunks[:50]

SYSTEM_MESSAGE = """
Output

YES

if the text explicitly contains either a recipe, or part of a recipe. Otherwise, output

NO

If it only discusses recipes, output NO.
""".strip()

# resp = instruct_chat_model(
#     SYSTEM_MESSAGE,
#     chunks[0],
#     model="gpt-4"
# )

# answer = resp["choices"][0]["message"]["content"]
# print(answer)

resps = instruct_chat_model_batched(
    SYSTEM_MESSAGE,
    chunks,
    model="gpt-4"
)


Loaded 99 chunks


100%|██████████| 50/50 [00:00<00:00, 3894.72it/s]


In [8]:
evaluations = [1 if resps[i]["choices"][0]["message"]["content"] == "YES" else 0 for i in range(len(resps))]

# for chunk, evaluation in zip(chunks, evaluations):
#     print(chunk[:50] + "...")
#     print(evaluation)
#     print()

print("{}/{} chunks contain recipes".format(sum(evaluations), len(evaluations)))

for i in range(len(resps)):
    print("Chunk {}:".format(i))
    print(resps[i]["choices"][0]["message"]["content"])
    print()

# Gather all chunks with blocks
blocks = [chunks[i] for i in range(len(chunks)) if evaluations[i] == 1]

# Add line numbering to blocks
blocks = ["".join(["{}: {}\n".format(i, line) for i, line in enumerate(block.split("\n"))]) for block in blocks]
blocks = blocks[:10]

SYSTEM_MESSAGE = """
Output the start and end lines of the recipes in the text. If a recipe is cut-off (at the start or the end), output 0 or -1 respectively for the start or end line.
Give your response as a list of pairs of numbers, e.g.
0 23
34 45
46 -1
""".strip()

block_resps = instruct_chat_model_batched(
    SYSTEM_MESSAGE,
    blocks,
    model="gpt-4"
)

for i in range(len(block_resps)):
    print("Chunk {}:".format(i))
    print(block_resps[i]["choices"][0]["message"]["content"])
    print()


34/50 chunks contain recipes
Chunk 0:
NO

Chunk 1:
NO

Chunk 2:
NO

Chunk 3:
NO

Chunk 4:
NO

Chunk 5:
NO

Chunk 6:
NO

Chunk 7:
NO

Chunk 8:
NO

Chunk 9:
NO

Chunk 10:
YES

Chunk 11:
YES

Chunk 12:
YES

Chunk 13:
NO

Chunk 14:
NO

Chunk 15:
NO

Chunk 16:
NO

Chunk 17:
YES

Chunk 18:
YES

Chunk 19:
NO

Chunk 20:
NO

Chunk 21:
YES

Chunk 22:
YES

Chunk 23:
YES

Chunk 24:
YES

Chunk 25:
YES

Chunk 26:
YES

Chunk 27:
YES

Chunk 28:
YES

Chunk 29:
YES

Chunk 30:
YES

Chunk 31:
YES

Chunk 32:
YES

Chunk 33:
YES

Chunk 34:
YES

Chunk 35:
YES

Chunk 36:
YES

Chunk 37:
YES

Chunk 38:
YES

Chunk 39:
YES

Chunk 40:
YES

Chunk 41:
YES

Chunk 42:
YES

Chunk 43:
YES

Chunk 44:
YES

Chunk 45:
YES

Chunk 46:
YES

Chunk 47:
YES

Chunk 48:
YES

Chunk 49:
YES



100%|██████████| 10/10 [00:00<00:00, 71575.15it/s]


Chunk 0:
0 81

Chunk 1:
0 0
29 37

Chunk 2:
0 -1

Chunk 3:
0 0
73 -1

Chunk 4:
9 84

Chunk 5:
0 45
79 -1

Chunk 6:
21 26
35 40
46 51
59 65
71 78
88 94
109 -1

Chunk 7:
5 23
26 42
45 53
56 64
67 75
78 89
92 103
106 117
120 -1

Chunk 8:
0 2
10 17
24 30
38 42
50 55
61 68
81 87
95 100
107 -1

Chunk 9:
0 8
9 21
22 46
47 65
66 80
81 96
97 -1



In [38]:
# Extract the start and end lines of the recipes from the responses
recipe_lines = []
for i in range(len(block_resps)):
    lines = block_resps[i]["choices"][0]["message"]["content"].split('\n')
    for line in lines:
        if line:  # Ignore empty lines
            start, end = map(int, line.split())
            recipe_lines.append((blocks[i], start, end))

print("Found {} recipes".format(len(recipe_lines)))
# Extract the recipes from the blocks using the start and end lines
recipes = []
for (block, start, end) in recipe_lines:
    lines = block.split('\n')
    if end == -1:  # If the recipe is cut-off at the end
        end = len(lines)  # Use the last line
    recipe = '\n'.join(lines[start:end+1])
    recipes.append((recipe, start, end))

# Print the recipes individually
for i, (recipe, start, end) in enumerate(recipes, 1):
    print("*** Recipe {} (lines {}-{}) ***".format(i, start, end))
    print(recipe)
    print()

# Save recipes to JSON
with open("recipes.json", "w") as f:
    json.dump(recipes, f)

Found 41 recipes
*** Recipe 1 (lines 0-81) ***
0: not only frequently used,
1: but was of various sorts, as _cypre_, No. 41. 99. 120. named probably
2: from the isle of Cyprus, whence it might either come directly to us,
3: or where it had received some improvement by way of refining. There
4: is mention of _blanch-powder or white sugar_, 132. They, however,
5: were not the same, for see No. 193. Sugar was clarified sometimes
6: with wine [105].
7: 
8: Spices. _Species_. They are mentioned in general No. 133, and _whole
9: spices_, 167, 168. but they are more commonly specified, and are
10: indeed greatly used, though being imported from abroad, and from so
11: far as Italy or the Levant (and even there must be dear), some may
12: wonder at this: but it shouid be considered, that our Roll was
13: chiefly compiled for the use of noble and princely tables; and the
14: same may be said of the Editor's MS. The spices came from the same
15: part of the world, and by the same route, as sugar

In [42]:
import yaml

# Load recipes from recipes.json
with open("recipes.json") as f:
    recipes = json.load(f)

# For each recipe, extract the data with GPT-4
SYSTEM_MESSAGE = """
Output the title of the recipe, the ingredients, and the instructions.

Format your response as valid YAML, e.g.

title: "Recipe Title"
ingredients:
    "Ingredient 1": "Ingredient 1 amount"
    "Ingredient 2": "Ingredient 2 amount"
instructions:
- "Instruction 1"
- "Instruction 2"
...

Do not infer any information that is not present in the text. Just include "unknown" if you cannot find the information.

If you are unable to convert this into a recipe, output "FAIL".
""".strip()

recipe_resps = instruct_chat_model_batched(
    SYSTEM_MESSAGE,
    [recipe[0] for recipe in recipes],
    model="gpt-4"
)

# Print the responses
for i in range(len(recipe_resps)):
    print("Recipe {}:".format(i+1))
    print(recipe_resps[i]["choices"][0]["message"]["content"])
    print()


  0%|          | 0/41 [00:00<?, ?it/s]

100%|██████████| 41/41 [00:01<00:00, 22.68it/s]


Recipe 1:
FAIL

Recipe 2:
FAIL

Recipe 3:
title: "Unknown"
ingredients: 
    "cheese": "unknown amount"
    "flesh of capons or hens": "unknown amount"
    "milk of almonds": "unknown amount"
    "broth of fresh beef or fresh flesh": "unknown amount"
    "flour of rye": "unknown amount"
    "gastbon or amydon": "unknown amount"
    "yolk of eggs": "unknown amount"
    "saffron": "for coloring"
    "cloves of gilofre": "unknown amount"
    "powder of galyugale": "unknown amount"

instructions:
- "Take the cheese and flesh of capons, or of hens and chop small and grind small in a mortar."
- "Take milk of almonds with the broth of fresh beef. or fresh flesh, and put the flesh in the milk or in the broth and set them to the fire."
- "Mix them with flour of rye, or gastbon, or amydon as much as you want."
- "Add yolks of eggs and saffron to make it yellow."
- "When it is dressed in dishes as you desire, stick above cloves of gilofre."
- "Sprinkle powder of galyugale above, and serve it fort

In [43]:
# Extract the data from the responses
recipe_data = []
for i in range(len(recipe_resps)):
    # If the recipe failed, just add "FAIL"
    if recipe_resps[i]["choices"][0]["message"]["content"] == "FAIL":
        continue

    data = yaml.safe_load(recipe_resps[i]["choices"][0]["message"]["content"])
    recipe_data.append(data)

# Save recipes to JSON
with open("recipes_data.json", "w") as f:
    json.dump(recipe_data, f)


In [44]:
# Save recipes to JSON
with open("recipes_data.json") as f:
    recipe_data = json.load(f)

# For each recipe, extract the data with GPT-4
SYSTEM_MESSAGE = """
You will be given a YAML object representing a recipe. It will be written in an arbitrary language. Convert it to modern English, while keeping as much original meaning and inflection as possible. Keep the YAML structure exactly the same.
""".strip()

recipe_resps = instruct_chat_model_batched(
    SYSTEM_MESSAGE,
    [yaml.dump(recipe) for recipe in recipe_data[:10]],
    model="gpt-4"
)

# Print the responses
for i in range(len(recipe_resps)):
    print("Recipe {}:".format(i+1))
    print(recipe_resps[i]["choices"][0]["message"]["content"])
    print()


100%|██████████| 10/10 [00:00<00:00, 67432.54it/s]


Recipe 1:
ingredients:
  fresh beef or meat broth: unknown amount
  cheese: unknown amount
  cloves of clove: unknown amount
  chicken or hen meat: unknown amount
  rye flour: unknown amount
  gastbon or cornstarch: unknown amount
  almond milk: unknown amount
  galingale powder: unknown amount
  saffron: for coloring
  egg yolks: unknown amount
instructions:
- Take the cheese and meat from chickens or hens, chop them finely and grind them in
  a mortar.
- Take almond milk with the fresh beef or meat broth, and put the meat
  in the milk or in the broth and bring them to the boil.
- Mix them with rye flour, or gastbon, or cornstarch according to your preference.
- Add egg yolks and saffron to give it a yellow color.
- When it is served in dishes as you like, sprinkle some cloves on top.
- Sprinkle galingale powder on top, and serve it.
title: Unknown

Recipe 2:
ingredients:
  Bacon: unknown
  Beans: unknown
  High-quality broth: unknown
instructions:
- Take beans and dry them in an ove