## Generating example sentences for dict entries using LLMs

Inputs: `selected_top10k_{SRC_LANG}_eng.tsv` where SRC_LANG is one of 'fra', 'ind', 'tdt'

Outputs: `select_examples_{MODEL}_{SRC_LANG}_eng.tsv` where MODEL is one of 'llama','gpt4'

In [17]:
from dotenv import load_dotenv

load_dotenv()

import anthropic
import replicate
from openai import OpenAI

MODEL = "llama"

if MODEL == "claude":
    MODEL_NAME = "claude-3-5-sonnet-20240620"
elif MODEL == "llama":
    MODEL_NAME = "meta/meta-llama-3.1-405b-instruct"
elif MODEL == "gpt4":
    MODEL_NAME = "gpt-4o"
else:
    raise ValueError("Model not found")

CLIENT = anthropic.Anthropic()
OPENAI_CLIENT = OpenAI()
EXAMPLE_KEY = f'example.{MODEL}'

In [None]:
import json
import random
import csv

SRC_LANG = 'tdt'
SRC_NAME = 'Tetun'
# SRC_LANG = 'fra'
# SRC_NAME = 'French'
# SRC_LANG = "ind"
# SRC_NAME = "Indonesian"
TGT_LANG = 'eng'
TGT_NAME = 'English'

with open(f"selected_top10k_{SRC_LANG}_{TGT_LANG}.tsv", "r") as f:
    reader = csv.DictReader(f, delimiter="\t")
    rows = list(reader)
    # only the first 51
    rows = rows[:50]
    # remove duplicates while preserving order
    seen = set()
    rows = [x for x in rows if tuple(x.items()) not in seen and not seen.add(tuple(x.items()))]
    print(f"Loaded {len(rows)} rows")

# fix the row where fra = 'enlise'  to eng = 'bog down'
if SRC_LANG == 'fra':
    for i, r in enumerate(rows):
        if r['fra'] == 'enlise':
            r['eng'] = 'bog down'

rows[-1]


In [19]:
# TODO: use SRC first, TGT second
PROMPT = f'''You are assisting in the creation of a bilingual {SRC_NAME}-{TGT_NAME} dictionary. Your task is to generate example sentences for dictionary entries to help users understand the usage of words in context.

You will be provided with a {SRC_NAME} word and its {TGT_NAME} equivalent.
<{SRC_NAME} entry>
{{src_word}}
</{SRC_NAME} entry>

<{TGT_NAME} entry>
{{tgt_word}}
</{TGT_NAME} entry>

Please create a pair of example sentences for each entry. The sentences should be:
1. Typical: Show typical usage of the word
2. Informative: Add value by providing context or additional information
3. Intelligible: Be clear, concise, and appropriate for a general audience
4. Using the entries provided above (the {SRC_NAME} and {TGT_NAME} words)

Format your response as follows:

<example_sentence_pair>
{SRC_NAME}: [Your {SRC_NAME} sentence here]
{TGT_NAME}: [Your {TGT_NAME} sentence here]
</example_sentence_pair>

Please provide your example sentences based on the given {SRC_NAME} and {TGT_NAME} entries.'''

In [20]:

def make_prompt(entry):
    return PROMPT.format(src_word=entry[SRC_LANG], tgt_word=entry[TGT_LANG])

def get_examples_for_entry(entry):
    prompt = make_prompt(entry)

    if MODEL == "claude":
        message = CLIENT.messages.create(
            model=MODEL_NAME,
            max_tokens=4096,
            messages=[
                {
                    "role": "user",
                    "content":  prompt
                },
            ],
            temperature=0
        ).content[0].text
    elif MODEL == "llama":
        input = {
            "prompt": prompt,
            "max_tokens": 4096,
            "temperature": 0.0,
        }

        message = replicate.run(MODEL_NAME, input=input)
        message = "".join(message).strip()
    elif MODEL == "gpt4":
        chat_completion = OPENAI_CLIENT.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            model=MODEL_NAME,
            temperature=0.0,
        )
        message = chat_completion.choices[0].message.content
    else:
        raise ValueError("Model not found")

    # find text between <example_sentences> and </example_sentences>
    example_text = message.split('<example_sentence_pair>')[1].split('</example_sentence_pair>')[0]
    example_text = example_text.strip()
    
    # split the example text into English and Tetun sentences
    src_sentence = example_text.split(f'{SRC_NAME}:')[1].split(f'{TGT_NAME}:')[0].strip()
    tgt_sentence = example_text.split(f'{TGT_NAME}:')[1].strip()
    
    # return both sentences in a tuple
    return src_sentence, tgt_sentence

In [None]:
print(f"Starting with model {MODEL} on {SRC_LANG} to {TGT_LANG} examples")

for line in rows:
    if f'{EXAMPLE_KEY}_src' in line:
        continue
    examples = get_examples_for_entry(line)
    line[f'{EXAMPLE_KEY}_src'] = examples[0]
    line[f'{EXAMPLE_KEY}_tgt'] = examples[1]
    print(line)

In [22]:
with open(f"select_examples_{MODEL}_{SRC_LANG}_{TGT_LANG}.tsv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=rows[0].keys(), delimiter="\t")
    writer.writeheader()
    writer.writerows(rows)

In [9]:
!open .