# Manually generating in-context examples for aligning LLMs with human rating

- Get ~100 ratings from an annotator (see `get_relevant_files`)
- Select 10 ratings for in-context examples. Ask an LLM (gpt-4o here) to produce reasoning for the rating.
- Inject those 10 ratings and their reasonings in the prompt
- Now get the LLM to rate the remaining 90 rows
- Calculate correlation with human rating using `pearsonr`

In [1]:
from dotenv import load_dotenv

load_dotenv()

import csv
import random


In [3]:
SRC_LANG = 'ind'
SRC_NAME = 'Indonesian'
TGT_LANG = 'eng'
TGT_NAME = 'English'

ANNOTATOR = 'A2'
RATINGS_FOR_MODEL = ''

RATER_MODEL = 'gemini-1.5-pro'

In [4]:
import dspy

lm = dspy.Google(model=RATER_MODEL)

### Get data

In [None]:
from utils import get_relevant_files, extract_rows, remap_columns
files = get_relevant_files(SRC_LANG, RATINGS_FOR_MODEL, ANNOTATOR)
rows = extract_rows(files)
rows = [r for r in rows if r[SRC_LANG]]
for r in rows:
    remap_columns(r)

print(f"Total of {len(rows)} examples")
print(random.sample(rows, 1))


In [None]:
random.seed(42)
random.shuffle(rows)
# sample are our rows for injecting in context, the rest will be used for evaluation
sample = random.sample(rows, 10)

print(f'ratings in sample: {[r['Overall rating'] for r in sample]}')

### Get LLM reasoning for rating of the rows in `sample`

Output: prompt with in-context examples `AUGMENTED_SYSTEM_PROMPT`

In [7]:
SYSTEM = f'''You are assisting in the creation of a bilingual {TGT_NAME}-{SRC_NAME} dictionary.
Your task is to rate a candidate sentence pair that illustrates dictionary entries to help linguists select an appropriate example pair.

Example sentences should should be:
1. Typical: Show typical usage of the word
2. Informative: Add value by providing context or additional information
3. Intelligible: Be clear, concise, and appropriate for a general audience
4. Translation correct: Are sentences a good translation of each other, with fluent grammar and correct usage of words in both languages

You are rating the example sentences, not the dictionary entries.\n\n'''

TEMPLATE_GET_REASONING = """<example>
Src Entry: {src_entry}
Tgt Entry: {tgt_entry}
Src Example: {src_example}
Tgt Example: {tgt_example}

Comment: {comment}
Typical: {typical}
Informative: {informative}
Intelligible: {intelligible}
Translation correct: {translation_correct}
</example>

Reasoning: what is the reasoning for the above ratings? Give your response in one paragraph.
"""

def get_templated_row(row):
    return TEMPLATE_GET_REASONING.format(
        src_entry=row[SRC_LANG],
        tgt_entry=row[TGT_LANG],
        src_example=row['src_example'],
        tgt_example=row['tgt_example'],
        comment=row['Comment'],
        typical=row['Typical'],
        informative=row['Informative'],
        intelligible=row['Intelligible'],
        translation_correct=row['Translation correct']
    )


In [None]:
from openai import OpenAI
from tqdm import tqdm

OPENAI_CLIENT = OpenAI()

def get_reasoning(row):
    prompt = SYSTEM + get_templated_row(row)
    return lm(prompt)[0]

for row in tqdm(sample, desc="Generating reasoning"):
    row['reasoning'] = get_reasoning(row)

In [None]:
TEMPLATE_EXAMPLE = """<example>
<data>
Src Entry: {src_entry}
Tgt Entry: {tgt_entry}
Src Example: {src_example}
Tgt Example: {tgt_example}
</data>
<reasoning>{reasoning}</reasoning>
<rating>{rating}</rating>
</example>"""

TEMPLATE_ASK_FOR_RATING = """
<data>
Src Entry: {src_entry}
Tgt Entry: {tgt_entry}
Src Example: {src_example}
Tgt Example: {tgt_example}
</data>"""

def get_templated_example(row):
    return TEMPLATE_EXAMPLE.format(
        src_entry=row[SRC_LANG],
        tgt_entry=row[TGT_LANG],
        src_example=row['src_example'],
        tgt_example=row['tgt_example'],
        reasoning=row['reasoning'],
        rating=row['Overall rating']
    )



AUGMENTED_SYSTEM_PROMPT = SYSTEM
for row in sample:
    AUGMENTED_SYSTEM_PROMPT += get_templated_example(row)
    AUGMENTED_SYSTEM_PROMPT += '\n\n'

print(AUGMENTED_SYSTEM_PROMPT)


### Rate the remaining rows using ICL

In [None]:

def templated_ask_for_rating(row):
    return TEMPLATE_ASK_FOR_RATING.format(
        src_entry=row[SRC_LANG],
        tgt_entry=row[TGT_LANG],
        src_example=row['src_example'],
        tgt_example=row['tgt_example']
    )

print(templated_ask_for_rating(rows[5]))

In [None]:
def row_in_sample(row):
    # this will filter out more than 10 rows, which is fine, we don't want to re-use rows that have the same word entries as those in the prompt
    for r in sample:
        if r[SRC_LANG] == row[SRC_LANG] and r[TGT_LANG] == row[TGT_LANG]:
            return True
    return False

def templated_ask_for_rating(row):
    return TEMPLATE_ASK_FOR_RATING.format(
        src_entry=row[SRC_LANG],
        tgt_entry=row[TGT_LANG],
        src_example=row['src_example'],
        tgt_example=row['tgt_example']
    )

def get_rating(row):
    prompt = AUGMENTED_SYSTEM_PROMPT + templated_ask_for_rating(row)
    return lm(prompt)[0]

def extract_rating(text):
    # keep the part after '</reasoning>'
    text = text.split('</reasoning>')[1].strip()
    # keep the part after 'Rating: '
    text = text.split('<rating>')[1].strip()
    return int(text[0])

devset = [r for r in rows if not row_in_sample(r)]

for row in tqdm([r for r in devset if not 'pred' in r], desc="Rating examples"):
    if 'pred' in row:
        continue
    row['pred'] = get_rating(row)
    row['rating_pred'] = extract_rating(row['pred'])

In [None]:
# save LLM ratings to a file

with open(f'rating_pred_{ANNOTATOR}_{RATER_MODEL}_{SRC_LANG}_{TGT_LANG}.tsv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=devset[0].keys(), delimiter='\t')
    writer.writeheader()
    for row in devset:
        writer.writerow(row)

print(f"Saved ratings to rating_pred_{ANNOTATOR}_{RATER_MODEL}_{SRC_LANG}_{TGT_LANG}.tsv")

In [31]:
if isinstance(devset[0]['Overall rating'], str):
    for row in devset:
        row['Overall rating'] = int(row['Overall rating'][0])

In [None]:
from scipy.stats import pearsonr


def get_preds(rows):
    return [r['rating_pred'] for r in rows]

def get_refs(rows):
    # return [int(r['Overall rating'][0]) for r in rows]
    return [r['Overall rating'] for r in rows]

corel, pvalue = pearsonr(get_refs(devset), get_preds(devset))
print(f"Pearson correlation: {corel:.3f}, p-value: {pvalue:.3f}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

with open(f'rating_pred_{ANNOTATOR}_{RATER_MODEL}_{SRC_LANG}_{TGT_LANG}.tsv') as f:
    reader = csv.DictReader(f, delimiter='\t')
    devset = list(reader)

def convert_rating_int(rows):
    for row in rows:
        row['Overall rating'] = int(row['Overall rating'][0])
        row['rating_pred'] = int(row['rating_pred'][0])

if isinstance(devset[0]['Overall rating'], str):
    convert_rating_int(devset)

import pandas as pd
df = pd.DataFrame(devset)
df_grouped = df.groupby(['Overall rating', 'rating_pred']).size().reset_index(name='counts')

plt.scatter(df_grouped['Overall rating'], df_grouped['rating_pred'], s=df_grouped['counts']*100, color='blue', alpha=0.7)

plt.xlim(1, 5.5)
plt.ylim(1, 5.5)

plt.xlabel('Reference')
plt.ylabel('Prediction')
plt.title(f'Scatter plot of Gemini-predicted vs {ANNOTATOR}-reference ratings ({SRC_LANG})')

plt.savefig(f'rating_pred_{ANNOTATOR}_{RATER_MODEL}_{SRC_LANG}_{TGT_LANG}.png')
plt.show()
