In [2]:
from dotenv import load_dotenv

load_dotenv()

import csv
import random
import glob

from tqdm import tqdm
from openai import OpenAI

OPENAI_CLIENT = OpenAI()

## Experiment setup

In [None]:

SRC_LANG = 'ind'
SRC_NAME = 'French'
TGT_LANG = 'eng'
TGT_NAME = 'English'


# ANNOTATOR = 'A1'
# RATINGS_FOR_MODEL = 'llama'
ANNOTATOR = ''
RATINGS_FOR_MODEL = ''

metrics = 'Typical	Informative	Intelligible	Translation correct'
metrics = metrics.split('\t')

def remap_keys(row, model):
    # remap keys for examples
    row['src_example'] = row[f'example.{model}_src']
    row['tgt_example'] = row[f'example.{model}_tgt']
    # remap SRC_LANG to 'src_word'
    row['src_word'] = row[SRC_LANG]
    row['tgt_word'] = row[TGT_LANG]
    # convert rating to int
    row['Overall rating'] = int(row['Overall rating'][0])
    row = {k: v for k, v in row.items() if k in ['src_example', 'tgt_example', 'src_word', 'tgt_word', 'Overall rating']}
    return row

def average_out_ratings(rows):
    # fow rows that share the same src_example and tgt_example, average out the ratings
    examples_to_ratings = {}
    for row in rows:
        key = (row['src_example'], row['tgt_example'], row['src_word'], row['tgt_word'])
        if key not in examples_to_ratings:
            examples_to_ratings[key] = []
        examples_to_ratings[key].append(row['Overall rating'])
    # average out ratings
    for key, ratings in examples_to_ratings.items():
        examples_to_ratings[key] = sum(ratings) / len(ratings)
    # convert to list
    examples_to_ratings = [{'src_example': k[0], 'tgt_example': k[1], 'src_word': k[2], 'tgt_word': k[3], 'Overall rating': v} for k, v in examples_to_ratings.items()]
    return examples_to_ratings

def get_rated_examples_model(lang, model):
    files = f'select_examples_{model}_{lang}_eng_rated_*.tsv'
    files = glob.glob(files)
    print(f"Found {len(files)} files")
    rows = []
    for file in files:
        with open(file) as f:
            reader = csv.DictReader(f, delimiter='\t')
            for row in reader:
                if not row[SRC_LANG]:
                    continue
                row = remap_keys(row, model)
                rows.append(row)
    examples_to_ratings = average_out_ratings(rows)
    return examples_to_ratings

averaged_rows = get_rated_examples_model(SRC_LANG, 'llama')
print(f"Total of {len(averaged_rows)} examples with averaged ratings")
random.choice(averaged_rows)


## Per model

In [None]:
from utils import get_relevant_files, extract_rows

def get_model_rows(model):
    files = get_relevant_files(SRC_LANG, ratings_for_model=model, annotator='')
    rows = extract_rows(files)
    for r in rows:
        r['Overall rating'] = int(r['Overall rating'][0])
    return rows

for model in ['llama', 'gpt4']:
    rows = get_model_rows(model)
    avg = sum(r['Overall rating'] for r in rows) / len(rows)
    print(f"Model {model} has {len(rows)} examples with average rating {avg:.2f}")

## Agreement


### Between humans

between humans

In [None]:
import pandas as pd
import glob
from sklearn.metrics import cohen_kappa_score
from scipy.stats import pearsonr, ttest_rel


LANG = f"{SRC_LANG}_{TGT_LANG}"
ANNOTATORS = ['A1', 'A2']

from utils import get_relevant_files, extract_rows

def get_annotator_rows(annotator):
    files = get_relevant_files(SRC_LANG, ratings_for_model=RATINGS_FOR_MODEL, annotator=annotator)
    rows = extract_rows(files)
    for r in rows:
        r['Overall rating'] = int(r['Overall rating'][0])
    return rows

rows_1 = get_annotator_rows(ANNOTATORS[0])
rating1 = [r['Overall rating'] for r in rows_1]
rows_2 = get_annotator_rows(ANNOTATORS[1])
rating2 = [r['Overall rating'] for r in rows_2]

def print_mean_and_ttest():
    print(f"Average overall rating for {ANNOTATORS[0]}: {sum(rating1) / len(rating1):.2f}")
    print(f"Standard deviation for {ANNOTATORS[0]}: {pd.Series(rating1).std():.2f}")
    print(f"Average overall rating for {ANNOTATORS[1]}: {sum(rating2) / len(rating2):.2f}")
    print(f"Standard deviation for {ANNOTATORS[1]}: {pd.Series(rating2).std():.2f}")
    t_statistic, p_value = ttest_rel(rating1, rating2)
    print(f"t-statistic: {t_statistic:.3f}, p-value: {p_value:.3f}")

print_mean_and_ttest()

print(f"Working with a total of {len(rows_1)} and {len(rows_2)} rows")

def rating_to_bucket(rating: int):
    if rating in [1, 2]:
        return 'Bad'
    if rating == 3:
        return 'Neutral'
    if rating in [4, 5]:
        return 'Good'

def calculate_overall_rating_agreement_buckets():
    rating1 = [rating_to_bucket(r) for r in rating1]
    rating2 = [rating_to_bucket(r) for r in rating2]
    return cohen_kappa_score(rating1, rating2)


def calculate_overall_rating_agreement():
    cohenkappa = cohen_kappa_score(rating1, rating2, weights='quadratic')
    print(f"Cohen's Kappa: {cohenkappa:.3f}")

    # pearsonr
    p = pearsonr(rating1, rating2)
    print(f"Pearson's r: {p[0]:.3f}, p-value: {p[1]:.3f}")

calculate_overall_rating_agreement()
# TMP: use buckets instead
# print(calculate_overall_rating_agreement_buckets(rows_1, rows_2))


#### Calculate average over different metrics



In [None]:
from scipy.stats import pearsonr


metrics = 'Typical	Informative	Intelligible	Translation correct'
metrics = metrics.split('\t')

def map_rating_to_int(rating):
    if rating == 'Yes':
        return 1
    if rating == 'No':
        return 0
    return 0.5

def get_values_for_metric_as_list(metric, rows_1, rows_2=None, map_to_int=True):
    values = [r[metric] for r in rows_1]
    if rows_2:
        values.extend([r[metric] for r in rows_2])
    # map 'Yes' to 1 and 'No' to 0, 'Somewhat' to 0.5
    if map_to_int:
        values = [map_rating_to_int(v) for v in values]
    values = [v for v in values if v is not None]
    return values

print(f"Calculating metrics for model {RATINGS_FOR_MODEL} and lang {LANG}")
print()
for metric in metrics:
    values = get_values_for_metric_as_list(metric, rows_1, rows_2)
    # calculate average
    average = sum(values) / len(values)
    print(f"Metric {metric} average: {average:.3f} over {len(values)} ratings")
    # calculate correlation with pearsonr


# overall rating: just canculate the avg
metric = 'Overall rating'
values = get_values_for_metric_as_list(metric, rows_1, rows_2, map_to_int=False)
average = sum(values) / len(values)
print(f"Metric {metric} average: {average:.3f} over {len(values)} ratings")

### Calculate significance

In [None]:
import pandas as pd
import numpy as np
import glob
from sklearn.metrics import cohen_kappa_score
from scipy.stats import pearsonr
from scipy import stats

LANG = f"{SRC_LANG}_{TGT_LANG}"
MODELS = ['llama', 'gpt4']

ANNOTATORS = ['A1', 'A2']

from utils import get_relevant_files, extract_rows

def get_model_rows(lang, model):
    files = get_relevant_files(lang, ratings_for_model=model, annotator='')
    rows = extract_rows(files)
    for r in rows:
        r['Overall rating'] = int(r['Overall rating'][0])
    return rows

def calculate_rating_statistics(model_rows):
    for model in ['gpt4', 'llama']:
        ratings = [r['Overall rating'] for r in model_rows[model]]
        
        # Calculate basic statistics
        mean = np.mean(ratings)
        std = np.std(ratings)
        
        print(f"\n{model.upper()} Statistics:")
        print(f"Mean: {mean:.3f}")
        print(f"Standard Deviation: {std:.3f}")

def calculate_ttest(model_rows):
    gpt4_ratings = [r['Overall rating'] for r in model_rows['gpt4']]
    llama_ratings = [r['Overall rating'] for r in model_rows['llama']]

    # Paired t-test
    t_statistic, p_value = stats.ttest_rel(gpt4_ratings, llama_ratings)
    print(f"\nPaired t-test: t-statistic = {t_statistic:.4f}, p-value = {p_value:.4f}")
    
    # Effect size (Cohen's d for paired samples)
    d = (np.mean(gpt4_ratings) - np.mean(llama_ratings)) / np.std(np.array(gpt4_ratings) - np.array(llama_ratings))
    print(f"Cohen's d effect size: {d:.4f}")

for lang in ['fra', 'ind', 'tdt']:
    print(f"\n{'='*50}")
    print(f"Analysis for {lang}")
    print(f"{'='*50}")
    model_rows = {model: get_model_rows(lang, model) for model in MODELS}
    calculate_rating_statistics(model_rows)
    calculate_ttest(model_rows)

#### Bar chart

In [None]:
from utils import get_relevant_files, extract_rows

files = get_relevant_files(SRC_LANG, RATINGS_FOR_MODEL, ANNOTATOR)
rows = extract_rows(files)

values = {}
for metric in metrics:
    # values can be Yes, Somewhat, No
    values[metric] = get_values_for_metric_as_list(metric, rows, rows_2=None, map_to_int=False)

# stacked bar plot for each metric
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for stacking
yes_counts = [v.count('Yes') for v in values.values()]
somewhat_counts = [v.count('Somewhat') for v in values.values()]
no_counts = [v.count('No') for v in values.values()]

def print_percent_per_metric():
    print(SRC_LANG)
    for metric, yes, somewhat, no in zip(metrics, yes_counts, somewhat_counts, no_counts):
        total = yes + somewhat + no
        print(f"{metric}: {yes_counts} yes, {somewhat_counts} somewhat, {no_counts} no")
        # print(f"{metric}: {yes/total*100:.2f}% yes, {somewhat/total*100:.2f}% somewhat, {no/total*100:.2f}% no")
print_percent_per_metric()

# Set the positions of the bars
bar_width = 0.5
indices = np.arange(len(metrics))

# Plot each stack
plt.bar(indices, yes_counts, bar_width, label='Yes', color='lightgreen', edgecolor='black')
plt.bar(indices, somewhat_counts, bar_width, bottom=yes_counts, label='Somewhat', color='lightyellow', edgecolor='black')
plt.bar(indices, no_counts, bar_width, bottom=np.array(yes_counts) + np.array(somewhat_counts), label='No', color='lightcoral', edgecolor='black')

# Labels and title
plt.xticks(indices, [metric if metric != "Translation correct" else "Trans. correct" for metric in metrics], fontsize=18)
plt.yticks([0, 50, 100, 150, 200], rotation=90, fontsize=18)  # Show y ticks oriented vertically with specific values
# plt.xlabel('Metrics')
plt.ylabel('Count')
# plt.title(f'{SRC_NAME} metric distribution')
plt.legend(loc='lower right', fontsize=12)

# Show the plot
plt.tight_layout()
plt.savefig(f'{SRC_LANG} metric distribution.png')
plt.show()

#### Calculate correlation over metrics

In [None]:
from scipy.stats import pearsonr
import numpy as np
import krippendorff


metrics = 'Typical	Informative	Intelligible	Translation correct'
metrics = metrics.split('\t')

print(f"Language: {LANG}")

for metric in metrics:
    rows_1_metric = [map_rating_to_int(r[metric]) for r in rows_1]
    rows_2_metric = [map_rating_to_int(r[metric]) for r in rows_2]
    p = pearsonr(rows_1_metric, rows_2_metric)
    k = krippendorff.alpha(reliability_data=[rows_1_metric, rows_2_metric], level_of_measurement='interval')
    print(f"{metric} Pearson's r: {p[0]:.3f}, p-value: {p[1]:.3f}; Krippendorff's alpha: {k:.3f}")

metric = 'Overall rating'
rows_1_metric = [r[metric] for r in rows_1]
rows_2_metric = [r[metric] for r in rows_2]
p = pearsonr(rows_1_metric, rows_2_metric)
krippendorff.alpha(reliability_data=[rows_1_metric, rows_2_metric], level_of_measurement='interval')
print(f"{metric} Pearson's r: {p[0]:.3f}, p-value: {p[1]:.3f}; Krippendorff's alpha: {k:.3f}")

In [None]:
"""
Language: fra_eng
Typical Pearson's r: 0.386, p-value: 0.000; Krippendorff's alpha: 0.378
Informative Pearson's r: -0.052, p-value: 0.610; Krippendorff's alpha: -0.047
Intelligible Pearson's r: 0.263, p-value: 0.008; Krippendorff's alpha: 0.264
Translation correct Pearson's r: 0.146, p-value: 0.146; Krippendorff's alpha: 0.136
Overall rating Pearson's r: 0.322, p-value: 0.001; Krippendorff's alpha: 0.136

Language: ind_eng
Typical Pearson's r: 0.518, p-value: 0.000; Krippendorff's alpha: 0.517
Informative Pearson's r: 0.187, p-value: 0.062; Krippendorff's alpha: -0.198
Intelligible Pearson's r: -0.040, p-value: 0.690; Krippendorff's alpha: -0.036
Translation correct Pearson's r: -0.084, p-value: 0.407; Krippendorff's alpha: -0.083
Overall rating Pearson's r: 0.071, p-value: 0.482; Krippendorff's alpha: -0.083

Language: tdt_eng
Typical Pearson's r: 0.585, p-value: 0.000; Krippendorff's alpha: 0.548
Informative Pearson's r: 0.479, p-value: 0.000; Krippendorff's alpha: 0.449
Intelligible Pearson's r: 0.598, p-value: 0.000; Krippendorff's alpha: 0.519
Translation correct Pearson's r: 0.529, p-value: 0.000; Krippendorff's alpha: 0.529
Overall rating Pearson's r: 0.573, p-value: 0.000; Krippendorff's alpha: 0.529
"""