# Pythia Evaluation

In [None]:
!pip install transformers jsonlines datasets evaluate

In [None]:
from preplexity_local import *

def test_prep_score(sentences: list[tuple[str, str]], size: str, steps_to_test: list[str]):
  success_rates = {}
  for step in steps_to_test:
      print(f"(*) Calculating PLL score for step {step}")

      success_count = 0
      _perplexity = Perplexity()
      for right_sentence, wrong_sentence in sentences:
        input_texts = [right_sentence,wrong_sentence]
        right_score, wrong_score = _perplexity._compute(model_id=f'EleutherAI/pythia-{size}-deduped',
                              revision=step,
                             add_start_token=False,
                             predictions=input_texts)['perplexities']
        if right_score < wrong_score:
            success_count += 1

      success_rates[step] = success_count / len(sentences) * 100

  print(f"\t(*) Success rate: {success_count / len(sentences) * 100}%")

  return success_rates



In [None]:
from pathlib import Path
import jsonlines

def parse_gordon_questions_file(file_path: Path):
    questions = {'subject_extracted': [], 'definite_description': [], 'object_extracted': [], 'indexical_pronoun': [],'name':[]}
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            right_sentence = f"{obj['sentence']} {obj['true_question']}"
            wrong_sentence = f"{obj['sentence']} {obj['false_question']}"

            condition_sentence = obj["condition_sentence"]
            condition_distractor = obj["condition_distractor"]
            condition_question = obj["condition_question"]

            questions[condition_sentence].append((right_sentence, wrong_sentence))
            questions[condition_distractor].append((right_sentence, wrong_sentence))
            # questions[condition_question].append((right_sentence, wrong_sentence)) for now we dont focus on that parameter

    return questions

def parse_naama_questions_file(file_path: Path):
    questions = {'S-V': [], 'F-G': [], 'animate': [], 'inanimate': []}
    with jsonlines.open(file_path) as reader:
        for obj in reader:
            dependency = obj['dependency']
            animacy = obj['animacy']
            right_sentence = f"{obj['sentence']} Therefore, {obj['true_question']}"
            wrong_sentence = f"{obj['sentence']} Therefore, {obj['false_question']}"
            questions[dependency].append((right_sentence, wrong_sentence))
            questions[animacy].append((right_sentence, wrong_sentence))

    return questions

In [None]:
import seaborn as sns
import pandas as pd


def plot_success_rates(rate_per_size: dict[str, dict[int, float]]):
    """
    Plot beautiful line plot using seaborn
    X axis - step
    Y axis - success rate
    Color - size
    """
    # Create a Dataframe from rates_per_size
    data = []
    for size, rates in rate_per_size.items():
        for step, rate in rates.items():
            data.append({"size": size, "step": step, "rate": rate})

    data = pd.DataFrame(data, columns=["size", "step", "rate"])

    sns.lineplot(data=data, x="step", y="rate", hue="size")

In [None]:
model_size_list = ["70m","410m","1B"]
checkpoint_step_list = ["step1", "step35000", "step75000", "step110000", "step143000"]

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/My Folder"

# Gordon

In [None]:
import csv
import itertools

gordon_questions = parse_gordon_questions_file(Path("gordon_questions.jsonl"))

# Create or overwrite a CSV file named 'output.csv'
with open('/content/drive/My Drive/My Folder/gordon_results_pythia_1B.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write the header row to the CSV file
    csv_writer.writerow(['group', 'size', 'step', 'success_rate'])

x = {}
for group_tuple in [("subject_extracted", "definite_description"), ("object_extracted", "definite_description"), ("subject_extracted", "indexical_pronoun"), ("object_extracted", "indexical_pronoun")]:
    question_group = f"{group_tuple[0]}_{group_tuple[1]}"
    print(f"(*) Testing question group {question_group}")
    print('\t(*) Questions: ', len(gordon_questions[group_tuple[0]]), len(gordon_questions[group_tuple[1]]))

    # Get only questions that are both in the dependency and animacy groups
    questions = list(set(gordon_questions[group_tuple[0]]).intersection(gordon_questions[group_tuple[1]]))
    print('\t(*) Questions after intersection: ', len(questions))

    x[question_group] = questions

    rates_per_size = {}
    for size in model_size_list:
        print(f"(*) Testing size {size}")
        success_rates = test_prep_score(questions, size, checkpoint_step_list)
        rates_per_size[size] = success_rates
        # Iterate through the success rates and write to the CSV file
        for step in checkpoint_step_list:
          with open('/content/drive/My Drive/My Folder/gordon_results_pythia_1B.csv', 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow([question_group, size, step, success_rates[step]])

    # plot_success_rates(rates_per_size)

In [None]:
plot_success_rates(rates_per_size)

# Naama

In [None]:
import csv
import itertools

naama_questions = parse_naama_questions_file(Path("naama_questions.jsonl"))

# Create or overwrite a CSV file named 'output.csv'
with open('naama_output.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)

    # Write the header row to the CSV file
    csv_writer.writerow(['group', 'size', 'step', 'success_rate'])

    x = {}
    for group_tuple in [("S-V", "animate"), ("S-V", "inanimate"), ("F-G", "animate"), ("F-G", "inanimate")]:
        question_group = f"{group_tuple[0]}_{group_tuple[1]}"
        print(f"(*) Testing question group {question_group}")
        print('\t(*) Questions: ', len(naama_questions[group_tuple[0]]), len(naama_questions[group_tuple[1]]))

        # Get only questions that are both in the dependency and animacy groups
        questions = list(set(naama_questions[group_tuple[0]]).intersection(naama_questions[group_tuple[1]]))
        print('\t(*) Questions after intersection: ', len(questions))

        x[question_group] = questions

        rates_per_size = {}
        for size in model_size_list:
            print(f"(*) Testing size {size}")
            success_rates = test_prep_score(questions, size, checkpoint_step_list)
            rates_per_size[size] = success_rates
            print(success_rates)
            # Iterate through the success rates and write to the CSV file
            for step in checkpoint_step_list:
                csv_writer.writerow([question_group, size, step, success_rates[step]])

        # plot_success_rates(rates_per_size)