# Generate training data for fine-tuning GPT-3.5


## Imports


In [1]:
import csv
from pathlib import Path
from typing import List, Dict, Any
import random
import json

## Constants


In [2]:
out_path = Path("out.json")
in_path = Path("training_utf-8.tsv")
num_samples_per_set = 50
essay_prompts = [
    "Write a letter to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you.",
    "Write a persuasive essay to a newspaper reflecting your vies on censorship in libraries. Do you believe that certain materials, such as books, music, movies, magazines, etc., should be removed from the shelves if they are found offensive? Support your position with convincing arguments from your own experience, observations, and/or reading.",
]
essay_rubrics = [
    "6 points: A well-developed response that takes a clear and thoughtful position and provides persuasive support. Typically has moderately well elaborated reasons with mostly specific details, exhibits generally strong organization, may be moderately fluent with transitional language throughout, and may show a consistent awareness of audience.\n5 points: A developed response that takes a clear position and provides reasonably persuasive support. Typically has moderately well elaborated reasons with mostly specific details, exhibits generally strong organization, may be moderately fluent with transitional language throughout, and may show a consistent awareness of audience.\n4 points: A somewhat-developed response that takes a position and provides adequate support. Typically has adequately elaborated reasons with a mix of general and specific details, shows satisfactory organization, may be somewhat fluent with some transitional language, and shows adequate awareness of audience.\n3 points: A minimally-developed response that may take a position, but with inadequate support and details. Typically has reasons with minimal elaboration and more general than specific details, shows some organization, may be awkward in parts with few transitions, and shows some awareness of audience.\n2 points: An under-developed response that may or may not take a position. Typically contains only general reasons with un-elaborated and/or list-like details, shows little or no evidence of organization, may be awkward and confused or simplistic, and may show little awareness of audience.\n1 points: An undeveloped response that may take a position but offers no more than very minimal support. Typically contains few or vague details, is awkward and fragmented, may be difficult to read and understand, is awkward and fragmented, and may show no awareness of audience.",
    "TODO",
]
target_essay_sets = [1, 2]

## Load Raw TSV


In [3]:
def load_tsv(file_path: Path) -> List[Dict[str, Any]]:
    with file_path.open(mode="r", newline="", encoding="utf-8") as file:
        reader = csv.DictReader(file, delimiter="\t")
        list_of_dicts = [row for row in reader]
    return list_of_dicts


training_data_list = load_tsv(in_path)

In [4]:
training_samples = ([], [])
for training_data in training_data_list:
    if (
        int(training_data["essay_set"]) in target_essay_sets
        and training_data["rater1_domain1"] is not None
    ):
        index = int(training_data["essay_set"]) - 1
        training_samples[index].append(
            {
                "prompt": essay_prompts[index],
                "rubric": essay_rubrics[index],
                "essay": training_data["essay"],
                "score": training_data["rater1_domain1"],
            }
        )

In [5]:
sampled_training_samples = []
for target_essay_set in target_essay_sets:
    for sample in random.sample(
        training_samples[target_essay_set - 1], num_samples_per_set
    ):
        sampled_training_samples.append(sample)

In [6]:
def save_train_samples(sampled_training_samples: List[Dict[str, Any]], out_path: Path):
    with out_path.open("w", encoding="utf-8") as fp:
        json.dump(sampled_training_samples, fp)


save_train_samples(sampled_training_samples, out_path)