# Generate training data for fine-tuning GPT-3.5


## Imports


In [13]:
import csv
from pathlib import Path
from typing import List, Dict, Any
import random
import json

## Constants


In [14]:
out_path = Path("fine-tune-annotated.json")
in_path = Path("training_utf-8.tsv")
num_samples_per_set = 50
essay_prompts = [
    "Write a letter to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you.",
    "Write a persuasive essay to a newspaper reflecting your vies on censorship in libraries. Do you believe that certain materials, such as books, music, movies, magazines, etc., should be removed from the shelves if they are found offensive? Support your position with convincing arguments from your own experience, observations, and/or reading.",
]
essay_rubrics = [
    "6 points: A well-developed response that takes a clear and thoughtful position and provides persuasive support. Typically has moderately well elaborated reasons with mostly specific details, exhibits generally strong organization, may be moderately fluent with transitional language throughout, and may show a consistent awareness of audience.\n5 points: A developed response that takes a clear position and provides reasonably persuasive support. Typically has moderately well elaborated reasons with mostly specific details, exhibits generally strong organization, may be moderately fluent with transitional language throughout, and may show a consistent awareness of audience.\n4 points: A somewhat-developed response that takes a position and provides adequate support. Typically has adequately elaborated reasons with a mix of general and specific details, shows satisfactory organization, may be somewhat fluent with some transitional language, and shows adequate awareness of audience.\n3 points: A minimally-developed response that may take a position, but with inadequate support and details. Typically has reasons with minimal elaboration and more general than specific details, shows some organization, may be awkward in parts with few transitions, and shows some awareness of audience.\n2 points: An under-developed response that may or may not take a position. Typically contains only general reasons with un-elaborated and/or list-like details, shows little or no evidence of organization, may be awkward and confused or simplistic, and may show little awareness of audience.\n1 points: An undeveloped response that may take a position but offers no more than very minimal support. Typically contains few or vague details, is awkward and fragmented, may be difficult to read and understand, is awkward and fragmented, and may show no awareness of audience.",
    "Essays are scored based on the following properties:\n\nIdeas and Content:\nDoes the writing sample fully accomplish the task (e.g., support an opinion, summarize, tell a story, or write an article)? Does it\n-present a unifying theme or main idea without going off on tangents?\n-stay completely focused on topic and task?\nDoes the writing sample include thorough, relevant, and complete ideas? Does it\n-include in-depth information and exceptional supporting details that are fully developed?\n-fully explore many facets of the topic?\n\nOrganization:\nAre the ideas in the writing sample organized logically? Does the writing\n-present a meaningful, cohesive whole with a beginning, a middle, and an end (i.e., include an inviting introduction and a strong conclusion)?\n-progress in an order that enhances meaning?\n-include smooth transitions between ideas, sentences, and paragraphs to enhance meaning of text (i.e., have a clear connection of ideas and use topic sentences)?\n\nStyle:\nDoes the writing sample exhibit exceptional word usage? Does it\n-include vocabulary to make explanations detailed and precise, descriptions rich, and actions clear and vivid (e.g., varied word choices, action words, appropriate modifiers, sensory details)?\n-demonstrate control of a challenging vocabulary?\nDoes the writing sample demonstrate exceptional writing technique?\n-Is the writing exceptionally fluent?\n-Does it include varied sentence patterns, including complex sentences?\n-Does it demonstrate use of writer's techniques (e.g., literary conventions such as imagery and dialogue and/or literary genres such as humor and suspense)?\n\nVoice:\nDoes the writing sample demonstrate effective adjustment of language and tone to task and reader? Does it\n-exhibit appropriate register (e.g., formal, personal, or dialect) to suit task?\n-demonstrate a strong sense of audience?\n-exhibit an original perspective (e.g., authoritative, lively, and/or exciting)?\n\nThe assignable scores are:\n\n6 points: A Score Point 6 paper is rare. It fully accomplishes the task in a thorough and insightful manner and has a distinctive quality that sets it apart as an outstanding performance.\n5 points: A Score Point 5 paper represents a solid performance. It fully accomplishes the task, but lacks the overall level of sophistication and consistency of a Score Point 6 paper.\n4 points: A Score Point 4 paper represents a good performance. It accomplishes the task, but generally needs to exhibit more development, better organization, or a more sophisticated writing style to receive a higher score.\n3 points: A Score Point 3 paper represents a performance that minimally accomplishes the task. Some elements of development, organization, and writing style are weak.\n2 points: A Score Point 2 paper represents a performance that only partially accomplishes the task. Some responses may exhibit difficulty maintaining a focus. Others may be too brief to provide sufficient development of the topic or evidence of adequate organizational or writing style.\n1 point: A Score Point 1 paper represents a performance that fails to accomplish the task. It exhibits considerable difficulty in areas of development, organization, and writing style. The writing is generally either very brief or rambling and repetitive, sometimes resulting in a response that may be difficult to read or comprehend.",
]
target_essay_sets = [1, 2]

## Load Raw TSV


In [15]:
def load_tsv(file_path: Path) -> List[Dict[str, Any]]:
    with file_path.open(mode="r", newline="", encoding="utf-8") as file:
        reader = csv.DictReader(file, delimiter="\t")
        list_of_dicts = [row for row in reader]
    return list_of_dicts


training_data_list = load_tsv(in_path)

In [16]:
training_samples = ([], [])
for training_data in training_data_list:
    if (
        int(training_data["essay_set"]) in target_essay_sets
        and training_data["rater1_domain1"] is not None
    ):
        index = int(training_data["essay_set"]) - 1
        training_samples[index].append(
            {
                "prompt": essay_prompts[index],
                "rubric": essay_rubrics[index],
                "essay": training_data["essay"],
                "score": training_data["rater1_domain1"],
            }
        )

In [17]:
sampled_training_samples = []
for target_essay_set in target_essay_sets:
    for sample in random.sample(
        training_samples[target_essay_set - 1], num_samples_per_set
    ):
        sampled_training_samples.append(sample)

In [18]:
def save_train_samples(sampled_training_samples: List[Dict[str, Any]], out_path: Path):
    with out_path.open("w", encoding="utf-8") as fp:
        json.dump(sampled_training_samples, fp)


save_train_samples(sampled_training_samples, out_path)