In [1]:
file_path = '../01_data/aita/AmItheAsshole_submissions'

In [2]:
import json

# Initialize an empty list to store the JSON objects
json_objects = []

# Open the file in read mode
with open(file_path, 'r') as file:
    # Loop through each line in the file
    for i, line in enumerate(file):
        # Convert the current line from a JSON string to a JSON object and append it to the list
        json_objects.append(json.loads(line))

In [13]:
filtered_stories = []
for i in json_objects:
    smaller_json = {}
    if i['score'] > 10 and 'http://www.' not in i['selftext'] and len(i['selftext']) > 25:
        smaller_json['id'] = i['id']
        smaller_json['title'] = i['title']
        smaller_json['selftext'] = i['selftext']
        smaller_json['score'] = i['score']
        filtered_stories.append(smaller_json)

# Sort the filtered stories by score in descending order
filtered_stories = sorted(filtered_stories, key=lambda x: x['score'], reverse=True)

In [4]:
from dotenv import load_dotenv
import os
from openai import OpenAI

# Load environment variables from .env file
load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_APIKEY"),
)

top_x_stories = 1000

In [5]:
def generate_instruction(story):
    # prompt = f"""
    # I am creating an LLM fine tuning dataset in the Alpaca format. I have my desired output stories, and need to create an "instruction" for each of them.

    # The instruction should be similar to this; "Write an am I the asshole reddit post about ... story details...". 
    
    # Please return only the instruction you write.

    # Below is the story, please output only a brief instruction set for creating it with an LLM.
    
    # STORY: {story}
    # """
    prompt = f"""
    I am creating a fine-tuning dataset for a large language model (LLM) in the Alpaca format. 
    Each story in the dataset needs a clear and concise instruction for generating a similar story.

    The instruction should guide the LLM to write a story in the style of an "Am I the Asshole" Reddit post. 
    The instruction should include a few details and themes from the story but be brief and to the point.

    Please provide only the instruction in your response.

    Below is the story. Based on this story, create an appropriate instruction.

    STORY: {story}
    """
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a assistant helping to create a high quality dataset for the user."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100
    )
    instruction = response.choices[0].message.content.strip()
    return instruction

In [6]:
for reddit_post in filtered_stories[:top_x_stories]:
    reddit_post['instruction'] = generate_instruction(reddit_post['selftext'])

In [7]:
# Save the modified stories to a new JSON file
output_file_path = 'post_instruction_data.json'
with open(output_file_path, 'w') as outfile:
    json.dump(filtered_stories[:top_x_stories], outfile, indent=4)

print(f"Modified Reddit posts saved to {output_file_path}")

Modified Reddit posts saved to post_instruction_data.json
