## Method 2: Prelabelled Data

Using a labelled transcript from https://transcripts.cnn.com/show/lkl/date/2010-03-26/segment/01

In [6]:
import re

transcript_file_path = '../data/transcript.txt'

with open(transcript_file_path, 'r', encoding='utf-8') as file:
    transcript_text = file.read()

# regular expression to match lines with at least four sequential capital letters
# that is NOT Snoop Dogg
prompt_regex = re.compile(r'^(?!SNOOP DOGG:)[A-Z]{4,}')

lines = [line for line in transcript_text.split('\n') if line.strip()]

conversations = []
current_prompt = ""
current_response = ""
collect_response = False

for line in lines:
    if line.startswith("SNOOP DOGG:"):
        if collect_response:
            # extract the text after the colon for Snoop Dogg's response
            current_response = line.split(": ", 1)[1]
            conversations.append((current_prompt, current_response))
            # reset for the next cycle
            current_prompt = ""
            current_response = ""
            collect_response = False
    elif prompt_regex.match(line):
        if current_prompt:
            # if there was a previous prompt without a Snoop response, reset response collection
            collect_response = True
        # extract the text after the colon for the prompt
        prompt_text = line.split(": ", 1)[1] if ": " in line else line
        current_prompt = prompt_text
        collect_response = True
    else:
        # append non-prompt, non-response lines to the current response if we're collecting responses
        if collect_response:
            current_response += (current_response + " " if current_response else "") + line

# add the last conversation if it hasn't been added
if current_prompt and current_response:
    conversations.append((current_prompt, current_response))

# test: print pairs of prompts and responses
for prompt, response in conversations[:5]:
    print(f"Prompt: {prompt}\nResponse: {response}\n")

print(len(conversations))


Prompt: And he's talking Tiger.
Response: Nobody's perfect and everybody makes mistakes.

Prompt: His drug of choice.
Response: I know it should be legalized. Legalize it and I (INAUDIBLE).

Prompt: What about Susan Boyle?
Response: She's a great artist. I'm looking forward to going to the studio with her.

Prompt: Relax.
Response: Wait.

Prompt: I'm a G now, right?
Response: You a double O-G.

231


In [8]:
import json

jsonl_file_path = '../data/transcript_for_fine_tuning.jsonl'

with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
    for prompt, response in conversations:
        fine_tuning_entry = {
            "prompt": prompt + "\n",  # Add a newline to separate the prompt from the model's response
            "completion": response
        }
        jsonl_file.write(json.dumps(fine_tuning_entry) + "\n")

In [None]:
s