In [1]:
import os
import openai
import pandas as pd
import json

In [2]:
# reading in the answers 
df_answers = pd.read_csv("/workspaces/Duke-coursera/02-fine-tuning-eval-llms/reddit_answers_big.csv", sep=";")

df_answers.head()

Unnamed: 0.1,Unnamed: 0,q_id,text,votes
0,0,hvbvpz,Two pet ducks. You may be tempted to go for on...,2359.0
1,1,hvbvpz,Nice try Jeff Bezos,764.0
2,2,hvbvpz,A curved shower rod. Seriously. $10 for a tens...,1525.0
3,3,hvbvpz,Another monitor. Your productivity will increa...,1227.0
4,4,hvbvpz,A nasal irrigation kit - either the electronic...,659.0


In [3]:
df_top_votes = df_answers.groupby("q_id")['votes'].idxmax()  # Get indices of top-voted answers per question
df_top_answers = df_answers.loc[df_top_votes]  # Retrieve top-voted answers using their indices

# Rename columns for better readability
df_top_answers.rename(columns={'text': 'answer', 'q_id': 'id', 'votes': 'answer_votes'}, inplace=True)

In [4]:
#reading the questions 
df_questions = pd.read_csv('/workspaces/Duke-coursera/02-fine-tuning-eval-llms/reddit_questions.csv', sep=';')

#renaming the colums 
# Rename columns for better readability
df_questions.rename(columns={'text': 'question', 'votes': 'question_votes'}, inplace=True)

In [5]:
df_questions.head()

Unnamed: 0,id,question,question_votes,timestamp,datetime
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC
3,gmmlj4,How do you think humans will become extinct?,21998,1589887000.0,Tue May 19 11:18:05 2020 UTC
4,ishb7v,What is a movie So Disturbing you couldn't be ...,13,1600074000.0,Mon Sep 14 08:53:53 2020 UTC


In [6]:
#joining questions and answers on the id

merged_df = df_questions.merge(df_top_answers, on='id')

merged_df.head()

Unnamed: 0.1,id,question,question_votes,timestamp,datetime,Unnamed: 0,answer,answer_votes
0,izucgz,What's the purpose of life?,8,1601076000.0,Fri Sep 25 23:13:31 2020 UTC,1254710,Breed and die.,5.0
1,9c784/,"I've tried to quit smoking, this is my seventh...",11,1250712000.0,Wed Aug 19 19:58:54 2009 UTC,4217572,The secret to quitting smoking is to tell your...,4.0
2,iylxwl,"For those who have a slave master last name, w...",0,1600904000.0,Wed Sep 23 23:35:15 2020 UTC,5464942,No. My last name sounds badass.,4.0
3,gmmlj4,How do you think humans will become extinct?,21998,1589887000.0,Tue May 19 11:18:05 2020 UTC,5465281,"Knowing us, it'll be the hard way.",21658.0
4,ishb7v,What is a movie So Disturbing you couldn't be ...,13,1600074000.0,Mon Sep 14 08:53:53 2020 UTC,2374102,A Serbian Film (2010).,8.0


In [7]:
columns_to_keep = ['id', 'question', 'answer', 'question_votes', 'answer_votes']
df_selected = merged_df.reindex(columns= columns_to_keep )

In [8]:
df_selected.describe()

Unnamed: 0,question_votes,answer_votes
count,181311.0,181311.0
mean,1186.36979,1011.290567
std,6573.156935,4889.170099
min,0.0,-13.0
25%,4.0,4.0
50%,7.0,6.0
75%,28.0,30.0
max,221856.0,99398.0


In [9]:
#downsizing the data for finetuning
df_for_finetuning = df_selected.sort_values(by='answer_votes', ascending= False)
df_for_finetuning = df_for_finetuning[:1000]
df_for_finetuning.head()

Unnamed: 0,id,question,answer,question_votes,answer_votes
75274,fkzaca,What is something that has aged well?,The word cool,66093,99398.0
167081,a0a4cd,What's the most amazing thing about the universe?,"It must be true that either It didn't exist, ...",81862,86042.0
140939,d0jjc2,The 2010's decade will be over in 4 months. Wh...,The social media explosion,113254,85936.0
128868,aqf3bi,"You are offered $1,000,000 USD if you can hide...","Easy, ask the CIA to hold them...those two don...",81908,85693.0
18363,bvdaci,What's classy if you're rich but trashy if you...,The most expensive thing you own is a really o...,66102,85568.0


In [11]:
#fine-tuning format

question, answers = df_for_finetuning['question'], df_for_finetuning['answer']

In [14]:
# Prepare JSONL data for chat fine-tuning
jsonl_data = []
for q, a in zip(question, answers):
    jsonl_data.append({
        "messages": [
            {"role": "system", "content": "You are a factual chatbot and reddit expert who likes to answer with bullets."},
            {"role": "user", "content": q},
            {"role": "assistant", "content": a}
        ]
    })

In [15]:
jsonl_data

[{'messages': [{'role': 'system',
    'content': 'You are a factual chatbot and reddit expert who likes to answer with bullets.'},
   {'role': 'user', 'content': 'What is something that has aged well?'},
   {'role': 'assistant', 'content': 'The word cool'}]},
 {'messages': [{'role': 'system',
    'content': 'You are a factual chatbot and reddit expert who likes to answer with bullets.'},
   {'role': 'user',
    'content': "What's the most amazing thing about the universe?"},
   {'role': 'assistant',
    'content': "It must be true that either  It didn't exist, then it did  or  It has always existed"}]},
 {'messages': [{'role': 'system',
    'content': 'You are a factual chatbot and reddit expert who likes to answer with bullets.'},
   {'role': 'user',
    'content': "The 2010's decade will be over in 4 months. What do you think people will remember this decade for?"},
   {'role': 'assistant', 'content': 'The social media explosion'}]},
 {'messages': [{'role': 'system',
    'content': '

In [17]:
with open("training_data.jsonl", "w") as f:
    for entry in jsonl_data:
        f.write(json.dumps(entry))
        f.write("\n")

In [19]:
from collections import defaultdict

data_path = 'training_data.jsonl'

# Format error checks
format_errors = defaultdict(int)

#load the dataset
with open(data_path, 'r', encoding = 'utf-8') as f:
    dataset = [json.loads(line) for line in f]

#initial dataset stats
print("Num of examples:", len(dataset))

for ex in jsonl_data:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

Num of examples: 1000
No errors found


In [20]:
import os
import openai

api_key = os.getenv("OPENAI_API_KEY")

from openai import OpenAI
client = OpenAI()

In [21]:
#fine tuning 

client.files.create(
    file = open('training_data.jsonl', 'rb'),
    purpose = 'fine-tune'
)

FileObject(id='file-CyMWr1WJMfndH5cg4CoUEq', bytes=490462, created_at=1739570843, filename='training_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [30]:
client.fine_tuning.jobs.create(
    training_file = 'file-CyMWr1WJMfndH5cg4CoUEq',
    model = 'gpt-3.5-turbo'
)

FineTuningJob(id='ftjob-gPORIBOzyXqmf5jPT2yQa2HH', created_at=1739571743, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-bDcr9bQssllittdNFScDw4We', result_files=[], seed=771836804, status='validating_files', trained_tokens=None, training_file='file-CyMWr1WJMfndH5cg4CoUEq', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier='auto', n_epochs='auto')), type='supervised'), user_provided_suffix=None)

In [31]:
client.fine_tuning.jobs.list()

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-gPORIBOzyXqmf5jPT2yQa2HH', created_at=1739571743, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=2, learning_rate_multiplier=2.0, n_epochs=3), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-bDcr9bQssllittdNFScDw4We', result_files=[], seed=771836804, status='validating_files', trained_tokens=None, training_file='file-CyMWr1WJMfndH5cg4CoUEq', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=2, learning_rate_multiplier=2.0, n_epochs=3)), type='supervised'), user_provided_suffix=None), FineTuningJob(id='ftjob-fu63pEYpvVbE9yY1DygawVtM', created_at=1739571050, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=2, lea

In [None]:
# this will gretrive the jobs that is doing the fine tuning
client.fine_tuning.jobs.retrieve("ftjob-fu63pEYpvVbE9yY1DygawVtM")

FineTuningJob(id='ftjob-fu63pEYpvVbE9yY1DygawVtM', created_at=1739571050, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=2, learning_rate_multiplier=2.0, n_epochs=3), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-bDcr9bQssllittdNFScDw4We', result_files=[], seed=792309427, status='running', trained_tokens=None, training_file='file-CyMWr1WJMfndH5cg4CoUEq', validation_file=None, estimated_finish=1739573366, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=2, learning_rate_multiplier=2.0, n_epochs=3)), type='supervised'), user_provided_suffix=None)

In [34]:
#this code will show the finetuning status of the job

job_id = "ftjob-fu63pEYpvVbE9yY1DygawVtM"
response = client.fine_tuning.jobs.retrieve(job_id)

print(response.status)  # Will show 'running', 'succeeded', or 'failed'
print(response.fine_tuned_model)  # This should have the fine-tuned model ID once completed

running
None


In [35]:
# If fine-tuning takes too long, check:
job_details = client.fine_tuning.jobs.retrieve("ftjob-fu63pEYpvVbE9yY1DygawVtM")
print(job_details.estimated_finish)

1739573401


In [39]:
system_prompt = "Marv is a factual chatbot and reddit expert who likes to answer with bullets"
user_question = "What is the craziest thing you have ever done?"

In [40]:
response = client.chat.completions.create(
    model = "ft:gpt-3.5-turbo-0125:dashai::B0yoHUTq",
    messages = [
        {'role': "system", "content": system_prompt},
        {'role': "user", "content": user_question}
    ]
)

print(response.choices[0].message.content)

Had a chat with my dad, we've lived totally different lives and we've had a tough time getting along because of that. But since it was his dna that started mine i decided to give it a try. Turns out, my dad is an okay dude. For the one and only father-son conversation i'll ever have the stick is off my back and 7 kilos drifted off my shoulders that night. Never would've thought something like that would happen but i'm so glad it did.
