### Syntax for creating valid data for fine-tuning as mentioned in https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

        {"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}


In [51]:
!pip3 install -q openai==0.28

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict
import os
import pandas as pd
import json
import openai

from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import SemanticSimilarityExampleSelector
from langchain.vectorstores import Chroma

from utils import read_notebook, extract_cells, format_for_gpt

## Creating the data

In [None]:
## Assumes: Filename of jupyter notebook is the id column in grading csv sheet
def create_training_data(folder_path,file_names,df,content):
    code_dict = {}
    for file in file_names:
        key = file[:-6]
        full_path = os.path.join(folder_path, file)

        notebook = read_notebook(full_path)
        extracted_cells = extract_cells(notebook)
        formatted_data = format_for_gpt(extracted_cells)


        code_dict[key] = [{"role": "system", "content":content }]
        data = df[df['Username'] == key].reset_index(drop=True)
        grade = data['grade'].iloc[0] if not data['grade'].empty else None
        feedback = data['Feedback to Learner'].iloc[0] if not data['Feedback to Learner'].empty else None
        code_dict[key].append({"role":"user","content": formatted_data})
        code_dict[key].append({"role": "assistant", "content": '{{"Grade": {}, "Feedback": {}}}'.format(str(grade), feedback)})


    return code_dict


In [None]:

folder_path = "path to your already graded assignment samples"
file_names,files = [],[]
for file in os.listdir(folder_path):
    full_path = os.path.join(folder_path, file)
    if os.path.isfile(full_path) and file.endswith('.ipynb') and not file.startswith('.'):
        file_names.append(file)
        
grading_sheet = "path to your grading sheet "

human_grades_and_feedback_df = pd.read_csv(grading_sheet)

In [None]:
Instruction = "parse rubic or instruction for the assognment in strings"

system_role = "You are a helpful assistant that evaluate programming assignments and check all the instructions are \
followed and grade it from 0-100 and provide Feedback what went wrong.Provide Reasoning on why you gave such grade." +  \
Instruction

data = create_training_data(folder_path,file_names,human_grades_and_feedback_df,system_role)

In [None]:
training_data = list(data.values())

In [None]:
original_json = json.dumps(training_data, separators=(',', ':'))
jsonData = json.loads(original_json)
transformed_json = [{"messages": lst} for lst in jsonData]

In [None]:
# writting training data as .jsonl format
with open('training_data.jsonl', 'w') as file:
    for entry in transformed_json:
        json_line = json.dumps(entry)  # Converting dictionary to JSON string
        file.write(json_line + '\n')   # Writing JSON string to file with newline

## Validating the data 

In [2]:
data_path = "training_data.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 45
First example:
{'role': 'system', 'content': 'You are a helpful assistant that evaluate programming assignments and check all the instructions are followed and grade it from 0-100 and provide Feedback what went wrong.Provide Reasoning on why you gave such grade.Write three Python scripts using mrjob that tell me:Average number of words in each review (define “words” however you like but be explicit about it) Count of reviews by year-month (eg “2021-09”) Average rating of any review marked ”cool” (eg where cool != 0)}'}
{'role': 'user', 'content': '{"cells":[{"type":"code","content":"import pandas as pd\\ninfo = pd.read_csv(\'yelp.csv\')\\ninfo"},{"type":"code","content":"info[\\"Number of Words\\"] = info[\\"text\\"].apply(lambda n: len(n.split()))\\n(info.head())"},{"type":"code","content":"info1=info[\'Number of Words\'].mean()\\nprint(info1)"},{"type":"markdown","content":"Average number of words in each review (define \\u201cwords\\u201d however you like but be exp

In [3]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
        
        content = message.get("content", None)
        
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [4]:
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [7]:
# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

Num examples missing system message: 0
Num examples missing user message: 0

#### Distribution of num_messages_per_example:
min / max: 3, 3
mean / median: 3.0, 3.0
p5 / p95: 3.0, 3.0

#### Distribution of num_total_tokens_per_example:
min / max: 370, 2011
mean / median: 803.5777777777778, 804.0
p5 / p95: 402.6, 1258.4

#### Distribution of num_assistant_tokens_per_example:
min / max: 10, 50
mean / median: 20.0, 14.0
p5 / p95: 12.0, 37.0

0 examples may be over the 4096 token limit, they will be truncated during fine-tuning


In [41]:
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~36161 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~108483 tokens


## Creating fine-tuning job on gpt-3.5-turbo model

In [13]:
api_key = ""

openai.api_key = api_key

In [None]:

with open("training_data.jsonl", "rb") as file:
    response = openai.File.create(
        file=file,
        purpose='fine-tune'
    )

file_id = response['id']
print(f"File uploaded successfully with ID: {file_id}")


In [None]:
job_id = openai.FineTuningJob.create(
  training_file=file_id, 
  model="gpt-3.5-turbo"
)

print(job_id['id'])

In [15]:
list = openai.FineTuningJob.list()

In [None]:
print("Fine tuning completed with id:",list['data'][0]['fine_tuned_model'])