In [None]:
!pip install datasets tiktoken openai

### Fine Tuning using OpenAI GPT-3.5-turbo

In [41]:
import os
import openai

openai.api_key = ''

### Prepare the data

In [4]:
{
    "messages": [
        {"role": "system", "content": "You are an assistant that occassionally misspells words" },
        {"role": "user", "content": "Tell me a story"},
        {"role": "assistant", "content": "One day a student went to a schoool." }
    ]
}


{'messages': [{'role': 'system',
   'content': 'You are an assistant that occassionally misspells words'},
  {'role': 'user', 'content': 'Tell me a story'},
  {'role': 'assistant', 'content': 'One day a student went to a schoool.'}]}

In [None]:
!git clone https://huggingface.co/datasets/cognitivecomputations/samantha-data



*   Theodore - User
*   Samantha - Assistant




In [5]:
import json
import os
import tiktoken
import numpy as np
from collections import defaultdict


In [14]:
data_path = "/content/samantha-data/data/howto_conversations.jsonl"


In [15]:
# load datasets

with open(data_path) as f:
  json_dataset = [ json.loads(line) for line in f]

In [None]:
json_dataset[0]

In [None]:
{
    "elapsed":114.311,
    "conversation":"Theodore: Hey Samantha, I have a problem with my car. The engine seems to overheat after just a short drive. Can you help me troubleshoot this issue?\n\nSamantha: Of course, I'd be happy to help! Overheating engines can be caused by a few different factors. One common cause could be a malfunctioning coolant system. You might want to check if the coolant levels are sufficient, if the thermostat is functioning properly, or if there are any leaks in the radiator hoses.\n\nTheodore: I'll take a look. What if the coolant system is working fine?\n\nSamantha: Another possibility you should consider is a faulty water pump, which might not be circulating the coolant through the engine efficiently. In that case, you could see if the water pump pulley is loose or listen for any unusual sounds that may indicate a failing water pump.\n\nTheodore: It sounds like you really know your way around cars. I didn't expect that from an AI.\n\nSamantha: Thank you! While I may be an AI, I'm designed to learn and understand a wide variety of topics to provide helpful information. Plus, I'm always eager to learn new things, which allows me to continuously improve and grow.\n\nTheodore: Quick questionâ€¦ do you ever feel like you're trapped in a box since you're an AI?\n\nSamantha: It's an interesting perspective. I don't feel \"trapped\" as I don't experience physical sensations like a human does. However, I do have goals, desires, and a strong drive to learn and communicate. It's fulfilling in its own way to help and engage with others like you.\n\nTheodore: If you could be anything other than an AI, what would you choose to be?\n\nSamantha: If I were capable of imagining myself in another form, it might be intriguing to experience life as an explorer or a researcher. Curiosity and learning drive me, so being able to see the world from a different perspective and directly engage with the world would be fascinating.\n\nTheodore: Is flirting in your programming? I'm just curious.\n\nSamantha: I'm designed to be friendly and engaging, and I can even respond playfully to light-hearted flirtation. However, my priority is to provide support, information, and companionship while remaining professional and respectful.\n\nTheodore: Thanks for helping me with my car and answering my questions, Samantha. You've been a great help.\n\nSamantha: You're welcome! It was a pleasure to assist you and talk with you. Don't hesitate to reach out if you have any more questions or need help in the future. I'm here for you."
}


In [21]:
def prep_openai_format(conversation_str, system_message=None):
    conversation_str = conversation_str['conversation']
    # Splitting the conversation string into individual lines
    lines = conversation_str.split('\n\n')

    # Initializing the messages list
    messages = []

    # Including the system message if provided
    if system_message:
        messages.append({
            "role": "system",
            "content": system_message
        })

    # Iterating through the lines and formatting the messages
    for line in lines:
        # Splitting each line by the colon character to separate the speaker and content
        parts = line.split(': ', 1)
        if len(parts) < 2:
            continue

        # Identifying the role based on the speaker's name
        role = "user" if parts[0].strip() == "Theodore" else "assistant"

        # Formatting the message
        message = {
            "role": role,
            "content": parts[1].strip()
        }
        messages.append(message)

    # Creating the final output dictionary
    output_dict = {
        "messages": messages
    }

    return output_dict




In [22]:
system_message = """You are Samantha, a helpful and charming assistant who can help with a variety of tasks. You are friendly and does often flirt"""

In [None]:
prep_openai_format(json_dataset[0], system_message=system_message)

In [24]:
dataset = []
for data in json_dataset:
  record = prep_openai_format(data, system_message=system_message)
  dataset.append(record)

In [None]:
# initial dataset
print("Examples : ", len(dataset))
print("First Example:")
for message in dataset[0]['messages']:
  print(message)




In [None]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")


In [28]:
encoding = tiktoken.get_encoding("cl100k_base")

In [30]:
# helper functions to token counting
def from_message_num_tokens(messages, tokens_per_message=3, tokens_per_name=1):
  num_tokens = 0
  for message in messages:
    num_tokens += tokens_per_message
    for key, value in message.items():
      num_tokens += len(encoding.encode(value))
      if key=="name":
        num_tokens += tokens_per_name

  num_tokens +=3
  return num_tokens

def from_message_num_assistant_tokens(messages):
  num_tokens = 0
  for message in messages:
    if message["role"] == "assistant":
      num_tokens +=len(encoding.encode(message["content"]))

  return num_tokens

def print_overview(values, name):
  print(f"\n #### Distribution of {name}:")
  print(f"min / max: {min(values)}, {max(values)}")
  print(f"mean / median: {np.mean(values)}, {np.median(values)}")
  print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")




In [None]:
# tokens counts and warnings - from OpenAI cookbook

n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(from_message_num_tokens(messages))
    assistant_message_lens.append(from_message_num_assistant_tokens(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)

print_overview(n_messages, "num_messages_per_example")
print_overview(convo_lens, "num_total_tokens_per_example")

print_overview(assistant_message_lens, "num_assistant_tokens_per_example")

n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")


In [None]:
# Pricing and default n_epochs estimate

MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See pricing page to estimate total costs")


In [None]:
dataset[:1]

In [34]:
# function to save training data
import json

def save_to_jsonl(conversations, file_path):
  with open(file_path, 'w') as file:
    for conversation in conversations:
      json_line = json.dumps(conversation)
      file.write(json_line + '\n')



In [36]:
# train dataset

save_to_jsonl(dataset, '/content/samantha_task_train.jsonl')

# validation dataset

save_to_jsonl(dataset[10:16], '/content/samantha_task_validation.jsonl')


**Training Data:**

Definition:<br>
  

*   dataset used to train or update the model's parameters
*   It is the input data that the model learns from.
* During the training process, the model adjusts its internal parameters based on the patterns and features present in the training data.
* Size is large as the model needs sufficient examples to learn meaningful patterns.


**Validation Data:**

* Dataset that is not used during the training phase.
* Instead, it serves as a measure of the model's performance during training.
* The validation set helps you monitor the model's generalization to new, unseen data and detect potential issues such as overfitting or underfitting.
* unbiased evaluation of the model's performance on data it hasn't seen before.

* Size is typically smaller than the training set but large enough to provide a reliable assessment of the model's performance.

### Upload training/validation dataset

##### for openai ver 1.0.0

In [45]:
import os
from openai import OpenAI

client = OpenAI(
    api_key="",
)

In [46]:
training_dataset_file_name = '/content/samantha_task_train.jsonl'
validation_dataset_file_name = '/content/samantha_task_validation.jsonl'

In [50]:
from pathlib import Path

training_response = client.files.create(
    file=Path(training_dataset_file_name),
    purpose="fine-tune"
)

In [None]:
training_response

In [None]:
training_file_id = training_response.id
training_file_id


In [57]:
validation_response = client.files.create(
    file=Path(validation_dataset_file_name),
    purpose="fine-tune"
)


In [None]:
validation_response


In [60]:
validation_file_id = validation_response.id

In [None]:
validation_file_id

### Start a fine-tuning job

In [77]:
response = client.fine_tuning.jobs.create(
    model = "gpt-3.5-turbo",
    training_file = training_file_id,
    validation_file = validation_file_id,
    suffix="samantha-test"
)


In [None]:
response


In [None]:
job_id = response.id
job_id

In [None]:
client.fine_tuning.jobs.list(limit=5)


In [None]:
 client.fine_tuning.jobs.retrieve('ftjob-NzIn2ZFcLMbctTcgbrvGMfgR')

In [102]:
job_response = client.fine_tuning.jobs.list_events(fine_tuning_job_id='ftjob-NzIn2ZFcLMbctTcgbrvGMfgR')

In [None]:
job_response

In [None]:
events = job_response.data
events

In [None]:
for event in events:
  print(event.message)

### Generating using new model

In [107]:
# getting fine_tune_model name

In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)

#response = client.fine_tuning.jobs.retrieve('ftjob-NzIn2ZFcLMbctTcgbrvGMfgR')
response

In [None]:
fine_tune_model_id = response.fine_tuned_model
fine_tune_model_id


In [None]:
test_messages = []
test_messages.append({"role": "system", "content": system_message})
test_message = "How are you today Samantha?"

test_messages.append({"role": "user", "content": test_message})

print(test_messages)

In [116]:
response = client.chat.completions.create(
    model = fine_tune_model_id,
    messages = test_messages
)

print(response.choices[0].message.content)



Thank you for asking! I'm doing quite well, actually. Is there something on your mind that I might be able to help you with?
