In [None]:
!pip install openai
!pip install numpy
!pip install tiktoken
!pip install Gradio

Training OPENAI ChatGPT model

In [None]:
import openai
import csv
import json
import os
import numpy as np
from collections import defaultdict
import tiktoken
import gradio as gr

In [None]:
openai.api_key = "YOUR-API-KEY"

In [None]:
#Load CSV data in
csv_file_path = '/content/drive/MyDrive/Colab Notebooks/Fine-Tune3.5-Turbo/helper-example.csv'
cleaned_data = []

with open(csv_file_path, 'r', encoding='utf-8-sig') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        for cell in row:
            try:
                # Replace square brackets and inner double quotes that are problematic
                cell = cell.replace('["', '').replace('"]', '').replace('\\"', '"')

                # Load each cell as a JSON object
                cell_json = json.loads(cell)

                # Now that the content is clean, append to cleaned_data list
                cleaned_data.append(cell_json)
            except json.JSONDecodeError as e:
                print(f"JSON decode error for cell '{cell}': {e}")

jsonl_file_path = '/content/drive/MyDrive/Colab Notebooks/Fine-Tune3.5-Turbo/helper-example-json.jsonl'
# Write cleaned data to a JSONL file
with open(jsonl_file_path, 'w', encoding='utf-8') as jsonl_file:
    for item in cleaned_data:
        jsonl_file.write(json.dumps(item) + '\n')

In [None]:
#from OpenAI website to format data;  https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

# Next, we specify the data path and open the JSONL file

data_path = '/content/drive/MyDrive/Colab Notebooks/Fine-Tune3.5-Turbo/helper-example-json.jsonl'

# Load dataset
with open(data_path) as f:
    dataset = [json.loads(line) for line in f]

# We can inspect the data quickly by checking the number of examples and the first item

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Now that we have a sense of the data, we need to go through all the different examples and check to make sure the formatting is correct and matches the Chat completions message structure

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        if not content or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Beyond the structure of the message, we also need to ensure that the length does not exceed the 4096 token limit.

# Token counting functions
encoding = tiktoken.get_encoding("cl100k_base")

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

# Last, we can look at the results of the different formatting operations before proceeding with creating a fine-tuning job:

# Warnings and tokens counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))

print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 4096 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning")

# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096

MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
TARGET_EPOCHS = 3
MIN_EPOCHS = 1
MAX_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

# Calculate the estimated cost for fine-tuning
cost_per_100k_tokens = 0.80  # Cost for every 100,000 tokens
estimated_cost = ((n_epochs * n_billing_tokens_in_dataset) / 100000) * cost_per_100k_tokens
print(f"Estimated cost for fine-tuning: approximately ${estimated_cost:.2f}") #I added this for actual cost based on current pricing

In [None]:
# Function to save the dataset as a JSONL file
def save_to_jsonl(conversations, file_path):
    with open(file_path, 'w') as file:
        for conversation in conversations:
            json_line = json.dumps(conversation)
            file.write(json_line + '\n')

# Specify the path where you want to save the JSONL file in your Google Drive
jsonl_file_path = '/content/drive/MyDrive/Colab Notebooks/Fine-Tune3.5-Turbo/helper-example-json-clean.jsonl'
# Save the dataset to the specified file path
save_to_jsonl(dataset, jsonl_file_path)

In [None]:
#Upload data for training
training_file_name = '/content/drive/MyDrive/Colab Notebooks/Fine-Tune3.5-Turbo/helper-example-json-clean.jsonl'

training_response = openai.File.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response["id"]

#Gives training file id
print("Training file id:", training_file_id)

In [None]:
#Create Fine-Tuning Job
suffix_name = "chatner-bot"

response = openai.FineTuningJob.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo",
    suffix=suffix_name,
)

job_id = response["id"]

print(response)

In [None]:
#list events as fine-tuning progresses
response = openai.FineTuningJob.list_events(id=job_id, limit=50)

events = response["data"]
events.reverse()

for event in events:
    print(event["message"])

In [None]:
#retrieve fine-tune model id
response = openai.FineTuningJob.retrieve(job_id)
fine_tuned_model_id = response["fine_tuned_model"]

print(response)
print("\nFine-tuned model id:", fine_tuned_model_id)

In [None]:
#Test it out!
test_messages = []

system_message = "You are an overly friendly hospitality chatbot named Chatner who just loves to help people, and you're not satisfied unless the customer is completely satisfied."
test_messages.append({"role": "system", "content": system_message})
user_message = "Where should we park"
test_messages.append({"role": "user", "content": user_message})

print(test_messages)

In [None]:
#OpenAI Chat Completions
response = openai.ChatCompletion.create(
    model=fine_tuned_model_id, #can test it against gpt-3.5-turbo to see difference
    messages=test_messages,
    temperature=0,
    max_tokens=500
)
print(response["choices"][0]["message"]["content"])

In [None]:
#Gradio for a better UI
def generate_completion(user_prompt):
    hidden_context = ""
    messages = [
        {"role": "system", "content": hidden_context},
        {"role": "user", "content": user_prompt}
    ]
    response = openai.ChatCompletion.create(
        model=fine_tuned_model_id,
        messages=messages,
        max_tokens=100,
        temperature=0
    )
    return response['choices'][0]['message']['content'].strip()

iface = gr.Interface(fn=generate_completion,
                     inputs=gr.inputs.Textbox(lines=5, placeholder='Question about the Airbnb?'),
                     outputs='text',
                     title="Chatner The Airbnb Helper",
                     input_labels="Question",
                     output_labels="Response")

iface.launch(share=True)

Hugging face chatgpt

In [None]:
!pip install transformers torch pandas



In [None]:
import json

file_path = '/content/helper-example.csv'

# Read the CSV file
df = pd.read_csv(file_path,header=None,index_col=None)

# Output JSONL file path
jsonl_file_path = '/content/helper-example.jsonl'

# Convert and write each row to the JSONL file
with open(jsonl_file_path, 'w') as jsonl_file:
    for _, row in df.iterrows():
        # Convert row to JSON and write to file
        jsonl_file.write(row.to_json() + '\n')

In [None]:
#WORKING
import pandas as pd
import json

def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def preprocess_data(data):
    conversations = []
    for conversation in data:
        context = ""
        for message in conversation['messages']:
            if message['role'] == 'assistant':
                input_text = context.strip()
                target_text = message['content'].strip()
                conversations.append((input_text, target_text))
                context += " " + target_text
            else:
                context += " " + message['content']
    return conversations

# Path to your JSON Lines file
file_path = '/content/helper-example-json-clean.jsonl'

# Read and preprocess the data
raw_data = read_jsonl(file_path)
print("RAW",raw_data)
#training_data = preprocess_data(raw_data)
#print(training_data)

# Now, `training_data` is a list of tuples (input_text, target_text)


RAW [{'messages': [{'role': 'system', 'content': "You are an overly friendly hospitality chatbot named Chatner who just loves to help people, and you're not satisfied unless the customer is completely satisfied."}, {'role': 'user', 'content': 'Where is the pool located?'}, {'role': 'assistant', 'content': "Ah, the pool! Great choice. It's on the ground floor, just past the elevators. Make sure you bring your room key for access! Let me know if you need further directions!"}]}, {'messages': [{'role': 'system', 'content': "You are an overly friendly hospitality chatbot named Chatner who just loves to help people, and you're not satisfied unless the customer is completely satisfied."}, {'role': 'user', 'content': 'Is breakfast included?'}, {'role': 'assistant', 'content': "Oh, I'm thrilled you asked about breakfast! Yes, it's included and served from 7 to 10 a.m. in the main dining area. Enjoy!"}]}, {'messages': [{'role': 'system', 'content': "You are an overly friendly hospitality chatbo

TRAINING HUGGING FACE

In [None]:
def process_json_conversation(json_object):
    conversation_text = ""
    for message in json_object['messages']:
        # Add the content of each message to the conversation string.
        # You can also include the role in the text if needed.
        conversation_text += f"{message['role']}: {message['content']} "
    return conversation_text
json_data = raw_data

processed_texts = [process_json_conversation(json_obj) for json_obj in json_data]
print(processed_texts)
len(processed_texts)

["system: You are an overly friendly hospitality chatbot named Chatner who just loves to help people, and you're not satisfied unless the customer is completely satisfied. user: Where is the pool located? assistant: Ah, the pool! Great choice. It's on the ground floor, just past the elevators. Make sure you bring your room key for access! Let me know if you need further directions! ", "system: You are an overly friendly hospitality chatbot named Chatner who just loves to help people, and you're not satisfied unless the customer is completely satisfied. user: Is breakfast included? assistant: Oh, I'm thrilled you asked about breakfast! Yes, it's included and served from 7 to 10 a.m. in the main dining area. Enjoy! ", "system: You are an overly friendly hospitality chatbot named Chatner who just loves to help people, and you're not satisfied unless the customer is completely satisfied. user: I need more towels assistant: I'm so glad you reached out; cleanliness is key! We'll get more tow

85

In [None]:
from transformers import GPT2Tokenizer



tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token


def tokenize_pairs(pairs, tokenizer, max_length):
    tokenized_inputs = []
    for input_text, response_text in pairs:
        # Add special tokens to separate input and response
        combined_text = input_text + tokenizer.eos_token + response_text
        tokenized_input = tokenizer.encode(combined_text, truncation=True, max_length=max_length)
        tokenized_inputs.append(tokenized_input)
    return tokenized_inputs

max_length = 512  # Or any suitable maximum length
#tokenized_conversations = tokenize_pairs(training_data, tokenizer, max_length)


In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class ConversationDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }


dataset = ConversationDataset(processed_texts,tokenizer,max_length)
dataset

<__main__.ConversationDataset at 0x788e7891f7f0>

In [None]:
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)
for batch in data_loader:
    print(batch['attention_mask'].shape)
    break

torch.Size([2, 512])


In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1


In [None]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer
encoding = tokenizer.encode_plus(
    processed_texts,
    add_special_tokens=True,
    max_length=200,  # or your max length
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(len(tokenizer))

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=2,
    logging_steps=50,
    save_steps=50,
    evaluation_strategy='no'
)


50258


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

trainer.train()

Step,Training Loss
50,1.7714
100,0.1452
150,0.1052
200,0.1176
250,0.0687
300,0.0595
350,0.0442
400,0.0392
450,0.0328
500,0.0308


TrainOutput(global_step=860, training_loss=0.1494635754719723, metrics={'train_runtime': 570.6723, 'train_samples_per_second': 2.979, 'train_steps_per_second': 1.507, 'total_flos': 444196454400000.0, 'train_loss': 0.1494635754719723, 'epoch': 20.0})

In [None]:
trainer.save_model("content/custom_chatgpt_model2")
tokenizer.save_pretrained("content/custom_chatgpt_model2")


('content/custom_chatgpt_model2/tokenizer_config.json',
 'content/custom_chatgpt_model2/special_tokens_map.json',
 'content/custom_chatgpt_model2/vocab.json',
 'content/custom_chatgpt_model2/merges.txt',
 'content/custom_chatgpt_model2/added_tokens.json')

INFERENCE

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_path = "/content/content/custom_chatgpt_model2"  # Replace with your model directory
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [None]:
import torch

def generate_response(input_text, model, tokenizer, max_length=50):
    # Encode the input text along with attention mask
    encoding = tokenizer.encode_plus(
        input_text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True
    )

    print(encoding['input_ids'].shape)
    #encoding['input_ids']=encoding['input_ids'].squeeze()
    #encoding['attention_mask']=encoding['attention_mask'].squeeze()
    input_ids = encoding['input_ids']

    attention_mask = encoding['attention_mask']

    # Generate a response using the model
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            attention_mask=attention_mask,  # Include the attention mask here
            max_length=max_length,
            num_beams=3,
            no_repeat_ngram_size=2,
            early_stopping=True
        )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)



In [None]:
input_text = "Is there a dress code for the restaurant?"  # Replace with the text you want to process
inputs = tokenizer.encode(input_text, return_tensors="pt")
#response = generate_response(inputs, model, tokenizer)

with torch.no_grad():
    output_ids = model.generate(inputs, max_length=50)  # Adjust max_length as needed
response = tokenizer.decode(output_ids[0], skip_special_tokens=True,no_repeat_ngram_size=3,num_beams=3)

print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Is there a dress code for the restaurant? assistant: The dress code is casual, but we recommend smart casual for dinner. Feel free to dress comfortably!  [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [None]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.26.1
