In [1]:
import os

# Get the partition_data parameter from environment variable
partition_data = os.environ.get('PARTITION_DATA', 'false').lower() == 'true'

print(f"Partition data: {partition_data}")

Partition data: False


In [1]:
import socketio
import pandas as pd
import json
import random
import openai # Install the OpenAI Python package with `pip install openai`
import time
import os

# Create a SocketIO client
sio = socketio.Client()

# Connect to the server
@sio.event
def connect():
    print('Connected to server')

# Define a function to send a message to the server
def send_message_to_server(message):
    sio.emit('message_from_notebook', message)
# Define a function to send the CSV data to the server
def send_csv_data(csv_data):
    sio.emit('csv_data', csv_data)
# Connect to the server
sio.connect('http://localhost:5000')

# Send a message to the server
#send_message_to_server('Starting finetuning!')

Connected to server


In [18]:
# Send a message to the server
send_message_to_server('Starting finetuning!')

In [4]:
# Load your data
data = pd.read_csv('output/specifications/mega_combined.csv')

if partition_data:
    print("Partitioning data into training and evaluation sets...")
    send_message_to_server('Partitioning data into training and evaluation sets...')
    # Partition the data
    train_data = data.sample(frac=0.8, random_state=42)
    eval_data = data.drop(train_data.index)
    
    # Save evaluation data for later use
    eval_data.to_csv('evaluation_data.csv', index=False)
    
    # Use train_data for fine-tuning
else:
    print("Using all data for fine-tuning...")
    send_message_to_server('Using all data for fine-tuning...')
    # Use all data for fine-tuning
    train_data = data

# Continue with your fine-tuning process using train_data
# Load the CSV file
#file_path = 'output/specifications/mega_combined.csv'
# Load the CSV file
#file_path = os.getenv('PARTITION_FILE', 'output/specifications/mega_combined.csv')
#print(f"Loading CSV file from {file_path}")
df = data

# Placeholder for the system message
system_message = "Marv is a factual chatbot that provides complete specifications based on user requirements."

# Function to create JSONL fine-tuning data
def create_finetuning_data(df, system_message):
    conversations = []
    for _, row in df.iterrows():
        # Filter out columns with '-' or blank values
        valid_columns = [col for col in df.columns if row[col] not in ['-', '', None]]
        
        if len(valid_columns) < 1:
            continue  # Skip rows where no valid columns are found
        
        # Randomly select one or two valid columns from the row
        selected_columns = random.sample(valid_columns, k=min(2, len(valid_columns)))
        
        # Construct the user content with the selected columns and their values
        user_content = "Please provide the complete specification for the following requirements: "
        user_content += ", ".join(f"{col}: {row[col]}" for col in selected_columns) + "."
        
        # Construct the assistant content with the complete row values
        assistant_content = ", ".join(f"{col}: {row[col]}" for col in df.columns)
        
        conversation = {
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}
            ]
        }
        conversations.append(conversation)
    
    return conversations

# Generate the fine-tuning data
finetuning_data = create_finetuning_data(df, system_message)

# Save the fine-tuning data to a JSONL file
output_path = 'finetuning_data.jsonl'
with open(output_path, 'w', encoding='utf-8') as f:
    for conversation in finetuning_data:
        json.dump(conversation, f)
        f.write('\n')

print(f"Fine-tuning data saved to {output_path}")
send_message_to_server('Fine-tuning data saved to {output_path}')

Using all data for fine-tuning...
Fine-tuning data saved to finetuning_data.jsonl


import pandas as pd
import json
import random

# Load the CSV file
file_path = 'output/specifications/park.csv'
df = pd.read_csv(file_path)

# Placeholder for the system message
system_message = "Marv is a factual chatbot that provides complete specifications based on user requirements."

# Function to create JSONL fine-tuning data
def create_finetuning_data(df, system_message):
    conversations = []
    for _, row in df.iterrows():
        # Filter out columns with '-' or blank values
        valid_columns = [col for col in df.columns if row[col] not in ['-', '', None]]
        
        if len(valid_columns) < 1:
            continue  # Skip rows where no valid columns are found
        
        # Randomly select one or two valid columns from the row
        selected_columns = random.sample(valid_columns, k=min(2, len(valid_columns)))
        
        # Construct the user content with the selected columns and their values
        user_content = "Please provide the complete specification for the following requirements: "
        user_content += ", ".join(f"{col}: {row[col]}" for col in selected_columns) + "."
        
        # Construct the assistant content with the complete row values
        assistant_content = ", ".join(f"{col}: {row[col]}" for col in df.columns)
        
        conversation = {
            "messages": [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}
            ]
        }
        conversations.append(conversation)
    
    return conversations

# Generate the fine-tuning data
finetuning_data = create_finetuning_data(df, system_message)

# Print the fine-tuning data instead of saving to a file
print("Fine-tuning data that would be sent to the model:")
for i, conversation in enumerate(finetuning_data, 1):
    print(f"\nConversation {i}:")
    print(json.dumps(conversation, indent=2))
    print("-" * 50)

print(f"Total number of conversations generated: {len(finetuning_data)}")


In [5]:
from openai import OpenAI
client = OpenAI()

file_object = client.files.create(
    file=open("finetuning_data.jsonl", "rb"),
    purpose="fine-tune"
)
training_file_id = file_object.id
print(f"Training file ID: {training_file_id}")
send_message_to_server(f"Created Training file ID: {training_file_id}")

Training file ID: file-3koOOtWFxj41vG1q4JTxW5K9


In [6]:
from openai import OpenAI
client = OpenAI()

fine_tuning_job = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo"
)
print(fine_tuning_job.id)
send_message_to_server(f"Fine-tuning job ID: {fine_tuning_job.id}")

ftjob-JnHSr27FAmrpmoZ2g4jnmB34


In [7]:
from openai import OpenAI
import time

client = OpenAI()

# Retrieve the initial job status
fine_tuning_job = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)

running_status_sent = False

while fine_tuning_job.status not in ["succeeded", "cancelled"]:
    if fine_tuning_job.status == "running" and not running_status_sent:
        print("Fine-tuning job status: running")
        send_message_to_server("Fine-tuning job status: running")
        running_status_sent = True
    elif fine_tuning_job.status != "running":
        print(f"Fine-tuning job status: {fine_tuning_job.status}")
        send_message_to_server(f"Fine-tuning job status: {fine_tuning_job.status}")
        running_status_sent = False  # Reset in case it goes back to running

    time.sleep(10)  # Wait for 10 seconds before checking the status again
    fine_tuning_job = client.fine_tuning.jobs.retrieve(fine_tuning_job.id)  # Retrieve the latest job status

if fine_tuning_job.status == "cancelled":
    print("Fine-tuning job was cancelled!")
    send_message_to_server("Fine-tuning job was cancelled!")
elif fine_tuning_job.status == "succeeded":
    print("Fine-tuning job completed successfully!")
    send_message_to_server("Fine-tuning job completed successfully!")


Fine-tuning job status: validating_files


Fine-tuning job status: validating_files


Fine-tuning job status: validating_files


Fine-tuning job status: running


Fine-tuning job completed successfully!


In [8]:
import json
import requests
import time
import os

def load_or_create_json(filename, default_content):
    if not os.path.exists(filename):
        with open(filename, 'w') as f:
            json.dump(default_content, f)
        print(f"Created {filename} with default content.")
    
    with open(filename, 'r') as f:
        content = f.read().strip()
        if content:
            return json.loads(content)
        else:
            return default_content

def save_model_id_and_restart_server(model_id, headers):
    # Load or create model_config.json
    model_config = load_or_create_json('model_config.json', {'model_id': "null"})
    
    # Update model_id if it's not "null"
    if model_id != "null":
        model_config['model_id'] = model_id
    
    # Save the updated model config
    with open('model_config.json', 'w') as f:
        json.dump(model_config, f)
    
    print(f"Fine-tuned model ID saved: {model_config['model_id']}")

    # Load or create csv_headers.json
    headers_config = load_or_create_json('csv_headers.json', {'headers': []})
    
    # Update headers
    headers_config['headers'] = headers
    
    # Save the updated headers
    with open('csv_headers.json', 'w') as f:
        json.dump(headers_config, f)
    
    print("CSV headers saved")
    
    # Trigger server restart
    try:
        requests.post('http://localhost:5000/restart')
        print("Restart signal sent to server")
    except requests.exceptions.ConnectionError:
        print("Server is restarting...")
    
    # Wait for server to come back online
    server_up = False
    while not server_up:
        try:
            response = requests.get('http://localhost:5000/healthcheck')
            if response.status_code == 200:
                server_up = True
                print("Server is back online")
        except requests.exceptions.ConnectionError:
            print("Waiting for server to restart...")
            time.sleep(5)

# Assuming you have a DataFrame called 'df' with your CSV data
headers = df.columns.tolist()

# Use this function after fine-tuning
if fine_tuning_job.status == "succeeded":
    fine_tuned_model_id = fine_tuning_job.fine_tuned_model
    save_model_id_and_restart_server(fine_tuned_model_id, headers)
else:
    print("Fine-tuning job did not succeed. Model ID not saved.")
    send_message_to_server("Fine-tuning job did not succeed. Model ID not saved.")

Fine-tuned model ID saved: ft:gpt-3.5-turbo-0125:personal::9mpfBKCV
CSV headers saved


Server is restarting...


Waiting for server to restart...


Server is back online


import json

def save_model_id(model_id):
    with open('model_config.json', 'w') as f:
        json.dump({'model_id': model_id}, f)
        
fine_tuned_model_id = fine_tuning_job.fine_tuned_model
save_model_id(fine_tuned_model_id)
print(f"Fine-tuned model ID saved: {fine_tuned_model_id}")

import os

# Get the fine-tuned model ID
fine_tuned_model_id = "ft:gpt-3.5-turbo-0125:personal::9ZJtzJtN"

# Save it as an environment variable
os.environ['FINE_TUNED_MODEL_ID'] = fine_tuned_model_id

import os

fine_tuned_model_id = os.environ.get('FINE_TUNED_MODEL_ID')
model_to_use = fine_tuned_model_id if fine_tuned_model_id else "ft:gpt-3.5-turbo-0125:personal::9ZJtzJtN:ckpt-step-72"
print(f"Using model: {model_to_use}")

from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model="ft:gpt-3.5-turbo-0125:personal::9afDwulJ",
  messages=[
    {"role": "system", "content": "Marv is a factual chatbot that provides complete specifications based on user requirements."},
    {"role": "user", "content": "Please provide the complete specification for the following requirements: Method: SD\({}^{35}\), Temp.: 120."}
  ]
)

#print(completion.choices[0].message)
message_content = completion.choices[0].message.content
print(message_content)

message_content = completion.choices[0].message.content
print(message_content)