In [2]:
import pandas as pd
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, recall_score, f1_score

from openai import OpenAI
client = OpenAI(organization='YOUR ORG',
    api_key='YOUR KEY'
    )

#### Upload files to OpenAI

Upload training and test datasets prepared in the previous notebook

In [4]:
for i in range(1, 6):
    
    client.files.create(
      file=open(f'data/activity_train_{i}.json', 'rb'),
      purpose='fine-tune'
    )
    
    client.files.create(
      file=open(f'data/activity_test_{i}.json', 'rb'),
      purpose='fine-tune',
    )
    
    client.files.create(
      file=open(f'data/hemolysis_train_{i}.json', 'rb'),
      purpose='fine-tune',
    )
    
    client.files.create(
      file=open(f'data/hemolysis_test_{i}.json', 'rb'),
      purpose='fine-tune',
    )

#### Create fine-tuning jobs

Go on OpenAI files and manually write down the file IDs because I couldn't find a way to do it programmatically

In [3]:
file_ids = {
    'activity_train_1': 'file-4QveSliSbmwbW5y0j0XaZDvP',
    'activity_test_1': 'file-QlCO8R5IpVTzVFFgWfXvuLWY',
    'hemolysis_train_1': 'file-6bF7sJa0ZKTOHNFHYAD6m2lF',
    'hemolysis_test_1': 'file-ngbcPNae21fDhbwcXtNXwinp',
    'activity_train_2': 'file-9sI9mZ6X1I2HfiUBtNE00rBr',
    'activity_test_2': 'file-RJsJfRhBqted2uWd8klnnHoh',
    'hemolysis_train_2': 'file-ibMzndIaKmfJfQ5PPiZSL9AU',
    'hemolysis_test_2': 'file-g8ccncNCsgY07wNA5u3Tl7aa',
    'activity_train_3': 'file-YOnRI80jYtBB4HQ75Po6JOM5',
    'activity_test_3': 'file-s8mxQGTOrvBFnlmuuo1sLeKb',
    'hemolysis_train_3': 'file-ffVrkoV96Mq9unWXLAlq957o',
    'hemolysis_test_3': 'file-Ftgnkd8DCx2LW7G0oki46lRv',
    'activity_train_4': 'file-KH4fAnK7PTz5go3URg7Qfni8',
    'activity_test_4': 'file-vPINZg9HjpRGzGkk6BmBT4zv',
    'hemolysis_train_4': 'file-5rIRI2OTwGWvKYgN2hENWJhc',
    'hemolysis_test_4': 'file-6xaNs42tKDHlO3wZx4BVMyCa',
    'activity_train_5': 'file-6xRbamt5YYZQBlrpl0azH5nB',
    'activity_test_5': 'file-RRGFWTEcHqHm54bdsGWcmBGC',
    'hemolysis_train_5': 'file-M49wDxpa1tmFvFiGFY0w6hq2',
    'hemolysis_test_5': 'file-eMRwVz9j9TbC4USMQdDbKcJ1',
}

Start fine-tuning jobs using the file IDs (yes manually AGAIN because it's not possible to fine-tune more than 3 models at the same time)

In [None]:
client.fine_tuning.jobs.create(
    training_file=file_ids[f'activity_train_5'],
    validation_file=file_ids[f'activity_test_5'],
    model='gpt-3.5-turbo',
    suffix=f'activity_split_5',
)

client.fine_tuning.jobs.create(
    training_file=file_ids[f'hemolysis_train_5'],
    validation_file=file_ids[f'hemolysis_test_5'],
    model='gpt-3.5-turbo',
    suffix=f'hemolysis_split_5',
)

#### Evaluate models 

Get the names of the fine-tuned models (manually)

In [7]:
model_ids = {
    'activity_split_1': 'ft:gpt-3.5-turbo-0613:reymond-group:activity-split-1:8sqaOXkP',
    'hemolysis_split_1': 'ft:gpt-3.5-turbo-0613:reymond-group:hemolysis-split-1:8sqhverN',
    'activity_split_2': 'ft:gpt-3.5-turbo-0613:reymond-group:activity-split-2:8ssBwa32',
    'hemolysis_split_2': 'ft:gpt-3.5-turbo-0613:reymond-group:hemolysis-split-2:8ssA4hWd',
    'activity_split_3': 'ft:gpt-3.5-turbo-0613:reymond-group:activity-split-3:8stvOJgD',
    'hemolysis_split_3': 'ft:gpt-3.5-turbo-0613:reymond-group:hemolysis-split-3:8stdCu4f',
    'activity_split_4': 'ft:gpt-3.5-turbo-0613:reymond-group:activity-split-4:8svNi6hW',
    'hemolysis_split_4': 'ft:gpt-3.5-turbo-0613:reymond-group:hemolysis-split-4:8sunorAI',
    'activity_split_5': 'ft:gpt-3.5-turbo-0613:reymond-group:activity-split-5:8svEPfjB',
    'hemolysis_split_5': 'ft:gpt-3.5-turbo-0613:reymond-group:hemolysis-split-5:8tu63nlf',
}

Import original datasets

In [9]:
activity = pd.read_csv('data/activity_clean.csv')
hemolysis = pd.read_csv('data/hemolysis_clean.csv')

Evaluate the 5-fold cross-validation of the fine-tuned models

In [25]:
def calculate_metrics(y_true, y_pred):
    
    roc_auc = roc_auc_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return pd.DataFrame({'roc_auc': [roc_auc], 'accuracy': [accuracy], 'precision': [precision], 'recall': [recall], 'f1': [f1]})

def run_cross_validation(df, purpose):

    # Initialize the output data frame
    df_out = pd.DataFrame()
    
    # Loop over the five folds
    for i in range(1, 6):

        # Split the data based on the information in the data frame column
        X_test = df[df[f'split_{i}'] == 'test']['sequence'].tolist()
        y_test = df[df[f'split_{i}'] == 'test']['label'].tolist()

        # Run predictions
        y_pred = []
        for idx in range(len(X_test)):
            response = client.chat.completions.create(
                model="ft:gpt-3.5-turbo-0613:reymond-group:activity:8orpwtM0",
                messages=[
                {"role": "system", "content": f"You are a model that predicts {purpose} from an amino acid sequence."},
                {"role": "user", "content": str(X_test[idx]) + " ->"}
                ], 
                temperature=0.0)
            y_pred.append(response.choices[0].message.content)
        y_pred = [int(x) for x in y_pred]
        
        # Calculate metrics
        row = calculate_metrics(y_test, y_pred)

        # Append the results to the output dataframe
        df_out = pd.concat([df_out, row], ignore_index=True)
    
    # Add mean and standard deviation
    df_out = pd.concat([df_out, pd.DataFrame(df_out.mean()).T], ignore_index=True)
    df_out = pd.concat([df_out, pd.DataFrame(df_out.std()).T], ignore_index=True)
    df_out.index = [f'fold_{i}' for i in range(1, 6)] + ['mean', 'std']

    return df_out

In [26]:
activity_eval = run_cross_validation(activity, 'antimicrobial activity')
hemolysis_eval = run_cross_validation(hemolysis, 'hemolysis')

#### Export data frames

In [30]:
activity_eval.to_csv('results/gpt3-5_activity_5cv.csv')
hemolysis_eval.to_csv('results/gpt3-5_hemolysis_5cv.csv')