In [1]:
import pandas as pd
import json

from sklearn.model_selection import train_test_split

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


#### Import data

Read files

In [2]:
activity = pd.read_csv('data/fine_tune_activity.csv', usecols=['Sequence', 'activity', 'Set'])
hemolysis = pd.read_csv('data/fine_tune_hemolysis.csv', usecols=['Sequence', 'isNotHemolytic', 'Set'])
hemolysis = hemolysis[(hemolysis['isNotHemolytic'] == 0) | (hemolysis['isNotHemolytic'] == 1)]

Rename columns

In [3]:
activity = activity.rename(columns={'Sequence': 'sequence', 'activity': 'label', 'Set': 'split_1'})
hemolysis = hemolysis.rename(columns={'Sequence': 'sequence', 'isNotHemolytic': 'label', 'Set': 'split_1'})

Shuffle data

In [4]:
activity = activity.sample(frac=1, random_state=42).reset_index(drop=True)
hemolysis = hemolysis.sample(frac=1, random_state=42).reset_index(drop=True)

Add columns with four additional test/train splits (for cross-validation).

In [5]:
def generate_splits(df):
    train_splits = {}
    test_splits = {}
    for i in range(2, 6):
        train_splits[i], test_splits[i] = train_test_split(df.index, test_size=0.25, random_state=i)
        df[f'split_{i}'] = 'training'
        df.loc[test_splits[i], f'split_{i}'] = 'test'

generate_splits(activity)
generate_splits(hemolysis)

Export datasets

In [6]:
activity.to_csv('data/activity_clean.csv', index=False)
hemolysis.to_csv('data/hemolysis_clean.csv', index=False)

#### Generate datasets for GPT-3.5 Turbo fine-tuning

Define function to write data from dataframe to file in the correct format

In [7]:
def write_json(df, purpose, filename):
    with open(filename, 'w') as f:
        for index, row in df.iterrows():

            sequence = row['sequence'] + ' ->'
            label = row['label']
                
            system_role = {"role": "system", "content": f"You are a model that predicts {purpose} from an amino acid sequence."}
            user_message = {"role": "user", "content": str(sequence)}
            system_message = {"role": "assistant", "content": str(label)}
            
            messages = []
            messages.append(system_role)
            messages.append(user_message)
            messages.append(system_message)
            data = {"messages": messages}

            f.write(json.dumps(data) + '\n')

Loop through each of the five splits and write the data to files

In [8]:
for i in range(1, 6):
    write_json(activity[activity[f'split_{i}'] == 'training'], 'activity', f'data/activity_train_{i}.json')
    write_json(activity[activity[f'split_{i}'] == 'test'], 'activity', f'data/activity_test_{i}.json')
    write_json(hemolysis[hemolysis[f'split_{i}'] == 'training'], 'hemolysis', f'data/hemolysis_train_{i}.json')
    write_json(hemolysis[hemolysis[f'split_{i}'] == 'test'], 'hemolysis', f'data/hemolysis_test_{i}.json')