In [2]:
from icego import *
import os
import boto3
import pandas as pd
import openai
from dotenv import load_dotenv
from time import time

# Need an OpenAI API key in a .env file to call GPT-3
env_path = r'.env'
dotenv_path = Path(env_path)
load_dotenv(dotenv_path=dotenv_path)
#openai.api_key = os.environ["OPENAI_API_KEY"]

True

In [None]:
import pickle

class GPT3Classifier():
    def __init__(self, file, max_examples=5, temperature=0, search_model="ada", model="ada"):
        self.file = file
        self.max_examples = max_examples
        self.temperature = temperature
        self.search_model = search_model
        self.model = model
        

    def predict(self, text):
        output = openai.Classification.create(
            file=self.file,
            query=text,
            search_model=self.search_model, 
            model=self.model, 
            max_examples=self.max_examples,
            temperature = self.temperature
        )
        return output
    
    def evaluate(self, X):
        self.predictions = []
        self.full_output = []
        for val in X:
            print("Evaluating: " + val)
            
            # Sometimes the search may not find any similar documents.
            # If this is the case, we skip it
            result = None
            try:
                result = self.predict(val)
            except:
                print("Could not classify: " + val)
                self.full_output.append(None)
                self.predictions.append('Unknown')
                continue
            self.full_output.append(result)
            self.predictions.append(result['label'])
        return self.predictions
    
    def accuracy(self, true):
        return sum(pd.Series(true) == pd.Series(self.predictions)) / len(self.predictions)
    
    def save(self, path):
        pickle_out = open(path, "wb")
        pickle.dump(self, pickle_out)
        pickle_out.close()
        
    
    @staticmethod
    def load(env, model):
        """
        Load model from a pickle file
        """

        # Load API Key
        try:
            dotenv_path = Path(env)
            load_dotenv(dotenv_path=dotenv_path)
            openai.api_key = os.environ["OPENAI_API_KEY"]
        except:
            raise Exception("Please specify your OpenAI API key in a .env file as OPENAI_API_KEY=your_key, and provide in the \"path\" parameter")
        pickle_in = open(model, "rb")
        return pickle.load(pickle_in)
        

In [None]:
training = pd.read_csv(r"research_data\training(1).tsv", sep="\t", index_col=0)
validation = pd.read_csv(r"research_data\validation(2).tsv", sep="\t", index_col=0)
testing = pd.read_csv(r"research_data\testing(3).tsv", sep="\t", index_col=0)

# Train-test split
X_train = training["question"]
y_train = training["label"]

X_val = validation["question"]
y_val = validation["label"]

X_test = testing["question"]
y_test = testing["label"]

In [None]:
# Hyperparameter optimization parameters using grid search
examples = np.array(list(range(1,6))) * 5
temperatures = [0, 0.1, 0.5]

def grid_search(file, X_test, y_test, examples, temperatures):
    times = []
    accuracies = []

    for example in examples:
        times_ex = []
        accuracies_ex = []
        for temperature in temperatures:
            print("({}, {})".format(temperature, example))
            model25 = GPT3Classifier(file, max_examples=int(example), temperature=temperature)
            start = time()
            model25.evaluate(X_test)
            time_taken = time() - start
            acc = model25.accuracy(y_test)

            times_ex.append(time_taken)
            accuracies_ex.append(acc)

        times.append(times_ex)
        accuracies.append(accuracies_ex)
        
    return pd.DataFrame(accuracies, columns=temperatures, index=examples), pd.DataFrame(times, columns=temperatures, index=examples)

In [None]:
# Default train set
acc26, time26 = grid_search('file-S0NVQFrb3YIQIXyREilOSBEr', X_val, y_val, examples, temperatures)
acc26.to_csv(r'saved_models\classifier_acc26_mar23.csv')
time26.to_csv(r'saved_models\classifier_time26_mar23.csv')

In [None]:
# With 10 examples added
acc10, time10 = grid_search('file-WoFiSCiVC3d8MolIKIcNTqZn', X_val, y_val, examples, temperatures)
acc10.to_csv(r'saved_models\classifier_acc10_mar23.csv')
time10.to_csv(r'saved_models\classifier_time10_mar23.csv')

In [None]:
examples = examples.tolist()
examples.append(50)

In [None]:
acc100, time100 = grid_search('file-ZwBnMRiuBcIaCis3EM8hXIci', X_val, y_val, examples, temperatures)
acc100.to_csv(r'saved_models\classifier_acc100_mar23.csv')
time100.to_csv(r'saved_models\classifier_time100_mar23.csv')

In [None]:
examples.append(100)

In [None]:
acc1000, time1000 = grid_search('file-J1c6lmpgebqHCIlPr42FGuD0', X_val, y_val, examples, temperatures)
acc1000.to_csv(r'saved_models\classifier_acc1000_mar23.csv')
time1000.to_csv(r'saved_models\classifier_time1000_mar23.csv')

In [None]:
acc3302, time3302 = grid_search('file-ApLgz7aeTcjNXQH7M2KMqtFJ', X_val, y_val, examples, temperatures)
acc3302.to_csv(r'saved_models\classifier_acc3302_mar23.csv')
time3302.to_csv(r'saved_models\classifier_time3302_mar23.csv')

In [None]:
best3302 = GPT3Classifier('file-ApLgz7aeTcjNXQH7M2KMqtFJ', max_examples=20, temperature=0)
best3302.evaluate(X_test)
best3302.save(r'saved_models\best3302_mar23.p')

In [None]:
pd.DataFrame({"question":X_test,"true":y_test, "predicted":best3302.predictions})

In [None]:
best3302.accuracy(y_test)

In [None]:
best1000 = GPT3Classifier('file-J1c6lmpgebqHCIlPr42FGuD0', max_examples=20, temperature=0)
best1000.evaluate(X_test)
best1000.save(r'saved_models\best1000_mar23.p')

In [None]:
pd.DataFrame({"question":X_test,"true":y_test, "predicted":best1000.predictions})

In [None]:
best1000.accuracy(y_test)

In [None]:
best100 = GPT3Classifier('file-ZwBnMRiuBcIaCis3EM8hXIci', max_examples=15, temperature=0)
best100.evaluate(X_test)
best100.save(r'saved_models\best100_mar23.p')

In [None]:
pd.DataFrame({"question":X_test,"true":y_test, "predicted":best100.predictions})

In [None]:
best100.accuracy(y_test)

In [None]:
best10 = GPT3Classifier('file-WoFiSCiVC3d8MolIKIcNTqZn', max_examples=5, temperature=0.1)
best10.evaluate(X_test)
best10.save(r'saved_models\best10_mar23.p')

In [None]:
pd.DataFrame({"question":X_test,"true":y_test, "predicted":best10.predictions})

In [None]:
best10.accuracy(y_test)

In [None]:
best26 = GPT3Classifier('file-S0NVQFrb3YIQIXyREilOSBEr', max_examples=15, temperature=0.5)
best26.evaluate(X_test)
best26.save(r'saved_models\best26_mar23.p')

In [None]:
pd.DataFrame({"question":X_test,"true":y_test, "predicted":best26.predictions})

In [None]:
best26.accuracy(y_test)

In [None]:
naive26 = GPT3Classifier('file-S0NVQFrb3YIQIXyREilOSBEr', max_examples=5, temperature=0)
naive26.evaluate(X_test)
naive26.save(r'saved_models\naive26_mar23.p')

In [None]:
naive26.accuracy(y_test)

In [None]:
pd.read_csv(r"saved_models/classifier_time0_0_mar28.csv")