In [2]:
from icego import *
import os
import boto3
import pandas as pd
import openai
from dotenv import load_dotenv
from time import time

# Need an OpenAI API key in a .env file to call GPT-3
env_path = r'.env'
dotenv_path = Path(env_path)
load_dotenv(dotenv_path=dotenv_path)
#openai.api_key = os.environ["OPENAI_API_KEY"]

True

In [None]:
df1 = pd.read_csv(r"saved_models\mar23_2.tsv",sep="\t",index_col=0)
df2 = pd.read_csv(r"saved_models\mar24.tsv",sep="\t",index_col=0)
df3 = pd.read_csv(r"saved_models\mar27.tsv",sep="\t",index_col=0)
df = pd.concat([df1, df2, df3]).reset_index(drop=True)

training = pd.read_csv(r"research_data\training(1).tsv", sep="\t", index_col=0)
validation = pd.read_csv(r"research_data\validation(2).tsv", sep="\t", index_col=0)
testing = pd.read_csv(r"research_data\testing(3).tsv", sep="\t", index_col=0)

# Train-test split
X_train = training["question"]
y_train = training["label"]

X_val = validation["question"]
y_val = validation["label"]

X_test = testing["question"]
y_test = testing["label"]

# Prep training for sending to GPT-3
training["text"] = training["question"]
training = training.drop("question",axis=1)

In [None]:
def relabel(label):
    if label == "Topic: Data":
        return "Data"
    elif label == "Topic: Other":
        return "Other"
    else:
        raise Exception("Class not Data or Other")

In [None]:
Xs = pd.Series(dtype="object")
ys = pd.Series(dtype="object")
for i in range(max(df.index)):
    entry = df.context[i].split("\n")[1:-1]
    X = pd.Series(entry[::2])
    y = pd.Series(entry[1::2]).map(relabel)
    Xs = pd.concat([Xs, X])
    ys = pd.concat([ys, y])
Xs = Xs.reset_index(drop=True)
ys = ys.reset_index(drop=True)

In [None]:
df_new = pd.DataFrame(data={"text":Xs, "label":ys}).drop_duplicates().reset_index(drop=True)
df_new = df_new[df_new.text != ""]

orig = pd.read_csv("initial_qs.csv", index_col=0)
df_new = pd.concat([df_new, orig])

In [None]:
class GPT3Classifier():
    def __init__(self, file, max_examples=5, temperature=0, search_model="ada", model="ada"):
        self.file = file
        self.max_examples = max_examples
        self.temperature = temperature
        self.search_model = search_model
        self.model = model
        

    def predict(self, text):
        output = openai.Classification.create(
            file=self.file,
            query=text,
            search_model=self.search_model, 
            model=self.model, 
            max_examples=self.max_examples,
            temperature = self.temperature
        )
        return output
    
    def evaluate(self, X):
        self.predictions = []
        self.full_output = []
        for val in X:
            print("Evaluating: " + val)
            
            # Sometimes the search may not find any similar documents.
            # If this is the case, we skip it
            result = None
            try:
                result = self.predict(val)
            except:
                print("Could not classify: " + val)
                self.full_output.append(None)
                self.predictions.append('Unknown')
                continue
            self.full_output.append(result)
            self.predictions.append(result['label'])
        return self.predictions
    
    def accuracy(self, true):
        return sum(pd.Series(true) == pd.Series(self.predictions)) / len(self.predictions)
    
    def save(self, path):
        pickle_out = open(path, "wb")
        pickle.dump(self, pickle_out)
        pickle_out.close()
        
    
    @staticmethod
    def load(env, model):
        """
        Load model from a pickle file
        """

        # Load API Key
        try:
            dotenv_path = Path(env)
            load_dotenv(dotenv_path=dotenv_path)
            openai.api_key = os.environ["OPENAI_API_KEY"]
        except:
            raise Exception("Please specify your OpenAI API key in a .env file as OPENAI_API_KEY=your_key, and provide in the \"path\" parameter")
        pickle_in = open(model, "rb")
        return pickle.load(pickle_in)

In [None]:
# Hyperparameter optimization parameters using grid search
def grid_search(file, X_test, y_test, examples, temperatures):
    times = []
    accuracies = []

    for example in examples:
        times_ex = []
        accuracies_ex = []
        for temperature in temperatures:
            print("({}, {})".format(temperature, example))
            model25 = GPT3Classifier(file, max_examples=int(example), temperature=temperature)
            start = time()
            model25.evaluate(X_test)
            time_taken = time() - start
            acc = model25.accuracy(y_test)

            times_ex.append(time_taken)
            accuracies_ex.append(acc)

        times.append(times_ex)
        accuracies.append(accuracies_ex)
        
    return pd.DataFrame(accuracies, columns=temperatures, index=examples), pd.DataFrame(times, columns=temperatures, index=examples)

In [None]:
def evaluate_classification_endpoint(n, df_sample, training, examples, temps, X_val, y_val, X_test, y_test, iters=5):
    test_errs = []
    for i in range(iters):
        print("Iteration {}".format(i))
        df = df_sample.sample(n=n)
        df = pd.concat([df, training]).reset_index(drop=True)
        df.to_json(r"saved_models\gpt3_qs_{}_{}.jsonl".format(n,i),orient="records",lines=True)
        result = openai.File.create(file=open(r"saved_models\gpt3_qs_{}_{}.jsonl".format(n,i)), purpose="classifications")

        acc, runtime = grid_search(result['id'], X_val, y_val, examples, temps)
        acc.to_csv(r'saved_models\classifier_acc{}_{}_mar28.csv'.format(n,i))
        runtime.to_csv(r'saved_models\classifier_time{}_{}_mar28.csv'.format(n,i))
        
        best_temp = acc.max().idxmax()
        best_n_exs = acc[best_temp].idxmax()
        best_col = acc[acc.max().idxmax()].idxmax()
        
        best_classifier = GPT3Classifier(result['id'], max_examples=int(best_n_exs), temperature=float(best_temp))
        best_classifier.evaluate(X_test)
        best_classifier.save(r'saved_models\best{}_{}_mar28.p'.format(n, i))
        
        test_errs.append(best_classifier.accuracy(y_test))
       
    # Save test error output
    pickle_out = open(r'saved_models\best{}_{}_testerrs_mar28.p'.format(n, i), "wb")
    pickle.dump(test_errs, pickle_out)
    pickle_out.close()

In [None]:
n = 10
test_errs = []
for i in range(5):
    best_classifier = pd.read_pickle(r'saved_models\best{}_{}_mar28.p'.format(n,i))
    best_classifier.temperature = float(best_classifier.temperature)
    best_classifier.max_examples = int(best_classifier.max_examples)
    best_classifier.evaluate(X_test)
    best_classifier.save(r'saved_models\best{}_{}_mar28_.p'.format(n, i))

    test_errs.append(best_classifier.accuracy(y_test))

pickle_out = open(r'saved_models\best{}_{}_testerrs_mar28.p'.format(n, i), "wb")
pickle.dump(test_errs, pickle_out)
pickle_out.close()



In [None]:
examples = np.array(list(range(1,6))) * 5
temperatures = [0, 0.1, 0.5]
evaluate_classification_endpoint(10, df_new, training, examples, temperatures, X_val, y_val, X_test, y_test)

In [None]:
test = pd.read_pickle(r'saved_models\best10_4_testerrs_mar28.p')

In [None]:
examples = np.array(list(range(1,6))) * 5
temperatures = [0, 0.1, 0.5]
evaluate_classification_endpoint(100, df_new, training, examples, temperatures, X_val, y_val, X_test, y_test)

In [None]:
test100 = pd.read_pickle(r'saved_models\best100_4_testerrs_mar28.p')
np.mean(test100)

In [None]:
test10 = pd.read_pickle(r'saved_models\best10_4_testerrs_mar28.p')
np.mean(test10)

In [None]:
examples = np.array(list(range(1,6))) * 5
examples = examples.tolist()
examples.append(100)
temperatures = [0, 0.1, 0.5]
evaluate_classification_endpoint(1000, df_new, training, examples, temperatures, X_val, y_val, X_test, y_test)

In [None]:
test1000 = pd.read_pickle(r'saved_models\best1000_4_testerrs_mar28.p')

In [None]:
evaluate_classification_endpoint(10000, df_new, training, examples, temperatures, X_val, y_val, X_test, y_test)

In [None]:
test10000 = pd.read_pickle(r'saved_models\best10000_4_testerrs_mar28.p')

In [None]:
np.mean(test10000)

In [None]:
np.mean(test1000)

In [None]:
examples = np.array(list(range(1,6))) * 5
examples = examples.tolist()
examples.append(100)
temperatures = [0, 0.1, 0.5]
evaluate_classification_endpoint(0, df_new, training, examples, temperatures, X_val, y_val, X_test, y_test)

In [None]:
# Analysis
testerrs = []
valerrs = []

for i in [0, 10,100,1000,10000]:
    testerrs.append(pd.read_pickle(r'saved_models\best{}_4_testerrs_mar28.p'.format(i)))

    valerr = []
    for j in range(5):
        # Get the maximum accuracy produced by any given set of hyperparameters
        valerr.append(pd.read_csv(r'saved_models\classifier_acc{}_{}_mar28.csv'.format(i,j), index_col=0).to_numpy().max())
    valerrs.append(valerr)

In [None]:
testerrs = np.array(testerrs)
testerrs

In [None]:
testerrs.mean(axis=1)

In [None]:
testerrs.std(axis=1) / np.sqrt(5)

In [None]:
valerrs = np.array(valerrs)

In [None]:
valerrs.mean(axis=1)

In [None]:
valerrs.std(axis=1) / np.sqrt(5)

In [None]:
from scipy.stats import permutation_test, ttest_ind

def statistic(x, y):
    return np.mean(x) - np.mean(y)

print("Test Error P-values")
for i in range(1,5):
    res = permutation_test([testerrs[0], testerrs[i]], statistic, alternative = "less")
    print(res.pvalue)

In [None]:
from scipy.stats import permutation_test, ttest_ind

def statistic(x, y):
    return np.mean(x) - np.mean(y)

print("Validation Error P-values")
for i in range(1,5):
    res = permutation_test([valerrs[0], valerrs[i]], statistic, alternative='less')
    print(res.pvalue)

In [None]:
# Old data format
exs = [0, 1, 2, 3, 4]
valse = valerrs.std(axis=1) / np.sqrt(5)
testse = testerrs.std(axis=1) / np.sqrt(5)
overall = pd.DataFrame({"exs":exs, "val":valerrs.mean(axis=1), "test":testerrs.mean(axis=1), "valse":valse, "testse":testse})

In [None]:
# New data format
acc_test = testerrs.flatten()
acc_val = valerrs.flatten()
labs = np.array([[0]*5] + [[i]*5 for i in range(1,5)]).flatten()
overall = pd.concat([pd.DataFrame({"label":labs, "value":acc_val, "set":"Validation Set"}), pd.DataFrame({"label":labs, "value":acc_test, "set":"Test Set (Unseen)"})], axis=0).reset_index(drop=True)


In [None]:
overall

In [None]:
# New plotting
import seaborn as sns
sns.set(font_scale = 2, style="white", rc={'figure.figsize':(11,8)})
p = sns.lineplot(data=overall, y="value", x="label", hue="set", ci=68, marker='o', linewidth=2, markersize=10);
p.set_ylabel("Accuracy");
p.set_xlabel("Number of New Example Questions Added");
p.legend(['Validation Set Mean', 'Validation Set SE','Test Set (Unseen) Mean','Test Set SE']);
plt.xticks([0,1,2,3,4])
p.set_xticklabels(["0","10","100","1,000","10,000"]);
plt.tick_params(bottom=True, left=True)
#plt.savefig('class_endpoint_simplified2.png', dpi=2000);

In [None]:
# New plotting
import seaborn as sns
sns.set(font_scale = 2, style="white", rc={'figure.figsize':(11,8)})
p = sns.lineplot(data=overall[overall.set=="Validation Set"], y="value", x="label", ci=68, marker='o', linewidth=2, markersize=10);
p.set_ylabel("Accuracy");
p.set_xlabel("Number of New Example Questions Added");
p.legend(['Mean Accuracy','Standard Error']);
plt.xticks([0,1,2,3,4])
p.set_xticklabels(["0","10","100","1,000","10,000"]);
plt.tick_params(bottom=True, left=True)
plt.savefig('class_endpoint_simplified4.png', dpi=2000);

In [None]:
# New plotting
import seaborn as sns
sns.set(font_scale = 2, style="white", rc={'figure.figsize':(11,8)})
p = sns.lineplot(data=overall[overall.set=="Test Set (Unseen)"], y="value", x="label", ci=68, marker='o', linewidth=2, markersize=10);
p.set_ylabel("Accuracy on Unseen Examples");
p.set_xlabel("Number of New Example Questions Added");
p.legend(['Mean Accuracy','Standard Error']);
plt.xticks([0,1,2,3,4])
p.set_xticklabels(["0","10","100","1,000","10,000"]);
plt.tick_params(bottom=True, left=True)
plt.savefig('class_endpoint_simplified3.png', dpi=2000);

In [None]:
# Old plotting
import seaborn as sns
sns.set(font_scale = 1.25, style="white")

p = sns.lineplot(data=overall, y="val",x="exs", ci="valse", err_style = "band", markers=True);
p = sns.lineplot(data=overall, y="test",x="exs", ci="testse", err_style = "band", markers=True);
p.set_ylabel("Accuracy");
p.set_xlabel("Number of New Example Questions Added");
plt.xticks([0,1,2,3,4])
p.set_xticklabels(["0","10","100","1,000","10,000"])
p.fill_between(overall.exs, overall.val - overall.valse, overall.val + overall.valse, color='blue', alpha=0.2);
p.fill_between(overall.exs, overall.test - overall.testse, overall.test + overall.testse, color='orange', alpha=0.2);
p.legend(['Validation Set', 'Test Set (Unseen)'])

plt.savefig('class_endpoint_simplified.png', dpi=1000);

In [None]:
model = GPT3Classifier.load(env_path, r'saved_models\best100_4_testerrs_mar28.p')

In [None]:
classifier = GPT3Classifier("file-QU5FZXAKFre6i1BblZdxdWzW", max_examples=25, temperature=0)

In [None]:
results = classifier.evaluate(X_test)

In [None]:
conf = pd.DataFrame({"Question": X_test, "true": y_test,"pred": results})

In [None]:
result = conf[conf.true != conf.pred]

In [None]:
result.Question.values

In [None]:
X_test[[2,3,7,9,16,17,19]]

In [None]:
# Time test
time = []

for i in [0, 10,100,1000,10000]:
    tmp = []
    for j in range(5):
        # Sum the total time to run each set of hyperparameters
        tmp.append(pd.read_csv(r'saved_models\classifier_time{}_{}_mar28.csv'.format(i,j), index_col=0).to_numpy().sum())
    time.append(tmp)
time = np.array(time) / 60

In [None]:


# Get average time for each number of new questions added
time.mean(axis=1).std() / np.sqrt(5)

In [None]:
# Get average time for each number of new questions added
time.std(axis=1) / np.sqrt(5)

In [None]:
import platform
print('Sysinfo')
uname = platform.uname()
print(f"System: {uname.system}")
print(f"Node Name: {uname.node}")
print(f"Release: {uname.release}")
print(f"Version: {uname.version}")
print(f"Machine: {uname.machine}")
print(f"Processor: {uname.processor}")

In [None]:
np.std([105, 119, 112]) / np.sqrt(3)