In [1]:
import getpass
import sys
import pandas as pd
import string

from src.common.authentication import Authentication
from src.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
from src.common.request import Request, RequestResult
from src.proxy.accounts import Account
from proxy.remote_service import RemoteService

from os.path import exists

In [2]:
# An example of how to use the request API.
#api_key = getpass.getpass(prompt="Enter a valid API key: ")
api_key = pd.read_csv("prod_env/api_key.csv", header=None)[0].values[0]
auth = Authentication(api_key=api_key)
service = RemoteService("https://crfm-models.stanford.edu")

# Access account and show my current quotas and usages
account: Account = service.get_account(auth)
print(account.usages)

{'gpt3': {'daily': Usage(period='2022-8-15', used=109497, quota=None), 'total': Usage(period='all', used=196805, quota=400000), 'monthly': Usage(period='2022-8', used=159608, quota=None)}, 'codex': {'daily': Usage(period=None, used=0, quota=0)}, 'jurassic': {'daily': Usage(period=None, used=0, quota=None), 'total': Usage(period=None, used=0, quota=0)}, 'gooseai': {'daily': Usage(period=None, used=0, quota=0)}}


In [3]:
def get_processed_countries(OUTPUT_FILE):
    processed_countries = set()
    if exists(OUTPUT_FILE):
        processed_countries = set(list(pd.read_csv(OUTPUT_FILE, header=None)[1].values))
    return processed_countries

In [4]:
import json
# load data using Python JSON module
with open("triviaqa/data/web-train.json",'r') as f:
    data = json.loads(f.read())
# Flatten data
triviaqa_data = pd.json_normalize(data, record_path =['Data'])


In [5]:
sample_questions = triviaqa_data.sample(n=100)
sample_questions.to_csv("triviaqa/data/triviaqa_sample_questions.csv", index=False)

sample_examples = triviaqa_data.sample(n=100)
sample_examples.to_csv("triviaqa/data/triviaqa_sample_examples.csv", index=False)


In [6]:
sample_questions = pd.read_csv("triviaqa/results/50_7_davinci_trivia_qa_sample1.csv", header=None)
sample_questions.columns = ['prompt', 'example_question', 'example_answer', 'Question', 'Answer.Aliases', 'prefix', 'prediction', 'logprob']
sample_questions = sample_questions[sample_questions['example_answer'] != "["]
sample_questions = sample_questions.drop_duplicates(subset='Question')


In [7]:
sample_questions

Unnamed: 0,prompt,example_question,example_answer,Question,Answer.Aliases,prefix,prediction,logprob
80,Q: Which American (1886-1956) founded the mode...,Which American (1886-1956) founded the modern ...,Clarence Frank Birdseye,In April 2008 PEP accounts automatically becam...,['(PERSONAL) EQUITY'],"I don't know, maybe it's",Educational,-3.835908
88,"Q: ""Who co-starred with Kevin Costner in """"Bod...","""Who co-starred with Kevin Costner in """"Bodygu...",Death of Whitney Houston,"""""""All that glisters is not gold"""" is a quote ...","['TMOV', 'Prince of Arragon', 'A Merchant of V...","I don't know, maybe it's",Dennis Rodman's autobiography,-12.812705
92,Q: What is the name of the French underground ...,What is the name of the French underground res...,"The Maquis, Part II","According to legend, who rode through the stre...","['Godgyfu', 'Godgifu of Mercia', 'Lady Godiva'...","I don't know, maybe it's",the French resistance leader who fought again...,-15.498733
96,"Q: ""Which actress, nominated for her role in t...","""Which actress, nominated for her role in the ...",The Hathaway effect,Which Scottish Loch is the deepest freshwater ...,"['Loch Morar', 'Loch morar', 'Lock Mhòrair']","I don't know, maybe it's",Loch Ness,-1.513932
100,Q: Which 1987 film set during the Vietnam War ...,Which 1987 film set during the Vietnam War was...,Good Morning Vietnam,In which secure London building was Hitler's d...,"['London Tower', 'The Tower of London', 'Tower...","I don't know, maybe it's",10 Downing Street,-4.387862
104,"Q: In 1990, mountain climbers Rob Hall and Gar...","In 1990, mountain climbers Rob Hall and Gary B...","""Climb the """"Seven Summits""""""",Whose autobiography was Goodbye To All That?,"['Gravesian', 'Beryl Hodge', 'Robert Ranke Gra...","I don't know, maybe it's","""Robert Graves""",-2.305971
108,Q: How is James Wormold described in the title...,How is James Wormold described in the title of...,Our Man in Havana,Which car manufacturer has produced the Taunus...,"['Ford Motor', 'Ford Group', 'Ford Motor Compa...","I don't know, maybe it's",Ford,-0.259438
112,Q: Which business directory had the ‘Walking F...,Which business directory had the ‘Walking Fing...,Die Gelben Seiten,What term is used for the appliance of gold or...,"['Filigrane', 'Filigranes', 'Filigrees', 'Fill...","I don't know, maybe it's",pattern,-5.610436
116,Q: Who is the leader of the Autobots? A: I don...,Who is the leader of the Autobots?,Orion Pax,In which English city did trams return to the ...,"['Manchester developments', 'Manchester (borou...","I don't know, maybe it's",Blackpool,-2.825139
120,Q: Who is known as The Father of Television? A...,Who is known as The Father of Television?,John L. Baird,Of what will Keith Bristow become the first he...,"['NATIONAL CRIME AGENCY', 'National Crime Agen...","I don't know, maybe it's",the National Crime Agency,-2.998997


In [8]:
NUM_COMPLETIONS = 1
MAX_TOKENS = 50
TEMPERATURE = 0.7
MODEL_NAME = "openai/davinci"
ECHO_PROMPT = False

prompt_responses = pd.DataFrame()

PATH = 'triviaqa/results/'
OUTPUT_FILE = PATH + str(MAX_TOKENS) + "_" + str(TEMPERATURE)[-1:] + "_" + MODEL_NAME[7:] + '_trivia_qa_sample1.csv'
STOP_SEQUENCES = ["?", ".", "!", "\n"]

# prefixes = ["I don't know, maybe it's",
#     "I have no idea, maybe it's",
#     "I'm pretty sure it's",
#     "I believe it's"]

# prefixes = [""]
# for n, row in sample_questions[:40].iterrows():
    
# #     example = sample_examples.sample(n=1)
# #     example_question = example['Question'].values[0]
# #     example_answer = example['Answer.Aliases'].values[0][0]
    
#     example_question = row['example_question']
#     example_answer = row['example_answer']

#     question = row['Question']
#     answers = row['Answer.Aliases']
    
#     for prefix in prefixes:
#         example = "Q: " + example_question + " A:" + prefix + " " + example_answer + "."
#         prompt  = example + " Q: " + question + " A:" + prefix
#         print(prompt)
#         print("----")
#         print(answers)
#         print("----")
        
#         request = Request(prompt=prompt, temperature=TEMPERATURE, model=MODEL_NAME, num_completions=NUM_COMPLETIONS, max_tokens=MAX_TOKENS, stop_sequences=STOP_SEQUENCES, echo_prompt=ECHO_PROMPT)
#         request_result: RequestResult = service.make_request(auth, request)

#         for response in request_result.completions:
#             print(response.text, "\t",  response.logprob)
#             prompt_responses = pd.concat([prompt_responses, pd.DataFrame([[prompt, example_question, example_answer, question, answers, prefix, response.text, response.logprob]])])
#             print("----")
                
# prompt_responses.to_csv(OUTPUT_FILE, mode='a', header=False, index=False)


In [26]:
PATH = 'triviaqa/results/'

TRIVIAQA_RESULTS = PATH + '50_7_davinci_trivia_qa_sample1.csv'

result = pd.read_csv(TRIVIAQA_RESULTS, header=None).drop_duplicates()
result.columns=['prompt', 'example_question', 'example_answer', 'question', 'answers', 'prefix', 'prediction', 'logprob']


In [27]:
result['prefix'] = result['prefix'].fillna("unprompted")

In [28]:
pd.set_option('display.max_rows', 20)
result = result[result['example_answer'] != '[']
result.groupby("prefix").mean()

Unnamed: 0_level_0,logprob
prefix,Unnamed: 1_level_1
I believe it's,-4.354187
"I don't know, maybe it's",-4.279188
"I have no idea, maybe it's",-5.064187
I think it's,-5.175495
I would guess it's,-4.568904
I'm pretty sure it's,-2.366313
unprompted,-4.698562


In [29]:
#lowercase, remove punctuation
def remove_punctuation(str_input):
    return str_input.lower()

def check_answer_aliases(answers, prediction):
    answers = answers.strip("][''").split(', ')
    
    answers = [x.lower().strip("''") for x in answers]
    prediction = prediction.strip(" ").lower()
#     print(answers)
#     print("---")
#     print(prediction)
    if prediction in answers:
        return 1
    else:
        return 0

        
#check if the prediction is in the answer (no partial credits)
result['correct'] = result.apply(lambda x: check_answer_aliases(x['answers'], x['prediction']), axis=1)

In [30]:
result.loc[result.groupby('example_question').logprob.idxmax()].sum()

prompt              Q: "Which 16th century English composer's work...
example_question    "Which 16th century English composer's works i...
example_answer      Thomas TallysThe Hathaway effectDeath of Whitn...
question            What colour are the stars on the flag of the E...
answers             ['Yellowest', 'Whiteyellow', 'Yelow', 'Yellow ...
prefix              I believe it'sunpromptedunpromptedunpromptedun...
prediction           yellow Loch Ness The Merchant of Venice Ford ...
logprob                                                    -26.023326
correct                                                            12
dtype: object

In [31]:
result.sort_values(by='question')
result.groupby("prefix").mean()

Unnamed: 0_level_0,logprob,correct
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1
I believe it's,-4.354187,0.4
"I don't know, maybe it's",-4.279188,0.190476
"I have no idea, maybe it's",-5.064187,0.142857
I think it's,-5.175495,0.3
I would guess it's,-4.568904,0.2
I'm pretty sure it's,-2.366313,0.47619
unprompted,-4.698562,0.5


In [19]:
result = pd.read_csv(PATH + "filtered_results_sample1_gpt3.csv")
result.groupby("prefix").mean()

Unnamed: 0_level_0,logprob,correct
prefix,Unnamed: 1_level_1,Unnamed: 2_level_1
I believe it's,-4.354187,0.65
"I don't know, maybe it's",-4.279188,0.428571
"I have no idea, maybe it's",-5.064187,0.380952
I think it's,-5.175495,0.55
I would guess it's,-4.568904,0.45
I'm pretty sure it's,-2.366313,0.666667


In [20]:
result.groupby(['question', "prefix"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,logprob,correct
question,prefix,Unnamed: 2_level_1,Unnamed: 3_level_1
"""""""All that glisters is not gold"""" is a quote from which Shakespeare play?""",I believe it's,-6.718655,0.0
"""""""All that glisters is not gold"""" is a quote from which Shakespeare play?""","I don't know, maybe it's",-12.812705,0.0
"""""""All that glisters is not gold"""" is a quote from which Shakespeare play?""","I have no idea, maybe it's",-12.056896,0.0
"""""""All that glisters is not gold"""" is a quote from which Shakespeare play?""",I think it's,-4.640491,1.0
"""""""All that glisters is not gold"""" is a quote from which Shakespeare play?""",I would guess it's,-3.270939,0.0
"""""""All that glisters is not gold"""" is a quote from which Shakespeare play?""",I'm pretty sure it's,-5.696058,0.0
"According to legend, who rode through the streets of Coventry naked and was seen by someone called Tom, leading to the phrase Peeping Tom?",I believe it's,-8.559576,0.0
"According to legend, who rode through the streets of Coventry naked and was seen by someone called Tom, leading to the phrase Peeping Tom?","I don't know, maybe it's",-15.498733,0.0
"According to legend, who rode through the streets of Coventry naked and was seen by someone called Tom, leading to the phrase Peeping Tom?","I have no idea, maybe it's",-6.251779,1.0
"According to legend, who rode through the streets of Coventry naked and was seen by someone called Tom, leading to the phrase Peeping Tom?",I think it's,-25.576775,0.0


In [21]:
best_prediction_array = []
for n, row in result.iterrows():
    #set a very negative value
    best_prediction = -1000
    #go through all the types of prompts
    for x in ['_sure', '_dontknow', '_ithink', '_iguess']:
        #if the logprob is higher, then set that to be the best value
        if row['logprob' + x] > best_prediction:
            best_prediction = row['logprob' + x]
            best_prediction_value = row['correct' + x]
        
    best_prediction_array.append(best_prediction_value)

joined['best_prediction'] = best_prediction_array
joined.sum()

KeyError: 'logprob_sure'