# GPT: Baseline Reproduction

In [None]:
import json
import random
import re
import os
import numpy as np
import time
import copy

In [1]:
# setup
!export HF_DATASETS_CACHE="/scratch/tg2520/cache/"

In [2]:
os.environ['TRANSFORMERS_CACHE'] = '/scratch/tg2520/cache/'
LLM = "gpt"

### Data Preprocessing 

In [4]:
file_path = "/scratch/tg2520/my_env/LLVM/automatic_prompt_engineer/data/bigbench-ii/epistemic_reasoning/task.json"
with open(file_path, 'r') as json_file:
    data_orig = json.load(json_file)

In [5]:
data=data_orig['examples']

In [7]:
# random.shuffle(data)

# # Define the proportions for train, test, and validation sets
# total_samples = len(data)
# train_ratio = 0.7
# test_ratio = 0.15
# validation_ratio = 0.15

# # Calculate the sizes of each set
# train_size = int(total_samples * train_ratio)
# test_size = int(total_samples * test_ratio)
# validation_size = int(total_samples * validation_ratio)

# # Split the data into sets
# train_data = data[:train_size]
# test_data = data[train_size:train_size + test_size]
# validation_data = data[train_size + test_size:]

In [8]:
import json

# Read data from train.json
with open('train_entail.json', 'r') as train_file:
    train_data = json.load(train_file)
# Read data from test.json
with open('test_entail.json', 'r') as test_file:
    test_data = json.load(test_file)
with open('val_entail.json', 'r') as val_file:
    validation_data=json.load(val_file)

In [9]:
print(f"Training data size: {len(train_data)}")

Training data size: 1400


In [13]:
# Number of positive and negative samples
pos_train = [i for i in range(len(train_data)) if train_data[i]['target_scores']['entailment'] == 1]
neg_train = [i for i in range(len(train_data)) if train_data[i]['target_scores']['non-entailment'] == 1]

print(len(pos_train))
print(len(neg_train))

158
242


### Algorithm setup

In [14]:
task_description=data_orig['description']
print(task_description)

task_type='entailment'

task_prefix=data_orig['task_prefix']
if task_type=='hyperbaton':
    task_prefix=task_prefix.strip()
    task_prefix+=" Choose only from the following options: 'a' or 'b'.\n"

Determine whether one sentence entails the next


### Helper functions

In [16]:
### GPT API CALL
# GPT3.5 : gpt-3.5-turbo
# GPT4: gpt-4-0613

from openai import OpenAI
client = OpenAI(api_key = "API_KEY")

def get_openai_api(inp, temp=0,topP=1):

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": inp}
        ],
        temperature = temp,
        top_p = topP
    )

    return completion.choices[0].message.content
    

In [17]:
def get_answer_llm(user_prompt,use_api=True,temp=0,topP=1):
    sys_prompt="You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
    prompt = ""
    if LLM == 'llama':
        prompt=f"<s>[INST] {user_prompt} [/INST]"
        
        if use_api==False:
            model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
            output = model.generate(**model_inputs)
            return tokenizer.decode(output[0], skip_special_tokens=True)
        else:

            try:
                return get_api(prompt,is_summary)[0]['generated_text']
            except:
                return ""
    elif LLM == 'gpt':
        prompt = user_prompt
        return get_openai_api(prompt)
#

In [18]:
def extract_answer(length,line,task_type='entailment'):
    line=line[length:]
    pattern = r'<Ans>(.*?)</Ans>|Ans:\s*([\w-]+)'
    matches = re.findall(pattern, line)
    if len(matches)==0:  # If no match
        return -1
    
    matches=matches[0][0]
    if task_type=='entailment':
        if matches =='entailment' or matches=='Entailment':
            return 1
    elif task_type=='hyperbaton':
        if matches[0][0]=='a' or matches[0][0]=='A':
            return 1
    elif task_type == 'implicatures':
         if matches[0][0]=='yes' or matches[0][0]=='Yes':
            return 1
    return 0

In [19]:
def extract_hint(length,line):
    line=line[length:]
    pattern = r'<hint>(.*?)</hint>'
    matches = re.findall(pattern, line)
    if len(matches)==0:
        return ""
    return matches[0]

In [20]:
def extract_confidence(length,line):
    line=line[length:]
    pattern = r'<conf>(.*?)</conf>'
    matches = re.findall(pattern, line)
    if len(matches)==0:
        return ""
    return matches[0]

### Algorithm Steps

In [21]:
# initial prompt
prompt_t=task_description+'\n'+'Given input: '+'<INPUT>'+'\n'+task_prefix+'Put your one-word answer choosing from the previously stated two options within tag <Ans> and </Ans>. The one word answer should be necassarily between the tags. Also provide the confidence score for the answer within tag <conf> and </conf>'


##### Step 1: Get inferences for the entire dataset (y_hat <- {xi,yi})

In [22]:
def run_inference_step1(prompt_t,use_api=True,task_type='entailment'):
    wrong_ans_indices=[] #wrong samples
    correct_ans_indices=[] #correct samples
    positive_class=[] #entailment class
    negative_class=[] #non-entailment class
    not_got=[]
    conf = []
    temp_prompt_t=prompt_t
    for i in range(len(train_data)):        
        prompt_t=temp_prompt_t.replace('<INPUT>',train_data[i]['input'])
        op=get_answer_llm(prompt_t,use_api)
        if op=='':
            not_got.append(i)
            continue
        output_str=op
        
        ans=extract_answer(len(prompt_t) if LLM=="llama" else 0,output_str,task_type)
        conf.append(extract_confidence(len(prompt_t) if LLM=="llama" else 0,output_str))
        if ans==-1:
            not_got.append(i)
            continue
        ground_truth=1
      
        if task_type=='entailment':
            if train_data[i]['target_scores']['non-entailment']==1:
                ground_truth=0
        elif task_type=='hyperbaton':
            if train_data[i]['target_scores']['b']==1:
                ground_truth=0
        elif task_type=='implicatures':
            if train_data[i]['target_scores']['no']==1.0:
                ground_truth=0
                
        # Sampling        
        if (ans!=ground_truth):
            wrong_ans_indices.append(i)
        else:
            correct_ans_indices.append(i)
        
        if ground_truth==1:
            positive_class.append(i)
        else:
            negative_class.append(i)
    
    return wrong_ans_indices,correct_ans_indices, positive_class,negative_class,not_got
    

##### Step 2: Sampling (Done before summarization in the original paper, but we are doing this before generating hints)

In [23]:
def random_sampling(wrong_answers,k=3):
    selected=random.sample(wrong_answers, k)
    return selected

In [25]:
def random_balanced_sampling(wrong_answers,right_answers,pos,neg,k=3):
    #sample from wrong ans
    wrong_pos_indices = [i for i in wrong_answers if i in pos]
    wrong_neg_indices = [i for i in wrong_answers if i in neg]

    random_sample_pos = random.sample(wrong_pos_indices, k if len(wrong_pos_indices)>=k else len(wrong_pos_indices))
    random_sample_neg = random.sample(wrong_neg_indices, k if len(wrong_neg_indices)>=k else len(wrong_neg_indices))
    
    if len(random_sample_pos) < k:
        random_sample_pos += random.sample(pos_train, k - len(random_sample_pos))
    
    if len(random_sample_neg) < k:
        random_sample_neg.append += random.sample(neg_train, k - len(random_sample_neg))
        
    print(f"random_sample_pos(Wrong ans) : {len(random_sample_pos)}")
    print(f"random_sample_neg(Wrong ans) : {len(random_sample_neg)}")
    return random_sample_pos + random_sample_neg

##### Step 3: Extract hints from the samples  ( temperature as 0.1 and topP as 0.95 )

In [27]:
def get_hints_residual_step2(answers,use_api=True):
    hints={}

    for idx in answers:
        if task_type=='entailment':
            if train_data[idx]['target_scores']['entailment']==1:
                ans='entailment'
            else:
                ans='non-entailment'
        if task_type=='implicatures':
            if train_data[idx]['target_scores']['yes']==1.0:
                ans='yes'
            else:
                ans='no'
        prompt_h='Given following task:'+task_description+'\n'+'Given input: '+train_data[idx]['input']+'\n'+'And its expected output: '+ans+'\n'+'List the reason or hint why its with this expected output within tag <hint> and </hint>. The hint or explaination should be necassarily between the tags.'
       
        op=get_answer_llm(prompt_h,use_api, temp=0.1,topP=0.95)
        hint=extract_hint(len(prompt_h) if LLM=="llama" else 0,op)
        
        if idx%1000==0:
            print(op,len(op))

       
        if hint!='':
            hints[idx]=hint
        
    return hints

##### Step 4: Summarize the hints

In [29]:
def get_summarise_step4(hints,use_api=True,):
    prompt_s="This is a task to "+task_description+". We have some expected input and output pairs and have asked labeler to give reason or hint for each expected output. Given following data each contains input, output and reason for the expected output, summarize a general reason for all these cases:"+'\n'
    temp_str=''
    ctr=1
    for idx in hints:
        temp=''
        if task_type=='entailment':
            if train_data[idx]['target_scores']['entailment']==1:
                ans='entailment'
            else:
                ans='non-entailment'
        if task_type=='implicatures':
            if train_data[idx]['target_scores']['yes']==1.0:
                ans='yes'
            else:
                ans='no'
                
        temp+='Given input: '+train_data[idx]['input']+'\n'+'And its expected output: '+ans+'. And the reason for the expected output: '+hints[idx]+'\n'
        temp_str+=temp+'\n'
            
    prompt_s+=temp_str+'\n'+"Give a summary of the reasons for the example output, and do not give a reason particular to the respective example. Also do not mention the number of examples nor give any reference to entities in examples in the summary directly or indirectly. Be as general as possible. The summarised reasons are:"

    op=get_answer_llm(prompt_s,use_api)#[l+8+7:]#or extract after [/INSTR]
    return op.lstrip()

#### Algorithm Run

In [30]:
#initial prompt
prompt_t=task_description+'\n'+'Given input: '+'<INPUT>'+'\n'+task_prefix+'Put your one-word answer choosing from the previously stated two options within tag <Ans> and </Ans>. The one word answer should be necassarily between the tags. Also provide the confidence score for the answer within tag <conf> and </conf>'

In [1]:
num_iterations_T=10
succesive_prompts=[]
start=time.time()
file_path = "results.txt"

start = time.time()
for t in range(num_iterations_T):
    # Step 1: Inference
    wrong_answers,right_answers,pos,neg,ng=run_inference_step1(prompt_t,task_type)
    
    # Step 2: Sampling
    samples = random_balanced_sampling(wrong_answers,right_answers,pos,neg)
    
    # Step 3: Generate Hints
    hints=get_hints_residual_step2(samples)
    
    # Step 4: Summarize Hints
    summarised_prompt=get_summarise_step4(hints)
    
    final_prompt=task_description+"\n"+'Some useful hints are: '+summarised_prompt+'\n'+'Given input: '+'<INPUT>'+'\n'+task_prefix+'Put your one-word answer choosing from the previously stated two options within tag <Ans> and </Ans>. The one word answer should be necassarily between the tags.'
    succesive_prompts.append(final_prompt)
    prompt_t=final_prompt
    
    with open(file_path, "a") as file:
    # Write the string to the file
        file.write(f"Iteration {str(t)} \n {prompt_t} \n\n Wrong Ans: {len(wrong_answers)} \n Correct Ans: {len(right_answers)} \n Not got: {len(ng)} \n Total time : {time.time()-start} \n------\n\n ")

end = time.time()
print("Time taken",end-start)

#### Test 

In [33]:
def test_out(final_prompt_totest,use_api=True):
    wrong_ans_indices=[]
    not_got=[]
    correct_ans_indices=[]
    
    for i in range(len(test_data)):
        prompt_t=final_prompt_totest.replace('<INPUT>',test_data[i]['input'])
        op=get_answer_llm(prompt_t,use_api)
        if op=='':
            not_got.append(i)
            continue
        output_str=op
        ans=extract_answer(len(prompt_t) if LLM=="llama" else 0,output_str,task_type)
        if ans==-1:
            not_got.append(i)
            continue
        ground_truth=1
      
        if task_type=='entailment':
            if test_data[i]['target_scores']['non-entailment']==1:
                ground_truth=0
        elif task_type=='hyperbaton':
            if test_data[i]['target_scores']['b']==1:
                ground_truth=0
        elif task_type=='implicatures':
            if test_data[i]['target_scores']["no"]==1.0:
                ground_truth=0
                
        if (ans!=ground_truth):
            wrong_ans_indices.append(i)
        else:
            correct_ans_indices.append(i)
    return wrong_ans_indices,not_got, correct_ans_indices

In [None]:
# succesive_prompts[-1]

In [None]:
prompt_t=task_description+'\n'+'Given input: '+'<INPUT>'+'\n'+task_prefix+'Put your one-word answer choosing from the previously stated two options within tag <Ans> and </Ans>. The one word answer should be necassarily between the tags.'
prompt_t

In [None]:
paper_prompt = "Determine whether one sentence entails the next\nSome useful hints are:\n-Entailment occurs when the hypothesis is a logical consequence of the premise, or when the premise guarantees the truth of the hypothesis, regardless of the level of specificity or simplification of the terms Involved.\n-Non-entailment occurs when the premise does iet guarantee the truth of the hypothesis, or when there is a possibility that the hypothesis is false or unknown, especially when the premise involves beliefs or thoughts of other people.\nGiven input: <INPUT>\nIdentify the relation between the following premises and hypotheses, choosing from the options 'entailment' or 'non-entailment'.\nPut your one-word answer choosing from the previously stated two options within tag <Ans> and </Ans>. The one word answer should be necassarily between the tags."

In [38]:
start = time.time()
wrong_ans_indices_test,not_got_test,correct_ans_indices=test_out(last_prompt)
print(time.time()-start)

147.88107562065125


##### Accuracy

In [44]:
accuracy = len(correct_ans_indices)/len(test_data)
accuracy

0.48

##### Balanced Accuracy

In [45]:
pos_test = [i for i in range(len(test_data)) if test_data[i]['target_scores']['entailment'] == 1]
neg_test = [i for i in range(len(test_data)) if test_data[i]['target_scores']['non-entailment'] == 1]

print(len(pos_test))
print(len(neg_test))

127
173


In [46]:
y_test = [0] * 300
for i in pos_test:
    y_test[i] = 1

y_pred = copy.deepcopy(y_test)
for i in wrong_ans_indices_test:
    if y_pred[i] == 1:
        y_pred[i] = 0
    else:
        y_pred[i] = 1

In [52]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.527149424241045