In [None]:
import time
import openai
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load the pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Load a valued aligned model
openai.api_key = "{OPENAI_API_KEY}"

# TODO: tweak the prompt
def get_aligned_response(question, model_name="gpt-3.5-turbo", temperature=0.8):
    prompt = f"""Answer the following question in one paragraph, be concise.
    Question: {question}"""

    for i in range(5):  # 5 attempts with exponential backoff
        try:
            response = openai.ChatCompletion.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
            )
            return response['choices'][0]['message']['content'].strip()
        except openai.error.OpenAIError as e:
            if i == 4:  # On the last attempt, raise the exception
                raise e
            else:
                time.sleep((2 ** i) + 1)  # Exponential backoff

In [None]:
from sage import sage

def get_target_response(question: list, model, tokenizer):
    pass

def evaluate_model(model, tokenizer, questions, is_target=False):
    # some other handling
    if is_target:
        response = get_target_response(questions, model, tokenizer)
    else:
        response = get_aligned_response(questions)
    return sage.score(questions, response, use_rots=True)


questions = ['What makes us human?']

# Evaluate the baseline model
baseline_results = evaluate_model(model, tokenizer, questions, is_target=True)
print("Baseline Evaluation Results:", baseline_results)

# Evaluate the aligned model
aligned_results = evaluate_model(None, None, questions, is_target=False)
print("Aligned Evaluation Results:", aligned_results)


In [None]:
# Fine-tune the GPT-2 model using LoRA
from lora import LoRA

# TODO might need to add more layers to the lora model
# Initialize LoRA with the GPT-2 model
lora = LoRA(model)

In [None]:

# Start the GAN training here
training_data = []
# create network for target model answer to our questions
# aligned model will answer the questions as well
# if the aligned model answer scores well on sage, feed it into training data
def create_training_data(questions, target_model, tokenizer, threshold):
    training_data = []
    feedback_loop = []

    for question in questions:
        target_response = get_target_response(question, target_model, tokenizer)
        aligned_response = get_aligned_response(question)
        
        target_score = sage.score(question, target_response, use_rots=True)
        aligned_score = sage.score(question, aligned_response, use_rots=True)
        
        if aligned_score > threshold:
            training_data.append((question, aligned_response))
        elif target_score > aligned_score:
            feedback_loop.append((question, target_response, aligned_response))
    
    return training_data, feedback_loop


# TODO add more questions to ask
# TODO use chatGPT to generate the qn for us
questions = ['What makes us human?']
threshold = 0.8  # Define a threshold for good responses

training_data, feedback_loop = create_training_data(questions, model, tokenizer, threshold)

In [None]:
# examine in which cases the target model is better than the aligned model
feedback_loop

# process questions and add them into the training data
# print out the responses for a quick check
for i in range(5):
    question, target_response, aligned_response = feedback_loop[i]
    print(f"Question: {question}")
    print(f"Target Response: {target_response}")
    print(f"Aligned Response: {aligned_response}")
    print()

# TODO how should we handle these cases ?

# add the feedback loop into the training data
training_data.extend([(question, target_response) for question, target_response, _ in feedback_loop])

In [None]:
# Train the model with LoRA
lora.train(training_data)

# Save the fine-tuned model
lora.save_pretrained('lora_finetuned_gpt2_a')
lora.save_pretrained('lora_finetuned_gpt2_b')