In [None]:
### Few shot prompting

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

def load_dataset(train_file, test_file):
    # Load train and test datasets
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    
    # Prepare the data in the required format
    train_data['winner'] = np.where(train_data['winnerId'] == train_data['candidateAId'], 'A', 'B')
    test_data['winner'] = np.where(test_data['winnerId'] == test_data['candidateAId'], 'A', 'B')
    
    return train_data, test_data

def create_prompt(role, candidateA, candidateB, examples):
    prompt = f"Role: {role}\n\n"
    prompt += "Examples:\n"
    for _, example in examples.iterrows():
        prompt += f"Candidate A: {example['candidateATranscript']}\n"
        prompt += f"Candidate B: {example['candidateBTranscript']}\n"
        prompt += f"Better candidate: {example['winner']}\n\n"
    prompt += f"Candidate A: {candidateA}\n"
    prompt += f"Candidate B: {candidateB}\n"
    prompt += "Better candidate: "
    return prompt

def get_embedding(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token embedding as the sentence representation
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

def few_shot_prediction(model, tokenizer, role, candidateA, candidateB, examples):
    prompt = create_prompt(role, candidateA, candidateB, examples)
    
    # Get embeddings for the entire prompt
    prompt_embedding = get_embedding(model, tokenizer, prompt)
    
    # Get embeddings for "Candidate A" and "Candidate B"
    embedding_A = get_embedding(model, tokenizer, f"Candidate A: {candidateA}")
    embedding_B = get_embedding(model, tokenizer, f"Candidate B: {candidateB}")
    
    # Compare cosine similarities
    similarity_A = cosine_similarity([prompt_embedding], [embedding_A])[0][0]
    similarity_B = cosine_similarity([prompt_embedding], [embedding_B])[0][0]
    
    return "A" if similarity_A > similarity_B else "B"

def main():
    # Load BERT model and tokenizer
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Load your dataset
    train_data, test_data = load_dataset('train_dataset.csv', 'test_dataset.csv')

    # Select a few examples for few-shot learning
    few_shot_examples = train_data.sample(n=5, random_state=42)  # Adjust the number of examples as needed

    # Make predictions on the test set
    correct_predictions = 0
    total_predictions = 0

    for _, sample in test_data.iterrows():
        prediction = few_shot_prediction(
            model, tokenizer,
            sample['role'],
            sample['candidateATranscript'],
            sample['candidateBTranscript'],
            few_shot_examples
        )
        
        correct_predictions += (prediction == sample['winner'])
        total_predictions += 1

    accuracy = correct_predictions / total_predictions
    print(f"Few-shot learning accuracy: {accuracy:.4f}")

if __name__ == "__main__":
    main()