In [None]:
# Install required libraries (Colab)
!pip install -q transformers sentence-transformers

In [None]:
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
def load_model_and_tokenizer(model_name_or_path: str):
    """Load a (small) causal LLM such as Gemma locally using transformers."""
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    print(f"Loading model '{model_name_or_path}' on device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    # Use float16 on GPU, float32 on CPU for compatibility.
    torch_dtype = torch.float16 if device.type == "cuda" else torch.float32

    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        torch_dtype=torch_dtype,
    )
    model.to(device)
    model.eval()

    return model, tokenizer, device


def generate_answer(model, tokenizer, device, question: str, max_new_tokens: int = 128) -> str:
    """Generate an answer from the local LLM for the given question."""
    prompt = f"Question: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            eos_token_id=tokenizer.eos_token_id,
        )
    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if full_text.startswith(prompt):
        answer = full_text[len(prompt):].strip()
    else:
        answer = full_text.strip()
    return answer


def cosine_similarity_embeddings(reference: str, hypothesis: str, model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> float:
    """Compute cosine similarity between reference and hypothesis sentence embeddings."""
    embed_model = SentenceTransformer(model_name)
    embeddings = embed_model.encode([reference, hypothesis])
    ref_emb, hyp_emb = embeddings[0], embeddings[1]
    num = float(np.dot(ref_emb, hyp_emb))
    den = float(np.linalg.norm(ref_emb) * np.linalg.norm(hyp_emb))
    if den == 0.0:
        return 0.0
    return num / den


def read_prompt_reference_pairs(path: str):
    """Read blocks of two lines: 'Prompt: ...' and reference answer.

    Expected format per example (no blank lines required):
      Prompt: <question text>
      <reference answer>
    """
    pairs = []
    with open(path, "r", encoding="utf-8") as f:
        lines = [line.rstrip("\n") for line in f]

    i = 0
    while i + 1 < len(lines):
        prompt_line = lines[i].strip()
        ref_line = lines[i + 1].strip()
        if not prompt_line.startswith("Prompt:"):
            raise ValueError(f"Expected line starting with 'Prompt:' at line {i+1}, got: {prompt_line!r}")
        question = prompt_line[len("Prompt:"):].strip()
        reference = ref_line
        pairs.append((question, reference))
        i += 2

    if i < len(lines):
        raise ValueError("Input file has an odd number of lines; each example must have exactly two lines.")

    return pairs

In [None]:
# Upload an input file with two-line blocks (Prompt / Reference)
from google.colab import files

uploaded = files.upload()
input_filename = list(uploaded.keys())[0]
print('Uploaded file:', input_filename)

In [None]:
# Configure model and generation settings
model_name = 'google/gemma-2-2b-it'  # change if needed
max_new_tokens = 128

model, tokenizer, device = load_model_and_tokenizer(model_name)
pairs = read_prompt_reference_pairs(input_filename)
print(f'Loaded {len(pairs)} examples from', input_filename)

output_path = 'predictions.txt'
with open(output_path, 'w', encoding='utf-8') as out_f:
    for idx, (question, reference) in enumerate(pairs, start=1):
        hypothesis = generate_answer(model, tokenizer, device, question, max_new_tokens=max_new_tokens)
        cosine_sim = cosine_similarity_embeddings(reference=reference, hypothesis=hypothesis)
        out_f.write(f'Example {idx}\n')
        out_f.write(f'Prompt: {question}\n')
        out_f.write(f'Reference: {reference}\n')
        out_f.write(f'Answer: {hypothesis}\n')
        out_f.write(f'CosineSimilarity: {cosine_sim:.4f}\n')
        out_f.write('\n')

print('Wrote predictions to', output_path)

In [None]:
# Download the predictions file to your local machine
from google.colab import files
files.download('predictions.txt')