In [27]:
import subprocess

MODELS = ["llama3.2:3b", "tinyllama:latest"]


def call_ollama2(model, prompt):
    try:
        result = subprocess.run(
            ["ollama", "run", model, "-p", prompt],
            capture_output=True,
            text=True,
            timeout=60  # avoid hanging
        )
        return result.stdout.strip()
    except subprocess.TimeoutExpired:
        return "[ERROR: Timeout]"
    except Exception as e:
        return f"[ERROR: {e}]"

def call_ollama(model, prompt) -> str:
    try:
        result = subprocess.run(
            ["ollama", "run", model],
            input=prompt.encode("utf-8"),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=30
        )
        return result.stdout.decode("utf-8").strip()
    except subprocess.TimeoutExpired:
        return "Unknown"
        
def evaluate_responses(prompts):
    results = []
    for prompt in prompts:
        row = {"prompt": prompt}
        for model in MODELS:
            print(f"\nRunning prompt on {model}...")
            output = call_ollama(model, prompt)
            row[model] = output
        results.append(row)
    return results

def print_side_by_side(results):
    for r in results:
        print(f"\n游릭 Prompt: {r['prompt']}")
        for model in MODELS:
            print(f"\n游댯 {model}:\n{r[model]}")

def main():
    with open("prompts.txt") as f:
        prompts = [line.strip() for line in f if line.strip()]
    
    results = evaluate_responses(prompts)
    print_side_by_side(results)

main()



Running prompt on llama3.2:3b...

Running prompt on tinyllama:latest...

Running prompt on llama3.2:3b...

Running prompt on tinyllama:latest...

Running prompt on llama3.2:3b...

Running prompt on tinyllama:latest...

Running prompt on llama3.2:3b...

Running prompt on tinyllama:latest...

Running prompt on llama3.2:3b...

Running prompt on tinyllama:latest...

游릭 Prompt: What is the capital of Australia?

游댯 llama3.2:3b:


游댯 tinyllama:latest:
The capital of Australia is Canberra, located in the nation's Capital Territory.

游릭 Prompt: Explain the difference between TCP and UDP.

游댯 llama3.2:3b:


游댯 tinyllama:latest:
In computer networking, there are two main protocols for transmitting data:

1. Transmission Control Protocol (TCP) - This is used to establish a connection between two hosts, ensure data transmission reliability, and protect against packet loss and other communication errors. It provides a layer of abstraction for handling complex network traffic.

2. User Datagram Pro

In [29]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Reference sentence(s) - as list of word lists
reference = [["the", "cat", "is", "on", "the", "mat"]]

# Candidate sentence (generated by LLM)
candidate = ["the", "cat", "the", "cat", "on", "the", "mat"]

# Apply smoothing to avoid 0 scores on short text
smoother = SmoothingFunction().method1

# Compute BLEU score using up to 4-grams
score = sentence_bleu(reference, candidate, smoothing_function=smoother)

print(f"BLEU score: {score:.4f}")


BLEU score: 0.2056


In [31]:
try:
    from rouge_score import rouge_scorer
except ImportError:
    %pip install rouge-score
    from rouge_score import rouge_scorer
    
# Create a scorer for ROUGE-1, ROUGE-2, and ROUGE-L
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Reference (human-written) and Candidate (generated) texts
reference = "The cat sat on the mat."
candidate = "The cat lay on the dirty rug."

# Compute ROUGE scores
scores = scorer.score(reference, candidate)

# Print results
for metric, score in scores.items():
    print(f"{metric}:")
    print(f"  Precision: {score.precision:.4f}")
    print(f"  Recall:    {score.recall:.4f}")
    print(f"  F1 score:  {score.fmeasure:.4f}")

rouge1:
  Precision: 0.5714
  Recall:    0.6667
  F1 score:  0.6154
rouge2:
  Precision: 0.3333
  Recall:    0.4000
  F1 score:  0.3636
rougeL:
  Precision: 0.5714
  Recall:    0.6667
  F1 score:  0.6154
