In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Prompt 1 avg log prob: -11.3958
Prompt 2 avg log prob: -8.8229
Winner: Prompt 2


# Making LLMS RELIABLE




# 1.   Templatization of Prompt
# 2.   Comparing Prompts








#Templatization of Prompt

In [143]:
prompt = "What is a capital of India"
messages = [
    {"role": "user", "content": prompt}
]
for i in range(0,10):
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    # conduct text completion
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=100
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

    # parsing thinking content
    try:
        # rindex finding 151668 (</think>)
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0


    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")


    print("content:", content)

content: India's capital is **New Delhi**.
content: India's capital is **Bhopal**.
content: The capital of India is **New Delhi**.
content: India does not have a capital in the sense of a "capital city" as it is a country, but it has a capital city, **Delhi**, which is the capital and largest city in India. The capital city is known as **Delhi**.
content: India has multiple capital cities, each with its own significance. The capital of India is **New Delhi**. It is a major political and economic center of the country.
content: The capital of India is **New Delhi**. It is a significant political, economic, and cultural center of the country.
content: India's capital is **New Delhi**.
content: The capital of India is **New Delhi**.
content: India's capital is **New Delhi**.
content: The capital of India is **New Delhi**.


In [162]:
text="""
<|im_start|>user
What is a capital of India <|im_end|>
<|im_start|>assistant
<think>

</think>

The capital of India is **"""

for i in range(0,10):
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=100
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

    # parsing thinking content
    try:
        # rindex finding 151668 (</think>)
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0


    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")


    print("content:", content)


content: New Delhi**.
content: New Delhi**.
content: New Delhi**.
content: New Delhi**.
content: New Delhi**.
content: New Delhi**.
content: New Delhi**.
content: New Delhi**.
content: New Delhi**.
content: New Delhi**.


In [131]:
from collections import defaultdict
import torch

def evaluate_prompts(model, tokenizer, questions, sample_data, answer_template,
                    base_template=None, verbose=True, top_k=5):
    """
    Evaluate multiple prompt questions by measuring probability of correct answers.

    Args:
        model: The language model
        tokenizer: The tokenizer
        questions: List of question templates (can contain {country} placeholder)
        sample_data: Dict of {input: expected_output} pairs
        answer_template: Template for the answer part (e.g., "The capital of {country} is ")
        base_template: Optional base template with {question} and {answer} placeholders
        verbose: Whether to print detailed results
        top_k: Number of top predictions to show

    Returns:
        Dict with results and best question
    """

    # Default template if none provided
    if base_template is None:
        base_template = """<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
<think>
</think>
{answer}"""

    # Store results for each question
    question_scores = defaultdict(list)
    question_details = defaultdict(list)

    for input_val, expected_output in sample_data.items():
        for question in questions:
            if verbose:
                print(f"\n{input_val} -> {expected_output}")
                print(f"Question: '{question}'")

            # Format the question (replace placeholders)
            formatted_question = question.format(country=input_val, input=input_val)

            # Format the answer template
            formatted_answer = answer_template.format(country=input_val, input=input_val)

            # Create full text
            text = base_template.format(question=formatted_question, answer=formatted_answer)

            # Tokenize and get model prediction
            model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
            correct_token = tokenizer(expected_output).to(model.device)['input_ids'][0]

            if verbose:
                print(f"Correct Token: {correct_token}")

            with torch.no_grad():
                outputs = model(**model_inputs)
                logits = outputs.logits[0, -1, :]
                probs = torch.softmax(logits, dim=-1)
                target_prob = probs[correct_token].item()

                # Store the probability for this question
                question_scores[question].append(target_prob)
                question_details[question].append({
                    'input': input_val,
                    'expected': expected_output,
                    'prob': target_prob
                })

                if verbose:
                    top_k_probs, top_k_indices = torch.topk(probs, top_k)
                    top_k_tokens = [tokenizer.decode([idx]) for idx in top_k_indices]

                    print(f"Target probability: {target_prob:.4f}")
                    for token, prob in zip(top_k_tokens, top_k_probs):
                        print(f"'{token}': {prob.item():.4f}")

    # Calculate average scores for each question
    question_averages = []
    for question, scores in question_scores.items():
        avg_score = sum(scores) / len(scores)
        question_averages.append((question, avg_score, scores))

    # Sort by average score (highest first)
    question_averages.sort(key=lambda x: x[1], reverse=True)

    # Analyze results for anomalies and insights
    analysis = _analyze_results(question_averages, question_details, sample_data)

    if True:
        print("\n" + "="*60)
        print("QUESTION RANKING BY AVERAGE PROBABILITY:")
        print("="*60)

        for i, (question, avg_score, scores) in enumerate(question_averages):
            print(f"\n{i+1}. Question: '{question}'")
            print(f"   Average probability: {avg_score:.4f}")
            print(f"   Individual scores: {[f'{s:.3f}' for s in scores]}")
            print(f"   Std deviation: {analysis['std_devs'][question]:.4f}")

            # Show anomalies for this question
            if question in analysis['anomalies']:
                print(f"   ⚠️  Anomalies: {analysis['anomalies'][question]}")

        if question_averages:
            print(f"\n🏆 BEST QUESTION: '{question_averages[0][0]}'")
            print(f"🏆 BEST AVERAGE SCORE: {question_averages[0][1]:.4f}")

        # Print detailed analysis
        print("\n" + "="*60)
        print("DETAILED ANALYSIS:")
        print("="*60)
        print(f"📊 Total evaluations: {len(sample_data) * len(questions)}")
        print(f"📈 Score range: {analysis['min_score']:.4f} - {analysis['max_score']:.4f}")
        print(f"📉 Most consistent question: {analysis['most_consistent_question']}")
        print(f"📈 Most variable question: {analysis['most_variable_question']}")

        if analysis['worst_performers']:
            print(f"\n🔴 Worst performing examples:")
            for item in analysis['worst_performers'][:3]:
                print(f"   {item['input']} -> {item['expected']}: {item['prob']:.4f} (Question: '{item['question'][:30]}...')")

        if analysis['best_performers']:
            print(f"\n🟢 Best performing examples:")
            for item in analysis['best_performers'][:3]:
                print(f"   {item['input']} -> {item['expected']}: {item['prob']:.4f} (Question: '{item['question'][:30]}...')")

        if analysis['problematic_inputs']:
            print(f"\n⚠️  Inputs that consistently perform poorly:")
            for input_val, avg_prob in analysis['problematic_inputs']:
                print(f"   {input_val}: avg prob {avg_prob:.4f}")

        if analysis['recommendations']:
            print(f"\n💡 RECOMMENDATIONS:")
            for rec in analysis['recommendations']:
                print(f"   • {rec}")

    # Always print summary even if not verbose
    if not verbose and question_averages:
        print(f"Best question: '{question_averages[0][0]}' (score: {question_averages[0][1]:.4f})")

    return {
        'rankings': question_averages,
        'detailed_scores': question_details,
        'best_question': question_averages[0][0] if question_averages else None,
        'best_score': question_averages[0][1] if question_averages else None,
        'analysis': analysis
    }

def _analyze_results(question_averages, question_details, sample_data):
    """Analyze results to find anomalies and provide insights"""
    import statistics

    analysis = {
        'anomalies': {},
        'std_devs': {},
        'worst_performers': [],
        'best_performers': [],
        'problematic_inputs': [],
        'most_consistent_question': None,
        'most_variable_question': None,
        'recommendations': []
    }

    all_scores = []

    # Calculate standard deviations and find anomalies
    for question, avg_score, scores in question_averages:
        std_dev = statistics.stdev(scores) if len(scores) > 1 else 0
        analysis['std_devs'][question] = std_dev
        all_scores.extend(scores)

        # Find anomalies (scores that are 2 std devs away from mean)
        if std_dev > 0:
            anomalies = []
            for i, score in enumerate(scores):
                if abs(score - avg_score) > 2 * std_dev:
                    input_key = list(sample_data.keys())[i]
                    anomalies.append(f"{input_key}({score:.3f})")
            if anomalies:
                analysis['anomalies'][question] = anomalies

    # Find most/least consistent questions
    if analysis['std_devs']:
        analysis['most_consistent_question'] = min(analysis['std_devs'], key=analysis['std_devs'].get)
        analysis['most_variable_question'] = max(analysis['std_devs'], key=analysis['std_devs'].get)

    # Collect all individual results for worst/best performers
    all_results = []
    for question, details in question_details.items():
        for detail in details:
            all_results.append({
                'question': question,
                'input': detail['input'],
                'expected': detail['expected'],
                'prob': detail['prob']
            })

    # Sort to find worst and best performers
    all_results.sort(key=lambda x: x['prob'])
    analysis['worst_performers'] = all_results[:5]
    analysis['best_performers'] = all_results[-5:]

    # Find problematic inputs (consistently low across all questions)
    input_scores = {}
    for result in all_results:
        if result['input'] not in input_scores:
            input_scores[result['input']] = []
        input_scores[result['input']].append(result['prob'])

    input_avg_scores = [(inp, sum(scores)/len(scores)) for inp, scores in input_scores.items()]
    input_avg_scores.sort(key=lambda x: x[1])
    analysis['problematic_inputs'] = input_avg_scores[:2]  # Bottom 2

    # Calculate overall stats
    if all_scores:
        analysis['min_score'] = min(all_scores)
        analysis['max_score'] = max(all_scores)
        overall_avg = sum(all_scores) / len(all_scores)

        # Generate recommendations
        if analysis['max_score'] < 0.5:
            analysis['recommendations'].append("Overall scores are low. Consider revising the answer template or using a different model.")

        if len(question_averages) > 1:
            best_score = question_averages[0][1]
            worst_score = question_averages[-1][1]
            if best_score - worst_score > 0.2:
                analysis['recommendations'].append("Large variation between questions. The prompt format significantly affects performance.")

        if analysis['most_variable_question'] and analysis['std_devs'][analysis['most_variable_question']] > 0.3:
            analysis['recommendations'].append(f"Question '{analysis['most_variable_question'][:30]}...' shows high variance. Check for ambiguous wording.")

        if len(analysis['problematic_inputs']) > 0:
            problematic = analysis['problematic_inputs'][0][0]
            analysis['recommendations'].append(f"Input '{problematic}' consistently performs poorly. Check if the expected output is correct.")

    return analysis



In [134]:
# Example usage:
if __name__ == "__main__":
    # Your data
    sample_data = {
        "India": "New",
        "United States": "Washington",
        "Germany": "Berlin",
        "France": "Paris",
        "Russia": "Mos",
    }

    questions=[
    "what is the capital of {country} ?",
    "what is the capital of {country}",
    "what is the capital of {country} ",
    "tell me the captial of {country}",
    "mention the captial of {country}",
    "capital of {country} ?"
    ]

    answer_template = "The capital of {country} is **"

    # Run evaluation
    results = evaluate_prompts(
        model=model,  # Your model
        tokenizer=tokenizer,  # Your tokenizer
        questions=questions,
        sample_data=sample_data,
        answer_template=answer_template,
        verbose=False
    )

    # Access results
    best_question = results['best_question']
    best_score = results['best_score']


QUESTION RANKING BY AVERAGE PROBABILITY:

1. Question: 'mention the captial of {country}'
   Average probability: 0.9531
   Individual scores: ['0.918', '0.996', '0.906', '0.965', '0.980']
   Std deviation: 0.0393

2. Question: 'capital of {country} ?'
   Average probability: 0.9375
   Individual scores: ['0.879', '0.996', '0.891', '0.945', '0.977']
   Std deviation: 0.0516

3. Question: 'tell me the captial of {country}'
   Average probability: 0.9344
   Individual scores: ['0.898', '0.996', '0.836', '0.961', '0.980']
   Std deviation: 0.0664

4. Question: 'what is the capital of {country}'
   Average probability: 0.9219
   Individual scores: ['0.863', '0.996', '0.828', '0.941', '0.980']
   Std deviation: 0.0734

5. Question: 'what is the capital of {country} '
   Average probability: 0.9141
   Individual scores: ['0.863', '0.996', '0.793', '0.938', '0.980']
   Std deviation: 0.0850

6. Question: 'what is the capital of {country} ?'
   Average probability: 0.9133
   Individual scores

In [139]:
math_data = {
    "2 + 3": "5",
    "4 + 1": "5",
    "6 - 2": "4",
    "3 + 2": "5",
    "7 - 3": "4"
}

math_questions = [
    "What is {input}?",
    "What is {input} ? ",
    "Calculate {input}",
    "The result of {input} is",
    "Solve: {input}",
    "{input} equals",

]

math_answer_template = "The answer of {input}="


results = evaluate_prompts(
        model=model,  # Your model
        tokenizer=tokenizer,  # Your tokenizer
        questions=math_questions,
        sample_data=math_data,
        answer_template=math_answer_template,
        verbose=False
    )

    # Access results
best_question = results['best_question']
best_score = results['best_score']


QUESTION RANKING BY AVERAGE PROBABILITY:

1. Question: 'What is {input}?'
   Average probability: 0.9062
   Individual scores: ['0.875', '0.898', '0.902', '0.930', '0.926']
   Std deviation: 0.0223

2. Question: 'What is {input} ? '
   Average probability: 0.8148
   Individual scores: ['0.809', '0.828', '0.762', '0.863', '0.812']
   Std deviation: 0.0367

3. Question: '{input} equals'
   Average probability: 0.7594
   Individual scores: ['0.766', '0.809', '0.676', '0.840', '0.707']
   Std deviation: 0.0683

4. Question: 'Calculate {input}'
   Average probability: 0.6996
   Individual scores: ['0.715', '0.750', '0.498', '0.805', '0.730']
   Std deviation: 0.1177

5. Question: 'Solve: {input}'
   Average probability: 0.6695
   Individual scores: ['0.668', '0.664', '0.617', '0.711', '0.688']
   Std deviation: 0.0347

6. Question: 'The result of {input} is'
   Average probability: 0.0214
   Individual scores: ['0.027', '0.029', '0.008', '0.028', '0.016']
   Std deviation: 0.0094

🏆 BEST Q