# Prompt Template Prompting

Prompt Template Prompting refers to a technique where predefined templates are used to construct effective prompts that guide large language models (LLMs) to generate responses tailored to specific use cases. The templates typically contain static text combined with dynamic input variables, allowing for consistent, reusable, and customizable prompts.

Prompt templates are widely used across various domains, such as:
* **Question Generation**: Templates can generate quiz questions by filling in variables related to topics.
* **Text Summarization**: Static instructions combined with variable documents or inputs allow flexible summarization.
* **Coding Assistance**: Dynamic prompts help LLMs generate code snippets for different programming tasks.

## References:

* (OpenAI Documentation for Prompt Engineering)[https://platform.openai.com/docs/guides/prompt-engineering]

## Running this code on MyBind.org

Note: remember that you will need to **adjust CONFIG** with **proper URL and API_KEY**!

[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/GenILab-FAU/prompt-eng/HEAD?urlpath=%2Fdoc%2Ftree%2Fprompt-eng%2Fprompt_template.ipynb)


In [1]:
import json
import re
import time
import statistics
from _pipeline import create_payload, model_req

# Load previous best prompts with improved error handling
def load_prompt_history():
    try:
        with open("best_prompts.json", "r") as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        print("Creating new prompt history database")
        return {}

# Enhanced domain-specific terminology with importance weights
important_terms = {
    "functional requirements": 15,
    "non-functional requirements": 15,
    "real-time study support": 10,
    "personalized tutoring": 10,
    "exam preparation": 10,
    "user experience": 12,
    "response time": 8,
    "scalability": 8,
    "security": 12,
    "integration": 10,
    "data privacy": 12,
    "accessibility": 10
}

def enhanced_score_response(response, metrics_only=False):
    """
    Enhanced scoring function with multiple evaluation dimensions
    Returns tuple: (final_score, detailed_metrics, critique)
    """
    # Initialize metrics dictionary
    metrics = {
        "word_count": len(response.split()),
        "keyword_matches": 0,
        "keyword_score": 0,
        "ai_critique_score": 0,
        "readability_score": 0
    }
    
    # Calculate keyword presence and weighted score
    for term, weight in important_terms.items():
        if term.lower() in response.lower():
            metrics["keyword_matches"] += 1
            metrics["keyword_score"] += weight
    
    # Simple readability metric (avg sentence length - prefer 15-20 words)
    sentences = [s.strip() for s in re.split(r'[.!?]', response) if s.strip()]
    if sentences:
        avg_sentence_length = statistics.mean([len(s.split()) for s in sentences])
        # Penalize sentences that are too short or too long
        if avg_sentence_length < 8:
            metrics["readability_score"] = 5
        elif avg_sentence_length > 30:
            metrics["readability_score"] = 6
        else:
            metrics["readability_score"] = 10
    
    # If only want metrics without AI critique (for faster iterations)
    if metrics_only:
        metrics["final_score"] = (
            metrics["word_count"] * 0.2 +
            metrics["keyword_score"] * 2 +
            metrics["readability_score"] * 5
        )
        return metrics["final_score"], metrics, "No critique requested"
        
    # AI-based multidimensional critique
    critic_prompt = f"""
    Perform a detailed evaluation of the following requirement analysis response for an AI-powered Study Companion Discord Bot.
    
    Evaluate based on these specific criteria:
    1. Clarity (1-10): Is the response well-structured, organized logically, and easy to understand?
    2. Completeness (1-10): Does it thoroughly cover both functional and non-functional requirements?
    3. Technical Accuracy (1-10): Are the statements factually correct and aligned with software engineering best practices?
    4. Specificity (1-10): Does it provide concrete, actionable details rather than vague statements?
    5. Feasibility (1-10): Are the requirements realistic and implementable?
    
    Response to evaluate:
    {response}
    
    Format your response exactly as follows:
    Clarity: [score]/10
    Completeness: [score]/10
    Technical Accuracy: [score]/10
    Specificity: [score]/10
    Feasibility: [score]/10
    Overall: [average score]/10
    
    Explanation: [brief justification for scores and specific improvement suggestions]
    """

    # Call the AI model to critique the response
    payload_critic = create_payload(
        target="ollama",
        model="llama3.2:latest",
        prompt=critic_prompt,
        temperature=0.4  # Lower temperature for more consistent evaluation
    )
    
    _, critic_feedback = model_req(payload_critic)
    
    # Extract all scores using regex
    scores = {}
    for criterion in ["Clarity", "Completeness", "Technical Accuracy", "Specificity", "Feasibility", "Overall"]:
        match = re.search(rf"{criterion}:\s*(\d+(?:\.\d+)?)/10", critic_feedback)
        if match:
            scores[criterion.lower()] = float(match.group(1))
    
    # Calculate average score if "Overall" wasn't extracted
    if "overall" not in scores and len(scores) > 0:
        scores["overall"] = sum(scores.values()) / len(scores)
    
    # Use overall score or default to 5
    critic_score = scores.get("overall", 5)
    metrics["ai_critique_score"] = critic_score * 10  # Scale to 0-100
    
    # Calculate final weighted score
    metrics["final_score"] = (
        metrics["word_count"] * 0.2 +           # 20% weight for length
        metrics["keyword_score"] * 2 +          # 40% weight for keywords
        metrics["ai_critique_score"] * 3 +      # 30% weight for AI critique
        metrics["readability_score"] * 5        # 10% weight for readability
    )
    
    return metrics["final_score"], metrics, critic_feedback

def generate_prompt_variations(base_prompt, n=3):
    """Generate multiple variations of a prompt to test effectiveness"""
    variation_prompt = f"""
    Create {n} variations of the following prompt. Each variation should maintain the same goal
    but experiment with different structures, emphasis, or approaches to potentially improve results.
    Make each variation distinctly different in structure or strategy.
    
    Original prompt:
    {base_prompt}
    
    Format your response as:
    VARIATION 1:
    [First complete prompt variation]
    
    VARIATION 2:
    [Second complete prompt variation]
    
    VARIATION 3:
    [Third complete prompt variation]
    """
    
    payload = create_payload(
        target="ollama",
        model="llama3.2:latest",
        prompt=variation_prompt,
        temperature=0.8,  # Higher temperature for more creative variations
        num_predict=2048
    )
    
    _, variations_text = model_req(payload)
    
    # Extract variations using regex
    variations = []
    for i in range(1, n+1):
        pattern = rf"VARIATION {i}:(.*?)(?:VARIATION {i+1}:|$)"
        match = re.search(pattern, variations_text, re.DOTALL)
        if match:
            variations.append(match.group(1).strip())
    
    # If extraction failed, fall back to simpler method
    if not variations:
        variations = [base_prompt]
        print("Failed to generate distinct variations")
    
    return variations

# Main optimization process
def optimize_prompt_pipeline():
    # Load prompt history at the beginning of the function
    prompt_history = load_prompt_history()
    
    # Step 1: Generate initial meta-prompt
    meta_prompt = (
        "Create a highly structured prompt that extracts detailed functional and non-functional "
        "requirements for an AI-powered Study Companion Discord Bot. The structured prompt should "
        "elicit comprehensive, technically precise, and actionable requirements spanning user "
        "interaction, performance criteria, security measures, and integration capabilities. "
        "Include specific sections for real-time study support, personalized tutoring, and exam preparation. "
        "The prompt should guide the response to be well-organized with clear categorization."
    )
    
    payload_meta = create_payload(
        target="ollama",
        model="llama3.2:latest",
        prompt=meta_prompt,
        temperature=0.7
    )
    _, base_prompt = model_req(payload_meta)
    print("✓ Generated base prompt")
    
    # Step 2: Generate multiple prompt variations
    prompt_variations = generate_prompt_variations(base_prompt, n=3)
    print(f"✓ Created {len(prompt_variations)} prompt variations")
    
    # Step 3: Test all variations
    results = []
    
    for i, prompt in enumerate(prompt_variations):
        print(f"Testing variation {i+1}...")
        
        # Execute prompt
        payload_exec = create_payload(
            target="ollama",
            model="llama3.2:latest",
            prompt=prompt,
            temperature=0.9,  # Slightly higher for creative requirements
            num_ctx=4096,
            num_predict=2048
        )
        _, response = model_req(payload_exec)
        
        # Score the response
        score, metrics, critique = enhanced_score_response(response)
        
        results.append({
            "prompt": prompt,
            "response": response,
            "score": score,
            "metrics": metrics,
            "critique": critique
        })
        
        print(f"  Score: {score:.2f}")
    
    # Sort by score (descending)
    results.sort(key=lambda x: x["score"], reverse=True)
    best_result = results[0]
    
    # Step 4: Refine the best prompt based on critique
    refinement_prompt = f"""
    Refine this prompt based on the following critique of the response it generated.
    
    Original prompt:
    {best_result["prompt"]}
    
    Response critique:
    {best_result["critique"]}
    
    Create an improved version of the prompt that addresses the weaknesses identified in the critique
    while maintaining its strengths. Focus especially on eliciting more specific, actionable, and
    comprehensive requirements.
    """
    
    payload_refine = create_payload(
        target="ollama",
        model="llama3.2:latest",
        prompt=refinement_prompt,
        temperature=0.6
    )
    _, refined_prompt = model_req(payload_refine)
    print("✓ Generated refined prompt")
    
    # Step 5: Test the refined prompt
    payload_refined_exec = create_payload(
        target="ollama",
        model="llama3.2:latest",
        prompt=refined_prompt,
        temperature=0.9,
        num_ctx=4096,
        num_predict=2048
    )
    _, refined_response = model_req(payload_refined_exec)
    
    refined_score, refined_metrics, refined_critique = enhanced_score_response(refined_response)
    print(f"✓ Refined prompt score: {refined_score:.2f}")
    
    # Step 6: Compare and select the best overall
    all_results = results + [{
        "prompt": refined_prompt,
        "response": refined_response,
        "score": refined_score,
        "metrics": refined_metrics,
        "critique": refined_critique
    }]
    
    all_results.sort(key=lambda x: x["score"], reverse=True)
    champion = all_results[0]
    
    # Save to history
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    prompt_history[timestamp] = {
        "prompt": champion["prompt"],
        "score": champion["score"],
        "metrics": champion["metrics"]
    }
    
    # Save top 3 prompts to file
    with open("best_prompts.json", "w") as file:
        # Only store recent and high-scoring prompts (up to 20)
        if len(prompt_history) > 20:
            sorted_history = sorted(
                prompt_history.items(),
                key=lambda x: x[1].get("score", 0),
                reverse=True
            )
            prompt_history = dict(sorted_history[:20])
        
        json.dump(prompt_history, file, indent=4)
    
    # Output Results
    print("\n" + "="*50)
    print("OPTIMIZATION RESULTS")
    print("="*50)
    print(f"🏆 Best Score: {champion['score']:.2f}")
    print(f"📊 Metrics: {champion['metrics']}")
    print("\n✅ BEST OPTIMIZED PROMPT:")
    print("-"*50)
    print(champion["prompt"])
    print("-"*50)
    print("\n📜 GENERATED RESPONSE:")
    print("-"*50)
    print(champion["response"][:500] + "..." if len(champion["response"]) > 500 else champion["response"])
    print("-"*50)
    print("\n🔍 CRITIQUE:")
    print("-"*50)
    print(champion["critique"])
    print("="*50)
    
    return champion

# Run the optimization pipeline
if __name__ == "__main__":
    optimize_prompt_pipeline()

{'model': 'llama3.2:latest', 'prompt': 'Create a highly structured prompt that extracts detailed functional and non-functional requirements for an AI-powered Study Companion Discord Bot. The structured prompt should elicit comprehensive, technically precise, and actionable requirements spanning user interaction, performance criteria, security measures, and integration capabilities. Include specific sections for real-time study support, personalized tutoring, and exam preparation. The prompt should guide the response to be well-organized with clear categorization.', 'stream': False, 'options': {'temperature': 0.7}}
✓ Generated base prompt
{'model': 'llama3.2:latest', 'prompt': '\n    Create 3 variations of the following prompt. Each variation should maintain the same goal\n    but experiment with different structures, emphasis, or approaches to potentially improve results.\n    Make each variation distinctly different in structure or strategy.\n    \n    Original prompt:\n    **AI-Power