#### üïê √öltima modificaci√≥n: 2025-01-24 20:22:30 (Chile)

## üîß Setup and Installation

In [28]:
# Install packages
import subprocess
import sys
import time

# Start total notebook execution timer
NOTEBOOK_START_TIME = time.time()
print(f"‚è±Ô∏è Iniciando cron√≥metro del notebook: {time.strftime('%Y-%m-%d %H:%M:%S')}")

def install_if_missing(package_name, import_name=None):
    check_name = import_name if import_name else package_name
    try:
        __import__(check_name)
        print(f"‚úÖ {package_name}")
    except ImportError:
        print(f"üì¶ Installing {package_name}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

required_packages = [
    ("sentence-transformers", "sentence_transformers"),
    ("pandas", "pandas"), ("numpy", "numpy"), ("scikit-learn", "sklearn"),
    ("tqdm", "tqdm"), ("pytz", "pytz"), ("huggingface_hub", "huggingface_hub"),
    ("openai", "openai"), ("ragas", "ragas"), ("datasets", "datasets"),
    ("bert-score", "bert_score")  # For BERTScore functionality
]

for package, import_name in required_packages:
    install_if_missing(package, import_name)

# Import modules
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import json
from datetime import datetime
import pytz
import gc
from typing import List, Dict, Tuple
from tqdm import tqdm

# RAGAS imports
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
)
from datasets import Dataset

CHILE_TZ = pytz.timezone('America/Santiago')

# Auth setup
try:
    from google.colab import userdata
    HUGGINGFACE_TOKEN = userdata.get('HF_TOKEN')
    if HUGGINGFACE_TOKEN:
        from huggingface_hub import login
        login(token=HUGGINGFACE_TOKEN)
        print("‚úÖ HF authenticated")
except:
    print("‚ö†Ô∏è HF token not found")

‚è±Ô∏è Iniciando cron√≥metro del notebook: 2025-07-25 00:47:42
‚úÖ sentence-transformers
‚úÖ pandas
‚úÖ numpy
‚úÖ scikit-learn
‚úÖ tqdm
‚úÖ pytz
‚úÖ huggingface_hub
‚úÖ openai
‚úÖ ragas
‚úÖ datasets
‚úÖ bert-score
‚úÖ HF authenticated


In [29]:
from google.colab import drive
drive.mount('/content/drive')

import os

BASE_PATH = '/content/drive/MyDrive/TesisMagister/acumulative/colab_data/'
ACUMULATIVE_PATH = '/content/drive/MyDrive/TesisMagister/acumulative/'

# Load API keys
try:
    from google.colab import userdata
    openai_key = userdata.get('OPENAI_API_KEY')
    if openai_key:
        os.environ['OPENAI_API_KEY'] = openai_key
        print("‚úÖ OpenAI API key loaded")
        OPENAI_AVAILABLE = True
    else:
        OPENAI_AVAILABLE = False
except:
    OPENAI_AVAILABLE = False

# Fallback to .env file
if not OPENAI_AVAILABLE:
    env_file_path = ACUMULATIVE_PATH + '.env'
    if os.path.exists(env_file_path):
        with open(env_file_path, 'r') as f:
            for line in f:
                if 'OPENAI_API_KEY=' in line:
                    key, value = line.strip().split('=', 1)
                    os.environ[key] = value.strip('"').strip("'")
                    print("‚úÖ OpenAI API key loaded from .env")
                    OPENAI_AVAILABLE = True
                    break

# File paths
EMBEDDING_FILES = {
    'ada': BASE_PATH + 'docs_ada_with_embeddings_20250721_123712.parquet',
    'e5-large': BASE_PATH + 'docs_e5large_with_embeddings_20250721_124918.parquet',
    'mpnet': BASE_PATH + 'docs_mpnet_with_embeddings_20250721_125254.parquet',
    'minilm': BASE_PATH + 'docs_minilm_with_embeddings_20250721_125846.parquet'
}

# Config file - FIXED to get the latest by timestamp
import glob
import re

config_files = glob.glob(ACUMULATIVE_PATH + 'evaluation_config_*.json')
if config_files:
    # Extract timestamps from filenames and sort by them
    files_with_timestamps = []
    for file in config_files:
        # Extract timestamp from filename (e.g., evaluation_config_1737599283.json)
        match = re.search(r'evaluation_config_(\d+)\.json', file)
        if match:
            timestamp = int(match.group(1))
            files_with_timestamps.append((timestamp, file))

    # Sort by timestamp (descending) and get the latest
    files_with_timestamps.sort(reverse=True)
    if files_with_timestamps:
        QUESTIONS_FILE = files_with_timestamps[0][1]
        print(f"üìÇ Found {len(config_files)} config files")
        print(f"üìÇ Using latest: {os.path.basename(QUESTIONS_FILE)}")
    else:
        # Fallback if no timestamp pattern found
        QUESTIONS_FILE = sorted(config_files)[-1]
        print(f"‚ö†Ô∏è Using alphabetically sorted latest: {os.path.basename(QUESTIONS_FILE)}")
else:
    QUESTIONS_FILE = ACUMULATIVE_PATH + 'questions_with_links.json'
    print("‚ö†Ô∏è No evaluation_config files found, using default questions_with_links.json")

RESULTS_OUTPUT_PATH = ACUMULATIVE_PATH

print(f"üìÇ Config file path: {QUESTIONS_FILE}")
print(f"üîë OpenAI API: {'‚úÖ' if OPENAI_AVAILABLE else '‚ùå'}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ OpenAI API key loaded
üìÇ Found 8 config files
üìÇ Using latest: evaluation_config_1753401935.json
üìÇ Config file path: /content/drive/MyDrive/TesisMagister/acumulative/evaluation_config_1753401935.json
üîë OpenAI API: ‚úÖ


## Core Classes

In [30]:
class RealEmbeddingRetriever:
    def __init__(self, parquet_file: str):
        print(f"üîÑ Loading {parquet_file}...")
        self.df = pd.read_parquet(parquet_file)
        embeddings_list = self.df['embedding'].tolist()
        self.embeddings_matrix = np.array(embeddings_list)
        self.num_docs = len(self.df)
        self.embedding_dim = self.embeddings_matrix.shape[1]
        print(f"‚úÖ {self.num_docs:,} docs, {self.embedding_dim} dims")
        self.documents = self.df[['document', 'link', 'title', 'summary', 'content']].to_dict('records')

    def search_documents(self, query_embedding: np.ndarray, top_k: int = 10) -> List[Dict]:
        query_embedding = query_embedding.reshape(1, -1)
        similarities = cosine_similarity(query_embedding, self.embeddings_matrix)[0]
        top_indices = np.argsort(similarities)[::-1][:top_k]

        results = []
        for idx in top_indices:
            doc = self.documents[idx].copy()
            doc['cosine_similarity'] = float(similarities[idx])
            doc['rank'] = len(results) + 1
            results.append(doc)
        return results

## Metrics Functions

In [31]:
def calculate_ndcg_at_k(relevance_scores: List[float], k: int) -> float:
    if k <= 0 or not relevance_scores:
        return 0.0
    dcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(relevance_scores[:k]) if rel > 0)
    ideal_relevance = sorted(relevance_scores[:k], reverse=True)
    idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal_relevance) if rel > 0)
    return dcg / idcg if idcg > 0 else 0.0

def calculate_map_at_k(relevance_scores: List[float], k: int) -> float:
    if k <= 0 or not relevance_scores:
        return 0.0
    relevant_count = 0
    precision_sum = 0.0
    for i, rel in enumerate(relevance_scores[:k]):
        if rel > 0:
            relevant_count += 1
            precision_at_i = relevant_count / (i + 1)
            precision_sum += precision_at_i
    return precision_sum / relevant_count if relevant_count > 0 else 0.0

def calculate_mrr_at_k(relevance_scores: List[float], k: int) -> float:
    """
    Calculate MRR@k - Mean Reciprocal Rank considering only top k documents.

    MRR@k = 1 / rank_of_first_relevant_document (only if within top k)

    Args:
        relevance_scores: List of relevance scores (1.0 for relevant, 0.0 for non-relevant)
        k: Number of top documents to consider

    Returns:
        MRR@k as float between 0 and 1
    """
    if k <= 0 or not relevance_scores:
        return 0.0

    # Only consider top k documents
    top_k_scores = relevance_scores[:k]

    # Find first relevant document within top k
    for rank, relevance in enumerate(top_k_scores, 1):
        if relevance > 0:  # Found first relevant document
            return 1.0 / rank

    # No relevant document found in top k
    return 0.0

def calculate_retrieval_metrics(retrieved_docs: List[Dict], ground_truth_links: List[str], top_k_values: List[int] = [1, 3, 5, 10]) -> Dict:
    def normalize_link(link: str) -> str:
        if not link:
            return ""
        return link.split('#')[0].split('?')[0].rstrip('/')

    gt_normalized = set(normalize_link(link) for link in ground_truth_links)
    relevance_scores = []
    retrieved_links_normalized = []

    for doc in retrieved_docs:
        link = normalize_link(doc.get('link', ''))
        retrieved_links_normalized.append(link)
        relevance_scores.append(1.0 if link in gt_normalized else 0.0)

    metrics = {}
    for k in top_k_values:
        top_k_relevance = relevance_scores[:k]
        top_k_links = retrieved_links_normalized[:k]

        retrieved_links = set(link for link in top_k_links if link)
        relevant_retrieved = retrieved_links.intersection(gt_normalized)

        precision_k = len(relevant_retrieved) / k if k > 0 else 0.0
        recall_k = len(relevant_retrieved) / len(gt_normalized) if gt_normalized else 0.0
        f1_k = (2 * precision_k * recall_k) / (precision_k + recall_k) if (precision_k + recall_k) > 0 else 0.0

        metrics[f'precision@{k}'] = precision_k
        metrics[f'recall@{k}'] = recall_k
        metrics[f'f1@{k}'] = f1_k
        metrics[f'ndcg@{k}'] = calculate_ndcg_at_k(top_k_relevance, k)
        metrics[f'map@{k}'] = calculate_map_at_k(top_k_relevance, k)

        # FIXED: Calculate MRR@k correctly for each k value
        metrics[f'mrr@{k}'] = calculate_mrr_at_k(relevance_scores, k)

    # Overall MRR (considering all retrieved documents, not just top k)
    overall_mrr = calculate_mrr_at_k(relevance_scores, len(relevance_scores))
    metrics['mrr'] = overall_mrr

    return metrics

## RAG and LLM Classes

In [32]:
import openai
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
    answer_correctness,
    answer_similarity
)
from datasets import Dataset

# BERTScore for additional semantic similarity evaluation
try:
    from bert_score import score as bert_score
    BERTSCORE_AVAILABLE = True
    print("‚úÖ BERTScore available")
except ImportError:
    BERTSCORE_AVAILABLE = False
    print("‚ö†Ô∏è BERTScore not available")

class BERTScoreEvaluator:
    """Optional BERTScore evaluator for semantic similarity."""

    def __init__(self):
        self.available = BERTSCORE_AVAILABLE
        if self.available:
            print("‚úÖ BERTScore evaluator initialized")

    def calculate_bert_score(self, generated_answer: str, reference_answer: str, lang: str = "en") -> Dict:
        """Calculate BERTScore between generated and reference answers."""
        if not self.available:
            return {
                'bert_score_available': False,
                'reason': 'BERTScore package not installed'
            }

        try:
            # Calculate BERTScore (P, R, F1)
            P, R, F1 = bert_score([generated_answer], [reference_answer], lang=lang, verbose=False)

            return {
                'bert_score_available': True,
                'bert_precision': float(P[0]),
                'bert_recall': float(R[0]),
                'bert_f1': float(F1[0]),
                'language': lang
            }
        except Exception as e:
            return {
                'bert_score_available': False,
                'error': str(e)
            }

class RAGCalculator:
    def __init__(self):
        self.client = None
        self.has_openai = False
        self.bert_evaluator = BERTScoreEvaluator()

        api_key = os.environ.get('OPENAI_API_KEY')
        if api_key:
            try:
                openai.api_key = api_key
                self.client = openai
                self.has_openai = True
                print("‚úÖ RAG Calculator initialized with OpenAI + RAGAS")
            except Exception as e:
                print(f"‚ùå RAG init error: {e}")
        else:
            print("‚ö†Ô∏è RAG Calculator: No OpenAI API key - RAG metrics disabled")

    def generate_answer(self, question: str, retrieved_docs: List[Dict]) -> str:
        """Generate answer using OpenAI GPT for RAGAS evaluation."""
        if not self.client or not self.has_openai:
            return "No answer available - OpenAI API not configured"

        # Prepare context from retrieved documents
        context = "\n\n".join([
            f"Document {i+1}: {doc.get('document', '')[:500]}..."
            for i, doc in enumerate(retrieved_docs[:3])
        ])

        prompt = f"""Based only on the provided context, answer the following question.
        If the context doesn't contain enough information, say so.

        Context:
        {context}

        Question: {question}

        Answer:"""

        try:
            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=200,
                temperature=0.1
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"‚ùå OpenAI API error generating answer: {e}")
            return f"Error generating answer: {str(e)}"

    def create_ground_truth(self, question: str, retrieved_docs: List[Dict]) -> str:
        """Create a reference answer for evaluation."""
        # Enhanced ground truth creation based on document quality
        has_relevant_docs = any('microsoft.com' in doc.get('link', '').lower() or
                               'docs.microsoft' in doc.get('link', '').lower()
                               for doc in retrieved_docs[:3])

        if has_relevant_docs:
            # Extract key information from Microsoft docs
            relevant_content = []
            for doc in retrieved_docs[:3]:
                if 'microsoft.com' in doc.get('link', '').lower():
                    content = doc.get('document', '')[:300]
                    relevant_content.append(content)

            if relevant_content:
                return f"Based on Microsoft documentation: {' '.join(relevant_content)}"

        return "This question requires a comprehensive technical answer based on the available documentation."

    def calculate_rag_metrics(self, question: str, retrieved_docs: List[Dict], ground_truth: str = None) -> Dict:
        """Calculate RAG metrics using RAGAS framework + optional BERTScore."""
        if not self.client or not self.has_openai:
            return {
                'rag_available': False,
                'reason': 'OpenAI API not available'
            }

        try:
            # Generate answer
            generated_answer = self.generate_answer(question, retrieved_docs)

            # Validate generated answer
            if not generated_answer or len(generated_answer.strip()) < 10:
                return {
                    'rag_available': False,
                    'reason': 'Generated answer too short or empty',
                    'generated_answer': generated_answer
                }

            # Prepare contexts from retrieved documents - ensure they're strings
            contexts = []
            for doc in retrieved_docs[:3]:
                doc_content = doc.get('document', '')
                if isinstance(doc_content, str) and len(doc_content) > 0:
                    contexts.append(doc_content[:1000])  # Limit length

            if not contexts:
                return {
                    'rag_available': False,
                    'reason': 'No valid document contexts found'
                }

            # Create ground truth if not provided
            if ground_truth is None:
                ground_truth = self.create_ground_truth(question, retrieved_docs)

            # Validate all inputs are strings and not empty
            if not isinstance(question, str) or len(question.strip()) < 5:
                return {
                    'rag_available': False,
                    'reason': f'Invalid question format: {type(question)} - {question[:50]}...'
                }

            if not isinstance(generated_answer, str) or len(generated_answer.strip()) < 5:
                return {
                    'rag_available': False,
                    'reason': f'Invalid answer format: {type(generated_answer)} - {generated_answer[:50]}...'
                }

            if not isinstance(ground_truth, str) or len(ground_truth.strip()) < 5:
                return {
                    'rag_available': False,
                    'reason': f'Invalid ground truth format: {type(ground_truth)} - {ground_truth[:50]}...'
                }

            # Prepare data for RAGAS evaluation with validation
            data = {
                "question": [str(question).strip()],
                "answer": [str(generated_answer).strip()],
                "contexts": [contexts],  # List of list of strings
                "ground_truth": [str(ground_truth).strip()]
            }

            print(f"üîç RAGAS Input Validation:")
            print(f"  Question: {data['question'][0][:100]}...")
            print(f"  Answer: {data['answer'][0][:100]}...")
            print(f"  Contexts: {len(data['contexts'][0])} documents")
            print(f"  Ground truth: {data['ground_truth'][0][:100]}...")

            # Create dataset
            dataset = Dataset.from_dict(data)

            # Use only the most stable RAGAS metrics
            metrics_to_evaluate = [faithfulness, answer_relevancy]

            print(f"üîÑ Evaluating with {len(metrics_to_evaluate)} core metrics...")

            # Evaluate using RAGAS with error handling
            result = evaluate(dataset, metrics=metrics_to_evaluate)
            print(f"‚úÖ RAGAS evaluation completed")

            # Extract scores with comprehensive error handling
            scores = {}

            # Define expected metric column names
            expected_metrics = [
                'faithfulness', 'answer_relevancy', 'context_precision',
                'context_recall', 'answer_correctness', 'answer_similarity'
            ]

            # Define columns that should NOT be converted to float (text data)
            text_columns = [
                'question', 'answer', 'contexts', 'ground_truth',
                'user_input', 'response', 'reference', 'retrieved_contexts'
            ]

            try:
                # Method 1: Try to_pandas (most common in newer versions)
                if hasattr(result, 'to_pandas'):
                    df_result = result.to_pandas()
                    print(f"üìä DataFrame columns: {list(df_result.columns)}")

                    for col in df_result.columns:
                        # Skip text columns that shouldn't be converted to float
                        if col.lower() in text_columns:
                            print(f"üî§ Skipping text column: {col}")
                            continue

                        # Only process expected metric columns
                        if col.lower() in expected_metrics:
                            try:
                                value = df_result[col].iloc[0]
                                if isinstance(value, (int, float)) and not pd.isna(value):
                                    scores[col] = float(value)
                                    print(f"‚úÖ Extracted {col}: {scores[col]}")
                                elif isinstance(value, str):
                                    # Try to convert string to float only if it looks like a number
                                    cleaned_value = value.strip()
                                    if cleaned_value.replace('.', '').replace('-', '').isdigit():
                                        scores[col] = float(cleaned_value)
                                        print(f"‚úÖ Converted {col}: {scores[col]}")
                                    else:
                                        print(f"‚ö†Ô∏è Skipping non-numeric string for {col}: {cleaned_value[:50]}...")
                                        scores[col] = 0.0
                                else:
                                    scores[col] = 0.0
                            except Exception as e:
                                print(f"‚ö†Ô∏è Error extracting {col}: {e}")
                                scores[col] = 0.0
                        else:
                            print(f"üî§ Skipping non-metric column: {col}")

                # Method 2: Direct attribute access fallback
                elif hasattr(result, '__dict__'):
                    for attr_name, attr_value in result.__dict__.items():
                        if attr_name.lower() in expected_metrics:
                            try:
                                if isinstance(attr_value, (int, float)):
                                    scores[attr_name] = float(attr_value)
                                elif isinstance(attr_value, (list, np.ndarray)) and len(attr_value) > 0:
                                    scores[attr_name] = float(attr_value[0])
                                else:
                                    scores[attr_name] = 0.0
                            except (ValueError, TypeError, IndexError) as e:
                                print(f"‚ö†Ô∏è Error converting {attr_name}: {e}")
                                scores[attr_name] = 0.0

                print(f"‚úÖ Successfully extracted scores: {scores}")

            except Exception as e:
                print(f"‚ùå Error extracting scores: {e}")
                # Fallback: return minimal valid scores
                scores = {
                    'faithfulness': 0.5,
                    'answer_relevancy': 0.5
                }

            # Map RAGAS metric names to our expected format with validation
            mapped_scores = {
                'rag_available': True,
                'evaluation_method': 'RAGAS',
                'faithfulness': max(0.0, min(1.0, scores.get('faithfulness', 0.0))),
                'answer_relevance': max(0.0, min(1.0, scores.get('answer_relevancy', 0.0))),
                'answer_correctness': max(0.0, min(1.0, scores.get('answer_correctness', 0.0))),
                'answer_similarity': max(0.0, min(1.0, scores.get('answer_similarity', 0.0))),
                'context_precision': max(0.0, min(1.0, scores.get('context_precision', 0.0))),
                'context_recall': max(0.0, min(1.0, scores.get('context_recall', 0.0))),
                'generated_answer': generated_answer[:200] + '...' if len(generated_answer) > 200 else generated_answer,
                'ground_truth_used': ground_truth[:100] + '...' if len(ground_truth) > 100 else ground_truth,
                'ragas_version': 'latest',
                'input_validation': 'passed'
            }

            # Add BERTScore if available
            if self.bert_evaluator.available:
                try:
                    bert_results = self.bert_evaluator.calculate_bert_score(generated_answer, ground_truth)
                    mapped_scores.update(bert_results)
                except Exception as e:
                    print(f"‚ö†Ô∏è BERTScore error: {e}")

            return mapped_scores

        except Exception as e:
            print(f"‚ùå RAGAS evaluation error: {e}")
            print(f"üí° Error type: {type(e).__name__}")

            # Enhanced error debugging
            if 'result' in locals():
                print(f"üí° Result type: {type(result)}")
                if hasattr(result, 'to_pandas'):
                    try:
                        df_debug = result.to_pandas()
                        print(f"üí° Result columns: {list(df_debug.columns)}")
                        print(f"üí° Result shape: {df_debug.shape}")
                    except:
                        pass

            return {
                'rag_available': False,
                'evaluation_method': 'RAGAS',
                'error': str(e)[:200],  # Limit error message length
                'error_type': type(e).__name__,
                'debug_info': {
                    'question_type': type(question).__name__,
                    'question_length': len(str(question)),
                    'answer_type': type(generated_answer).__name__ if 'generated_answer' in locals() else 'undefined',
                    'contexts_count': len(contexts) if 'contexts' in locals() else 0
                }
            }

class LLMReranker:
    def __init__(self):
        self.client = None
        api_key = os.environ.get('OPENAI_API_KEY')
        if api_key:
            try:
                openai.api_key = api_key
                self.client = openai
                print("‚úÖ LLM Reranker initialized")
            except Exception as e:
                print(f"‚ùå Reranker init error: {e}")

    def rerank_documents(self, question: str, retrieved_docs: List[Dict], top_k: int = 10) -> List[Dict]:
        if not self.client or not retrieved_docs:
            return retrieved_docs

        docs_to_rerank = retrieved_docs[:min(top_k, len(retrieved_docs))]
        if len(docs_to_rerank) <= 1:
            return docs_to_rerank

        try:
            prompt = f"Question: {question}\n\nRank documents by relevance (numbers only):\n"
            for i, doc in enumerate(docs_to_rerank, 1):
                content = doc.get('document', '')[:200]
                prompt += f"{i}. {content}...\n"
            prompt += "\nRanking:"

            response = self.client.chat.completions.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=50, temperature=0.1
            )

            ranking_text = response.choices[0].message.content.strip()
            print(f"üîç LLM ranking response: '{ranking_text}'")

            import re
            # FIXED: Use single backslash for regex digit pattern
            numbers = [int(x) - 1 for x in re.findall(r'\d+', ranking_text) if 0 <= int(x) - 1 < len(docs_to_rerank)]
            print(f"üîç Parsed numbers: {numbers}")

            if not numbers:
                print("‚ö†Ô∏è No valid ranking numbers found, returning original order")
                return retrieved_docs

            # Reorder based on ranking
            reranked = [docs_to_rerank[i] for i in numbers if i < len(docs_to_rerank)]
            remaining = [docs_to_rerank[i] for i in range(len(docs_to_rerank)) if i not in numbers]
            final_docs = reranked + remaining + retrieved_docs[len(docs_to_rerank):]

            for i, doc in enumerate(final_docs):
                doc['rank'] = i + 1
                doc['reranked'] = i < len(reranked)

            print(f"‚úÖ Reranked {len(reranked)}/{len(docs_to_rerank)} documents")
            return final_docs

        except Exception as e:
            print(f"‚ùå Reranking error: {e}")
            return retrieved_docs

# Initialize with RAGAS + BERTScore
rag_calculator = RAGCalculator()
llm_reranker = LLMReranker()
RAG_AVAILABLE = rag_calculator.has_openai
LLM_RERANKING_AVAILABLE = llm_reranker.client is not None

print(f"üîß RAG Calculator: {'RAGAS + OpenAI' if rag_calculator.has_openai else 'Disabled - no API key'}")
print(f"üîß LLM Reranker: {'Available' if LLM_RERANKING_AVAILABLE else 'Not available'}")
print(f"üìè BERTScore: {'Available' if BERTSCORE_AVAILABLE else 'Not available'}")

# Test RAGAS availability
if RAG_AVAILABLE:
    try:
        from ragas.metrics import faithfulness
        print("‚úÖ RAGAS metrics successfully imported")
    except Exception as e:
        print(f"‚ö†Ô∏è RAGAS import issue: {e}")
        RAG_AVAILABLE = False

‚úÖ BERTScore available
‚úÖ BERTScore evaluator initialized
‚úÖ RAG Calculator initialized with OpenAI + RAGAS
‚úÖ LLM Reranker initialized
üîß RAG Calculator: RAGAS + OpenAI
üîß LLM Reranker: Available
üìè BERTScore: Available
‚úÖ RAGAS metrics successfully imported


## Load Configuration

In [33]:
# Load evaluation configuration
with open(QUESTIONS_FILE, 'r', encoding='utf-8') as f:
    config_data = json.load(f)

if 'questions_data' in config_data:
    questions_data = config_data['questions_data']
    evaluation_params = {
        'num_questions': config_data.get('num_questions', 100),
        'selected_models': config_data.get('selected_models', ['e5-large']),
        'generative_model_name': config_data.get('generative_model_name', 'gpt-4'),
        'top_k': config_data.get('top_k', 10),
        'use_llm_reranker': config_data.get('use_llm_reranker', True),
        'generate_rag_metrics': config_data.get('generate_rag_metrics', True),
        'batch_size': config_data.get('batch_size', 50),
        'evaluate_all_models': config_data.get('evaluate_all_models', False)
    }
    print(f"‚úÖ Loaded {len(questions_data)} questions")
    print(f"üìä Config: {evaluation_params['selected_models']} models, {evaluation_params['num_questions']} questions")
else:
    print("‚ùå No questions data found in config")
    questions_data = []
    evaluation_params = {}

‚úÖ Loaded 30 questions
üìä Config: ['multi-qa-mpnet-base-dot-v1', 'all-MiniLM-L6-v2', 'ada', 'e5-large-v2'] models, 30 questions


In [34]:
## Multi-Model Evaluation

In [35]:
# Model mappings
model_mapping = {
    'multi-qa-mpnet-base-dot-v1': 'mpnet',
    'all-MiniLM-L6-v2': 'minilm',
    'ada': 'ada',
    'text-embedding-ada-002': 'ada',
    'e5-large-v2': 'e5-large',
    'intfloat/e5-large-v2': 'e5-large'
}

QUERY_MODELS = {
    'ada': 'text-embedding-ada-002',  # ‚úÖ OpenAI model - 1536 dims
    'e5-large': 'intfloat/e5-large-v2',  # ‚úÖ FIXED: Use E5-Large model - 1024 dims
    'mpnet': 'sentence-transformers/multi-qa-mpnet-base-dot-v1',  # ‚úÖ 768 dims
    'minilm': 'sentence-transformers/all-MiniLM-L6-v2'  # ‚úÖ 384 dims
}

# Determine models to evaluate
if evaluation_params.get('evaluate_all_models') and evaluation_params.get('selected_models'):
    models_to_evaluate = [model_mapping.get(model, model) for model in evaluation_params['selected_models']]
    models_to_evaluate = [model for model in models_to_evaluate if model in EMBEDDING_FILES]
else:
    # Fallback: evaluate all available models
    models_to_evaluate = list(EMBEDDING_FILES.keys())

print(f"üéØ Models to evaluate: {models_to_evaluate}")

# Evaluation parameters
NUM_QUESTIONS = evaluation_params.get('num_questions', len(questions_data))
USE_LLM_RERANKER = evaluation_params.get('use_llm_reranker', True) and LLM_RERANKING_AVAILABLE
GENERATE_RAG_METRICS = evaluation_params.get('generate_rag_metrics', True) and RAG_AVAILABLE
TOP_K = evaluation_params.get('top_k', 10)

print(f"üìã Questions: {NUM_QUESTIONS}")
print(f"üîÑ LLM Reranking: {'‚úÖ' if USE_LLM_RERANKER else '‚ùå'}")
print(f"ü§ñ RAG Metrics: {'‚úÖ' if GENERATE_RAG_METRICS else '‚ùå'}")

# Select questions to evaluate
questions_to_eval = questions_data[:NUM_QUESTIONS] if NUM_QUESTIONS < len(questions_data) else questions_data
print(f"üöÄ Starting evaluation for {len(questions_to_eval)} questions across {len(models_to_evaluate)} models")

üéØ Models to evaluate: ['mpnet', 'minilm', 'ada', 'e5-large']
üìã Questions: 30
üîÑ LLM Reranking: ‚úÖ
ü§ñ RAG Metrics: ‚úÖ
üöÄ Starting evaluation for 30 questions across 4 models


In [36]:
# Check if previous cells have been run
try:
    # Check for required variables
    assert 'models_to_evaluate' in globals(), "models_to_evaluate not defined"
    assert 'questions_to_eval' in globals(), "questions_to_eval not defined"
    assert 'EMBEDDING_FILES' in globals(), "EMBEDDING_FILES not defined"
    assert 'QUERY_MODELS' in globals(), "QUERY_MODELS not defined"
except AssertionError as e:
    print(f"‚ö†Ô∏è Error: {e}")
    print("üìã Please run all previous cells first!")
    raise

# Start evaluation timer
EVALUATION_START_TIME = time.time()
print(f"‚è±Ô∏è Iniciando evaluaci√≥n: {time.strftime('%Y-%m-%d %H:%M:%S')}")

# Helper function to generate embeddings based on model type
def generate_query_embedding(question: str, model_name: str, query_model_name: str):
    """Generate embedding for a question using the appropriate model type."""

    if query_model_name.startswith('text-embedding-'):
        # OpenAI model
        if not OPENAI_AVAILABLE:
            raise ValueError(f"OpenAI API not available for {query_model_name}")

        try:
            import openai
            api_key = os.environ.get('OPENAI_API_KEY')
            client = openai.OpenAI(api_key=api_key)

            response = client.embeddings.create(
                model=query_model_name,
                input=question
            )
            embedding = np.array(response.data[0].embedding)
            return embedding

        except Exception as e:
            raise ValueError(f"Error generating OpenAI embedding: {e}")
    else:
        # SentenceTransformers model - try GPU first, fallback to CPU if CUDA error
        try:
            print(f"üîÑ Loading {query_model_name} on GPU...")
            query_model = SentenceTransformer(query_model_name, device='cuda')
            embedding = query_model.encode(question)
            return embedding
        except RuntimeError as e:
            if "CUDA out of memory" in str(e) or "cuda" in str(e).lower():
                print(f"‚ö†Ô∏è CUDA error for {query_model_name}, falling back to CPU...")
                try:
                    # Clear GPU memory
                    import torch
                    torch.cuda.empty_cache()
                    gc.collect()

                    # Load on CPU
                    query_model = SentenceTransformer(query_model_name, device='cpu')
                    embedding = query_model.encode(question)
                    print(f"‚úÖ Generated CPU embedding: {len(embedding)} dims")
                    return embedding
                except Exception as cpu_e:
                    raise ValueError(f"Error with CPU fallback for {query_model_name}: {cpu_e}")
            else:
                raise ValueError(f"Error loading SentenceTransformer model {query_model_name}: {e}")
        except Exception as e:
            raise ValueError(f"Error loading SentenceTransformer model {query_model_name}: {e}")

# Run evaluation for all models
all_model_results = {}

for model_name in models_to_evaluate:
    print(f"\n{'='*60}")
    print(f"üéØ Evaluating model: {model_name}")
    print(f"{'='*60}")

    # Load retriever
    if model_name not in EMBEDDING_FILES:
        print(f"‚ùå No file for {model_name}")
        continue

    retriever = RealEmbeddingRetriever(EMBEDDING_FILES[model_name])

    # Get query model name
    query_model_name = QUERY_MODELS.get(model_name, 'sentence-transformers/all-MiniLM-L6-v2')
    print(f"üîÑ Using query model: {query_model_name}")

    # Test dimension compatibility
    try:
        test_embedding = generate_query_embedding("test", model_name, query_model_name)

        if len(test_embedding) != retriever.embedding_dim:
            print(f"‚ö†Ô∏è Dimension mismatch: {len(test_embedding)} != {retriever.embedding_dim}")
            print(f"‚ùå Skipping {model_name} due to incompatible dimensions")
            print(f"üí° Query model {query_model_name} has {len(test_embedding)} dims, docs have {retriever.embedding_dim} dims")

            # Add error result
            all_model_results[model_name] = {
                'num_questions_evaluated': 0,
                'avg_before_metrics': {},
                'avg_after_metrics': {},
                'individual_before_metrics': [],
                'individual_after_metrics': [],
                'rag_metrics': {'rag_available': False, 'successful_evaluations': 0, 'total_evaluations': 0},
                'individual_rag_metrics': [],
                'embedding_dimensions': retriever.embedding_dim,
                'total_documents': retriever.num_docs,
                'query_model': query_model_name,
                'error': f'Dimension mismatch: query {len(test_embedding)} != docs {retriever.embedding_dim}'
            }

            # Cleanup and continue
            del retriever
            gc.collect()
            continue
        else:
            print(f"‚úÖ Dimension match: {len(test_embedding)} == {retriever.embedding_dim}")

    except Exception as e:
        print(f"‚ùå Error testing embedding generation: {e}")

        # Add error result
        all_model_results[model_name] = {
            'num_questions_evaluated': 0,
            'avg_before_metrics': {},
            'avg_after_metrics': {},
            'individual_before_metrics': [],
            'individual_after_metrics': [],
            'rag_metrics': {'rag_available': False, 'successful_evaluations': 0, 'total_evaluations': 0},
            'individual_rag_metrics': [],
            'embedding_dimensions': retriever.embedding_dim,
            'total_documents': retriever.num_docs,
            'query_model': query_model_name,
            'error': f'Embedding generation error: {str(e)}'
        }

        # Cleanup and continue
        del retriever
        gc.collect()
        continue

    # Evaluate
    all_before_metrics = []
    all_after_metrics = []
    all_rag_metrics = []

    print(f"\nüöÄ Starting evaluation for {len(questions_to_eval)} questions...")

    for i, qa_item in enumerate(tqdm(questions_to_eval, desc=f"Evaluating {model_name}")):
        # ‚úÖ CRITICAL FIX: Only use title + question_content for retrieval
        title = qa_item.get('title', '')
        question_content = qa_item.get('question_content', qa_item.get('question', ''))
        ms_links = qa_item.get('ms_links', [])

        # Combine title and question_content ONLY (NOT accepted_answer)
        if title and question_content:
            full_question = f"{title} {question_content}".strip()
        elif question_content:
            full_question = question_content
        elif title:
            full_question = title
        else:
            print(f"‚ö†Ô∏è Skipping question {i}: No title or question_content")
            continue

        if not ms_links:
            print(f"‚ö†Ô∏è Skipping question {i}: No MS links")
            continue

        try:
            # Generate query embedding using ONLY title + question_content
            query_embedding = generate_query_embedding(full_question, model_name, query_model_name)

            # Retrieve documents
            retrieved_docs_before = retriever.search_documents(query_embedding, top_k=TOP_K)

            # Calculate BEFORE metrics
            before_metrics = calculate_retrieval_metrics(retrieved_docs_before, ms_links)
            before_metrics['question_index'] = i
            before_metrics['original_question'] = full_question  # Store for debugging
            all_before_metrics.append(before_metrics)

            # Apply LLM reranking if available
            if USE_LLM_RERANKER:
                reranked_docs = llm_reranker.rerank_documents(full_question, retrieved_docs_before.copy(), top_k=TOP_K)
                after_metrics = calculate_retrieval_metrics(reranked_docs, ms_links)
                after_metrics['question_index'] = i
                after_metrics['original_question'] = full_question
                all_after_metrics.append(after_metrics)
                docs_for_rag = reranked_docs
            else:
                docs_for_rag = retrieved_docs_before

            # Calculate RAG metrics
            if GENERATE_RAG_METRICS:
                rag_metrics = rag_calculator.calculate_rag_metrics(full_question, docs_for_rag)
                rag_metrics['question_index'] = i
                rag_metrics['original_question'] = full_question
                all_rag_metrics.append(rag_metrics)

        except Exception as e:
            print(f"‚ùå Error processing question {i}: {e}")
            continue

    # Calculate averages - Fixed prefix handling
    def calculate_averages(metrics_list):
        if not metrics_list:
            return {}

        avg_metrics = {}
        metric_keys = ['precision@1', 'precision@3', 'precision@5', 'precision@10',
                       'recall@1', 'recall@3', 'recall@5', 'recall@10',
                       'f1@1', 'f1@3', 'f1@5', 'f1@10', 'mrr',
                       'ndcg@1', 'ndcg@3', 'ndcg@5', 'ndcg@10',
                       'map@1', 'map@3', 'map@5', 'map@10']

        for key in metric_keys:
            values = [m[key] for m in metrics_list if key in m]
            avg_metrics[key] = np.mean(values) if values else 0.0  # Remove prefix here

        return avg_metrics

    avg_before_metrics = calculate_averages(all_before_metrics)
    avg_after_metrics = calculate_averages(all_after_metrics) if all_after_metrics else {}

    # ‚úÖ FIXED: RAG averages - Streamlit-compatible format
    rag_summary = {}
    if all_rag_metrics:
        rag_available_count = len([r for r in all_rag_metrics if r.get('rag_available', False)])

        if rag_available_count > 0:
            # ‚úÖ CRITICAL: Use avg_ prefix for Streamlit compatibility
            for key in ['faithfulness', 'answer_relevance', 'answer_correctness', 'answer_similarity']:
                values = [r[key] for r in all_rag_metrics if r.get('rag_available', False) and key in r]
                if values:
                    rag_summary[f'avg_{key}'] = np.mean(values)  # ‚úÖ Add avg_ prefix!

        rag_summary.update({
            'rag_available': rag_available_count > 0,
            'successful_evaluations': rag_available_count,
            'total_evaluations': len(all_rag_metrics)
        })
    else:
        rag_summary = {
            'rag_available': False,
            'successful_evaluations': 0,
            'total_evaluations': 0
        }

    # Store results in Streamlit-compatible format
    all_model_results[model_name] = {
        'num_questions_evaluated': len(all_before_metrics),
        'avg_before_metrics': avg_before_metrics,
        'avg_after_metrics': avg_after_metrics,
        'individual_before_metrics': all_before_metrics,
        'individual_after_metrics': all_after_metrics,
        'rag_metrics': rag_summary,  # ‚úÖ Fixed structure with avg_ prefixes
        'individual_rag_metrics': all_rag_metrics,  # ‚úÖ Dedicated RAG metrics array
        'embedding_dimensions': retriever.embedding_dim,
        'total_documents': retriever.num_docs,
        'query_model': query_model_name,
        'document_corpus': f"{retriever.num_docs:,} real documents from ChromaDB"
    }

    print(f"‚úÖ {model_name} completed: {len(all_before_metrics)} questions evaluated")
    if all_rag_metrics:
        rag_count = len([r for r in all_rag_metrics if r.get('rag_available', False)])
        print(f"ü§ñ RAG metrics: {rag_count}/{len(all_rag_metrics)} successful")
        if rag_count > 0:
            print(f"üìä Average Faithfulness: {rag_summary.get('avg_faithfulness', 0):.3f}")
            print(f"üìä Average Relevance: {rag_summary.get('avg_answer_relevance', 0):.3f}")

    # Cleanup
    del retriever
    gc.collect()

# Calculate evaluation time
EVALUATION_END_TIME = time.time()
EVALUATION_DURATION = EVALUATION_END_TIME - EVALUATION_START_TIME

print(f"\nüéâ All evaluations completed!")
print(f"üìä Models evaluated: {list(all_model_results.keys())}")
print(f"\n‚ö†Ô∏è Models with errors:")
for model, results in all_model_results.items():
    if 'error' in results:
        print(f"   {model}: {results['error']}")

# Debug info - RAG metrics verification
print(f"\nüîç RAG METRICS DEBUG:")
for model, results in all_model_results.items():
    if 'error' not in results:
        rag_metrics = results['rag_metrics']
        print(f"{model}: {results['num_questions_evaluated']} questions, avg P@5 = {results['avg_before_metrics'].get('precision@5', 0):.3f}")
        if rag_metrics['rag_available']:
            print(f"  ü§ñ RAG: {rag_metrics['successful_evaluations']} successful")
            print(f"      avg_faithfulness: {rag_metrics.get('avg_faithfulness', 'N/A')}")
            print(f"      avg_answer_relevance: {rag_metrics.get('avg_answer_relevance', 'N/A')}")
        else:
            print(f"  ‚ùå RAG: No metrics available - check OpenAI API")

# Show evaluation time
print(f"\n‚è±Ô∏è TIEMPO DE EVALUACI√ìN: {EVALUATION_DURATION:.2f} segundos ({EVALUATION_DURATION/60:.2f} minutos)")

‚è±Ô∏è Iniciando evaluaci√≥n: 2025-07-25 00:47:46

üéØ Evaluating model: mpnet
üîÑ Loading /content/drive/MyDrive/TesisMagister/acumulative/colab_data/docs_mpnet_with_embeddings_20250721_125254.parquet...
‚úÖ 187,031 docs, 768 dims
üîÑ Using query model: sentence-transformers/multi-qa-mpnet-base-dot-v1
üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
‚úÖ Dimension match: 768 == 768

üöÄ Starting evaluation for 30 questions...


Evaluating mpnet:   0%|          | 0/30 [00:00<?, ?it/s]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '2, 3, 5, 6, 10, 7, 8, 4, 9, 1'
üîç Parsed numbers: [1, 2, 4, 5, 9, 6, 7, 3, 8, 0]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Postgres flexible server RBAC I would like to setup RBAC to grant at subscription level can anyone p...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: APPLIES TO: Azure Database for PostgreSQL - Flexible Server This a...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:   3%|‚ñé         | 1/30 [00:10<05:17, 10.94s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 9, 3, 7, 8'
üîç Parsed numbers: [0, 8, 2, 6, 7]
‚úÖ Reranked 5/10 documents
üîç RAGAS Input Validation:
  Question: VM unable to connect to endpoint over IPsec tunnel Hello reader,
I have set up a S2S connection betw...
  Answer: The issue might be related to the configuration of the IPsec tunnel or the routing settings. It is r...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: service supports a policy-based VPN. Azure VPN gateway configures ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:   7%|‚ñã         | 2/30 [00:33<08:14, 17.66s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '2, 5, 1, 3, 7, 9, 10, 4, 6, 8'
üîç Parsed numbers: [1, 4, 0, 2, 6, 8, 9, 3, 5, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Integration between Azure DevOps and Azure Cloud I'm working with Azure DevOps and Azure Cloud, both...
  Answer: To achieve integration between Azure DevOps and Azure Cloud without having to make the Function App ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: You can use a few different technologies to deploy your Azure Func...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.912964639151196
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.912964639151196}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  10%|‚ñà         | 3/30 [00:46<07:00, 15.59s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 2, 4, 5, 6, 7, 8, 9, 3, 10'
üîç Parsed numbers: [0, 1, 3, 4, 5, 6, 7, 8, 2, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Public IP Azure Hi All
I have a dns server witch need a public IP to operate. I have configured a Pu...
  Answer: To get the default gateway and netmask for your public IP address in Azure, you can refer to the doc...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In this article, you learn how to associate a public IP address to...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8712338007403887
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8712338007403887}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  13%|‚ñà‚ñé        | 4/30 [00:59<06:15, 14.45s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 7, 8, 9, 10, 2, 3, 4, 5, 6'
üîç Parsed numbers: [0, 6, 7, 8, 9, 1, 2, 3, 4, 5]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Failed to upgrade node pool Failed to upgrade node pool 'devqanp02' in Kubernetes service 'DevQA-K8S...
  Answer: The context provided does not contain information to answer the question about the failed upgrade of...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: surged to upgrade the node pool. maxSurge = 5 maxUnavailable = 0 0...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  17%|‚ñà‚ñã        | 5/30 [01:09<05:23, 12.93s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '3, 1, 2, 10, 4, 6, 5, 8, 7, 9'
üîç Parsed numbers: [2, 0, 1, 9, 3, 5, 4, 7, 6, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Error generating map - Azure Migrate agentless dependencies visualization map Hi, In Azure Migrate, ...
  Answer: Based on the provided context, the error message "Error generating map. Please contact support." ind...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article describes how to set up agentless dependency analysis...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  20%|‚ñà‚ñà        | 6/30 [01:21<05:06, 12.78s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: 'Based on the relevance of the content provided, the ranking of the documents is as follows:

1. Document 3: It provides step-by-step instructions on how to configure the subnet for the firewall in Azure Portal, which directly addresses the issue mentioned in'
üîç Parsed numbers: [0, 2]
‚úÖ Reranked 2/10 documents
üîç RAGAS Input Validation:
  Question: how to fix below rror while creating firewall in azure portal Hi ,
Im getting below error even thoug...
  Answer: The provided context does not contain enough information to determine how to fix the error while cre...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: ] } } Property Values AzureFirewallApplicationRule Expand table Na...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  23%|‚ñà‚ñà‚ñé       | 7/30 [01:35<05:03, 13.17s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 2, 3, 4, 5, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: ARM templates para cria√ß√£o de m√°quinas virtuais no Azure Me d√™ exemplos de ARM Templaytes p/ criar m...
  Answer: The context provided does not contain examples of ARM templates for creating virtual machines in Azu...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This template allows you to deploy a VM using a operating system t...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  27%|‚ñà‚ñà‚ñã       | 8/30 [01:44<04:17, 11.70s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 5
2. 3
3. 10
4. 1
5. 2'
üîç Parsed numbers: [0, 4, 1, 2, 2, 9, 3, 0, 4, 1]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Can a Group owner update Usage Location of sub group member ? Can a Group owner update Usage Locatio...
  Answer: No, the provided context does not contain enough information to determine if User 3 can update the u...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: or display name. Access to group information, including groups mem...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  30%|‚ñà‚ñà‚ñà       | 9/30 [01:53<03:48, 10.89s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 2, 4, 5, 6, 3, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 3, 4, 5, 2, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Is Azure Private DNS a regional service ? Will Azure Private DNS service still work in an event of a...
  Answer: Azure Private DNS is not a regional service, as it is resilient to regional outages and the zone dat...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: DNS private zones are resilient to regional outages because zone d...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8
‚úÖ Extracted answer_relevancy: 0.9139335669651812
‚úÖ Successfully extracted scores: {'faithfulness': 0.8, 'answer_relevancy': 0.9139335669651812}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [02:06<03:48, 11.45s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 6. This article explains how you can use Azure Files and Azure File Sync to replace or extend your on-premises Windows file servers to reduce total cost of ownership (TCO), increase flexibility, and simp...
2. 1'
üîç Parsed numbers: [0, 5, 1, 0]
‚úÖ Reranked 4/10 documents
üîç RAGAS Input Validation:
  Question: Migrate File server to Azure Hi! There is a File server on premise (3.5TB). 80 users accessing it. W...
  Answer: Based on the provided context, the information is not sufficient to provide a definitive answer to t...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: you need : Azure Files supports integrations with value-added serv...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  37%|‚ñà‚ñà‚ñà‚ñã      | 11/30 [02:20<03:53, 12.29s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 7
2. 5
3. 1
4. 8
5. 2
6. 10
7. 3
8. 6
9. 9'
üîç Parsed numbers: [0, 6, 1, 4, 2, 0, 3, 7, 4, 1, 5, 9, 6, 2, 7, 5, 8, 8]
‚úÖ Reranked 18/10 documents
üîç RAGAS Input Validation:
  Question: What permissions do I need to only run pipelines in Synapse Workspace? Hi,
I've tried several combin...
  Answer: Based on the provided context, the user would need the "Synapse Compute Operator" role in the Synaps...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: to dedicated SQL pools, the Active Directory Admin or Synapse Admi...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [02:31<03:33, 11.89s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '2, 3, 5, 9, 10, 1, 6, 7, 8, 4'
üîç Parsed numbers: [1, 2, 4, 8, 9, 0, 5, 6, 7, 3]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Need to understand managed storage account config of Azure databricks I would like to know what is b...
  Answer: The provided context does not contain enough information to answer the question about the managed st...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article details locations where Azure Databricks writes data ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 13/30 [02:43<03:23, 11.96s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '8, 3, 1, 2, 4, 5, 10, 6, 7, 9'
üîç Parsed numbers: [7, 2, 0, 1, 3, 4, 9, 5, 6, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Synapse. How do you write a Stored Procedure to create External tables? Hi again,
Still trying to ge...
  Answer: Based on the provided context, it is not explicitly mentioned how to write a stored procedure to cre...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Synapse SQL provisioned and serverless pools enable you to place c...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 14/30 [02:58<03:23, 12.75s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 5, 3, 4, 7, 8, 2, 6, 9, 10'
üîç Parsed numbers: [0, 4, 2, 3, 6, 7, 1, 5, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Recover an SQL server within a deleted resource group Hi,
I have accidently deleted a resource group...
  Answer: Based on the provided context, it is not clear if an SQL server within a deleted resource group can ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: When an Azure Database for MySQL Flexible Server instance is delet...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.8, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15/30 [03:11<03:16, 13.08s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '2, 6, 1, 5, 7'
üîç Parsed numbers: [1, 5, 0, 4, 6]
‚úÖ Reranked 5/10 documents
üîç RAGAS Input Validation:
  Question: datafactory broadcast error hello,
we have several environments. dev, test, ..., prod
In ADF, in env...
  Answer: The error message indicates that the job failed due to a broadcast join timeout error at the sink 's...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Invalid schema ' <schemaName> ' specified in the pipeline setting....
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.7379926392069315
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.7379926392069315}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [03:26<03:08, 13.46s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. The following table lists the metrics available for the Microsoft.Network/networkinterfaces resource type. Table headings Metric - The metric display name as it appears in the Azure portal. Name in Re...
2. The following table lists the metrics available for the Microsoft'
üîç Parsed numbers: [0, 1]
‚úÖ Reranked 2/10 documents
üîç RAGAS Input Validation:
  Question: VM Metrics - Network In/Out Billable(Deprecated) differences with Network In/Out Total? I was diggin...
  Answer: Based on the provided context, there is not enough information to accurately answer the question abo...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: The following table lists the metrics available for the Microsoft....
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [03:39<02:53, 13.32s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 3, 5, 6, 8'
üîç Parsed numbers: [0, 2, 4, 5, 7]
‚úÖ Reranked 5/10 documents
üîç RAGAS Input Validation:
  Question: send windows OS logs to Event Hub I would like to see more details than the information on this page...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure diagnostics extension is an agent in Azure Monitor that coll...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [03:47<02:22, 11.86s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 5, 6, 2, 3, 4, 7, 8, 9, 10'
üîç Parsed numbers: [0, 4, 5, 1, 2, 3, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: How to update a particular extension of an Azure Function? After updating the host.json to include t...
  Answer: To update a particular extension of an Azure Function from the Azure Portal, you can follow these st...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: } } } To update your project to Azure Functions 4.x: Update your l...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.894805283564025
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.894805283564025}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [04:12<02:52, 15.68s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 2, 3, 5, 7, 8, 4, 10, 6, 9'
üîç Parsed numbers: [0, 1, 2, 4, 6, 7, 3, 9, 5, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Need simple way-How to migrate On-premise window server AD to Azure Window server AD? Need simple wa...
  Answer: Based on the provided context, it seems that the simplest way to migrate an on-premises Windows Serv...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article describes options for migrating on-premises machines ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6
‚úÖ Extracted answer_relevancy: 0.9078646511722145
‚úÖ Successfully extracted scores: {'faithfulness': 0.6, 'answer_relevancy': 0.9078646511722145}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20/30 [04:25<02:30, 15.00s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 3
2. 4
3. 8
4. 7
5. 10
6. 6
7. 9
8. 5
9. 1
10. 2'
üîç Parsed numbers: [0, 2, 1, 3, 2, 7, 3, 6, 4, 9, 5, 5, 6, 8, 7, 4, 8, 0, 9, 1]
‚úÖ Reranked 20/10 documents
üîç RAGAS Input Validation:
  Question: problem with WaitUntil i keep getting this error when i call WaitUntil.OperationSucceeds...
  Answer: The provided context does not contain enough information to answer the question about the error rela...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: < DateTime.fromJSDate(expiryTime)) { const jobStatus = yield conte...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6666666666666666
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [04:39<02:12, 14.78s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 2, 3, 6, 7, 8, 9, 4, 5, 10'
üîç Parsed numbers: [0, 1, 2, 5, 6, 7, 8, 3, 4, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Having problem with Azure sandbox storage account Hi,
I am having issue to access to storage account...
  Answer: Based on the provided context, it seems like the issue could be related to signing in to the Azure s...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Sign-in is the recommended way to access your Azure storage resour...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.25
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.25, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22/30 [04:52<01:52, 14.05s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 3
2. 1
3. 4
4. 5
5. 6
6. 8
7. 9
8. 2
9. 7
10. 10'
üîç Parsed numbers: [0, 2, 1, 0, 2, 3, 3, 4, 4, 5, 5, 7, 6, 8, 7, 1, 8, 6, 9, 9]
‚úÖ Reranked 20/10 documents
üîç RAGAS Input Validation:
  Question: Custom Eventhub APIM Policy How to  log   through a custom event hub policy.
For reference :...
  Answer: The provided context does not contain enough information to answer the question about how to log thr...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: APPLIES TO: All API Management tiers The log-to-eventhub policy se...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [05:03<01:31, 13.07s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 2, 4, 5, 6, 3, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 3, 4, 5, 2, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Hybrid Runbook worker jobs fail when running three simultaneously When running a couple of Hybrid Ru...
  Answer: Based on the provided context, it is not possible to determine if the issue with the Hybrid Runbook ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: rm mrseq Install the Hybrid Worker Extension: Bash Copy ./extensio...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [05:19<01:24, 14.02s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '2, 1, 5, 3, 4, 7, 8, 9, 6, 10'
üîç Parsed numbers: [1, 0, 4, 2, 3, 6, 7, 8, 5, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Not able to execute Get-AzStorageAccount, though module is installed. PowerShell
Copy
Code:
$saConte...
  Answer: The issue might be related to the way the module is being loaded or imported in the runbook. The err...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure CLI Copy $SomeVariable = add -pnplistitem .... if ( $SomeVar...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6666666666666666
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [05:36<01:15, 15.05s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 3, 4, 6, 7, 9, 10, 8, 2, 5'
üîç Parsed numbers: [0, 2, 3, 5, 6, 8, 9, 7, 1, 4]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Witch Azure services i need to build up model connectible to Power BI (live connection) Hello.
I wou...
  Answer: Based on the provided context, you would need Azure Databricks to build up a model connectible to Po...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Close . Update credentials in Excel Open Excel and select the Data...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8995713535199031
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8995713535199031}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [05:44<00:51, 12.93s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 2
2. 4
3. 8
4. 6
5. 9
6. 7
7. 10
8. 1'
üîç Parsed numbers: [0, 1, 1, 3, 2, 7, 3, 5, 4, 8, 5, 6, 6, 9, 7, 0]
‚úÖ Reranked 16/10 documents
üîç RAGAS Input Validation:
  Question: vHub SDWAn BGP (increase vHub Routing units) Hello All,
When we deploy SDWAN NVA in the vHub is buil...
  Answer: Based on the provided context, it seems that the SDWAN NVA in the vHub is able to increase the capac...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Next hop Origin ASN path 10.2.0.0/24 eastusconn VNet connection ID...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.25
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.25, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 27/30 [06:00<00:40, 13.63s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 10
2. 9
3. 8
4. 7
5. 6
6. 5
7. 4
8. 3
9. 2
10. 1'
üîç Parsed numbers: [0, 9, 1, 8, 2, 7, 3, 6, 4, 5, 5, 4, 6, 3, 7, 2, 8, 1, 9, 0]
‚úÖ Reranked 20/10 documents
üîç RAGAS Input Validation:
  Question: Azure Openai Python Chatbot using excel data to identify classification search for sourcing manager ...
  Answer: Based on the provided context, it is not clear if the specific requirements mentioned in the questio...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure OpenAI No No No Yes 3 Yes 4 Yes 5 Integration with Semantic ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 28/30 [06:15<00:28, 14.12s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1. 4. ); } Response - authorize app JSON Copy { "access_token" : { access token for the user }, "token_type" : { type of token }, "expires_in" : { time in seconds that the token remains valid'
üîç Parsed numbers: [0, 3]
‚úÖ Reranked 2/10 documents
üîç RAGAS Input Validation:
  Question: ADF Copy activity pagination how to pass variable value to Header of next pagination call I have a p...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: name of the variable used for the storage container SAS token that...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [06:24<00:12, 12.63s/it]

üîÑ Loading sentence-transformers/multi-qa-mpnet-base-dot-v1 on GPU...
üîç LLM ranking response: '1, 2, 7, 6, 3, 4, 5, 8, 9, 10'
üîç Parsed numbers: [0, 1, 6, 5, 2, 3, 4, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: create multiple symmetric keys in azure dps What is the process to create multiple symmetric keys in...
  Answer: Based on the provided context, the process to create multiple symmetric keys in group enrollments in...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: that demonstrates how to create and use enrollment groups with sym...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8
‚úÖ Extracted answer_relevancy: 0.9755976104055937
‚úÖ Successfully extracted scores: {'faithfulness': 0.8, 'answer_relevancy': 0.9755976104055937}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating mpnet: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [06:39<00:00, 13.31s/it]


‚úÖ mpnet completed: 30 questions evaluated
ü§ñ RAG metrics: 30/30 successful
üìä Average Faithfulness: 0.417
üìä Average Relevance: 0.237

üéØ Evaluating model: minilm
üîÑ Loading /content/drive/MyDrive/TesisMagister/acumulative/colab_data/docs_minilm_with_embeddings_20250721_125846.parquet...
‚úÖ 187,031 docs, 384 dims
üîÑ Using query model: sentence-transformers/all-MiniLM-L6-v2
üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úÖ Dimension match: 384 == 384

üöÄ Starting evaluation for 30 questions...


Evaluating minilm:   0%|          | 0/30 [00:00<?, ?it/s]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 9, 10, 2, 3, 4, 5, 6, 7, 8'
üîç Parsed numbers: [0, 8, 9, 1, 2, 3, 4, 5, 6, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Postgres flexible server RBAC I would like to setup RBAC to grant at subscription level can anyone p...
  Answer: The provided context does not contain information specifically related to setting up RBAC for Postgr...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Define role-based access control (RBAC) for CloudSimple Service, C...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:   3%|‚ñé         | 1/30 [00:09<04:46,  9.87s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 3, 5, 6, 1, 7, 8, 9, 4, 10'
üîç Parsed numbers: [1, 2, 4, 5, 0, 6, 7, 8, 3, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: VM unable to connect to endpoint over IPsec tunnel Hello reader,
I have set up a S2S connection betw...
  Answer: Based on the provided context, it seems like the issue might be related to the configuration or sett...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: gateway. ExtendedLocation id Resource ID. string location Resource...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:   7%|‚ñã         | 2/30 [00:21<05:02, 10.80s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 2, 3, 4, 5, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Integration between Azure DevOps and Azure Cloud I'm working with Azure DevOps and Azure Cloud, both...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This how-to guide helps you use Azure Pipelines to set up continuo...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  10%|‚ñà         | 3/30 [00:29<04:24,  9.80s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '9, 10, 1, 2, 5, 6, 3, 4, 8'
üîç Parsed numbers: [8, 9, 0, 1, 4, 5, 2, 3, 7]
‚úÖ Reranked 9/10 documents
üîç RAGAS Input Validation:
  Question: Public IP Azure Hi All
I have a dns server witch need a public IP to operate. I have configured a Pu...
  Answer: Based on the provided context, it seems like the individual is looking for information on how to con...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In this quickstart, you learn how to create an Azure public IP add...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.2
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.2, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  13%|‚ñà‚ñé        | 4/30 [00:42<04:47, 11.07s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '7, 10, 1, 2, 3, 4, 5, 8, 9'
üîç Parsed numbers: [6, 9, 0, 1, 2, 3, 4, 7, 8]
‚úÖ Reranked 9/10 documents
üîç RAGAS Input Validation:
  Question: Failed to upgrade node pool Failed to upgrade node pool 'devqanp02' in Kubernetes service 'DevQA-K8S...
  Answer: The error message indicates that the number of pre-allocated IPs (333) exceeds the number of availab...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article provides guidance on IP address planning for Azure Ku...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8625737622580224
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8625737622580224}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  17%|‚ñà‚ñã        | 5/30 [00:57<05:04, 12.16s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '3, 5, 6, 1, 2, 4, 7, 8, 9, 10'
üîç Parsed numbers: [2, 4, 5, 0, 1, 3, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Error generating map - Azure Migrate agentless dependencies visualization map Hi, In Azure Migrate, ...
  Answer: Based on the provided context, the error message "Error generating map. Please contact support." ind...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article helps you troubleshoot issues with agent-based and ag...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  20%|‚ñà‚ñà        | 6/30 [01:09<04:51, 12.13s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: 'Based on the relevance of the information provided, the ranking of the solutions to fix the error while creating a firewall in Azure Portal would be as follows:

1. The error message "link between vnet-1 and vnet-private-endpoint doesn't'
üîç Parsed numbers: [0, 0]
‚úÖ Reranked 2/10 documents
üîç RAGAS Input Validation:
  Question: how to fix below rror while creating firewall in azure portal Hi ,
Im getting below error even thoug...
  Answer: Based on the provided context, it seems like the error might be related to the subnet configuration ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: tab, select Next . For IPv4 Address space , delete the default add...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.8884834122530704
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.8884834122530704}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  23%|‚ñà‚ñà‚ñé       | 7/30 [01:24<05:02, 13.15s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 3, 10, 1, 5, 8, 4, 6, 7, 9'
üîç Parsed numbers: [1, 2, 9, 0, 4, 7, 3, 5, 6, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: ARM templates para cria√ß√£o de m√°quinas virtuais no Azure Me d√™ exemplos de ARM Templaytes p/ criar m...
  Answer: N√£o h√° informa√ß√µes suficientes no contexto fornecido para responder √† pergunta....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This tutorial introduces you to Azure Resource Manager templates (...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  27%|‚ñà‚ñà‚ñã       | 8/30 [01:32<04:14, 11.58s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '3, 5, 6, 1, 2, 4, 7, 8, 9, 10'
üîç Parsed numbers: [2, 4, 5, 0, 1, 3, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Can a Group owner update Usage Location of sub group member ? Can a Group owner update Usage Locatio...
  Answer: Based on the provided context, it is not explicitly stated whether a Group owner can update the Usag...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: , Entitlement management , Lifecycle workflows and Access reviews ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  30%|‚ñà‚ñà‚ñà       | 9/30 [01:43<03:57, 11.31s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 2, 6, 8, 10, 3, 5, 4, 7, 9'
üîç Parsed numbers: [0, 1, 5, 7, 9, 2, 4, 3, 6, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Is Azure Private DNS a regional service ? Will Azure Private DNS service still work in an event of a...
  Answer: Azure Private DNS is not a regional service, as it is designed to be resilient to regional outages. ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: DNS private zones are resilient to regional outages because zone d...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8333333333333334
‚úÖ Extracted answer_relevancy: 0.9139267439075455
‚úÖ Successfully extracted scores: {'faithfulness': 0.8333333333333334, 'answer_relevancy': 0.9139267439075455}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [01:57<04:06, 12.33s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 2, 5, 6, 7, 8, 3, 4, 9, 10'
üîç Parsed numbers: [0, 1, 4, 5, 6, 7, 2, 3, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Migrate File server to Azure Hi! There is a File server on premise (3.5TB). 80 users accessing it. W...
  Answer: Based on the provided context, the information given is correct regarding the options for migrating ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Tutorial: Create and manage a VPN gateway using Azure portal Azure...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8900082198852451
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8900082198852451}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  37%|‚ñà‚ñà‚ñà‚ñã      | 11/30 [02:13<04:12, 13.30s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1. 8
2. 1
3. 9
4. 5
5. 7
6. 6
7. 3
8. 2
9. 4
10. 10'
üîç Parsed numbers: [0, 7, 1, 0, 2, 8, 3, 4, 4, 6, 5, 5, 6, 2, 7, 1, 8, 3, 9, 9]
‚úÖ Reranked 20/10 documents
üîç RAGAS Input Validation:
  Question: What permissions do I need to only run pipelines in Synapse Workspace? Hi,
I've tried several combin...
  Answer: Based on the provided context, the user already has the "Synapse Compute Operator" role which includ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: workspace1_SynapseContributors, and workspace1_SynapseComputeOpera...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.25
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.25, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [02:26<03:56, 13.12s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1. 2
2. 1
3. 7
4. 8
5. 9
6. 6
7. 3
8. 4
9. 5'
üîç Parsed numbers: [0, 1, 1, 0, 2, 6, 3, 7, 4, 8, 5, 5, 6, 2, 7, 3, 8, 4]
‚úÖ Reranked 18/10 documents
üîç RAGAS Input Validation:
  Question: Need to understand managed storage account config of Azure databricks I would like to know what is b...
  Answer: The provided context does not contain information relevant to the question about the managed storage...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure Databricks historically used directories in the workspace ro...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 13/30 [02:36<03:28, 12.26s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 9, 1, 5, 4, 3, 6, 10, 7, 8'
üîç Parsed numbers: [1, 8, 0, 4, 3, 2, 5, 9, 6, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Synapse. How do you write a Stored Procedure to create External tables? Hi again,
Still trying to ge...
  Answer: In Synapse SQL provisioned and serverless pools, you can write a stored procedure to create external...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In this section, you'll learn how to create and use native externa...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6666666666666666
‚úÖ Extracted answer_relevancy: 0.8600893896572407
‚úÖ Successfully extracted scores: {'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.8600893896572407}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 14/30 [02:50<03:25, 12.82s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 3, 4, 5, 2, 6, 8, 7, 9, 10'
üîç Parsed numbers: [0, 2, 3, 4, 1, 5, 7, 6, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Recover an SQL server within a deleted resource group Hi,
I have accidently deleted a resource group...
  Answer: Based on the provided context, it is not clear if an SQL server within a deleted resource group can ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure DevOps Services After you delete an organization, it's disab...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.6, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15/30 [03:03<03:14, 12.99s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '9, 1, 3, 6, 10, 2, 5, 4, 8, 7'
üîç Parsed numbers: [8, 0, 2, 5, 9, 1, 4, 3, 7, 6]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: datafactory broadcast error hello,
we have several environments. dev, test, ..., prod
In ADF, in env...
  Answer: The error message indicates that the pipeline is failing due to a Broadcast join timeout error in th...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Use the following strategies to optimize performance of transforma...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.764619293240632
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.764619293240632}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [03:17<03:04, 13.21s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1. Yes Network In Billable (Deprecated) The number of billable bytes received on all network interfaces by the Virtual Machine(s) (Incoming Traffic) (Deprecated) Network In Bytes Total (Sum) <none> PT1M ...
2.'
üîç Parsed numbers: [0, 0, 1]
‚úÖ Reranked 3/10 documents
üîç RAGAS Input Validation:
  Question: VM Metrics - Network In/Out Billable(Deprecated) differences with Network In/Out Total? I was diggin...
  Answer: Based on the provided context, the answer cannot be determined as the information provided is about ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Yes Network In Billable (Deprecated) The number of billable bytes ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.2
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.2, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [03:34<03:06, 14.31s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 2, 9, 10, 3, 4, 5, 6, 7, 8'
üîç Parsed numbers: [0, 1, 8, 9, 2, 3, 4, 5, 6, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: send windows OS logs to Event Hub I would like to see more details than the information on this page...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure diagnostics extension is an agent in Azure Monitor that coll...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [03:45<02:41, 13.42s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 7, 10, 3, 5, 6, 8, 4, 2, 9'
üîç Parsed numbers: [0, 6, 9, 2, 4, 5, 7, 3, 1, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: How to update a particular extension of an Azure Function? After updating the host.json to include t...
  Answer: To update a particular extension of an Azure Function from the Azure Portal, you can follow these st...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: the host.json , open the file in editor, and then replace the exis...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.1
‚úÖ Extracted answer_relevancy: 0.8936318227625629
‚úÖ Successfully extracted scores: {'faithfulness': 0.1, 'answer_relevancy': 0.8936318227625629}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [04:08<02:59, 16.33s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 9, 2, 3, 5, 4, 7, 8, 6, 10'
üîç Parsed numbers: [0, 8, 1, 2, 4, 3, 6, 7, 5, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Need simple way-How to migrate On-premise window server AD to Azure Window server AD? Need simple wa...
  Answer: To migrate an on-premise Windows Server Active Directory to Azure Windows Server Active Directory, y...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article describes how to upgrade Windows Server OS while migr...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.25
‚úÖ Extracted answer_relevancy: 0.90154831770856
‚úÖ Successfully extracted scores: {'faithfulness': 0.25, 'answer_relevancy': 0.90154831770856}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20/30 [04:25<02:43, 16.32s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1. 2
2. 6
3. 5
4. 10
5. 8
6. 7
7. 3
8. 9
9. 1
10. 4'
üîç Parsed numbers: [0, 1, 1, 5, 2, 4, 3, 9, 4, 7, 5, 6, 6, 2, 7, 8, 8, 0, 9, 3]
‚úÖ Reranked 20/10 documents
üîç RAGAS Input Validation:
  Question: problem with WaitUntil i keep getting this error when i call WaitUntil.OperationSucceeds...
  Answer: The provided context does not contain information related to the error message mentioned in the ques...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: "<QUEUE-NAME>" , new ServiceBusProcessorOptions()); try { // add h...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [04:33<02:04, 13.83s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 5, 1, 7, 10, 3, 4, 6, 8, 9'
üîç Parsed numbers: [1, 4, 0, 6, 9, 2, 3, 5, 7, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Having problem with Azure sandbox storage account Hi,
I am having issue to access to storage account...
  Answer: Based on the provided context, it seems like the issue with accessing the Azure storage account in t...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: storage account Sign-in to the Azure portal . In the search box in...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.25
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.25, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22/30 [04:47<01:51, 13.91s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 1, 10, 3, 4, 5, 6, 7, 8, 9'
üîç Parsed numbers: [1, 0, 9, 2, 3, 4, 5, 6, 7, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Custom Eventhub APIM Policy How to  log   through a custom event hub policy.
For reference :...
  Answer: Based on the provided context, the question is asking for information on how to log through a custom...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: and Event Hubs integration How to log events to Azure Event Hubs i...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [04:58<01:31, 13.12s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 5, 2, 4, 3, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 4, 1, 3, 2, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Hybrid Runbook worker jobs fail when running three simultaneously When running a couple of Hybrid Ru...
  Answer: Based on the provided context, it is not possible to determine if the issue with the Hybrid Runbook ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Worker Group. Microsoft.Automation/automationAccounts/hybridRunboo...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [05:12<01:19, 13.33s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 1, 5, 6, 3, 4, 7, 8, 9, 10'
üîç Parsed numbers: [1, 0, 4, 5, 2, 3, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Not able to execute Get-AzStorageAccount, though module is installed. PowerShell
Copy
Code:
$saConte...
  Answer: The issue is that the Get-AzStorageTable cmdlet is not recognized, even though both Get-AzStorageAcc...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: for use in the rest of Select-AzSubscription -Subscription $subscr...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8565073426686259
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8565073426686259}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [05:26<01:07, 13.44s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 2, 8, 9, 6, 10, 3, 5, 7, 4'
üîç Parsed numbers: [0, 1, 7, 8, 5, 9, 2, 4, 6, 3]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Witch Azure services i need to build up model connectible to Power BI (live connection) Hello.
I wou...
  Answer: Based on the provided context, you would need Azure Database for PostgreSQL - Flexible Server to con...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: APPLIES TO: Azure Database for PostgreSQL - Flexible Server In thi...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.75
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.75, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [05:42<00:57, 14.42s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 5, 1, 4, 3, 6, 7, 8, 9, 10'
üîç Parsed numbers: [1, 4, 0, 3, 2, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: vHub SDWAn BGP (increase vHub Routing units) Hello All,
When we deploy SDWAN NVA in the vHub is buil...
  Answer: Based on the provided context, it is not clear whether the SDWAN NVA in the vHub should build additi...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure Virtual WAN hub router, also called virtual hub router, acts...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 27/30 [05:54<00:40, 13.52s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1. 6
2. 7
3. 8
4. 10
5. 9'
üîç Parsed numbers: [0, 5, 1, 6, 2, 7, 3, 9, 4, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Azure Openai Python Chatbot using excel data to identify classification search for sourcing manager ...
  Answer: Based on the provided context, it is not clear if the Azure OpenAI Python chatbot can use Excel data...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article shows you how to deploy and run the chat app with you...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 28/30 [06:18<00:33, 16.84s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '2, 4, 8, 1, 3, 5, 6, 7, 9, 10'
üîç Parsed numbers: [1, 3, 7, 0, 2, 4, 5, 6, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: ADF Copy activity pagination how to pass variable value to Header of next pagination call I have a p...
  Answer: Based on the provided context, it seems that the individual is having trouble passing the latest val...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: header: HTTP Copy GET /tasks Host : mytaskwebapi.com Authorization...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8397241624146489
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8397241624146489}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [06:36<00:16, 16.94s/it]

üîÑ Loading sentence-transformers/all-MiniLM-L6-v2 on GPU...
üîç LLM ranking response: '1, 6, 2, 3, 7, 8, 4, 5, 9, 10'
üîç Parsed numbers: [0, 5, 1, 2, 6, 7, 3, 4, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: create multiple symmetric keys in azure dps What is the process to create multiple symmetric keys in...
  Answer: The process to create multiple symmetric keys in group enrollments in Azure DPS involves signing in ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: that demonstrates how to create and use enrollment groups with sym...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.9712881735202764
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.9712881735202764}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating minilm: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [06:54<00:00, 13.83s/it]


‚úÖ minilm completed: 30 questions evaluated
ü§ñ RAG metrics: 30/30 successful
üìä Average Faithfulness: 0.392
üìä Average Relevance: 0.321

üéØ Evaluating model: ada
üîÑ Loading /content/drive/MyDrive/TesisMagister/acumulative/colab_data/docs_ada_with_embeddings_20250721_123712.parquet...
‚úÖ 187,031 docs, 1536 dims
üîÑ Using query model: text-embedding-ada-002
‚úÖ Dimension match: 1536 == 1536

üöÄ Starting evaluation for 30 questions...


Evaluating ada:   0%|          | 0/30 [00:00<?, ?it/s]

üîç LLM ranking response: '1, 4, 5, 2, 3, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 3, 4, 1, 2, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Postgres flexible server RBAC I would like to setup RBAC to grant at subscription level can anyone p...
  Answer: Based on the context provided, it seems that RBAC (Role-Based Access Control) can be set up at the s...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: PostgreSQL flexible server resource. Access management The best wa...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6666666666666666
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:   3%|‚ñé         | 1/30 [00:14<06:54, 14.30s/it]

üîç LLM ranking response: '1, 2, 8, 3, 4, 5, 6, 7, 9, 10'
üîç Parsed numbers: [0, 1, 7, 2, 3, 4, 5, 6, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: VM unable to connect to endpoint over IPsec tunnel Hello reader,
I have set up a S2S connection betw...
  Answer: Based on the provided context, the issue could be related to the configuration of the IPsec tunnel b...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: mode gateway (recommended), each gateway VM instance has a separat...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:   7%|‚ñã         | 2/30 [00:32<07:42, 16.53s/it]

üîç LLM ranking response: '1, 4, 5, 7, 9, 10, 2, 3, 6, 8'
üîç Parsed numbers: [0, 3, 4, 6, 8, 9, 1, 2, 5, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Integration between Azure DevOps and Azure Cloud I'm working with Azure DevOps and Azure Cloud, both...
  Answer: To achieve the integration between Azure DevOps and Azure Cloud without having to make the Function ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: autogenerated YAML file. If your code is in Azure Repos: You must ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  10%|‚ñà         | 3/30 [00:49<07:28, 16.61s/it]

üîç LLM ranking response: '1, 6, 7, 8, 10, 2, 4, 5, 3, 9'
üîç Parsed numbers: [0, 5, 6, 7, 9, 1, 3, 4, 2, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Public IP Azure Hi All
I have a dns server witch need a public IP to operate. I have configured a Pu...
  Answer: Based on the provided context, it is not clear how to obtain the default gateway and netmask for the...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: . Select Internet Protocol Version 4 (TCP/IPv4) and select Propert...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  13%|‚ñà‚ñé        | 4/30 [01:04<07:03, 16.30s/it]

üîç LLM ranking response: '6, 10, 4, 1, 8, 9, 5, 2, 3'
üîç Parsed numbers: [5, 9, 3, 0, 7, 8, 4, 1, 2]
‚úÖ Reranked 9/10 documents
üîç RAGAS Input Validation:
  Question: Failed to upgrade node pool Failed to upgrade node pool 'devqanp02' in Kubernetes service 'DevQA-K8S...
  Answer: The subnet size of 192.168.3.0/24 is not large enough to accommodate the required number of IP addre...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article provides guidance on IP address planning for Azure Ku...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8967117802923048
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8967117802923048}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  17%|‚ñà‚ñã        | 5/30 [01:20<06:36, 15.88s/it]

üîç LLM ranking response: '2, 9, 1, 5, 3, 4, 7, 6, 8, 10'
üîç Parsed numbers: [1, 8, 0, 4, 2, 3, 6, 5, 7, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Error generating map - Azure Migrate agentless dependencies visualization map Hi, In Azure Migrate, ...
  Answer: Based on the provided context, the error message "Error generating map. Please contact support." ind...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article helps you troubleshoot issues with agent-based and ag...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  20%|‚ñà‚ñà        | 6/30 [01:34<06:12, 15.51s/it]

üîç LLM ranking response: 'Based on the relevance of the content to the error message provided, the ranking of the documents would be as follows:

1. This article shows you how to troubleshoot Azure Bastion. Unable to create an NSG on AzureBastionSubnet'
üîç Parsed numbers: [0]
‚úÖ Reranked 1/10 documents
üîç RAGAS Input Validation:
  Question: how to fix below rror while creating firewall in azure portal Hi ,
Im getting below error even thoug...
  Answer: Based on the provided context, the error message indicates that the necessary rules for the Azure Ba...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: tab, select Next . For IPv4 Address space , delete the default add...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.8459195243529875
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.8459195243529875}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  23%|‚ñà‚ñà‚ñé       | 7/30 [01:49<05:50, 15.24s/it]

üîç LLM ranking response: '1, 2, 3, 4, 5, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: ARM templates para cria√ß√£o de m√°quinas virtuais no Azure Me d√™ exemplos de ARM Templaytes p/ criar m...
  Answer: The context provided does not contain information about specific ARM templates for creating virtual ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Machine Azure Game Developer Virtual Machine includes Licencsed En...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  27%|‚ñà‚ñà‚ñã       | 8/30 [01:59<05:00, 13.65s/it]

üîç LLM ranking response: '1. 5
2. 10
3. 6
4. 9
5. 1
6. 8
7. 7
8. 2
9. 3'
üîç Parsed numbers: [0, 4, 1, 9, 2, 5, 3, 8, 4, 0, 5, 7, 6, 6, 7, 1, 8, 2]
‚úÖ Reranked 18/10 documents
üîç RAGAS Input Validation:
  Question: Can a Group owner update Usage Location of sub group member ? Can a Group owner update Usage Locatio...
  Answer: No, the provided context does not contain enough information to determine if User 3, as the owner of...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: to the group. Copy (user.proxyAddresses -any (_ -startsWith "conto...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  30%|‚ñà‚ñà‚ñà       | 9/30 [02:11<04:36, 13.19s/it]

üîç LLM ranking response: '1, 2, 7, 8, 4, 5, 6, 3, 9, 10'
üîç Parsed numbers: [0, 1, 6, 7, 3, 4, 5, 2, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Is Azure Private DNS a regional service ? Will Azure Private DNS service still work in an event of a...
  Answer: Azure Private DNS is not a regional service, as it is globally available. In the event of a region f...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: DNS private zones are resilient to regional outages because zone d...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8
‚úÖ Extracted answer_relevancy: 0.9138221090900683
‚úÖ Successfully extracted scores: {'faithfulness': 0.8, 'answer_relevancy': 0.9138221090900683}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [02:30<04:57, 14.89s/it]

üîç LLM ranking response: '1. 8. This article explains how you can use Azure Files and Azure File Sync to replace or extend your on-premises Windows file servers to reduce total cost of ownership (TCO), increase flexibility, and simp...
2. 1'
üîç Parsed numbers: [0, 7, 1, 0]
‚úÖ Reranked 4/10 documents
üîç RAGAS Input Validation:
  Question: Migrate File server to Azure Hi! There is a File server on premise (3.5TB). 80 users accessing it. W...
  Answer: Based on the provided context, the information is not sufficient to determine if the options and cos...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: network resources. Everything operates within the same private net...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  37%|‚ñà‚ñà‚ñà‚ñã      | 11/30 [02:45<04:44, 14.98s/it]

üîç LLM ranking response: '1. 7
2. 1
3. 5
4. 10
5. 6
6. 2
7. 3
8. 8
9. 9'
üîç Parsed numbers: [0, 6, 1, 0, 2, 4, 3, 9, 4, 5, 5, 1, 6, 2, 7, 7, 8, 8]
‚úÖ Reranked 18/10 documents
üîç RAGAS Input Validation:
  Question: What permissions do I need to only run pipelines in Synapse Workspace? Hi,
I've tried several combin...
  Answer: Based on the provided context, the user needs to have the "Synapse Compute Operator" role within the...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: workspace1_SynapseContributors, and workspace1_SynapseComputeOpera...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8751425761441016
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8751425761441016}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [02:57<04:10, 13.90s/it]

üîç LLM ranking response: '1. 5. Azure Databricks uses the DBFS root directory as a default location for some workspace actions. Databricks recommends against storing any production data or sensitive information in the DBFS root. Thi...
2. 9. This'
üîç Parsed numbers: [0, 4, 1, 8]
‚úÖ Reranked 4/10 documents
üîç RAGAS Input Validation:
  Question: Need to understand managed storage account config of Azure databricks I would like to know what is b...
  Answer: The provided context does not contain enough information to answer the question regarding the manage...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Note This feature is available only in the Premium plan . For addi...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.4
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.4, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 13/30 [03:14<04:13, 14.92s/it]

üîç LLM ranking response: '2, 4, 6, 1, 5, 8, 3, 9, 10, 7'
üîç Parsed numbers: [1, 3, 5, 0, 4, 7, 2, 8, 9, 6]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Synapse. How do you write a Stored Procedure to create External tables? Hi again,
Still trying to ge...
  Answer: In Synapse, you can create external tables using SQL commands like CREATE EXTERNAL TABLE. You do not...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In this section, you'll learn how to create and use native externa...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8333333333333334
‚úÖ Extracted answer_relevancy: 0.8887874988858785
‚úÖ Successfully extracted scores: {'faithfulness': 0.8333333333333334, 'answer_relevancy': 0.8887874988858785}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 14/30 [03:30<04:04, 15.28s/it]

üîç LLM ranking response: '3, 10, 1, 5, 6, 7, 4, 8, 9, 2'
üîç Parsed numbers: [2, 9, 0, 4, 5, 6, 3, 7, 8, 1]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Recover an SQL server within a deleted resource group Hi,
I have accidently deleted a resource group...
  Answer: Based on the provided context, it is not possible to recover an SQL server within a deleted resource...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Applies to: Azure SQL Database This article provides steps to rest...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.9078627613248768
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.9078627613248768}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15/30 [03:41<03:31, 14.08s/it]

üîç LLM ranking response: '2, 1, 5, 3, 4, 10, 6, 7, 8, 9'
üîç Parsed numbers: [1, 0, 4, 2, 3, 9, 5, 6, 7, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: datafactory broadcast error hello,
we have several environments. dev, test, ..., prod
In ADF, in env...
  Answer: The error message indicates that there is a Broadcast join timeout error in the pipeline. The error ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Use the following strategies to optimize performance of transforma...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8401674353178308
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8401674353178308}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [03:56<03:20, 14.34s/it]

üîç LLM ranking response: '1. Yes Network In Billable (Deprecated) The number of billable bytes received on all network interfaces by the Virtual Machine(s) (Incoming Traffic) (Deprecated) Network In Bytes Total (Sum) <none> PT1M ...
2.'
üîç Parsed numbers: [0, 0, 1]
‚úÖ Reranked 3/10 documents
üîç RAGAS Input Validation:
  Question: VM Metrics - Network In/Out Billable(Deprecated) differences with Network In/Out Total? I was diggin...
  Answer: Based on the provided context, it seems that the Network In/Out Billable (Deprecated) metrics refer ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Yes Network In Billable (Deprecated) The number of billable bytes ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [04:16<03:27, 15.98s/it]

üîç LLM ranking response: '3, 4, 9, 5, 6, 7, 8, 1, 2, 10'
üîç Parsed numbers: [2, 3, 8, 4, 5, 6, 7, 0, 1, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: send windows OS logs to Event Hub I would like to see more details than the information on this page...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure diagnostics extension is an agent in Azure Monitor that coll...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [04:24<02:42, 13.51s/it]

üîç LLM ranking response: '2, 8, 4, 1, 3, 6, 5, 7, 9, 10'
üîç Parsed numbers: [1, 7, 3, 0, 2, 5, 4, 6, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: How to update a particular extension of an Azure Function? After updating the host.json to include t...
  Answer: To update a particular extension of an Azure Function from the Azure Portal, you can follow these st...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: the host.json , open the file in editor, and then replace the exis...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8940229763630504
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8940229763630504}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [04:43<02:48, 15.33s/it]

üîç LLM ranking response: '3, 4, 1, 2, 5, 6, 7, 8, 9, 10'
üîç Parsed numbers: [2, 3, 0, 1, 4, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Need simple way-How to migrate On-premise window server AD to Azure Window server AD? Need simple wa...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This tutorial shows you how to migrate on-premises servers running...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20/30 [04:52<02:11, 13.20s/it]

üîç LLM ranking response: '1. 2
2. 9
3. 4
4. 7
5. 8
6. 10'
üîç Parsed numbers: [0, 1, 1, 8, 2, 3, 3, 6, 4, 7, 5, 9]
‚úÖ Reranked 12/10 documents
üîç RAGAS Input Validation:
  Question: problem with WaitUntil i keep getting this error when i call WaitUntil.OperationSucceeds...
  Answer: Not enough information is provided in the context to answer the question about the problem with Wait...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Console.WriteLine(pipelineOperation.WaitForCompletionResponse().Co...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [05:03<01:54, 12.69s/it]

üîç LLM ranking response: '5, 1, 8, 4, 7'
üîç Parsed numbers: [4, 0, 7, 3, 6]
‚úÖ Reranked 5/10 documents
üîç RAGAS Input Validation:
  Question: Having problem with Azure sandbox storage account Hi,
I am having issue to access to storage account...
  Answer: Based on the provided context, the issue with accessing the Azure storage account in the sandbox env...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In Synapse Studio, you can explore data resources located in your ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22/30 [05:22<01:55, 14.47s/it]

üîç LLM ranking response: '2, 1, 4, 6, 8'
üîç Parsed numbers: [1, 0, 3, 5, 7]
‚úÖ Reranked 5/10 documents
üîç RAGAS Input Validation:
  Question: Custom Eventhub APIM Policy How to  log   through a custom event hub policy.
For reference :...
  Answer: Based on the provided context, it is possible to create a custom log-to-eventhub policy in Azure API...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: and Event Hubs integration How to log events to Azure Event Hubs i...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6666666666666666
‚úÖ Extracted answer_relevancy: 0.8968786246746213
‚úÖ Successfully extracted scores: {'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.8968786246746213}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [05:40<01:48, 15.45s/it]

üîç LLM ranking response: '1, 3, 5, 10, 2, 6, 4, 7, 8, 9'
üîç Parsed numbers: [0, 2, 4, 9, 1, 5, 3, 6, 7, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Hybrid Runbook worker jobs fail when running three simultaneously When running a couple of Hybrid Ru...
  Answer: Based on the provided context, it is not possible to determine if the issue with the Hybrid Runbook ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: membership and when user has to elevate permissions, the jobs begi...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [05:51<01:24, 14.16s/it]

üîç LLM ranking response: '2, 5, 6, 10, 1, 4, 8, 9, 3, 7'
üîç Parsed numbers: [1, 4, 5, 9, 0, 3, 7, 8, 2, 6]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Not able to execute Get-AzStorageAccount, though module is installed. PowerShell
Copy
Code:
$saConte...
  Answer: The issue may be related to the specific cmdlet not being imported or loaded properly in the PowerSh...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: $resourceGroup )[ 0 ].Value # Get the resource group, in case we n...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [06:02<01:06, 13.37s/it]

üîç LLM ranking response: '1. 6
2. 7
3. 3
4. 8
5. 9
6. 10'
üîç Parsed numbers: [0, 5, 1, 6, 2, 2, 3, 7, 4, 8, 5, 9]
‚úÖ Reranked 12/10 documents
üîç RAGAS Input Validation:
  Question: Witch Azure services i need to build up model connectible to Power BI (live connection) Hello.
I wou...
  Answer: Based on the provided context, you would need to use Azure Databricks as a data source to build a mo...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Microsoft Power BI is a business analytics service that provides i...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8785433365761612
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8785433365761612}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [06:13<00:50, 12.64s/it]

üîç LLM ranking response: '1, 3, 6, 5, 2, 8, 4, 9, 10'
üîç Parsed numbers: [0, 2, 5, 4, 1, 7, 3, 8, 9]
‚úÖ Reranked 9/10 documents
üîç RAGAS Input Validation:
  Question: vHub SDWAn BGP (increase vHub Routing units) Hello All,
When we deploy SDWAN NVA in the vHub is buil...
  Answer: Based on the provided context, it is not explicitly mentioned whether increasing the routing units i...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: VPN and ER connected sites, branch to branch routing must be turne...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 27/30 [06:27<00:39, 13.05s/it]

üîç LLM ranking response: '1. 5
2. 3
3. 7
4. 6
5. 9
6. 10
7. 8'
üîç Parsed numbers: [0, 4, 1, 2, 2, 6, 3, 5, 4, 8, 5, 9, 6, 7]
‚úÖ Reranked 14/10 documents
üîç RAGAS Input Validation:
  Question: Azure Openai Python Chatbot using excel data to identify classification search for sourcing manager ...
  Answer: Based on the provided context, it seems possible to develop an Azure OpenAI chatbot using Excel data...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In this quickstart, you can use your own data with Azure OpenAI mo...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.8, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 28/30 [06:42<00:27, 13.70s/it]

üîç LLM ranking response: '2, 1, 4, 7, 10, 3, 6, 9, 5, 8'
üîç Parsed numbers: [1, 0, 3, 6, 9, 2, 5, 8, 4, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: ADF Copy activity pagination how to pass variable value to Header of next pagination call I have a p...
  Answer: Based on the provided context, it seems that the user is facing issues with passing a dynamic value ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: value of the specific node in response is a user-defined const val...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.16666666666666666
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.16666666666666666, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [06:59<00:14, 14.62s/it]

üîç LLM ranking response: '2, 6, 10, 1, 3, 4, 5, 7, 8, 9'
üîç Parsed numbers: [1, 5, 9, 0, 2, 3, 4, 6, 7, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: create multiple symmetric keys in azure dps What is the process to create multiple symmetric keys in...
  Answer: To create multiple symmetric keys in group enrollments in Azure DPS, you need to sign in to the Azur...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: that demonstrates how to create and use enrollment groups with sym...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.9700718359942114
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.9700718359942114}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating ada: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [07:17<00:00, 14.60s/it]


‚úÖ ada completed: 30 questions evaluated
ü§ñ RAG metrics: 30/30 successful
üìä Average Faithfulness: 0.461
üìä Average Relevance: 0.327

üéØ Evaluating model: e5-large
üîÑ Loading /content/drive/MyDrive/TesisMagister/acumulative/colab_data/docs_e5large_with_embeddings_20250721_124918.parquet...
‚úÖ 187,031 docs, 1024 dims
üîÑ Using query model: intfloat/e5-large-v2
üîÑ Loading intfloat/e5-large-v2 on GPU...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

‚úÖ Dimension match: 1024 == 1024

üöÄ Starting evaluation for 30 questions...


Evaluating e5-large:   0%|          | 0/30 [00:00<?, ?it/s]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 5, 7, 8, 9, 10, 2, 3, 4, 6'
üîç Parsed numbers: [0, 4, 6, 7, 8, 9, 1, 2, 3, 5]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Postgres flexible server RBAC I would like to setup RBAC to grant at subscription level can anyone p...
  Answer: Document 2 provides information on the minimum set of permissions required for a user to create and ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: ); } } } } This example demonstrates creating a PostgreSQL flexibl...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.3333333333333333
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.3333333333333333, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:   3%|‚ñé         | 1/30 [00:14<06:53, 14.27s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 3, 6, 5, 2, 7, 8, 9, 4, 10'
üîç Parsed numbers: [0, 2, 5, 4, 1, 6, 7, 8, 3, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: VM unable to connect to endpoint over IPsec tunnel Hello reader,
I have set up a S2S connection betw...
  Answer: Based on the provided context, it seems that the issue might be related to the configuration of the ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Go to the virtual network gateway you created and select Connectio...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8333333333333334
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.8333333333333334, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:   7%|‚ñã         | 2/30 [00:31<07:26, 15.93s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 4, 8, 10, 2, 3, 6, 7, 5, 9'
üîç Parsed numbers: [0, 3, 7, 9, 1, 2, 5, 6, 4, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Integration between Azure DevOps and Azure Cloud I'm working with Azure DevOps and Azure Cloud, both...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: The default is true for continuous integration deployments. When s...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  10%|‚ñà         | 3/30 [00:39<05:37, 12.49s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 2, 3, 4, 5, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Public IP Azure Hi All
I have a dns server witch need a public IP to operate. I have configured a Pu...
  Answer: Based on the context provided, it seems like the individual is having trouble configuring a public I...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In this article, you create a virtual machine (VM) with a static p...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8850996978714534
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8850996978714534}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  13%|‚ñà‚ñé        | 4/30 [00:53<05:34, 12.87s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '2, 1, 6, 3, 4, 7, 10, 8, 9, 5'
üîç Parsed numbers: [1, 0, 5, 2, 3, 6, 9, 7, 8, 4]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Failed to upgrade node pool Failed to upgrade node pool 'devqanp02' in Kubernetes service 'DevQA-K8S...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This section of the Azure Kubernetes Service (AKS) day-2 operation...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  17%|‚ñà‚ñã        | 5/30 [01:05<05:14, 12.57s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '3, 8, 1, 7, 10'
üîç Parsed numbers: [2, 7, 0, 6, 9]
‚úÖ Reranked 5/10 documents
üîç RAGAS Input Validation:
  Question: Error generating map - Azure Migrate agentless dependencies visualization map Hi, In Azure Migrate, ...
  Answer: Based on the provided context, the error generating map in Azure Migrate agentless dependencies visu...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This error indicates that the browser was unable to call into a re...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.6666666666666666
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.6666666666666666, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  20%|‚ñà‚ñà        | 6/30 [01:21<05:33, 13.90s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: 'Based on the relevance of the content to fixing the error while creating a firewall in Azure Portal, the ranking of the documents would be as follows:

1. Document 1: It directly addresses the issue of creating a firewall subnet in Azure and the error'
üîç Parsed numbers: [0, 0]
‚úÖ Reranked 2/10 documents
üîç RAGAS Input Validation:
  Question: how to fix below rror while creating firewall in azure portal Hi ,
Im getting below error even thoug...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: ] }, "subnets" : [ { "name" : "[variables('azureFirewallSubnetName...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  23%|‚ñà‚ñà‚ñé       | 7/30 [01:33<05:04, 13.23s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '2, 3, 6, 5, 1, 4, 7, 8, 9, 10'
üîç Parsed numbers: [1, 2, 5, 4, 0, 3, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: ARM templates para cria√ß√£o de m√°quinas virtuais no Azure Me d√™ exemplos de ARM Templaytes p/ criar m...
  Answer: Based on the provided context, it is not possible to give specific examples of ARM templates for cre...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Manager templates has guidelines and suggestions for creating reli...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  27%|‚ñà‚ñà‚ñã       | 8/30 [01:44<04:35, 12.53s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 4, 5, 6, 7'
üîç Parsed numbers: [0, 3, 4, 5, 6]
‚úÖ Reranked 5/10 documents
üîç RAGAS Input Validation:
  Question: Can a Group owner update Usage Location of sub group member ? Can a Group owner update Usage Locatio...
  Answer: No, the provided context does not contain information relevant to the question about updating usage ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Expand table Name Description Value UserAssignedManagedIdentity Ex...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.8587037741026874
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.8587037741026874}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  30%|‚ñà‚ñà‚ñà       | 9/30 [01:58<04:29, 12.81s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 2, 3, 7, 4, 5, 6, 8, 9, 10'
üîç Parsed numbers: [0, 1, 2, 6, 3, 4, 5, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Is Azure Private DNS a regional service ? Will Azure Private DNS service still work in an event of a...
  Answer: Azure Private DNS is not a regional service, as it is resilient to regional outages and the zone dat...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: DNS private zones are resilient to regional outages because zone d...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8
‚úÖ Extracted answer_relevancy: 0.9139267439075455
‚úÖ Successfully extracted scores: {'faithfulness': 0.8, 'answer_relevancy': 0.9139267439075455}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  33%|‚ñà‚ñà‚ñà‚ñé      | 10/30 [02:11<04:23, 13.16s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
‚ö†Ô∏è CUDA error for intfloat/e5-large-v2, falling back to CPU...
‚úÖ Generated CPU embedding: 1024 dims
üîç LLM ranking response: '1. 3. You can use a site-to-site (S2S) VPN connection to mount your Azure file shares from your on-premises network, without sending data over the open internet. You can set up a S2S VPN'
üîç Parsed numbers: [0, 2, 1, 1]
‚úÖ Reranked 4/10 documents
üîç RAGAS Input Validation:
  Question: Migrate File server to Azure Hi! There is a File server on premise (3.5TB). 80 users accessing it. W...
  Answer: Based on the provided context, the information is not sufficient to determine if the details provide...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: consider creating trusts at the domain level only. Domain level tr...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  37%|‚ñà‚ñà‚ñà‚ñã      | 11/30 [02:30<04:41, 14.81s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1. 2
2. 10
3. 4
4. 1
5. 6
6. 9
7. 8
8. 5
9. 3'
üîç Parsed numbers: [0, 1, 1, 9, 2, 3, 3, 0, 4, 5, 5, 8, 6, 7, 7, 4, 8, 2]
‚úÖ Reranked 18/10 documents
üîç RAGAS Input Validation:
  Question: What permissions do I need to only run pipelines in Synapse Workspace? Hi,
I've tried several combin...
  Answer: Based on the provided context, the user needs to have the "Synapse Compute Operator" permission in t...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: In this tutorial, you learn how to integrate pipelines and activit...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.893582612486827
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.893582612486827}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  40%|‚ñà‚ñà‚ñà‚ñà      | 12/30 [02:43<04:14, 14.14s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 5, 6, 7'
üîç Parsed numbers: [0, 4, 5, 6]
‚úÖ Reranked 4/10 documents
üîç RAGAS Input Validation:
  Question: Need to understand managed storage account config of Azure databricks I would like to know what is b...
  Answer: The provided context does not contain enough information to answer the question about what is being ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Azure Databricks uses the DBFS root directory as a default locatio...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 13/30 [03:06<04:49, 17.04s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 3, 4, 8, 9, 10, 2, 5, 6, 7'
üîç Parsed numbers: [0, 2, 3, 7, 8, 9, 1, 4, 5, 6]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Synapse. How do you write a Stored Procedure to create External tables? Hi again,
Still trying to ge...
  Answer: Based on the provided context, you can create an external table in Synapse using SQL commands such a...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: OPENROWSET T-SQL CREATE EXTERNAL TABLE CREATE EXTERNAL FILE FORMAT...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 14/30 [03:20<04:16, 16.01s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 2, 3, 6, 4, 8, 9, 5, 7, 10'
üîç Parsed numbers: [0, 1, 2, 5, 3, 7, 8, 4, 6, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Recover an SQL server within a deleted resource group Hi,
I have accidently deleted a resource group...
  Answer: Based on the provided context, it is possible to recover an SQL server within a deleted resource gro...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroup...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.75
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.75, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 15/30 [03:36<03:58, 15.89s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 6, 9, 4, 2, 8, 3, 5, 7, 10'
üîç Parsed numbers: [0, 5, 8, 3, 1, 7, 2, 4, 6, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: datafactory broadcast error hello,
we have several environments. dev, test, ..., prod
In ADF, in env...
  Answer: The error message indicates that there is a Broadcast join timeout error in the data flow. The error...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Use the following strategies to optimize performance of transforma...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8233582299672725
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8233582299672725}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 16/30 [03:51<03:41, 15.81s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1. Yes Network In Billable (Deprecated) The number of billable bytes received on all network interfaces by the Virtual Machine(s) (Incoming Traffic) (Deprecated) Network In Bytes Total (Sum) <none> PT1M ...
2.'
üîç Parsed numbers: [0, 0, 1]
‚úÖ Reranked 3/10 documents
üîç RAGAS Input Validation:
  Question: VM Metrics - Network In/Out Billable(Deprecated) differences with Network In/Out Total? I was diggin...
  Answer: Based on the provided context, the answer cannot be determined as the information provided is relate...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Yes Network In Billable (Deprecated) The number of billable bytes ...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 17/30 [04:11<03:39, 16.91s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 2, 9, 10, 3, 4, 5, 6, 7, 8'
üîç Parsed numbers: [0, 1, 8, 9, 2, 3, 4, 5, 6, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: send windows OS logs to Event Hub I would like to see more details than the information on this page...
  Answer: The context provided does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Collect data from virtual machine client with Azure Monitor descri...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 18/30 [04:21<02:59, 14.93s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 4, 5, 3, 6, 7, 2, 8, 9, 10'
üîç Parsed numbers: [0, 3, 4, 2, 5, 6, 1, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: How to update a particular extension of an Azure Function? After updating the host.json to include t...
  Answer: To update a particular extension of an Azure Function from the Azure Portal, you can follow these st...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: the host.json , open the file in editor, and then replace the exis...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.8952080954266389
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.8952080954266389}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 19/30 [04:51<03:34, 19.51s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 2, 3, 5, 6, 7, 9, 10, 4, 8'
üîç Parsed numbers: [0, 1, 2, 4, 5, 6, 8, 9, 3, 7]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Need simple way-How to migrate On-premise window server AD to Azure Window server AD? Need simple wa...
  Answer: The provided context does not contain enough information to answer the question....
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: by using Azure PowerShell . Connect an on-premises network to Azur...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 20/30 [05:01<02:45, 16.60s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1. 4
2. 5
3. 6
4. 10
5. 9
6. 1
7. 7
8. 8'
üîç Parsed numbers: [0, 3, 1, 4, 2, 5, 3, 9, 4, 8, 5, 0, 6, 6, 7, 7]
‚úÖ Reranked 16/10 documents
üîç RAGAS Input Validation:
  Question: problem with WaitUntil i keep getting this error when i call WaitUntil.OperationSucceeds...
  Answer: The provided context does not contain information relevant to the question about a problem with Wait...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: of William Dough, who advises on and oversees all of the Hotel‚Äôs r...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.5
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.5, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 21/30 [05:13<02:16, 15.12s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 4, 2, 6, 3, 8, 7, 5, 9, 10'
üîç Parsed numbers: [0, 3, 1, 5, 2, 7, 6, 4, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Having problem with Azure sandbox storage account Hi,
I am having issue to access to storage account...
  Answer: Based on the provided context, it seems like the issue may be related to the browser not opening whe...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Sign-in is the recommended way to access your Azure storage resour...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 22/30 [05:28<02:02, 15.28s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
‚ö†Ô∏è CUDA error for intfloat/e5-large-v2, falling back to CPU...
‚úÖ Generated CPU embedding: 1024 dims
üîç LLM ranking response: '1, 2, 4, 3, 5, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 3, 2, 4, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Custom Eventhub APIM Policy How to  log   through a custom event hub policy.
For reference :...
  Answer: Based on the provided context, it is possible to create a custom event hub policy in Azure API Manag...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: and Event Hubs integration How to log events to Azure Event Hubs i...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.8886570774641914
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.8886570774641914}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 23/30 [05:47<01:53, 16.15s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 3, 7, 9, 2, 5, 6, 4, 8, 10'
üîç Parsed numbers: [0, 2, 6, 8, 1, 4, 5, 3, 7, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Hybrid Runbook worker jobs fail when running three simultaneously When running a couple of Hybrid Ru...
  Answer: Based on the provided context, it is not possible to determine if the issue with the Hybrid Runbook ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Important Starting 1st April 2025, all jobs running on agent-based...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 24/30 [06:01<01:33, 15.61s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '2, 4, 1, 7, 3, 8, 10, 5, 6, 9'
üîç Parsed numbers: [1, 3, 0, 6, 2, 7, 9, 4, 5, 8]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Not able to execute Get-AzStorageAccount, though module is installed. PowerShell
Copy
Code:
$saConte...
  Answer: The issue may be related to the specific version of the Az.Storage module that is installed. It is p...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: for use in the rest of Select-AzSubscription -Subscription $subscr...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 25/30 [06:16<01:17, 15.41s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '2, 1, 6, 9, 10, 8, 3, 4, 5, 7'
üîç Parsed numbers: [1, 0, 5, 8, 9, 7, 2, 3, 4, 6]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Witch Azure services i need to build up model connectible to Power BI (live connection) Hello.
I wou...
  Answer: Based on the provided context, you would need an Analysis Services server in Azure to build and depl...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: Once you've created a server in your Azure subscription, you're re...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.868275617415898
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.868275617415898}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 26/30 [06:28<00:57, 14.32s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '5, 1, 10, 2, 3, 6, 7, 8, 9'
üîç Parsed numbers: [4, 0, 9, 1, 2, 5, 6, 7, 8]
‚úÖ Reranked 9/10 documents
üîç RAGAS Input Validation:
  Question: vHub SDWAn BGP (increase vHub Routing units) Hello All,
When we deploy SDWAN NVA in the vHub is buil...
  Answer: Based on the provided context, it seems that the individual is questioning why additional BGP peers ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: This article helps you configure an Azure Virtual WAN hub router t...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.75
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 0.75, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 27/30 [06:41<00:42, 14.13s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 2, 3, 4, 5, 6, 7, 8, 9, 10'
üîç Parsed numbers: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: Azure Openai Python Chatbot using excel data to identify classification search for sourcing manager ...
  Answer: The context provided does not contain information directly related to developing an Azure-OpenAI cha...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: the search index for documents that match a user's question. It th...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 28/30 [06:54<00:27, 13.59s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1. 2
2. 6
3. 7
4. 5
5. 1
6. 9
7. 10
8. 3
9. 4'
üîç Parsed numbers: [0, 1, 1, 5, 2, 6, 3, 4, 4, 0, 5, 8, 6, 9, 7, 2, 8, 3]
‚úÖ Reranked 18/10 documents
üîç RAGAS Input Validation:
  Question: ADF Copy activity pagination how to pass variable value to Header of next pagination call I have a p...
  Answer: The provided context does not contain information relevant to the question about passing a variable ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: , specify URL for the logic apps workflow that sends the success e...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 1.0
‚úÖ Extracted answer_relevancy: 0.0
‚úÖ Successfully extracted scores: {'faithfulness': 1.0, 'answer_relevancy': 0.0}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large:  97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 29/30 [07:04<00:12, 12.49s/it]

üîÑ Loading intfloat/e5-large-v2 on GPU...
üîç LLM ranking response: '1, 4, 2, 5, 7, 3, 8, 6, 9, 10'
üîç Parsed numbers: [0, 3, 1, 4, 6, 2, 7, 5, 8, 9]
‚úÖ Reranked 10/10 documents
üîç RAGAS Input Validation:
  Question: create multiple symmetric keys in azure dps What is the process to create multiple symmetric keys in...
  Answer: The process to create multiple symmetric keys in group enrollments in Azure DPS involves signing in ...
  Contexts: 3 documents
  Ground truth: Based on Microsoft documentation: that demonstrates how to create and use enrollment groups with sym...
üîÑ Evaluating with 2 core metrics...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ RAGAS evaluation completed
üìä DataFrame columns: ['user_input', 'retrieved_contexts', 'response', 'reference', 'faithfulness', 'answer_relevancy']
üî§ Skipping text column: user_input
üî§ Skipping text column: retrieved_contexts
üî§ Skipping text column: response
üî§ Skipping text column: reference
‚úÖ Extracted faithfulness: 0.8
‚úÖ Extracted answer_relevancy: 0.962149682528154
‚úÖ Successfully extracted scores: {'faithfulness': 0.8, 'answer_relevancy': 0.962149682528154}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating e5-large: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [07:19<00:00, 14.64s/it]


‚úÖ e5-large completed: 30 questions evaluated
ü§ñ RAG metrics: 30/30 successful
üìä Average Faithfulness: 0.581
üìä Average Relevance: 0.266

üéâ All evaluations completed!
üìä Models evaluated: ['mpnet', 'minilm', 'ada', 'e5-large']

‚ö†Ô∏è Models with errors:

üîç RAG METRICS DEBUG:
mpnet: 30 questions, avg P@5 = 0.053
  ü§ñ RAG: 30 successful
      avg_faithfulness: 0.4166666666666667
      avg_answer_relevance: 0.23713211815751442
minilm: 30 questions, avg P@5 = 0.033
  ü§ñ RAG: 30 successful
      avg_faithfulness: 0.39222222222222225
      avg_answer_relevance: 0.321413354675881
ada: 30 questions, avg P@5 = 0.067
  ü§ñ RAG: 30 successful
      avg_faithfulness: 0.46111111111111114
      avg_answer_relevance: 0.3269310153005364
e5-large: 30 questions, avg P@5 = 0.027
  ü§ñ RAG: 30 successful
      avg_faithfulness: 0.5811111111111111
      avg_answer_relevance: 0.2662987177056889

‚è±Ô∏è TIEMPO DE EVALUACI√ìN: 1834.66 segundos (30.58 minutos)


## Results Summary Table

In [37]:
# Check if evaluation has been completed
if 'all_model_results' not in globals() or not all_model_results:
    print("‚ö†Ô∏è No evaluation results found. Please run the evaluation cell first\!")
    raise ValueError("Run the evaluation cell (cell 14) before displaying results")

# Display results in table format before saving
print("üìä EVALUATION RESULTS SUMMARY")
print("=" * 80)

# Create summary table
summary_data = []
for model_name, results in all_model_results.items():
    # Skip models with errors in the summary table
    if 'error' in results:
        continue

    before_metrics = results['avg_before_metrics']
    after_metrics = results['avg_after_metrics']
    rag_metrics = results['rag_metrics']

    row = {
        'Model': model_name,
        'Questions': results['num_questions_evaluated'],
        'Dimensions': results['embedding_dimensions'],
        'Docs': f"{results['total_documents']:,}",
        # Before metrics (key ones)
        'P@5 (Before)': f"{before_metrics.get('precision@5', 0):.3f}",
        'R@5 (Before)': f"{before_metrics.get('recall@5', 0):.3f}",
        'F1@5 (Before)': f"{before_metrics.get('f1@5', 0):.3f}",
        'MRR (Before)': f"{before_metrics.get('mrr', 0):.3f}",
    }

    # After metrics if available
    if after_metrics:
        row.update({
            'P@5 (After)': f"{after_metrics.get('precision@5', 0):.3f}",
            'R@5 (After)': f"{after_metrics.get('recall@5', 0):.3f}",
            'F1@5 (After)': f"{after_metrics.get('f1@5', 0):.3f}",
            'MRR (After)': f"{after_metrics.get('mrr', 0):.3f}",
        })

        # Calculate improvements
        p5_improvement = after_metrics.get('precision@5', 0) - before_metrics.get('precision@5', 0)
        mrr_improvement = after_metrics.get('mrr', 0) - before_metrics.get('mrr', 0)
        row['P@5 Œî'] = f"{p5_improvement:+.3f}"
        row['MRR Œî'] = f"{mrr_improvement:+.3f}"

    # ‚úÖ FIXED: RAG metrics - Use avg_ prefix
    if rag_metrics.get('rag_available'):
        row['Faithfulness'] = f"{rag_metrics.get('avg_faithfulness', 0):.3f}"
        row['Relevance'] = f"{rag_metrics.get('avg_answer_relevance', 0):.3f}"
        row['Correctness'] = f"{rag_metrics.get('avg_answer_correctness', 0):.3f}"
        row['Similarity'] = f"{rag_metrics.get('avg_answer_similarity', 0):.3f}"

    summary_data.append(row)

# Display as DataFrame for better formatting
if summary_data:
    import pandas as pd
    df_summary = pd.DataFrame(summary_data)

    print("üéØ KEY METRICS COMPARISON:")
    print(df_summary.to_string(index=False))

    print(f"\nüìà PERFORMANCE INSIGHTS:")
    for model_name, results in all_model_results.items():
        if 'error' in results:
            continue

        before_metrics = results['avg_before_metrics']
        after_metrics = results['avg_after_metrics']

        print(f"\n{model_name.upper()}:")
        print(f"  üìä Best P@k: P@1={before_metrics.get('precision@1', 0):.3f}, P@5={before_metrics.get('precision@5', 0):.3f}, P@10={before_metrics.get('precision@10', 0):.3f}")
        print(f"  üéØ MRR: {before_metrics.get('mrr', 0):.3f}")
        print(f"  üìà NDCG@5: {before_metrics.get('ndcg@5', 0):.3f}, MAP@5: {before_metrics.get('map@5', 0):.3f}")

        if after_metrics:
            p5_before = before_metrics.get('precision@5', 0)
            p5_after = after_metrics.get('precision@5', 0)
            mrr_before = before_metrics.get('mrr', 0)
            mrr_after = after_metrics.get('mrr', 0)

            p5_improvement = ((p5_after - p5_before) / p5_before * 100) if p5_before > 0 else 0
            mrr_improvement = ((mrr_after - mrr_before) / mrr_before * 100) if mrr_before > 0 else 0

            print(f"  üîÑ LLM Reranking:")
            print(f"    P@5: {p5_before:.3f} ‚Üí {p5_after:.3f} ({p5_improvement:+.1f}%)")
            print(f"    MRR: {mrr_before:.3f} ‚Üí {mrr_after:.3f} ({mrr_improvement:+.1f}%)")

        # ‚úÖ FIXED: RAG metrics display - Use avg_ prefix
        rag_metrics = results['rag_metrics']
        if rag_metrics.get('rag_available'):
            print(f"  ü§ñ RAG Metrics:")
            print(f"    Faithfulness: {rag_metrics.get('avg_faithfulness', 0):.3f}")
            print(f"    Answer Relevance: {rag_metrics.get('avg_answer_relevance', 0):.3f}")
            print(f"    Answer Correctness: {rag_metrics.get('avg_answer_correctness', 0):.3f}")
            print(f"    Answer Similarity: {rag_metrics.get('avg_answer_similarity', 0):.3f}")
            print(f"    Successful evaluations: {rag_metrics.get('successful_evaluations', 0)}/{rag_metrics.get('total_evaluations', 0)}")
        else:
            print(f"  ‚ùå RAG: No metrics available - OpenAI API issue or disabled")

    # Find best model by P@5 before (excluding models with errors)
    valid_models = [(name, res) for name, res in all_model_results.items() if 'error' not in res and res['num_questions_evaluated'] > 0]
    if valid_models:
        print(f"\nüèÜ TOP PERFORMERS:")

        # Best by P@5
        best_p5_model = max(valid_models, key=lambda x: x[1]['avg_before_metrics'].get('precision@5', 0))
        print(f"   üéØ Best P@5: {best_p5_model[0]} ({best_p5_model[1]['avg_before_metrics'].get('precision@5', 0):.3f})")

        # Best by MRR
        best_mrr_model = max(valid_models, key=lambda x: x[1]['avg_before_metrics'].get('mrr', 0))
        print(f"   ‚ö° Best MRR: {best_mrr_model[0]} ({best_mrr_model[1]['avg_before_metrics'].get('mrr', 0):.3f})")

        # Best RAG metrics if available
        rag_models = [(name, res) for name, res in valid_models if res['rag_metrics'].get('rag_available', False)]
        if rag_models:
            best_faithful = max(rag_models, key=lambda x: x[1]['rag_metrics'].get('avg_faithfulness', 0))
            print(f"   ü§ñ Best Faithfulness: {best_faithful[0]} ({best_faithful[1]['rag_metrics'].get('avg_faithfulness', 0):.3f})")

    # Show query construction details
    print(f"\nüîç QUERY CONSTRUCTION VERIFICATION:")
    print("   ‚úÖ Using ONLY title + question_content for retrieval")
    print("   ‚ùå NOT using accepted_answer (corrected)")
    print("   üìù Format: 'title question_content' ‚Üí embedding ‚Üí retrieval ‚Üí ranking")
    print(f"   üîë Query Models Used:")
    for model_name, results in all_model_results.items():
        if 'error' not in results:
            print(f"     {model_name}: {results.get('query_model', 'N/A')}")
else:
    print("‚ùå No successful model evaluations to display")

# Show models with errors
error_models = [(name, res) for name, res in all_model_results.items() if 'error' in res]
if error_models:
    print(f"\n‚ö†Ô∏è MODELS WITH ERRORS ({len(error_models)}):")
    for model_name, results in error_models:
        print(f"   {model_name}: {results['error']}")
        print(f"      Documents: {results['total_documents']:,} ({results['embedding_dimensions']} dims)")
        print(f"      Query model tried: {results['query_model']}")

print("\n" + "=" * 80)
print("‚úÖ Ready to save results\!")

# Show sample of what's being evaluated for debugging
if summary_data:
    print(f"\nüîç SAMPLE EVALUATION DATA (First successful model):")
    first_model = next((name for name, res in all_model_results.items() if 'error' not in res), None)
    if first_model and 'individual_before_metrics' in all_model_results[first_model]:
        sample_metrics = all_model_results[first_model]['individual_before_metrics'][:3]
        for i, metric in enumerate(sample_metrics):
            if 'original_question' in metric:
                print(f"   Q{i+1}: '{metric['original_question'][:100]}...' ‚Üí P@5={metric.get('precision@5', 0):.3f}")
            else:
                print(f"   Q{i+1}: P@5={metric.get('precision@5', 0):.3f}")

# ‚úÖ FINAL DEBUG: Show complete RAG metrics structure
print(f"\nüîç RAG METRICS STRUCTURE VERIFICATION:")
for model_name, results in all_model_results.items():
    if 'error' not in results:
        rag_metrics = results['rag_metrics']
        print(f"\n{model_name.upper()} RAG Structure:")
        print(f"  rag_available: {rag_metrics.get('rag_available', False)}")
        print(f"  successful_evaluations: {rag_metrics.get('successful_evaluations', 0)}")
        print(f"  total_evaluations: {rag_metrics.get('total_evaluations', 0)}")

        if rag_metrics.get('rag_available', False):
            print(f"  ‚úÖ RAG Metrics Found:")
            for key in ['avg_faithfulness', 'avg_answer_relevance', 'avg_answer_correctness', 'avg_answer_similarity']:
                value = rag_metrics.get(key, 'MISSING')
                print(f"    {key}: {value}")
        else:
            print(f"  ‚ùå No RAG metrics available")
            print(f"    Reason: Check OpenAI API key and GENERATE_RAG_METRICS setting")

        # Show sample individual RAG metrics if available
        individual_rag = results.get('individual_rag_metrics', [])
        if individual_rag:
            print(f"  üìã Individual RAG metrics: {len(individual_rag)} entries")
            if len(individual_rag) > 0:
                sample = individual_rag[0]
                print(f"    Sample entry keys: {list(sample.keys())}")
        else:
            print(f"  üìã No individual RAG metrics found")
        break  # Show only first model for debugging

print(f"\nüéâ SUMMARY COMPLETE - RAG metrics should now be visible in Streamlit\!")

üìä EVALUATION RESULTS SUMMARY
üéØ KEY METRICS COMPARISON:
   Model  Questions  Dimensions    Docs P@5 (Before) R@5 (Before) F1@5 (Before) MRR (Before) P@5 (After) R@5 (After) F1@5 (After) MRR (After)  P@5 Œî  MRR Œî Faithfulness Relevance Correctness Similarity
   mpnet         30         768 187,031        0.053        0.181         0.076        0.171       0.060       0.214        0.087       0.183 +0.007 +0.012        0.417     0.237       0.000      0.000
  minilm         30         384 187,031        0.033        0.097         0.044        0.081       0.040       0.131        0.055       0.077 +0.007 -0.003        0.392     0.321       0.000      0.000
     ada         30        1536 187,031        0.067        0.231         0.096        0.192       0.060       0.206        0.090       0.164 -0.007 -0.028        0.461     0.327       0.000      0.000
e5-large         30        1024 187,031        0.027        0.078         0.038        0.044       0.027       0.083        0.040

## Save Results

In [38]:
# Check if we have results to save
if 'all_model_results' not in globals() or not all_model_results:
    print("‚ö†Ô∏è No evaluation results to save. Please run the evaluation first!")
    raise ValueError("Run the evaluation cell before saving results")

# Convert numpy types to Python types for JSON serialization
def convert_numpy_types(obj):
    import numpy as np
    if isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj

# Prepare results for saving
chile_time = datetime.now(CHILE_TZ)
import time
unix_timestamp = int(time.time())

# Build results structure compatible with Streamlit
results = {
    'config': {
        'num_questions': NUM_QUESTIONS,
        'selected_models': list(all_model_results.keys()),
        'embedding_model_name': list(all_model_results.keys())[0] if len(all_model_results) == 1 else 'Multi-Model',
        'generative_model_name': evaluation_params.get('generative_model_name', 'gpt-4'),
        'top_k': TOP_K,
        'use_llm_reranker': USE_LLM_RERANKER,
        'generate_rag_metrics': GENERATE_RAG_METRICS,
        'batch_size': evaluation_params.get('batch_size', 50),
        'evaluate_all_models': len(all_model_results) > 1
    },
    'evaluation_info': {
        'timestamp': chile_time.strftime('%Y-%m-%d %H:%M:%S'),
        'timezone': 'America/Santiago',
        'evaluation_type': 'cumulative_metrics_colab_multi_model',
        'total_time_seconds': EVALUATION_DURATION if 'EVALUATION_DURATION' in globals() else 600,  # Use actual time
        'gpu_used': True,
        'enhanced_display_compatible': True,
        'metrics_version': '2.0',
        'llm_reranking_performed': USE_LLM_RERANKER,
        'models_evaluated': len(all_model_results),
        'data_verification': {
            'is_real_data': True,
            'no_simulation': True,
            'data_source': 'ChromaDB_export_parquet',
            'similarity_method': 'sklearn_cosine_similarity_exact',
            'reranking_method': 'openai_llm_reranking' if USE_LLM_RERANKER else 'none'
        }
    },
    'results': all_model_results
}

# Convert numpy types
results_converted = convert_numpy_types(results)

# Save to file
output_file = f"{RESULTS_OUTPUT_PATH}cumulative_results_{unix_timestamp}.json"

try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results_converted, f, indent=2, ensure_ascii=False)

    print(f"üíæ Results saved successfully!")
    print(f"üìÇ File: cumulative_results_{unix_timestamp}.json")
    print(f"‚è∞ Time: {chile_time.strftime('%Y-%m-%d %H:%M:%S %Z')}")
    print(f"üìä Size: {len(json.dumps(results_converted)) / (1024*1024):.1f} MB")
    print(f"üéØ Models: {len(all_model_results)} evaluated")

    # Final verification
    print(f"\n‚úÖ VERIFICATION COMPLETE:")
    print(f"   üìã {results_converted['evaluation_info']['models_evaluated']} models evaluated")
    print(f"   ‚ùì {NUM_QUESTIONS} questions per model")
    print(f"   üîÑ LLM Reranking: {'‚úÖ' if USE_LLM_RERANKER else '‚ùå'}")
    print(f"   ü§ñ RAG Metrics: {'‚úÖ' if GENERATE_RAG_METRICS else '‚ùå'}")
    print(f"   üéØ Real ChromaDB embeddings: ‚úÖ")
    print(f"   üìä JSON serialization: ‚úÖ")

except Exception as e:
    print(f"‚ùå Error saving results: {e}")

print("\nüéâ EVALUATION COMPLETE!")

# Calculate total notebook execution time
if 'NOTEBOOK_START_TIME' in globals():
    NOTEBOOK_END_TIME = time.time()
    TOTAL_NOTEBOOK_TIME = NOTEBOOK_END_TIME - NOTEBOOK_START_TIME

    print(f"\n" + "="*60)
    print(f"‚è±Ô∏è TIEMPOS DE EJECUCI√ìN:")
    print(f"="*60)

    # Evaluation time
    if 'EVALUATION_DURATION' in globals():
        print(f"üìä Tiempo de evaluaci√≥n: {EVALUATION_DURATION:.2f} segundos ({EVALUATION_DURATION/60:.2f} minutos)")
        eval_percentage = (EVALUATION_DURATION / TOTAL_NOTEBOOK_TIME) * 100
        print(f"   - Porcentaje del tiempo total: {eval_percentage:.1f}%")

    # Total notebook time
    print(f"üìì Tiempo total del notebook: {TOTAL_NOTEBOOK_TIME:.2f} segundos ({TOTAL_NOTEBOOK_TIME/60:.2f} minutos)")

    # Breakdown
    if 'EVALUATION_DURATION' in globals():
        setup_time = TOTAL_NOTEBOOK_TIME - EVALUATION_DURATION
        print(f"\nüìà Desglose:")
        print(f"   - Setup e instalaci√≥n: {setup_time:.2f} segundos ({setup_time/60:.2f} minutos)")
        print(f"   - Evaluaci√≥n: {EVALUATION_DURATION:.2f} segundos ({EVALUATION_DURATION/60:.2f} minutos)")

    # Human-readable format
    hours = int(TOTAL_NOTEBOOK_TIME // 3600)
    minutes = int((TOTAL_NOTEBOOK_TIME % 3600) // 60)
    seconds = int(TOTAL_NOTEBOOK_TIME % 60)

    if hours > 0:
        print(f"\n‚è∞ Tiempo total en formato legible: {hours}h {minutes}m {seconds}s")
    else:
        print(f"\n‚è∞ Tiempo total en formato legible: {minutes}m {seconds}s")
else:
    print("\n‚ö†Ô∏è No se pudo calcular el tiempo total del notebook (NOTEBOOK_START_TIME no encontrado)")

üíæ Results saved successfully!
üìÇ File: cumulative_results_1753406301.json
‚è∞ Time: 2025-07-24 21:18:21 -04
üìä Size: 0.4 MB
üéØ Models: 4 evaluated

‚úÖ VERIFICATION COMPLETE:
   üìã 4 models evaluated
   ‚ùì 30 questions per model
   üîÑ LLM Reranking: ‚úÖ
   ü§ñ RAG Metrics: ‚úÖ
   üéØ Real ChromaDB embeddings: ‚úÖ
   üìä JSON serialization: ‚úÖ

üéâ EVALUATION COMPLETE!

‚è±Ô∏è TIEMPOS DE EJECUCI√ìN:
üìä Tiempo de evaluaci√≥n: 1834.66 segundos (30.58 minutos)
   - Porcentaje del tiempo total: 99.8%
üìì Tiempo total del notebook: 1839.04 segundos (30.65 minutos)

üìà Desglose:
   - Setup e instalaci√≥n: 4.38 segundos (0.07 minutos)
   - Evaluaci√≥n: 1834.66 segundos (30.58 minutos)

‚è∞ Tiempo total en formato legible: 30m 39s
