In [1]:
!pip3 install -r requirements.txt



In [2]:
import os
import sys
import time
import json
import threading
import statistics
import traceback
from collections import Counter, defaultdict
from pathlib import Path
from datetime import datetime
from urllib.parse import quote

# Data Processing and Analysis
import numpy as np
import pandas as pd
from scipy import stats

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Progress Bars and UI
from tqdm import tqdm

# Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

# Machine Learning Datasets
from datasets import load_dataset

# HTTP Requests and Elasticsearch
import requests
from elasticsearch import Elasticsearch, helpers

# System Monitoring and Performance
import psutil
import resource

# Concurrent Processing
import concurrent.futures
from multiprocessing import Pool, cpu_count

# Random and Math
import random
import math

# Dataset load 

In [3]:
# Dataset Configuration
DATASET_CONFIG = {
    'wikipedia': {
        'name': 'wikimedia/wikipedia',
        'version': '20231101.en',
        'text_field': 'text',
        'id_field': 'id',
        'split': 'train'
    },
    'news': {
        'name': 'custom_news',  # Placeholder for news dataset
        'version': None,
        'text_field': 'text',
        'id_field': 'id',
        'split': 'train'
    }
}

# User Configuration - MODIFY THESE VALUES
SELECTED_DATASET = 'wikipedia'  # Options: 'wikipedia', 'news'
MAX_DOCUMENTS = 50000           # Number of documents to process
INDEX_NAME = "esindex-v1.0"    # Elasticsearch index name
BATCH_SIZE = 100              # Batch size for ES indexing

# Display current configuration
print("üîß Current Configuration:")
print(f"   Dataset: {SELECTED_DATASET}")
print(f"   Max Documents: {MAX_DOCUMENTS}")
print(f"   Index Name: {INDEX_NAME}")
print(f"   Batch Size: {BATCH_SIZE}")
print("-" * 50)

üîß Current Configuration:
   Dataset: wikipedia
   Max Documents: 50000
   Index Name: esindex-v1.0
   Batch Size: 100
--------------------------------------------------


In [4]:
# Define the local directory where you want to save the data
local_path = os.path.join(os.getcwd(), "local_wikipedia_data")

# 1. Download and save the data to the specified local_path
ds = load_dataset(
    "wikimedia/wikipedia",
    "20231101.en",
    cache_dir=local_path  
)

print(f"Dataset successfully downloaded/loaded to: {local_path}")


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

Dataset successfully downloaded/loaded to: /home/san22chit/Documents/IIITH/Sem3/IRE/Assignments/IndexingAndRetrieval/local_wikipedia_data


# Part 1

## Preprocessing (Hugging Face Dataset)

In [5]:
def load_selected_dataset(dataset_key, max_docs=None):
    """Load the specified dataset with configuration"""
    
    if dataset_key not in DATASET_CONFIG:
        raise ValueError(f"Dataset '{dataset_key}' not found. Available: {list(DATASET_CONFIG.keys())}")
    
    config = DATASET_CONFIG[dataset_key]
    
    print(f"üìÅ Loading dataset: {dataset_key}")
    
    if dataset_key == 'wikipedia':
        # Use existing Wikipedia dataset
        dataset = ds  # Your existing loaded dataset
        split_data = dataset[config['split']]
        
    elif dataset_key == 'news':
        # Load news dataset (implement based on your news source)
        print("‚ö†Ô∏è  Loading sample news data (replace with actual news dataset)")
        # For demo, create sample news data
        sample_news = [
            {
                'id': f'news_{i}',
                'text': f"Sample news article {i}: This is a technology news article about artificial intelligence and machine learning developments in the industry. The latest research shows significant progress in natural language processing.",
                'title': f"News Article {i}"
            }
            for i in range(min(max_docs or 1000, 1000))
        ]
        
        # Convert to dataset-like structure
        class SimpleDataset:
            def __init__(self, data):
                self.data = data
            
            def __iter__(self):
                return iter(self.data)
            
            def __getitem__(self, idx):
                return self.data[idx]
            
            def __len__(self):
                return len(self.data)
        
        split_data = SimpleDataset(sample_news)
    
    print(f"‚úÖ Dataset loaded: {len(split_data) if hasattr(split_data, '__len__') else 'Unknown size'} total documents")
    
    return split_data, config

# Load the selected dataset
selected_data, dataset_config = load_selected_dataset(SELECTED_DATASET, MAX_DOCUMENTS)

üìÅ Loading dataset: wikipedia
‚úÖ Dataset loaded: 6407814 total documents


In [6]:
# Download NLTK resources if not already present
nltk.download('punkt_tab')
nltk.download('stopwords')


# Get English stopwords and initialize stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
punct_table = str.maketrans('', '', string.punctuation)

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(punct_table)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and stem
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words and word.isalpha()]
    return tokens


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/san22chit/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/san22chit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def get_word_counts(split, preprocess_func=None, max_docs=10000):
    word_count = Counter()
    for idx, item in enumerate(ds[split]):
        text = item['text']
        if preprocess_func:
            tokens = preprocess_func(text)
        else:
            tokens = word_tokenize(text)
        word_count.update(tokens)
        if idx+1 >= max_docs:
            break
    return word_count

def plot_word_freq(counter, title, filename, top_n=30):
    most_common = counter.most_common(top_n)
    words, counts = zip(*most_common)
    plt.figure(figsize=(14,7))
    plt.bar(words, counts)
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

In [8]:
def preprocess_selected_dataset(dataset, config, max_docs, preprocess_func=None):
    """Preprocess the selected dataset with limits"""
    
    print(f"üîÑ Preprocessing {max_docs} documents from {SELECTED_DATASET} dataset...")
    
    original_counter = Counter()
    processed_counter = Counter()
    processed_documents = []
    
    count = 0
    for item in tqdm(dataset, desc="Processing documents", total=max_docs):
        if count >= max_docs:
            break
            
        # Extract text based on config
        text = item[config['text_field']]
        doc_id = item[config['id_field']]
        
        # Original tokenization
        original_tokens = word_tokenize(text)
        original_counter.update(original_tokens)
        
        # Processed tokens
        if preprocess_func:
            processed_tokens = preprocess_func(text)
            processed_counter.update(processed_tokens)
        else:
            processed_tokens = original_tokens
        
        # Store processed document
        processed_documents.append({
            'id': doc_id,
            'original_text': text,
            'processed_tokens': processed_tokens,
            'title': item.get('title', '')
        })
        
        count += 1
    
    print(f"‚úÖ Processed {count} documents")
    print(f"   Original vocabulary: {len(original_counter)} unique tokens")
    print(f"   Processed vocabulary: {len(processed_counter)} unique tokens")
    
    return processed_documents, original_counter, processed_counter

# Run preprocessing on selected dataset
processed_docs, orig_counts, proc_counts = preprocess_selected_dataset(
    selected_data, 
    dataset_config, 
    MAX_DOCUMENTS, 
    preprocess
)



# Generate plots for selected dataset
plot_word_freq(
    orig_counts, 
    f"Word Frequency Before Preprocessing ({SELECTED_DATASET} - {MAX_DOCUMENTS} docs)",
    f"freq_before_{SELECTED_DATASET}_{MAX_DOCUMENTS}.png"
)

plot_word_freq(
    proc_counts, 
    f"Word Frequency After Preprocessing ({SELECTED_DATASET} - {MAX_DOCUMENTS} docs)",
    f"freq_after_{SELECTED_DATASET}_{MAX_DOCUMENTS}.png"
)

print(f"üìä Plots saved for {SELECTED_DATASET} dataset with {MAX_DOCUMENTS} documents")
# Save processed documents to CSV
out_dir = Path("dataset")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "preprocessed_dataset.csv"

try:
    df_pre = pd.DataFrame(processed_docs)
    # Normalize processed_tokens to a space-joined string for CSV storage
    df_pre['processed_tokens'] = df_pre['processed_tokens'].apply(lambda t: " ".join(t) if isinstance(t, (list, tuple)) else str(t))
    # Add token count for convenience
    df_pre['token_count'] = df_pre['processed_tokens'].apply(lambda s: len(s.split()) if s else 0)
    df_pre.to_csv(out_path, index=False, encoding='utf-8')
    print(f"üíæ Saved {len(df_pre)} processed documents to: {out_path}")
except Exception as e:
    print(f"‚ùå Failed to save processed dataset: {e}")

üîÑ Preprocessing 50000 documents from wikipedia dataset...


Processing documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50000/50000 [06:43<00:00, 123.94it/s]


‚úÖ Processed 50000 documents
   Original vocabulary: 985682 unique tokens
   Processed vocabulary: 607460 unique tokens
üìä Plots saved for wikipedia dataset with 50000 documents
üíæ Saved 50000 processed documents to: dataset/preprocessed_dataset.csv


In [9]:
import requests
import json
from urllib.parse import quote

class WorkingElasticsearch:
    """Working Elasticsearch client using requests library"""
    
    def __init__(self, host="http://localhost:9200"):
        self.host = host.rstrip('/')
        
    def ping(self):
        try:
            response = requests.get(f"{self.host}/", timeout=10)
            return response.status_code == 200
        except Exception as e:
            print(f"Ping error: {e}")
            return False
    
    def info(self):
        try:
            response = requests.get(f"{self.host}/", timeout=10)
            return response.json() if response.status_code == 200 else None
        except:
            return None
    
    def delete_index(self, index_name):
        try:
            response = requests.delete(f"{self.host}/{quote(index_name)}")
            return response.status_code in [200, 404]
        except:
            return False
    
    def create_index(self, index_name, body):
        try:
            response = requests.put(f"{self.host}/{quote(index_name)}", json=body, timeout=30)
            return response.status_code in [200, 201]
        except Exception as e:
            print(f"Create index error: {e}")
            return False
    
    def index_exists(self, index_name):
        try:
            response = requests.head(f"{self.host}/{quote(index_name)}")
            return response.status_code == 200
        except:
            return False
    
    def bulk_index(self, docs):
        """Bulk index documents"""
        try:
            bulk_data = []
            for doc in docs:
                # Add index action
                bulk_data.append(json.dumps({
                    "index": {
                        "_index": doc["_index"],
                        "_id": doc["_id"]
                    }
                }))
                # Add document data
                doc_data = {k: v for k, v in doc.items() if not k.startswith('_')}
                bulk_data.append(json.dumps(doc_data))
            
            bulk_body = '\n'.join(bulk_data) + '\n'
            headers = {'Content-Type': 'application/x-ndjson'}
            
            response = requests.post(
                f"{self.host}/_bulk",
                data=bulk_body,
                headers=headers,
                timeout=60
            )
            
            if response.status_code == 200:
                result = response.json()
                errors = [item for item in result.get('items', []) if 'error' in item.get('index', {})]
                if errors:
                    print(f"‚ö†Ô∏è  {len(errors)} indexing errors occurred")
                return len(errors) == 0
            return False
            
        except Exception as e:
            print(f"Bulk index error: {e}")
            return False
    
    def refresh_index(self, index_name):
        try:
            response = requests.post(f"{self.host}/{quote(index_name)}/_refresh")
            return response.status_code == 200
        except:
            return False
    
    def search(self, index_name, body):
        try:
            response = requests.post(f"{self.host}/{quote(index_name)}/_search", json=body, timeout=30)
            return response.json() if response.status_code == 200 else None
        except Exception as e:
            print(f"Search error: {e}")
            return None
    
    def get_index_stats(self, index_name):
        try:
            response = requests.get(f"{self.host}/{quote(index_name)}/_stats")
            return response.json() if response.status_code == 200 else None
        except:
            return None

# Create Elasticsearch client
print("üîå Connecting to Elasticsearch...")
es = WorkingElasticsearch()

if es.ping():
    print("‚úÖ Elasticsearch connection successful!")
    info = es.info()
    if info:
        print(f"   Version: {info['version']['number']}")
        print(f"   Cluster: {info['cluster_name']}")
else:
    print("‚ùå Cannot connect to Elasticsearch")
    print("   Make sure Docker container is running: docker ps")

üîå Connecting to Elasticsearch...
‚úÖ Elasticsearch connection successful!
   Version: 8.11.0
   Cluster: docker-cluster


In [10]:
def optimized_bulk_indexing(processed_documents, index_name, es_client, batch_size=10):
    """
    Optimized bulk indexing that handles large documents properly
    """
    
    if not es_client.ping():
        print("‚ùå No Elasticsearch connection")
        return False
    
    print(f"üîß Optimized indexing for large documents...")
    print(f"   Batch size: {batch_size}")
    print(f"   Total documents: {len(processed_documents)}")
    
    # Delete and recreate index with optimized settings
    if es_client.index_exists(index_name):
        es_client.delete_index(index_name)
        print(f"üóëÔ∏è  Deleted existing index")
    
    # Optimized mapping for large documents
    mapping = {
        "mappings": {
            "properties": {
                "id": {"type": "keyword"},
                "text": {
                    "type": "text",
                    "analyzer": "standard",
                    "index_options": "docs",  # Don't store positions/frequencies
                    "norms": False  # Disable scoring norms to save space
                },
                "title": {"type": "text"},
                "token_count": {"type": "integer"}
            }
        },
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "refresh_interval": "30s",  # Reduce refresh frequency
            "index": {
                "max_result_window": 10000,
                "mapping": {
                    "total_fields": {"limit": 1000}
                },
                "blocks": {
                    "read_only_allow_delete": False
                }
            }
        }
    }
    
    if not es_client.create_index(index_name, mapping):
        print(f"‚ùå Failed to create optimized index")
        return False
    
    print(f"‚úÖ Created optimized index: {index_name}")
    
    # Prepare documents with size optimization
    optimized_docs = []
    skipped = 0
    
    for i, doc in enumerate(processed_documents):
        try:
            text = doc['original_text']
            
            # Aggressive text size limiting for bulk operations
            max_size = 10000  # 10KB per document for bulk operations
            if len(text) > max_size:
                text = text[:max_size] + "... [truncated]"
            
            # Clean document ID
            doc_id = str(doc['id']).replace('/', '_').replace(' ', '_')[:50]
            
            optimized_doc = {
                "_index": index_name,
                "_id": doc_id,
                "id": doc_id,
                "text": text,
                "title": str(doc.get('title', ''))[:200],
                "token_count": len(doc['processed_tokens'])
            }
            
            optimized_docs.append(optimized_doc)
            
        except Exception as e:
            print(f"‚ö†Ô∏è  Skipping doc {i}: {e}")
            skipped += 1
    
    print(f"üìã Prepared {len(optimized_docs)} documents (skipped {skipped})")
    
    # Bulk index with very small batches and error handling
    total_success = 0
    total_batches = (len(optimized_docs) + batch_size - 1) // batch_size
    
    print(f"üîÑ Indexing in {total_batches} batches of {batch_size}...")
    
    for i in tqdm(range(0, len(optimized_docs), batch_size), desc="Bulk indexing"):
        batch = optimized_docs[i:i + batch_size]
        batch_num = (i // batch_size) + 1
        
        try:
            # Create bulk request
            bulk_lines = []
            for doc in batch:
                # Index action
                bulk_lines.append(json.dumps({
                    "index": {
                        "_index": doc["_index"],
                        "_id": doc["_id"]
                    }
                }))
                # Document
                doc_data = {k: v for k, v in doc.items() if not k.startswith('_')}
                bulk_lines.append(json.dumps(doc_data))
            
            bulk_body = '\n'.join(bulk_lines) + '\n'
            
            # Send bulk request with retries
            max_retries = 3
            for retry in range(max_retries):
                try:
                    response = requests.post(
                        f"{es_client.host}/_bulk",
                        data=bulk_body,
                        headers={'Content-Type': 'application/x-ndjson'},
                        timeout=120  # Longer timeout
                    )
                    
                    if response.status_code == 200:
                        result = response.json()
                        
                        # Count successful indexings
                        batch_success = 0
                        batch_errors = 0
                        
                        for item in result.get('items', []):
                            if 'error' in item.get('index', {}):
                                batch_errors += 1
                                # Only show first error per batch
                                if batch_errors == 1:
                                    error = item['index']['error']
                                    print(f"‚ùå Batch {batch_num} error: {error.get('type', 'unknown')}")
                            else:
                                batch_success += 1
                        
                        total_success += batch_success
                        
                        if batch_errors == 0:
                            break  # Success, no need to retry
                        elif batch_success > 0:
                            print(f"‚ö†Ô∏è  Batch {batch_num}: {batch_success} success, {batch_errors} errors")
                            break  # Partial success, move on
                        else:
                            print(f"‚ùå Batch {batch_num}: All failed, retry {retry + 1}/{max_retries}")
                            if retry == max_retries - 1:
                                print(f"üíÄ Batch {batch_num}: Giving up after {max_retries} retries")
                    else:
                        print(f"‚ùå Batch {batch_num}: HTTP {response.status_code}")
                        if retry == max_retries - 1:
                            print(f"Response: {response.text[:200]}")
                    
                    if retry < max_retries - 1:
                        time.sleep(2)  # Wait before retry
                        
                except requests.exceptions.Timeout:
                    print(f"‚è∞ Batch {batch_num}: Timeout (retry {retry + 1}/{max_retries})")
                    if retry < max_retries - 1:
                        time.sleep(5)
                except Exception as e:
                    print(f"‚ùå Batch {batch_num}: Exception {e}")
                    break
                    
        except Exception as e:
            print(f"‚ùå Critical error in batch {batch_num}: {e}")
    
    print(f"‚úÖ Indexing complete: {total_success}/{len(optimized_docs)} documents")
    
    # Refresh and get stats
    es_client.refresh_index(index_name)
    
    stats = es_client.get_index_stats(index_name)
    if stats and 'indices' in stats:
        actual_count = stats['indices'][index_name]['total']['docs']['count']
        size_mb = stats['indices'][index_name]['total']['store']['size_in_bytes'] / (1024 * 1024)
        print(f"üìä Index stats: {actual_count} documents, {size_mb:.2f} MB")
        
        # Success if we have at least 70% of documents
        success_rate = actual_count / len(processed_docs) if processed_docs else 0
        return success_rate >= 0.7
    
    return total_success >= len(optimized_docs) * 0.7

# Run optimized indexing
print(f"\n{'='*60}")
print(f"üöÄ RUNNING OPTIMIZED BULK INDEXING")
print(f"{'='*60}")

import time  # Add this import

if es and es.ping():
    start_time = time.time()
    
    optimized_success = optimized_bulk_indexing(
        processed_docs, 
        INDEX_NAME, 
        es, 
        batch_size=5  # Very small batches for large documents
    )
    
    end_time = time.time()
    duration = end_time - start_time
    
    print(f"\nüìä Indexing Results:")
    print(f"   Success: {'‚úÖ' if optimized_success else '‚ùå'}")
    print(f"   Duration: {duration:.2f} seconds")
    print(f"   Index: {INDEX_NAME}")
    
    if optimized_success:
        print(f"\nüéâ ESIndex-v1.0 Successfully Created!")
        indexing_success = True
    else:
        print(f"\n‚ö†Ô∏è  Indexing completed with reduced success rate")
        indexing_success = optimized_success
        
else:
    print("‚ùå No Elasticsearch connection")
    indexing_success = False


üöÄ RUNNING OPTIMIZED BULK INDEXING
üîß Optimized indexing for large documents...
   Batch size: 5
   Total documents: 50000
üóëÔ∏è  Deleted existing index
‚úÖ Created optimized index: esindex-v1.0
üìã Prepared 50000 documents (skipped 0)
üîÑ Indexing in 10000 batches of 5...


Bulk indexing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10000/10000 [01:39<00:00, 100.18it/s]


‚úÖ Indexing complete: 50000/50000 documents
üìä Index stats: 50000 documents, 124.48 MB

üìä Indexing Results:
   Success: ‚úÖ
   Duration: 101.31 seconds
   Index: esindex-v1.0

üéâ ESIndex-v1.0 Successfully Created!


In [11]:
def test_optimized_index(es_client, index_name):
    """Test the optimized index with various queries"""
    
    if not es_client.ping():
        print("‚ùå No ES connection for testing")
        return
    
    print(f"\nüß™ Testing optimized index: {index_name}")
    print("="*40)
    
    # Basic stats
    try:
        stats = es_client.get_index_stats(index_name)
        if stats and 'indices' in stats:
            doc_count = stats['indices'][index_name]['total']['docs']['count']
            print(f"üìä Total documents in index: {doc_count}")
        else:
            print("‚ùå Could not get index stats")
            return
    except:
        print("‚ùå Index may not exist")
        return
    
    # Test queries
    test_queries = [
        ("Simple match", {"query": {"match": {"text": "anarchism"}}, "size": 3}),
        ("Title search", {"query": {"match": {"title": "anarchism"}}, "size": 3}),
        ("Multi-word", {"query": {"match": {"text": "political philosophy"}}, "size": 3}),
        ("Range query", {"query": {"range": {"token_count": {"gte": 100}}}, "size": 5})
    ]
    
    for query_name, query in test_queries:
        try:
            print(f"\nüîç {query_name}:")
            results = es_client.search(index_name, query)
            
            if results and 'hits' in results:
                total = results['hits']['total']['value']
                hits = results['hits']['hits']
                
                print(f"   Results: {total} total")
                for i, hit in enumerate(hits[:2], 1):
                    score = hit.get('_score', 0)
                    title = hit['_source'].get('title', 'No title')
                    tokens = hit['_source'].get('token_count', 0)
                    text_preview = hit['_source']['text'][:80] + "..."
                    
                    print(f"   {i}. {title} (score: {score:.3f}, tokens: {tokens})")
                    print(f"      {text_preview}")
            else:
                print(f"   ‚ùå No results or error")
                
        except Exception as e:
            print(f"   ‚ùå Query error: {e}")

# Test the optimized index
if indexing_success:
    test_optimized_index(es, INDEX_NAME)


üß™ Testing optimized index: esindex-v1.0
üìä Total documents in index: 50000

üîç Simple match:
   Results: 25 total
   1. Anarchism (score: 12.796, tokens: 3970)
      Anarchism is a political philosophy and movement that is skeptical of all justif...
   2. Ayn Rand (score: 12.796, tokens: 3615)
      Alice O'Connor (born Alisa Zinovyevna Rosenbaum; , 1905¬†‚Äì March 6, 1982), better...

üîç Title search:
   Results: 2 total
   1. Anarchism (score: 13.429, tokens: 3970)
      Anarchism is a political philosophy and movement that is skeptical of all justif...
   2. Anarchism in Mexico (score: 9.610, tokens: 917)
      Anarchism in Mexico, the anarchist movement in Mexico, extends from Plotino Rhod...

üîç Multi-word:
   Results: 4801 total
   1. Anarchism (score: 10.843, tokens: 3970)
      Anarchism is a political philosophy and movement that is skeptical of all justif...
   2. Ayn Rand (score: 10.843, tokens: 3615)
      Alice O'Connor (born Alisa Zinovyevna Rosenbaum; , 1905¬

# Metrics Testing

In [12]:
import time
import psutil
import statistics
import threading
import concurrent.futures
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
import json
from datetime import datetime

class PerformanceMetrics:
    """Comprehensive performance metrics collector for search systems"""
    
    def __init__(self, system_name="ESIndex-v1.0"):
        self.system_name = system_name
        self.query_times = []
        self.memory_usage = []
        self.throughput_data = []
        self.functional_metrics = {}
        self.start_time = time.time()
        
    def reset_metrics(self):
        """Reset all collected metrics"""
        self.query_times = []
        self.memory_usage = []
        self.throughput_data = []
        self.functional_metrics = {}
        self.start_time = time.time()
        
    def record_query_time(self, query_time_ms):
        """Record individual query response time"""
        self.query_times.append(query_time_ms)
        
    def record_memory_usage(self):
        """Record current system memory usage"""
        process = psutil.Process()
        memory_mb = process.memory_info().rss / 1024 / 1024
        self.memory_usage.append({
            'timestamp': time.time() - self.start_time,
            'memory_mb': memory_mb,
            'memory_percent': process.memory_percent()
        })
        return memory_mb
        
    def calculate_latency_percentiles(self):
        """Calculate A: Latency percentiles (p95, p99)"""
        if not self.query_times:
            return None
            
        sorted_times = sorted(self.query_times)
        n = len(sorted_times)
        
        percentiles = {
            'p50': np.percentile(sorted_times, 50),
            'p90': np.percentile(sorted_times, 90), 
            'p95': np.percentile(sorted_times, 95),
            'p99': np.percentile(sorted_times, 99),
            'mean': statistics.mean(sorted_times),
            'min': min(sorted_times),
            'max': max(sorted_times),
            'total_queries': n
        }
        
        return percentiles
        
    def calculate_throughput(self, duration_seconds):
        """Calculate B: Throughput (queries/second)"""
        if not self.query_times or duration_seconds <= 0:
            return 0
            
        return len(self.query_times) / duration_seconds
        
    def get_memory_footprint(self):
        """Calculate C: Memory footprint statistics"""
        if not self.memory_usage:
            return None
            
        memory_values = [m['memory_mb'] for m in self.memory_usage]
        
        return {
            'peak_memory_mb': max(memory_values),
            'average_memory_mb': statistics.mean(memory_values),
            'min_memory_mb': min(memory_values),
            'memory_growth_mb': memory_values[-1] - memory_values[0] if len(memory_values) > 1 else 0
        }

# Initialize performance metrics
perf_metrics = PerformanceMetrics("ESIndex-v1.0")
print("üöÄ Performance Metrics System Initialized")
print(f"   System: {perf_metrics.system_name}")
print(f"   Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

üöÄ Performance Metrics System Initialized
   System: ESIndex-v1.0
   Start Time: 2025-10-28 17:44:09


In [13]:
def generate_diverse_query_set():
    """
    Generate a diverse query set that tests various system properties
    Based on information retrieval best practices and system stress testing
    """
    
    print("üß† Generating Diverse Query Set...")
    print("   Rationale: Testing different search patterns and system behaviors")
    
    # Query categories with justification
    query_categories = {
        'single_term': {
            'queries': ['anarchism', 'philosophy', 'politics', 'government', 'society'],
            'purpose': 'Test basic term matching and TF-IDF scoring',
            'system_property': 'Core search functionality, index lookup efficiency'
        },
        
        'multi_term': {
            'queries': [
                'political philosophy movement',
                'artificial intelligence research', 
                'computer science technology',
                'social economic theory',
                'historical cultural development'
            ],
            'purpose': 'Test multi-term coordination and ranking',
            'system_property': 'Query processing complexity, Boolean operations'
        },
        
        'phrase_queries': {
            'queries': [
                '"political philosophy"',
                '"artificial intelligence"', 
                '"social movement"',
                '"economic theory"',
                '"cultural development"'
            ],
            'purpose': 'Test exact phrase matching and position indexing',
            'system_property': 'Positional indexing, phrase query optimization'
        },
        
        'long_queries': {
            'queries': [
                'anarchism political philosophy movement skeptical authority hierarchical power structures',
                'artificial intelligence machine learning natural language processing computer science research',
                'social economic political cultural historical development theory practice implementation',
                'government authority power control society individual freedom liberty rights democracy'
            ],
            'purpose': 'Test system performance with complex queries',
            'system_property': 'Query parser efficiency, memory usage, scoring complexity'
        },
        
        'rare_terms': {
            'queries': ['epistemology', 'ontology', 'phenomenology', 'hermeneutics', 'dialectics'],
            'purpose': 'Test handling of low-frequency terms',
            'system_property': 'Index efficiency for rare terms, IDF calculation'
        },
        
        'common_terms': {
            'queries': ['the', 'and', 'of', 'to', 'in'],
            'purpose': 'Test system behavior with high-frequency terms',
            'system_property': 'Stopword handling, performance with common terms'
        },
        
        'boolean_queries': {
            'queries': [
                'anarchism AND philosophy',
                'politics OR government', 
                'society NOT authority',
                '(political AND philosophy) OR (social AND movement)'
            ],
            'purpose': 'Test Boolean query processing',
            'system_property': 'Boolean logic implementation, query optimization'
        },
        
        'range_queries': {
            'queries': [
                'token_count:[100 TO 500]',
                'token_count:[1000 TO *]',
                'token_count:[* TO 100]'
            ],
            'purpose': 'Test numeric range operations',
            'system_property': 'Numeric indexing, range query performance'
        },
        
        'wildcard_fuzzy': {
            'queries': [
                'politic*',
                'philosoph*', 
                'govern*',
                'anarch~2'  # Fuzzy search
            ],
            'purpose': 'Test pattern matching and fuzzy search',
            'system_property': 'Term expansion, fuzzy matching algorithms'
        },
        
        'empty_no_results': {
            'queries': [
                'xyzabc123nonexistent',
                'qqqqwwwweeeerrrr',
                'zzzzzaaaabbbbcccc'
            ],
            'purpose': 'Test system behavior with no matches',
            'system_property': 'Graceful handling of empty results, error conditions'
        }
    }
    
    # Flatten queries with metadata
    all_queries = []
    for category, info in query_categories.items():
        for query in info['queries']:
            all_queries.append({
                'query': query,
                'category': category,
                'purpose': info['purpose'],
                'system_property': info['system_property']
            })
    
    print(f"‚úÖ Generated {len(all_queries)} diverse queries across {len(query_categories)} categories")
    
    # Print justification
    print("\nüìã QUERY SET JUSTIFICATION:")
    for category, info in query_categories.items():
        print(f"   {category}: {info['purpose']}")
    
    return all_queries, query_categories

# Generate the diverse query set
diverse_queries, query_categories = generate_diverse_query_set()

print(f"\nüéØ Query Distribution:")
for category in query_categories:
    count = len([q for q in diverse_queries if q['category'] == category])
    print(f"   {category}: {count} queries")

üß† Generating Diverse Query Set...
   Rationale: Testing different search patterns and system behaviors
‚úÖ Generated 43 diverse queries across 10 categories

üìã QUERY SET JUSTIFICATION:
   single_term: Test basic term matching and TF-IDF scoring
   multi_term: Test multi-term coordination and ranking
   phrase_queries: Test exact phrase matching and position indexing
   long_queries: Test system performance with complex queries
   rare_terms: Test handling of low-frequency terms
   common_terms: Test system behavior with high-frequency terms
   boolean_queries: Test Boolean query processing
   range_queries: Test numeric range operations
   wildcard_fuzzy: Test pattern matching and fuzzy search
   empty_no_results: Test system behavior with no matches

üéØ Query Distribution:
   single_term: 5 queries
   multi_term: 5 queries
   phrase_queries: 5 queries
   long_queries: 4 queries
   rare_terms: 5 queries
   common_terms: 5 queries
   boolean_queries: 4 queries
   range_queries: 

In [14]:
def measure_system_throughput(es_client, index_name, query_set, duration_seconds=30):
    """
    Measure B: System throughput in queries/second
    Tests both read operations and mixed workloads
    """
    
    print("üöÄ MEASURING SYSTEM THROUGHPUT (Metric B)")
    print("="*50)
    
    # Prepare test queries (cycle through diverse set)
    test_queries = []
    for query_info in query_set[:20]:  # Use first 20 diverse queries
        query_text = query_info['query'].replace('"', '')  # Simplify for throughput test
        test_queries.append({
            "query": {"match": {"text": query_text}}, 
            "size": 5  # Smaller result set for faster processing
        })
    
    print(f"üìä Throughput test configuration:")
    print(f"   Duration: {duration_seconds} seconds")
    print(f"   Query pool size: {len(test_queries)} unique queries")
    print(f"   Target: Maximum queries/second")
    
    # Single-threaded throughput test
    print(f"\nüßµ Single-threaded throughput test...")
    single_thread_results = []
    start_time = time.time()
    query_count = 0
    
    perf_metrics.record_memory_usage()
    
    while (time.time() - start_time) < duration_seconds:
        # Cycle through queries
        query = test_queries[query_count % len(test_queries)]
        
        try:
            query_start = time.time()
            result = es_client.search(index_name, query)
            query_end = time.time()
            
            single_thread_results.append({
                'query_time': (query_end - query_start) * 1000,
                'result_count': result['hits']['total']['value'] if result else 0,
                'timestamp': query_end - start_time
            })
            
            query_count += 1
            
            # Record memory every 50 queries
            if query_count % 50 == 0:
                perf_metrics.record_memory_usage()
                
        except Exception as e:
            print(f"‚ùå Throughput query failed: {e}")
            
    single_thread_duration = time.time() - start_time
    single_thread_qps = len(single_thread_results) / single_thread_duration
    
    print(f"‚úÖ Single-threaded results:")
    print(f"   Queries executed: {len(single_thread_results)}")
    print(f"   Duration: {single_thread_duration:.2f} seconds")
    print(f"   üéØ Throughput: {single_thread_qps:.2f} queries/second")
    print(f"   Average query time: {statistics.mean([r['query_time'] for r in single_thread_results]):.2f} ms")
    
    # Multi-threaded throughput test
    print(f"\nüîÄ Multi-threaded throughput test (4 threads)...")
    
    def worker_thread(thread_id, duration, results_list):
        """Worker thread for concurrent throughput testing"""
        thread_start = time.time()
        thread_query_count = 0
        
        while (time.time() - thread_start) < duration:
            query = test_queries[thread_query_count % len(test_queries)]
            
            try:
                query_start = time.time()
                result = es_client.search(index_name, query)
                query_end = time.time()
                
                results_list.append({
                    'thread_id': thread_id,
                    'query_time': (query_end - query_start) * 1000,
                    'result_count': result['hits']['total']['value'] if result else 0,
                    'timestamp': query_end - thread_start
                })
                
                thread_query_count += 1
                
            except Exception as e:
                # Silently handle errors in stress test
                pass
    
    # Run concurrent threads
    multi_thread_results = []
    threads = []
    thread_count = 4
    
    start_time = time.time()
    
    for i in range(thread_count):
        thread = threading.Thread(
            target=worker_thread, 
            args=(i, duration_seconds, multi_thread_results)
        )
        thread.start()
        threads.append(thread)
    
    # Wait for all threads to complete
    for thread in threads:
        thread.join()
    
    multi_thread_duration = time.time() - start_time
    multi_thread_qps = len(multi_thread_results) / multi_thread_duration
    
    print(f"‚úÖ Multi-threaded results:")
    print(f"   Threads: {thread_count}")
    print(f"   Queries executed: {len(multi_thread_results)}")
    print(f"   Duration: {multi_thread_duration:.2f} seconds")
    print(f"   üéØ Throughput: {multi_thread_qps:.2f} queries/second")
    print(f"   Speedup: {multi_thread_qps/single_thread_qps:.2f}x")
    
    # Calculate final throughput metrics
    throughput_metrics = {
        'single_thread_qps': single_thread_qps,
        'multi_thread_qps': multi_thread_qps,
        'speedup_factor': multi_thread_qps/single_thread_qps if single_thread_qps > 0 else 0,
        'thread_count': thread_count,
        'test_duration': duration_seconds,
        'total_queries': len(single_thread_results) + len(multi_thread_results)
    }
    
    return throughput_metrics, single_thread_results, multi_thread_results

# Run throughput measurements
if es and es.ping() and indexing_success:
    throughput_metrics, single_results, multi_results = measure_system_throughput(
        es, INDEX_NAME, diverse_queries, duration_seconds=30
    )
else:
    print("‚ùå Cannot measure throughput - ES not available or indexing failed")
    throughput_metrics = None

üöÄ MEASURING SYSTEM THROUGHPUT (Metric B)
üìä Throughput test configuration:
   Duration: 30 seconds
   Query pool size: 20 unique queries
   Target: Maximum queries/second

üßµ Single-threaded throughput test...
‚úÖ Single-threaded results:
   Queries executed: 4298
   Duration: 30.02 seconds
   üéØ Throughput: 143.19 queries/second
   Average query time: 6.97 ms

üîÄ Multi-threaded throughput test (4 threads)...
‚úÖ Multi-threaded results:
   Threads: 4
   Queries executed: 9132
   Duration: 30.03 seconds
   üéØ Throughput: 304.13 queries/second
   Speedup: 2.12x


In [15]:
def measure_system_latency(es_client, index_name, query_set, warmup_runs=5):
    """
    Measure A: System response time with p95 and p99 percentiles
    """
    
    print("‚è±Ô∏è  MEASURING SYSTEM LATENCY (Metric A)")
    print("="*50)
    
    # Warmup phase to eliminate cold start effects
    print(f"üî• Warmup phase: {warmup_runs} runs...")
    warmup_query = {"query": {"match": {"text": "test"}}, "size": 1}
    
    for i in range(warmup_runs):
        try:
            es_client.search(index_name, warmup_query)
        except:
            pass
    
    print("üèÉ Starting latency measurements...")
    
    latency_results = []
    category_latencies = defaultdict(list)
    
    # Record memory before testing
    perf_metrics.record_memory_usage()
    
    for i, query_info in enumerate(tqdm(query_set, desc="Measuring latency")):
        query_text = query_info['query']
        category = query_info['category']
        
        try:
            # Construct appropriate ES query based on type
            if category == 'boolean_queries':
                # Skip complex boolean for now, use simple match
                es_query = {"query": {"match": {"text": query_text.replace(' AND ', ' ').replace(' OR ', ' ').replace(' NOT ', ' ')}}, "size": 10}
            elif category == 'range_queries':
                if 'token_count' in query_text:
                    es_query = {"query": {"range": {"token_count": {"gte": 100, "lte": 1000}}}, "size": 10}
                else:
                    es_query = {"query": {"match": {"text": query_text}}, "size": 10}
            elif category == 'wildcard_fuzzy':
                # Simplify wildcards for basic implementation
                clean_query = query_text.replace('*', '').replace('~2', '')
                es_query = {"query": {"match": {"text": clean_query}}, "size": 10}
            else:
                # Standard match query
                clean_query = query_text.replace('"', '')  # Remove quotes for basic match
                es_query = {"query": {"match": {"text": clean_query}}, "size": 10}
            
            # Measure query time
            start_time = time.time()
            results = es_client.search(index_name, es_query)
            end_time = time.time()
            
            # Calculate latency in milliseconds
            latency_ms = (end_time - start_time) * 1000
            
            # Record results
            latency_results.append({
                'query': query_text,
                'category': category,
                'latency_ms': latency_ms,
                'result_count': results['hits']['total']['value'] if results else 0,
                'has_results': results is not None and results['hits']['total']['value'] > 0
            })
            
            category_latencies[category].append(latency_ms)
            perf_metrics.record_query_time(latency_ms)
            
            # Record memory periodically
            if i % 10 == 0:
                perf_metrics.record_memory_usage()
                
        except Exception as e:
            print(f"‚ùå Query failed: {query_text[:30]}... Error: {e}")
            # Record failed query with high latency
            latency_results.append({
                'query': query_text,
                'category': category,
                'latency_ms': 5000,  # 5 second penalty for failed queries
                'result_count': 0,
                'has_results': False,
                'error': str(e)
            })
    
    # Calculate overall percentiles
    percentiles = perf_metrics.calculate_latency_percentiles()
    
    print(f"\nüìä LATENCY RESULTS:")
    print(f"   Total queries tested: {len(latency_results)}")
    print(f"   Mean latency: {percentiles['mean']:.2f} ms")
    print(f"   Median (p50): {percentiles['p50']:.2f} ms")
    print(f"   90th percentile (p90): {percentiles['p90']:.2f} ms")
    print(f"   üéØ 95th percentile (p95): {percentiles['p95']:.2f} ms")
    print(f"   üéØ 99th percentile (p99): {percentiles['p99']:.2f} ms")
    print(f"   Min latency: {percentiles['min']:.2f} ms")
    print(f"   Max latency: {percentiles['max']:.2f} ms")
    
    # Category-wise analysis
    print(f"\nüìã LATENCY BY QUERY CATEGORY:")
    for category, latencies in category_latencies.items():
        if latencies:
            avg_latency = statistics.mean(latencies)
            p95_latency = np.percentile(latencies, 95)
            print(f"   {category:20} | Avg: {avg_latency:6.2f} ms | p95: {p95_latency:6.2f} ms | Count: {len(latencies)}")
    
    return latency_results, percentiles, category_latencies

# Run latency measurements
if es and es.ping() and indexing_success:
    latency_results, percentiles, category_latencies = measure_system_latency(
        es, INDEX_NAME, diverse_queries, warmup_runs=5
    )
else:
    print("‚ùå Cannot measure latency - ES not available or indexing failed")
    latency_results = []
    percentiles = None

‚è±Ô∏è  MEASURING SYSTEM LATENCY (Metric A)
üî• Warmup phase: 5 runs...
üèÉ Starting latency measurements...


Measuring latency: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 43/43 [00:00<00:00, 66.43it/s]


üìä LATENCY RESULTS:
   Total queries tested: 43
   Mean latency: 14.90 ms
   Median (p50): 14.24 ms
   90th percentile (p90): 24.52 ms
   üéØ 95th percentile (p95): 26.57 ms
   üéØ 99th percentile (p99): 27.42 ms
   Min latency: 3.70 ms
   Max latency: 27.59 ms

üìã LATENCY BY QUERY CATEGORY:
   single_term          | Avg:  13.83 ms | p95:  14.47 ms | Count: 5
   multi_term           | Avg:  18.92 ms | p95:  20.78 ms | Count: 5
   phrase_queries       | Avg:  16.61 ms | p95:  19.50 ms | Count: 5
   long_queries         | Avg:  26.31 ms | p95:  27.10 ms | Count: 4
   rare_terms           | Avg:  12.65 ms | p95:  14.86 ms | Count: 5
   common_terms         | Avg:  12.76 ms | p95:  13.38 ms | Count: 5
   boolean_queries      | Avg:  19.50 ms | p95:  26.23 ms | Count: 4
   range_queries        | Avg:  11.74 ms | p95:  14.42 ms | Count: 3
   wildcard_fuzzy       | Avg:   9.03 ms | p95:  14.77 ms | Count: 4
   empty_no_results     | Avg:   4.06 ms | p95:   4.46 ms | Count: 3





In [16]:
def measure_memory_footprint(es_client, index_name):
    """
    Measure C: Memory footprint of the system
    Includes process memory, index size, and memory growth during operations
    """
    
    print("üíæ MEASURING MEMORY FOOTPRINT (Metric C)")
    print("="*50)
    
    # Get current process memory
    process = psutil.Process()
    process_memory = process.memory_info()
    
    # Get system memory
    system_memory = psutil.virtual_memory()
    
    # Get Elasticsearch index stats
    index_stats = es_client.get_index_stats(index_name)
    
    memory_metrics = {
        'process_memory_mb': process_memory.rss / 1024 / 1024,
        'process_memory_percent': process.memory_percent(),
        'system_total_gb': system_memory.total / 1024 / 1024 / 1024,
        'system_available_gb': system_memory.available / 1024 / 1024 / 1024,
        'system_usage_percent': system_memory.percent
    }
    
    if index_stats and 'indices' in index_stats:
        index_info = index_stats['indices'][index_name]['total']
        memory_metrics.update({
            'index_size_mb': index_info['store']['size_in_bytes'] / 1024 / 1024,
            'index_document_count': index_info['docs']['count'],
            'index_deleted_docs': index_info['docs']['deleted'],
            'avg_doc_size_kb': (index_info['store']['size_in_bytes'] / index_info['docs']['count']) / 1024 if index_info['docs']['count'] > 0 else 0
        })
    
    # Calculate memory efficiency
    if 'index_size_mb' in memory_metrics and 'index_document_count' in memory_metrics:
        docs_per_mb = memory_metrics['index_document_count'] / memory_metrics['index_size_mb']
        memory_metrics['documents_per_mb'] = docs_per_mb
    
    # Get memory usage history from performance metrics
    memory_history = perf_metrics.get_memory_footprint()
    if memory_history:
        memory_metrics.update(memory_history)
    
    print(f"üìä MEMORY FOOTPRINT RESULTS:")
    print(f"   üñ•Ô∏è  Process Memory:")
    print(f"       Current: {memory_metrics['process_memory_mb']:.2f} MB")
    print(f"       System %: {memory_metrics['process_memory_percent']:.2f}%")
    
    if memory_history:
        print(f"       Peak: {memory_history['peak_memory_mb']:.2f} MB")
        print(f"       Growth: {memory_history['memory_growth_mb']:.2f} MB")
    
    print(f"   üíø Index Storage:")
    if 'index_size_mb' in memory_metrics:
        print(f"       Index size: {memory_metrics['index_size_mb']:.2f} MB")
        print(f"       Documents: {memory_metrics['index_document_count']:,}")
        print(f"       Avg doc size: {memory_metrics['avg_doc_size_kb']:.2f} KB")
        print(f"       Efficiency: {memory_metrics['documents_per_mb']:.1f} docs/MB")
    
    print(f"   üåê System Memory:")
    print(f"       Total: {memory_metrics['system_total_gb']:.2f} GB")
    print(f"       Available: {memory_metrics['system_available_gb']:.2f} GB")
    print(f"       Usage: {memory_metrics['system_usage_percent']:.1f}%")
    
    # Memory efficiency assessment
    if 'index_size_mb' in memory_metrics:
        size_mb = memory_metrics['index_size_mb']
        doc_count = memory_metrics['index_document_count']
        
        print(f"\nüéØ MEMORY EFFICIENCY ASSESSMENT:")
        if size_mb < 100:
            print(f"   ‚úÖ Excellent: Small index size ({size_mb:.1f} MB)")
        elif size_mb < 500:
            print(f"   üëç Good: Moderate index size ({size_mb:.1f} MB)")
        else:
            print(f"   ‚ö†Ô∏è  Large: Significant index size ({size_mb:.1f} MB)")
        
        if memory_metrics['documents_per_mb'] > 50:
            print(f"   ‚úÖ Excellent: High storage efficiency ({memory_metrics['documents_per_mb']:.1f} docs/MB)")
        elif memory_metrics['documents_per_mb'] > 25:
            print(f"   üëç Good: Decent storage efficiency ({memory_metrics['documents_per_mb']:.1f} docs/MB)")
        else:
            print(f"   ‚ö†Ô∏è  Low: Storage efficiency could be improved ({memory_metrics['documents_per_mb']:.1f} docs/MB)")
    
    return memory_metrics

# Measure memory footprint
if es and es.ping() and indexing_success:
    memory_metrics = measure_memory_footprint(es, INDEX_NAME)
else:
    print("‚ùå Cannot measure memory - ES not available or indexing failed")
    memory_metrics = None

üíæ MEASURING MEMORY FOOTPRINT (Metric C)
üìä MEMORY FOOTPRINT RESULTS:
   üñ•Ô∏è  Process Memory:
       Current: 2522.67 MB
       System %: 16.09%
       Peak: 2591.62 MB
       Growth: -69.12 MB
   üíø Index Storage:
       Index size: 140.90 MB
       Documents: 50,000
       Avg doc size: 2.89 KB
       Efficiency: 354.9 docs/MB
   üåê System Memory:
       Total: 15.31 GB
       Available: 2.14 GB
       Usage: 86.0%

üéØ MEMORY EFFICIENCY ASSESSMENT:
   üëç Good: Moderate index size (140.9 MB)
   ‚úÖ Excellent: High storage efficiency (354.9 docs/MB)


In [17]:
def measure_functional_metrics(es_client, index_name, test_queries):
    """
    Measure D: Functional metrics like precision, recall, and ranking measures
    """
    
    print("üéØ MEASURING FUNCTIONAL METRICS (Metric D)")
    print("="*50)
    
    # Define ground truth for evaluation (simplified)
    # In a real system, you'd have human-annotated relevance judgments
    ground_truth = {
        'anarchism': {
            'relevant_docs': ['12'],  # We know doc 12 is about Anarchism
            'highly_relevant': ['12'],
            'expected_top_result': 'Anarchism'
        },
        'political philosophy': {
            'relevant_terms': ['anarchism', 'philosophy', 'political'],
            'min_expected_results': 10
        },
        'artificial intelligence': {
            'relevant_terms': ['artificial', 'intelligence', 'technology'],
            'min_expected_results': 5
        }
    }
    
    functional_results = {}
    
    print("üîç Testing search quality...")
    
    for query_text, truth in ground_truth.items():
        print(f"\nüìù Query: '{query_text}'")
        
        try:
            # Execute search
            es_query = {"query": {"match": {"text": query_text}}, "size": 20}
            results = es_client.search(index_name, es_query)
            
            if not results or 'hits' not in results:
                print(f"   ‚ùå No results returned")
                continue
                
            hits = results['hits']['hits']
            total_results = results['hits']['total']['value']
            
            print(f"   üìä Total results: {total_results}")
            
            # Calculate metrics based on available ground truth
            metrics = {}
            
            if 'relevant_docs' in truth:
                # Precision and Recall calculation
                retrieved_docs = [hit['_id'] for hit in hits]
                relevant_docs = truth['relevant_docs']
                
                true_positives = len(set(retrieved_docs) & set(relevant_docs))
                false_positives = len(set(retrieved_docs) - set(relevant_docs))
                false_negatives = len(set(relevant_docs) - set(retrieved_docs))
                
                precision = true_positives / len(retrieved_docs) if retrieved_docs else 0
                recall = true_positives / len(relevant_docs) if relevant_docs else 0
                f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
                
                metrics.update({
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1_score,
                    'true_positives': true_positives,
                    'false_positives': false_positives,
                    'false_negatives': false_negatives
                })
                
                print(f"   üéØ Precision: {precision:.3f}")
                print(f"   üìà Recall: {recall:.3f}")
                print(f"   üîó F1-Score: {f1_score:.3f}")
            
            if 'expected_top_result' in truth:
                # Ranking quality - check if expected result is in top positions
                top_titles = []
                for hit in hits[:5]:
                    title = hit['_source'].get('title', '')
                    top_titles.append(title)
                
                expected = truth['expected_top_result']
                if any(expected.lower() in title.lower() for title in top_titles):
                    rank_position = next(i for i, title in enumerate(top_titles) if expected.lower() in title.lower())
                    metrics['expected_result_rank'] = rank_position + 1
                    print(f"   üèÜ Expected result '{expected}' found at rank {rank_position + 1}")
                else:
                    metrics['expected_result_rank'] = None
                    print(f"   ‚ùå Expected result '{expected}' not in top 5")
            
            if 'min_expected_results' in truth:
                # Coverage - minimum expected results
                min_expected = truth['min_expected_results']
                metrics['coverage'] = total_results >= min_expected
                print(f"   üìä Coverage: {'‚úÖ' if metrics['coverage'] else '‚ùå'} ({total_results} >= {min_expected})")
            
            # Score distribution analysis
            if hits:
                scores = [hit['_score'] for hit in hits[:10]]
                metrics.update({
                    'top_score': max(scores),
                    'score_range': max(scores) - min(scores),
                    'score_std': statistics.stdev(scores) if len(scores) > 1 else 0
                })
                
                print(f"   üìà Score range: {metrics['score_range']:.3f}")
                print(f"   üìä Score std dev: {metrics['score_std']:.3f}")
            
            # Relevance at different cut-offs (simplified)
            cutoffs = [1, 3, 5, 10]
            for k in cutoffs:
                if len(hits) >= k:
                    # Simple relevance: results with score > threshold are considered relevant
                    score_threshold = 5.0  # Adjust based on your score ranges
                    relevant_at_k = sum(1 for hit in hits[:k] if hit['_score'] > score_threshold)
                    precision_at_k = relevant_at_k / k
                    metrics[f'precision_at_{k}'] = precision_at_k
                    print(f"   P@{k}: {precision_at_k:.3f}")
            
            functional_results[query_text] = metrics
            
        except Exception as e:
            print(f"   ‚ùå Error testing query '{query_text}': {e}")
            functional_results[query_text] = {'error': str(e)}
    
    # Calculate overall functional metrics
    overall_metrics = {}
    
    # Average precision across all queries
    precisions = [m.get('precision', 0) for m in functional_results.values() if 'precision' in m]
    if precisions:
        overall_metrics['mean_average_precision'] = statistics.mean(precisions)
    
    # Average F1 score
    f1_scores = [m.get('f1_score', 0) for m in functional_results.values() if 'f1_score' in m]
    if f1_scores:
        overall_metrics['mean_f1_score'] = statistics.mean(f1_scores)
    
    # Coverage rate
    coverage_results = [m.get('coverage', False) for m in functional_results.values() if 'coverage' in m]
    if coverage_results:
        overall_metrics['coverage_rate'] = sum(coverage_results) / len(coverage_results)
    
    print(f"\nüèÜ OVERALL FUNCTIONAL METRICS:")
    if 'mean_average_precision' in overall_metrics:
        print(f"   üìä Mean Average Precision: {overall_metrics['mean_average_precision']:.3f}")
    if 'mean_f1_score' in overall_metrics:
        print(f"   üéØ Mean F1-Score: {overall_metrics['mean_f1_score']:.3f}")
    if 'coverage_rate' in overall_metrics:
        print(f"   üìà Coverage Rate: {overall_metrics['coverage_rate']:.3f}")
    
    # Functional quality assessment
    print(f"\nüéñÔ∏è  FUNCTIONAL QUALITY ASSESSMENT:")
    if overall_metrics.get('mean_average_precision', 0) > 0.7:
        print(f"   ‚úÖ Excellent: High precision search results")
    elif overall_metrics.get('mean_average_precision', 0) > 0.5:
        print(f"   üëç Good: Decent precision search results")
    else:
        print(f"   ‚ö†Ô∏è  Needs improvement: Low precision search results")
    
    return functional_results, overall_metrics

# Measure functional metrics
if es and es.ping() and indexing_success:
    functional_results, overall_functional_metrics = measure_functional_metrics(es, INDEX_NAME, diverse_queries)
else:
    print("‚ùå Cannot measure functional metrics - ES not available or indexing failed")
    functional_results = None
    overall_functional_metrics = None

üéØ MEASURING FUNCTIONAL METRICS (Metric D)
üîç Testing search quality...

üìù Query: 'anarchism'
   üìä Total results: 25
   üéØ Precision: 0.050
   üìà Recall: 1.000
   üîó F1-Score: 0.095
   üèÜ Expected result 'Anarchism' found at rank 1
   üìà Score range: 0.000
   üìä Score std dev: 0.000
   P@1: 1.000
   P@3: 1.000
   P@5: 1.000
   P@10: 1.000

üìù Query: 'political philosophy'
   üìä Total results: 4801
   üìä Coverage: ‚úÖ (4801 >= 10)
   üìà Score range: 0.000
   üìä Score std dev: 0.000
   P@1: 1.000
   P@3: 1.000
   P@5: 1.000
   P@10: 1.000

üìù Query: 'artificial intelligence'
   üìä Total results: 1200
   üìä Coverage: ‚úÖ (1200 >= 5)
   üìà Score range: 0.000
   üìä Score std dev: 0.000
   P@1: 1.000
   P@3: 1.000
   P@5: 1.000
   P@10: 1.000

üèÜ OVERALL FUNCTIONAL METRICS:
   üìä Mean Average Precision: 0.050
   üéØ Mean F1-Score: 0.095
   üìà Coverage Rate: 1.000

üéñÔ∏è  FUNCTIONAL QUALITY ASSESSMENT:
   ‚ö†Ô∏è  Needs improvement: Low precis

In [18]:
def generate_comprehensive_report():
    """Generate comprehensive performance report with all metrics A, B, C, D"""
    
    print("\n" + "="*80)
    print("üìä COMPREHENSIVE PERFORMANCE REPORT - ESIndex-v1.0")
    print("="*80)
    
    report = {
        'system_name': 'ESIndex-v1.0',
        'dataset': SELECTED_DATASET,
        'document_count': MAX_DOCUMENTS,
        'index_name': INDEX_NAME,
        'timestamp': datetime.now().isoformat(),
        'metrics': {}
    }
    
    # Metric A: Latency
    if percentiles:
        print(f"\nüÖ∞Ô∏è  METRIC A: SYSTEM RESPONSE TIME (LATENCY)")
        print(f"   üìä Query Performance:")
        print(f"      ‚Ä¢ Mean Latency: {percentiles['mean']:.2f} ms")
        print(f"      ‚Ä¢ 95th Percentile (p95): {percentiles['p95']:.2f} ms ‚≠ê")
        print(f"      ‚Ä¢ 99th Percentile (p99): {percentiles['p99']:.2f} ms ‚≠ê")
        print(f"      ‚Ä¢ Total Queries: {percentiles['total_queries']}")
        report['metrics']['latency'] = percentiles
    
    # Metric B: Throughput
    if throughput_metrics:
        print(f"\nüÖ±Ô∏è  METRIC B: SYSTEM THROUGHPUT")
        print(f"   üöÄ Query Performance:")
        print(f"      ‚Ä¢ Single-threaded: {throughput_metrics['single_thread_qps']:.2f} queries/second ‚≠ê")
        print(f"      ‚Ä¢ Multi-threaded: {throughput_metrics['multi_thread_qps']:.2f} queries/second ‚≠ê")
        print(f"      ‚Ä¢ Speedup Factor: {throughput_metrics['speedup_factor']:.2f}x")
        print(f"      ‚Ä¢ Thread Count: {throughput_metrics['thread_count']}")
        report['metrics']['throughput'] = throughput_metrics
    
    # Metric C: Memory
    if memory_metrics:
        print(f"\nüÖ≤  METRIC C: MEMORY FOOTPRINT")
        print(f"   üíæ Memory Usage:")
        print(f"      ‚Ä¢ Process Memory: {memory_metrics['process_memory_mb']:.2f} MB ‚≠ê")
        print(f"      ‚Ä¢ Index Size: {memory_metrics.get('index_size_mb', 0):.2f} MB ‚≠ê")
        print(f"      ‚Ä¢ Storage Efficiency: {memory_metrics.get('documents_per_mb', 0):.1f} docs/MB")
        if 'peak_memory_mb' in memory_metrics:
            print(f"      ‚Ä¢ Peak Memory: {memory_metrics['peak_memory_mb']:.2f} MB")
        report['metrics']['memory'] = memory_metrics
    
    # Metric D: Functional
    if overall_functional_metrics:
        print(f"\nüÖ≥  METRIC D: FUNCTIONAL METRICS")
        print(f"   üéØ Search Quality:")
        if 'mean_average_precision' in overall_functional_metrics:
            print(f"      ‚Ä¢ Mean Average Precision: {overall_functional_metrics['mean_average_precision']:.3f} ‚≠ê")
        if 'mean_f1_score' in overall_functional_metrics:
            print(f"      ‚Ä¢ Mean F1-Score: {overall_functional_metrics['mean_f1_score']:.3f} ‚≠ê")
        if 'coverage_rate' in overall_functional_metrics:
            print(f"      ‚Ä¢ Coverage Rate: {overall_functional_metrics['coverage_rate']:.3f}")
        
        # Functional assessment
        precision = overall_functional_metrics.get('mean_average_precision', 0)
        
        report['metrics']['functional'] = overall_functional_metrics
    
    # Save comprehensive report
    report_filename = f"results/performance_report_{SELECTED_DATASET}_{MAX_DOCUMENTS}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(report_filename, 'w') as f:
        json.dump(report, f, indent=2, default=str)
    
    print(f"\nüíæ Report saved to: {report_filename}")
    
    return report

# Generate comprehensive report
final_report = generate_comprehensive_report()

print(f"\nüéØ ESIndex-v1.0 Performance Evaluation Complete!")
print(f"   Ready for comparison with SelfIndex implementation")
print(f"   All metrics (A,B,C,D) successfully measured")


üìä COMPREHENSIVE PERFORMANCE REPORT - ESIndex-v1.0

üÖ∞Ô∏è  METRIC A: SYSTEM RESPONSE TIME (LATENCY)
   üìä Query Performance:
      ‚Ä¢ Mean Latency: 14.90 ms
      ‚Ä¢ 95th Percentile (p95): 26.57 ms ‚≠ê
      ‚Ä¢ 99th Percentile (p99): 27.42 ms ‚≠ê
      ‚Ä¢ Total Queries: 43

üÖ±Ô∏è  METRIC B: SYSTEM THROUGHPUT
   üöÄ Query Performance:
      ‚Ä¢ Single-threaded: 143.19 queries/second ‚≠ê
      ‚Ä¢ Multi-threaded: 304.13 queries/second ‚≠ê
      ‚Ä¢ Speedup Factor: 2.12x
      ‚Ä¢ Thread Count: 4

üÖ≤  METRIC C: MEMORY FOOTPRINT
   üíæ Memory Usage:
      ‚Ä¢ Process Memory: 2522.67 MB ‚≠ê
      ‚Ä¢ Index Size: 140.90 MB ‚≠ê
      ‚Ä¢ Storage Efficiency: 354.9 docs/MB
      ‚Ä¢ Peak Memory: 2591.62 MB

üÖ≥  METRIC D: FUNCTIONAL METRICS
   üéØ Search Quality:
      ‚Ä¢ Mean Average Precision: 0.050 ‚≠ê
      ‚Ä¢ Mean F1-Score: 0.095 ‚≠ê
      ‚Ä¢ Coverage Rate: 1.000


FileNotFoundError: [Errno 2] No such file or directory: 'results/performance_report_wikipedia_50000_20251028_174510.json'

## Elastic Search Indexing

In [None]:
# !docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.11.0
# !docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.11.0



# ! docker run -d --name elasticsearch -p 9200:9200 -v esdata:/usr/share/elasticsearch/data -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.11.0