In [3]:
# Kaggle-optimized installation
!pip install pinecone sentence-transformers rank-bm25 langchain
# Most other packages are pre-installed on Kaggle

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting packaging>=20.9 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvi

In [5]:
import os
import logging
import json
import time
from typing import List, Dict, Optional, Tuple, Union
from pathlib import Path
import numpy as np
import pandas as pd

# Core packages
from sentence_transformers import SentenceTransformer

# Hugging Face for text generation
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

# NLTK for simple tokenization
import nltk
try:
    nltk.download('punkt', quiet=True)
except:
    pass

# LangChain for text splitting
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# BM25 for keyword search
from rank_bm25 import BM25Okapi

# Pinecone (you can also replace this with FAISS for fully local)
from pinecone import Pinecone, ServerlessSpec

# Evaluation
from sklearn.metrics.pairwise import cosine_similarity

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("✅ All libraries imported (Hugging Face only)!")

✅ All libraries imported (Hugging Face only)!


In [7]:
# Configuration
CONFIG = {
    "embedding_model": "BAAI/bge-small-en",
    "generation_model": "Qwen/Qwen2.5-3B-Instruct",  # Your preference
    
    "index_name": "hybrid-rag-langchain",
    
    # LangChain RecursiveCharacterTextSplitter parameters
    "chunk_size": 300,       # characters per chunk
    "chunk_overlap": 50,     # overlapping characters (20% overlap)
    "separators": ["\n\n", "\n", " ", ""],  # Hierarchy of separators
    "retrieval_k": 10
}

# Set your Pinecone API key (only one API key needed now!)
PINECONE_API_KEY = "pcsk_6m2PRg_1qqfqLoS7ZEfXyacwJzrjwkKaQUA5aW3VQjV7wVoMfLH7S8MYZPG2sD5QaVeSE"

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔧 Using device: {device}")

# Verify API key
if not PINECONE_API_KEY or PINECONE_API_KEY == "your_pinecone_api_key_here":
    print("⚠️ Please set your PINECONE_API_KEY")
else:
    print("✅ Pinecone API key configured")

print(f"🎯 Models selected:")
print(f"  Embedding: {CONFIG['embedding_model']}")
print(f"  Generation: {CONFIG['generation_model']}")

🔧 Using device: cuda
✅ Pinecone API key configured
🎯 Models selected:
  Embedding: BAAI/bge-small-en
  Generation: Qwen/Qwen2.5-3B-Instruct


In [8]:
# Initialize embedding model
print(f"🔄 Loading embedding model: {CONFIG['embedding_model']}")
embedding_model = SentenceTransformer(CONFIG['embedding_model'])
embedding_dimension = embedding_model.get_sentence_embedding_dimension()

# Initialize LangChain text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CONFIG['chunk_size'],
    chunk_overlap=CONFIG['chunk_overlap'],
    separators=CONFIG['separators'],
    length_function=len,  # Use character count
    is_separator_regex=False
)

print(f"✅ Embedding model loaded! Dimension: {embedding_dimension}")
print(f"✅ LangChain text splitter initialized:")
print(f"  - Chunk size: {CONFIG['chunk_size']} characters")
print(f"  - Overlap: {CONFIG['chunk_overlap']} characters")
print(f"  - Separators: {CONFIG['separators']}")

🔄 Loading embedding model: BAAI/bge-small-en


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding model loaded! Dimension: 384
✅ LangChain text splitter initialized:
  - Chunk size: 300 characters
  - Overlap: 50 characters
  - Separators: ['\n\n', '\n', ' ', '']


In [9]:
# Initialize Hugging Face text generation pipeline
print(f"🔄 Loading generation model: {CONFIG['generation_model']}")

try:
    # Try GPU first
    generator = pipeline(
        "text-generation",
        model=CONFIG['generation_model'],
        device=0 if device == "cuda" else -1,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        max_length=1024,
        do_sample=True,
        temperature=0.8,
        top_p=0.9,
        pad_token_id=50256,  # Common padding token
        trust_remote_code=True
    )
    print("✅ Text generation model loaded!")
    
except Exception as e:
    print(f"⚠️ Trying simpler configuration: {e}")
    try:
        generator = pipeline(
            "text-generation",
            model=CONFIG['generation_model'],
            device=-1,  # Force CPU
            max_length=512,
            trust_remote_code=True
        )
        print("✅ Text generation model loaded on CPU!")
    except Exception as e2:
        print(f"❌ Model loading failed: {e2}")
        print("💡 Try a smaller model like 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'")
        generator = None

# Test the generator
if generator:
    test_prompt = "Question: What is AI? Answer:"
    try:
        test_output = generator(test_prompt, max_length=50, num_return_sequences=1)
        print(f"🧪 Generator test: {test_output[0]['generated_text'][len(test_prompt):].strip()[:50]}...")
    except:
        print("⚠️ Generator test failed, but model is loaded")

🔄 Loading generation model: Qwen/Qwen2.5-3B-Instruct


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


✅ Text generation model loaded!
🧪 Generator test: Artificial intelligence (AI) is the simulation of ...


pinecone

In [10]:
# Initialize Pinecone
print("🔄 Initializing Pinecone...")
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if index exists
existing_indexes = [idx.name for idx in pc.list_indexes()]

if CONFIG['index_name'] not in existing_indexes:
    print(f"🔄 Creating Pinecone index: {CONFIG['index_name']}")
    pc.create_index(
        name=CONFIG['index_name'],
        dimension=embedding_dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print("⏳ Waiting for index to be ready...")
    time.sleep(10)

# Connect to index
index = pc.Index(CONFIG['index_name'])
print(f"✅ Connected to Pinecone index: {CONFIG['index_name']}")

# Initialize global variables for BM25
documents_corpus = []
document_metadata = []
bm25_index = None

print("✅ All components initialized!")

🔄 Initializing Pinecone...
✅ Connected to Pinecone index: hybrid-rag-langchain
✅ All components initialized!


# LangChain Document Chunking

In [12]:
def chunk_documents(documents: List[str], doc_ids: Optional[List[str]] = None) -> List[Dict]:
    """
    Chunk documents using LangChain RecursiveCharacterTextSplitter
    """
    if doc_ids and len(doc_ids) != len(documents):
        raise ValueError("doc_ids length must match documents length")
    
    all_chunks = []
    
    for i, doc_text in enumerate(documents):
        doc_id = doc_ids[i] if doc_ids else f"doc_{i}"
        
        # Create LangChain Document object
        doc = Document(
            page_content=doc_text,
            metadata={"doc_id": doc_id, "original_length": len(doc_text)}
        )
        
        # Split using LangChain text splitter
        split_docs = text_splitter.split_documents([doc])
        
        # Convert to our format
        for chunk_idx, split_doc in enumerate(split_docs):
            chunk_text = split_doc.page_content
            word_count = len(chunk_text.split())
            char_count = len(chunk_text)
            
            all_chunks.append({
                "id": f"{doc_id}_chunk_{chunk_idx}",
                "text": chunk_text,
                "metadata": {
                    "doc_id": doc_id,
                    "chunk_idx": chunk_idx,
                    "total_chunks": len(split_docs),
                    "word_count": word_count,
                    "char_count": char_count,
                    "original_length": len(doc_text)
                }
            })
    
    # Calculate statistics
    total_chunks = len(all_chunks)
    avg_words = sum(c['metadata']['word_count'] for c in all_chunks) / total_chunks if total_chunks else 0
    avg_chars = sum(c['metadata']['char_count'] for c in all_chunks) / total_chunks if total_chunks else 0
    
    print(f"✅ Created {total_chunks} LangChain chunks from {len(documents)} documents")
    print(f"📊 Average per chunk: {avg_words:.1f} words, {avg_chars:.0f} characters")
    
    return all_chunks



embedding generation

In [13]:
def vectorize_chunks(chunks: List[Dict]) -> List[Dict]:
    """Generate embeddings for chunks using BAAI/bge-small-en"""
    if not chunks:
        return []
    
    print(f"🔄 Generating embeddings for {len(chunks)} chunks...")
    
    # Extract texts
    texts = [chunk["text"] for chunk in chunks]
    
    # Generate embeddings in batch
    embeddings = embedding_model.encode(
        texts,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True  # Important for cosine similarity
    )
    
    # Add embeddings to chunks
    for chunk, embedding in zip(chunks, embeddings):
        chunk["embedding"] = embedding.tolist()
    
    print("✅ Embeddings generated successfully")
    return chunks



storing in pinecone

In [14]:
def store_in_pinecone(chunks: List[Dict]) -> bool:
    """Store chunks in Pinecone vector database"""
    if not chunks:
        return False
    
    try:
        print(f"🔄 Storing {len(chunks)} chunks in Pinecone...")
        
        vectors = []
        for chunk in chunks:
            vector = {
                "id": chunk["id"],
                "values": chunk["embedding"],
                "metadata": {
                    "text": chunk["text"][:1000],  # Pinecone metadata limit
                    "doc_id": chunk["metadata"]["doc_id"],
                    "chunk_idx": chunk["metadata"]["chunk_idx"],
                    "word_count": chunk["metadata"]["word_count"],
                    "char_count": chunk["metadata"]["char_count"]
                }
            }
            vectors.append(vector)
        
        # Batch upsert
        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            index.upsert(vectors=batch)
        
        print("✅ Successfully stored in Pinecone")
        return True
        
    except Exception as e:
        print(f"❌ Failed to store in Pinecone: {e}")
        return False


BM25 indexing


In [15]:
def build_bm25_index(chunks: List[Dict]) -> bool:
    """Build BM25 index for keyword-based retrieval"""
    global documents_corpus, document_metadata, bm25_index
    
    try:
        print("🔄 Building BM25 index...")
        
        # Store documents and metadata
        documents_corpus = [chunk["text"] for chunk in chunks]
        document_metadata = [
            {
                "id": chunk["id"],
                "doc_id": chunk["metadata"]["doc_id"],
                "chunk_idx": chunk["metadata"]["chunk_idx"]
            }
            for chunk in chunks
        ]
        
        # Tokenize documents for BM25
        tokenized_docs = [doc.lower().split() for doc in documents_corpus]
        
        # Build BM25 index
        bm25_index = BM25Okapi(tokenized_docs)
        
        print(f"✅ BM25 index built with {len(documents_corpus)} documents")
        return True
        
    except Exception as e:
        print(f"❌ Failed to build BM25 index: {e}")
        return False


Semantic search from the vectorDB

In [17]:
def semantic_search(query: str, top_k: int = 10) -> List[Dict]:
    """Semantic search using Pinecone"""
    try:
        # Generate query embedding
        query_embedding = embedding_model.encode([query], normalize_embeddings=True)[0]
        
        # Search Pinecone
        results = index.query(
            vector=query_embedding.tolist(),
            top_k=top_k,
            include_metadata=True
        )
        
        # Format results
        semantic_results = []
        for match in results.matches:
            semantic_results.append({
                "id": match.id,
                "score": match.score,
                "text": match.metadata.get("text", ""),
                "doc_id": match.metadata.get("doc_id", ""),
                "source": "semantic"
            })
        
        return semantic_results
        
    except Exception as e:
        print(f"❌ Semantic search failed: {e}")
        return []


keyword search via BM25 


In [18]:
def keyword_search(query: str, top_k: int = 10) -> List[Dict]:
    """Keyword-based search using BM25"""
    if bm25_index is None:
        print("⚠️ BM25 index not built")
        return []
    
    try:
        # Tokenize query
        query_tokens = query.lower().split()
        
        # Get BM25 scores
        scores = bm25_index.get_scores(query_tokens)
        
        # Get top-k results
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        # Format results
        keyword_results = []
        for idx in top_indices:
            if scores[idx] > 0:  # Only include positive scores
                keyword_results.append({
                    "id": document_metadata[idx]["id"],
                    "score": float(scores[idx]),
                    "text": documents_corpus[idx],
                    "doc_id": document_metadata[idx]["doc_id"],
                    "source": "keyword"
                })
        
        return keyword_results
        
    except Exception as e:
        print(f"❌ Keyword search failed: {e}")
        return []


RRF to combine bm25 and semantic

In [19]:
def reciprocal_rank_fusion(semantic_results: List[Dict], keyword_results: List[Dict], k: int = 60) -> List[Dict]:
    """Combine semantic and keyword results using Reciprocal Rank Fusion"""
    
    # Create score dictionaries
    rrf_scores = {}
    
    # Add semantic search scores
    for rank, result in enumerate(semantic_results):
        doc_id = result["id"]
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1 / (k + rank + 1)
    
    # Add keyword search scores
    for rank, result in enumerate(keyword_results):
        doc_id = result["id"]
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1 / (k + rank + 1)
    
    # Create combined results
    all_results = {}
    
    # Add all results to dictionary
    for result in semantic_results + keyword_results:
        doc_id = result["id"]
        if doc_id not in all_results:
            all_results[doc_id] = result.copy()
            all_results[doc_id]["rrf_score"] = rrf_scores[doc_id]
            all_results[doc_id]["sources"] = [result["source"]]
        else:
            if result["source"] not in all_results[doc_id]["sources"]:
                all_results[doc_id]["sources"].append(result["source"])
    
    # Sort by RRF score
    fused_results = list(all_results.values())
    fused_results.sort(key=lambda x: x["rrf_score"], reverse=True)
    
    return fused_results



hugginface respone generation

In [20]:
def generate_response_hf(query: str, context_docs: List[Dict], max_new_tokens: int = 800) -> str:
    """Generate response using Hugging Face model with retrieved context"""
    if not context_docs:
        return "I couldn't find relevant information to answer your question."
    
    if generator is None:
        return "Text generation model not available. Please check model loading."
    
    # Prepare context (increased limits)
    context_parts = []
    for i, doc in enumerate(context_docs[:5]):  # Use top 5 docs
        context_parts.append(f"Context {i+1}: {doc['text'][:800]}")  # 800 chars per doc
    
    context = "\n".join(context_parts)
    
    # Clean, simple prompt that won't confuse the model
    prompt = f"""Context: {context}

Question: {query}

Answer:"""
    
    try:
        # Generate response
        response = generator(
            prompt,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            truncation=True,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=generator.tokenizer.eos_token_id if generator.tokenizer.eos_token_id else 50256
        )
        
        # Extract generated text
        generated_text = response[0]['generated_text']
        
        # Extract only the answer part (after "Answer:")
        if "Answer:" in generated_text:
            answer = generated_text.split("Answer:")[-1].strip()
        else:
            answer = generated_text[len(prompt):].strip()
        
        # Clean up the answer - remove any extra instructions
        if "You are an AI assistant" in answer:
            answer = answer.split("You are an AI assistant")[0].strip()
        if "Task:" in answer:
            answer = answer.split("Task:")[0].strip()
        
        # Allow longer responses (up to 10 lines)
        answer_lines = answer.split('\n')
        answer = '\n'.join(answer_lines[:10]) if len(answer_lines) > 1 else answer
        
        # Ensure minimum length
        if not answer or len(answer) < 20:
            return "I need more context to provide a comprehensive answer."
        
        return answer
        
    except Exception as e:
        print(f"❌ Generation failed: {e}")
        return "Sorry, I encountered an error while generating the response."

evaluation using cosine similarity

In [21]:
def evaluate_response_simple(query: str, response: str, context_docs: List[Dict]) -> Dict:
    """Simple evaluation without generating follow-up questions"""
    
    # Simple metrics
    response_length = len(response.split())
    context_used = len(context_docs)
    
    # Check if response contains key terms from query
    query_words = set(query.lower().split())
    response_words = set(response.lower().split())
    word_overlap = len(query_words.intersection(response_words)) / len(query_words) if query_words else 0
    
    # Semantic similarity between query and response
    try:
        query_embedding = embedding_model.encode([query])
        response_embedding = embedding_model.encode([response])
        semantic_similarity = cosine_similarity(query_embedding, response_embedding)[0][0]
    except:
        semantic_similarity = 0.0
    
    # Context relevance (average similarity between query and context docs)
    context_relevance = 0.0
    if context_docs:
        try:
            context_texts = [doc['text'] for doc in context_docs[:3]]
            context_embeddings = embedding_model.encode(context_texts)
            query_embedding = embedding_model.encode([query])
            relevance_scores = cosine_similarity(query_embedding, context_embeddings)[0]
            context_relevance = np.mean(relevance_scores)
        except:
            context_relevance = 0.0
    
    return {
        "query": query,
        "response": response,
        "response_length": response_length,
        "context_docs_used": context_used,
        "word_overlap_score": word_overlap,
        "semantic_similarity": float(semantic_similarity),
        "context_relevance": float(context_relevance),
        "overall_score": (word_overlap + semantic_similarity + context_relevance) / 3
    }



Pipeline


In [22]:
def rag_chat_flexible(query: str, method: str = "hybrid", top_k: int = 10) -> Dict:
    """
    Flexible RAG pipeline - supports any retrieval method
    
    Args:
        query: Question to ask
        method: "semantic", "keyword", "hybrid" (default: "hybrid")
        top_k: Number of chunks to retrieve
    """
    print(f"🔄 Processing query with {method.upper()} retrieval: {query}")
    
    try:
        # Route to appropriate retrieval method
        if method == "semantic":
            retrieved_docs = semantic_search(query, top_k)
        elif method == "keyword":
            retrieved_docs = keyword_search(query, top_k)
        elif method == "hybrid":
            sem_results = semantic_search(query, top_k)
            key_results = keyword_search(query, top_k)
            retrieved_docs = reciprocal_rank_fusion(sem_results, key_results)
        else:
            raise ValueError(f"Unknown method: {method}. Use 'semantic', 'keyword', or 'hybrid'")
        
        # Generate response
        response = generate_response_hf(query, retrieved_docs)
        evaluation = evaluate_response_simple(query, response, retrieved_docs)
        
        return {
            "query": query,
            "answer": response,
            "retrieval_method": method,
            "num_docs_retrieved": len(retrieved_docs),
            "retrieved_docs": retrieved_docs[:3],
            "doc_sources": list(set([doc["doc_id"] for doc in retrieved_docs])),
            "evaluation": evaluation
        }
        
    except Exception as e:
        print(f"❌ Chat pipeline failed: {e}")
        return {
            "query": query,
            "answer": f"Error: {e}",
            "retrieval_method": method,
            "num_docs_retrieved": 0,
            "retrieved_docs": [],
            "doc_sources": [],
            "evaluation": {}
        }



testing using docs

In [23]:
def add_documents_to_system(documents: List[str], doc_ids: Optional[List[str]] = None) -> bool:
    """
    Complete pipeline to add documents using LangChain text splitter
    """
    try:
        print(f"📄 Adding {len(documents)} documents with LangChain chunking...")
        
        # Step 1: LangChain chunking
        chunks = chunk_documents(documents, doc_ids)
        
        if not chunks:
            print("❌ No chunks created")
            return False
        
        # Step 2: Generate embeddings
        chunks_with_embeddings = vectorize_chunks(chunks)
        
        # Step 3: Store in Pinecone
        pinecone_success = store_in_pinecone(chunks_with_embeddings)
        
        # Step 4: Build BM25 index for keyword search
        bm25_success = build_bm25_index(chunks_with_embeddings)
        
        success = pinecone_success and bm25_success
        
        if success:
            print("✅ Successfully added all documents with LangChain chunking")
            print(f"📊 Total chunks in system: {len(chunks_with_embeddings)}")
            
            # Show chunking statistics
            total_words = sum(c['metadata']['word_count'] for c in chunks_with_embeddings)
            total_chars = sum(c['metadata']['char_count'] for c in chunks_with_embeddings)
            avg_words = total_words / len(chunks_with_embeddings)
            avg_chars = total_chars / len(chunks_with_embeddings)
            
            print(f"📊 Chunking stats: {avg_words:.1f} avg words, {avg_chars:.0f} avg chars per chunk")
        else:
            print("❌ Failed to add some documents")
        
        return success
        
    except Exception as e:
        print(f"❌ Error adding documents: {e}")
        return False

# Load your parsed text files
def load_extracted_text_files(directory_path: str = "extracted_texts") -> List[str]:
    """Load text files from your document processing pipeline"""
    from pathlib import Path
    
    texts_dir = Path(directory_path)
    if not texts_dir.exists():
        print(f"❌ Directory not found: {directory_path}")
        return []
    
    # Find all extracted text files
    text_files = list(texts_dir.glob("*_extracted_text.txt"))
    
    documents = []
    doc_ids = []
    
    for file_path in text_files:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read().strip()
            
            if content:  # Only add non-empty files
                # Extract document name from filename
                doc_name = file_path.stem.replace("_extracted_text", "")
                
                documents.append(content)
                doc_ids.append(doc_name)
                
        except Exception as e:
            print(f"⚠️ Error reading {file_path}: {e}")
    
    print(f"📄 Loaded {len(documents)} text files from {directory_path}")
    if documents:
        total_words = sum(len(doc.split()) for doc in documents)
        avg_words = total_words / len(documents)
        print(f"📊 Average document length: {avg_words:.1f} words")
    
    return documents, doc_ids

# Example usage - load and add your documents
try:
    # Load your extracted text files
    my_documents, my_doc_ids = load_extracted_text_files("/kaggle/input/dataset1")
    
    if my_documents:
        # Add to RAG system with word-based chunking
        success = add_documents_to_system(my_documents, my_doc_ids)
        
        if success:
            print("🎉 Ready to chat with your documents!")
        else:
            print("❌ Setup failed")
    else:
        print("⚠️ No documents found. Please check the 'extracted_texts' directory.")
        
except Exception as e:
    print(f"❌ Error: {e}")

📄 Loaded 1 text files from /kaggle/input/dataset1
📊 Average document length: 1999.0 words
📄 Adding 1 documents with LangChain chunking...
✅ Created 61 LangChain chunks from 1 documents
📊 Average per chunk: 36.4 words, 227 characters
🔄 Generating embeddings for 61 chunks...


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Embeddings generated successfully
🔄 Storing 61 chunks in Pinecone...
✅ Successfully stored in Pinecone
🔄 Building BM25 index...
✅ BM25 index built with 61 documents
✅ Successfully added all documents with LangChain chunking
📊 Total chunks in system: 61
📊 Chunking stats: 36.4 avg words, 227 avg chars per chunk
🎉 Ready to chat with your documents!


In [29]:
def ask_question(question: str):
    """Simple interface to ask questions about your aviation documents"""
    print(f"\n🔍 Question: {question}")
    print("="*60)
    
    # Use hybrid retrieval (best performing method)
    result = rag_chat_flexible(question)
    
    # Display results
    print(f"📋 Answer:")
    print(f"   {result['answer']}")
    print(f"\n📄 Source Documents: {', '.join(result['doc_sources'])}")
    print(f"📊 Confidence Score: {result['evaluation'].get('overall_score', 0):.3f}")
    print(f"📈 Retrieved {result['num_docs_retrieved']} relevant chunks")
    
    return None

# Easy testing:
ask_question("Why was flight AC869 canceled?")
ask_question("What GPS issues occurred?")
ask_question("What maintenance was required?")


🔍 Question: Why was flight AC869 canceled?
🔄 Processing query with HYBRID retrieval: Why was flight AC869 canceled?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📋 Answer:
   Flight AC869 was canceled due to extraordinary circumstances that prevented it from operating as planned. Specifically, the aircraft arrived at YHZ (likely an airport code for Toronto Pearson International Airport) with a Maintenance Engineering Limitation (MEL), which means there were specific issues or limitations with the aircraft's systems that required maintenance. This MEL prevented the aircraft from being made available for operation in London Heathrow as scheduled. Despite these challenges, efforts were made to protect passengers by rerouting them to other flights within existing capacity and providing an upgrade to ensure they were not stranded for more than eight hours. To summarize, the cancellation of flight AC869 was primarily due to unresolved maintenance issues affecting the aircraft's operational readiness.

📄 Source Documents: claims_686be326f6267c89ac0cad27_686be326f6267c89ac0cad27_evidence
📊 Confidence Score: 0.792
📈 Retrieved 16 relevant chunks

🔍 Quest

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📋 Answer:
   Based on the context provided, there were several GPS-related issues that occurred:

1. **Industry-wide GPS Outage**: The text in both Context 2 and Context 3 mentions "GPS OUTAGE" at 58.7 PMMIMA (presumably May 22, 2025). This indicates a widespread GPS failure affecting air Canada and possibly other airlines as it is described as an "industry-wide" issue.

📄 Source Documents: claims_686be326f6267c89ac0cad27_686be326f6267c89ac0cad27_evidence
📊 Confidence Score: 0.759
📈 Retrieved 16 relevant chunks

🔍 Question: What maintenance was required?
🔄 Processing query with HYBRID retrieval: What maintenance was required?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

📋 Answer:
   According to the context provided, when the pilots started up aircraft systems, a GPS 1 and 2 Fail would pop up, requiring Maintenance intervention to put on the MEL (Minimum Equipment List). This indicates that maintenance action was necessary due to the GPS issue.

So, the specific maintenance required was:

📄 Source Documents: claims_686be326f6267c89ac0cad27_686be326f6267c89ac0cad27_evidence
📊 Confidence Score: 0.740
📈 Retrieved 15 relevant chunks


comparing retrieval methods

In [None]:
"""
def compare_retrieval_methods(query: str, verbose: bool = True) -> Dict:
    '''Compare all three retrieval methods for a single query'''
    
    methods = ["semantic", "keyword", "hybrid"]
    results = {}
    
    for method in methods:
        if verbose:
            print(f"\n🔍 Testing {method.upper()} retrieval:")
        
        # Get response using specific method
        if method == "semantic":
            sem_results = semantic_search(query, CONFIG['retrieval_k'])
            retrieved_docs = sem_results
        elif method == "keyword":
            key_results = keyword_search(query, CONFIG['retrieval_k'])
            retrieved_docs = key_results
        elif method == "hybrid":
            sem_results = semantic_search(query, CONFIG['retrieval_k'])
            key_results = keyword_search(query, CONFIG['retrieval_k'])
            retrieved_docs = reciprocal_rank_fusion(sem_results, key_results)
        
        # Generate response
        response_text = generate_response_hf(query, retrieved_docs)
        evaluation = evaluate_response_simple(query, response_text, retrieved_docs)
        
        response = {
            "query": query,
            "answer": response_text,
            "retrieval_method": method,
            "num_docs_retrieved": len(retrieved_docs),
            "retrieved_docs": retrieved_docs[:3],
            "doc_sources": list(set([doc["doc_id"] for doc in retrieved_docs])),
            "evaluation": evaluation
        }
        
        # Store results
        results[method] = {
            "response": response,
            "score": response["evaluation"].get("overall_score", 0)
        }
        
        if verbose:
            print(f"  Answer: {response['answer'][:150]}...")
            print(f"  Sources: {response['doc_sources']}")
            print(f"  Score: {results[method]['score']:.3f}")
    
    # Rank methods
    ranked_methods = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True)
    
    comparison_result = {
        "query": query,
        "method_results": results,
        "rankings": ranked_methods,
        "best_method": ranked_methods[0][0],
        "best_score": ranked_methods[0][1]['score']
    }
    
    if verbose:
        print(f"\n🏆 BEST METHOD: {comparison_result['best_method'].upper()} (Score: {comparison_result['best_score']:.3f})")
    
    return comparison_result

# Example usage (uncomment to test):
# comparison = compare_retrieval_methods("Why was flight AC869 canceled?")
"""
