## **Extract Hadith Data**

In [10]:
import fitz  # PyMuPDF
import re
import json
import pandas as pd
from datetime import datetime

def extract_hadiths_from_pdf(pdf_path):
    """Extract hadiths using PyMuPDF (more reliable)"""
    
    print(f"📖 Opening PDF with PyMuPDF: {pdf_path}")
    
    # Open PDF
    doc = fitz.open(pdf_path)
    total_pages = len(doc)
    print(f"📄 Total pages: {total_pages}")
    
    # Extract all text with page tracking
    all_text = ""
    page_map = {}
    current_pos = 0
    
    for page_num in range(total_pages):
        page = doc[page_num]
        page_text = page.get_text()
        
        page_start = current_pos
        page_end = current_pos + len(page_text)
        
        # Map character positions to page numbers
        for pos in range(page_start, page_end):
            page_map[pos] = page_num + 1  # 1-indexed
        
        all_text += page_text + "\n"
        current_pos = page_end + 1
    
    doc.close()
    
    print(f"✅ Extracted {len(all_text)} characters from PDF")
    
    # Show a sample of the text
    print("\n📝 Sample of extracted text (first 500 chars):")
    print(all_text[:500])
    print("...")
    
    # Try multiple patterns
    print("\n🔍 Searching for Hadith patterns...")
    
    patterns = [
        r'Hadith\s+No\.\s*(\d+)',              # Standard: Hadith No. 3
        r'Hadīth\s+No\.\s*(\d+)',              # With macron
        r'Hadith\s+No\s+(\d+)',                # Without period
        r'(?i)hadith\s*no\.?\s*(\d+)',        # Case insensitive
        r'Hadīth\s+No\s+(\d+)',                # Macron without period
    ]
    
    best_matches = []
    best_pattern = None
    
    for pattern in patterns:
        matches = list(re.finditer(pattern, all_text))
        print(f"  Pattern '{pattern}': Found {len(matches)} matches")
        if len(matches) > len(best_matches):
            best_matches = matches
            best_pattern = pattern
    
    if len(best_matches) == 0:
        print("\n❌ No hadith headers found!")
        print("\n💡 Let's search for 'Hadith' (case insensitive):")
        sample_matches = re.finditer(r'(?i)hadith', all_text)
        for i, m in enumerate(list(sample_matches)[:10]):
            start = max(0, m.start() - 20)
            end = min(len(all_text), m.end() + 50)
            print(f"  Match {i+1}: ...{all_text[start:end]}...")
        return []
    
    print(f"\n✅ Using pattern: '{best_pattern}' ({len(best_matches)} matches)")
    
    # Extract hadiths
    hadiths = []
    
    for i, match in enumerate(best_matches):
        hadith_num = match.group(1)
        start_pos = match.end()
        
        # Find where this hadith ends
        if i < len(best_matches) - 1:
            end_pos = best_matches[i + 1].start()
        else:
            end_pos = len(all_text)
        
        # Extract text
        hadith_text = all_text[start_pos:end_pos].strip()
        
        # Skip if too short
        if len(hadith_text) < 50:
            print(f"  ⚠️ Skipping Hadith {hadith_num}: Too short")
            continue
        
        # Get page range
        start_page = page_map.get(match.start(), 'Unknown')
        end_page = page_map.get(end_pos - 1, start_page)
        
        page_range = f"{start_page}" if start_page == end_page else f"{start_page}-{end_page}"
        
        hadith_data = {
            'hadith_no': int(hadith_num),
            'hadith_text': hadith_text,
            'page_start': start_page,
            'page_end': end_page,
            'page_range': page_range,
            'char_count': len(hadith_text),
            'word_count': len(hadith_text.split()),
            'extracted_at': datetime.now().isoformat()
        }
        
        hadiths.append(hadith_data)
        print(f"  ✓ Hadith {hadith_num}: Pages {page_range}, {hadith_data['word_count']} words")
    
    return hadiths

# Rest of the save functions remain the same...

def save_to_json(hadiths, output_path='hadiths.json'):
    """Save hadiths to JSON file"""
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(hadiths, f, ensure_ascii=False, indent=2)
    print(f"\n✅ Saved {len(hadiths)} hadiths to {output_path}")

def save_to_csv(hadiths, output_path='hadiths.csv'):
    """Save hadiths to CSV file"""
    df = pd.DataFrame(hadiths)
    df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"✅ Saved {len(hadiths)} hadiths to {output_path}")

# def save_to_excel(hadiths, output_path='hadiths.xlsx'):
#     """Save hadiths to Excel file"""
#     df = pd.DataFrame(hadiths)
#     df.to_excel(output_path, index=False, engine='openpyxl')
#     print(f"✅ Saved {len(hadiths)} hadiths to {output_path}")

In [11]:

# Main execution
if __name__ == "__main__":
    print("=" * 70)
    print("🚀 EXTRACTING HADITHS FROM KITAB SULAYM IBN QAYS")
    print("=" * 70 + "\n")
    
    # Extract hadiths
    hadiths = extract_hadiths_from_pdf('kitab_sulaim_ibn_qays_al-hilaali.pdf')
    
    # Save in multiple formats
    save_to_json(hadiths)
    save_to_csv(hadiths)
    # save_to_excel(hadiths)
    
    # Print statistics
    print("\n" + "=" * 70)
    print("📊 EXTRACTION STATISTICS")
    print("=" * 70)
    print(f"Total Hadiths: {len(hadiths)}")
    print(f"Shortest Hadith: {min(h['word_count'] for h in hadiths)} words")
    print(f"Longest Hadith: {max(h['word_count'] for h in hadiths)} words")
    print(f"Average Length: {sum(h['word_count'] for h in hadiths) // len(hadiths)} words")
    
    # Show first 3 hadiths as sample
    print("\n📖 Sample Hadiths:")
    for hadith in hadiths[:3]:
        print(f"\n  Hadith No. {hadith['hadith_no']} (Page {hadith['page_range']})")
        print(f"  Preview: {hadith['hadith_text'][:150]}...")
    
    print("\n" + "=" * 70)
    print("✅ EXTRACTION COMPLETE!")
    print("=" * 70)

🚀 EXTRACTING HADITHS FROM KITAB SULAYM IBN QAYS

📖 Opening PDF with PyMuPDF: kitab_sulaim_ibn_qays_al-hilaali.pdf
📄 Total pages: 255
✅ Extracted 562017 characters from PDF

📝 Sample of extracted text (first 500 chars):
 
 
KITAB-E-SULAYM 
IBN 
QAYS AL HILALI 
 
Imam Ja’far Al Sadiq (a.s) said: 
“If anyone from our Shia and devotees does not have the book of Sulaym 
ibn Qays al Hilali, then he does not have any of our things, and he does not 
know any of our matters.  
This is the first book of Shia and is one of the secrets of Ale-Muhammad 
(a.s)” 
 
Introduction 
 
BISMILLAHIR RAHMANIR RAHEEM 
 
Wa Sallallahu ‘Ala Muhammad Wa Alahit Tayyabin al Muntakhabin 
 
And May Allah shower his blessings on the Holy Pro
...

🔍 Searching for Hadith patterns...
  Pattern 'Hadith\s+No\.\s*(\d+)': Found 66 matches
  Pattern 'Hadīth\s+No\.\s*(\d+)': Found 0 matches
  Pattern 'Hadith\s+No\s+(\d+)': Found 0 matches
  Pattern '(?i)hadith\s*no\.?\s*(\d+)': Found 91 matches
  Pattern 'Hadīth\s+No\s+(\d+)'

In [13]:
import json

def update_hadiths_json(input_file='hadiths.json', output_file='hadiths_updated.json'):
    """Add section_type to existing hadiths.json and remove is_preface"""
    
    print(f"📖 Reading {input_file}...")
    
    # Load existing JSON
    with open(input_file, 'r', encoding='utf-8') as f:
        hadiths = json.load(f)
    
    print(f"✅ Loaded {len(hadiths)} entries")
    
    # Update each entry
    updated_count = 0
    for hadith in hadiths:
        # Determine section_type based on hadith_no
        if hadith['hadith_no'] == 0:
            hadith['section_type'] = 'preface'
        else:
            hadith['section_type'] = 'hadith'
        
        # Remove is_preface key if it exists
        if 'is_preface' in hadith:
            del hadith['is_preface']
        
        updated_count += 1
    
    print(f"✅ Updated {updated_count} entries")
    
    # Save updated JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(hadiths, f, ensure_ascii=False, indent=2)
    
    print(f"✅ Saved to {output_file}")
    
    # Show summary
    print("\n📊 Summary:")
    preface_count = sum(1 for h in hadiths if h['section_type'] == 'preface')
    hadith_count = sum(1 for h in hadiths if h['section_type'] == 'hadith')
    print(f"  Preface entries: {preface_count}")
    print(f"  Hadith entries: {hadith_count}")
    print(f"  Total: {len(hadiths)}")
    
    # Show sample
    print("\n📄 Sample entries:")
    for i, hadith in enumerate(hadiths[:3]):
        print(f"\n  Entry {i+1}:")
        print(f"    Hadith No: {hadith['hadith_no']}")
        print(f"    Section Type: {hadith['section_type']}")
        print(f"    Page Range: {hadith.get('page_range', 'N/A')}")
        print(f"    Preview: {hadith['hadith_text'][:80]}...")

if __name__ == "__main__":
    print("=" * 70)
    print("🔧 UPDATING HADITHS.JSON")
    print("=" * 70 + "\n")
    
    update_hadiths_json()
    
    print("\n" + "=" * 70)
    print("✅ UPDATE COMPLETE!")
    print("=" * 70)
    print("\n💡 Your updated file is: hadiths_updated.json")
    print("💡 To replace original: rename hadiths_updated.json → hadiths.json")

🔧 UPDATING HADITHS.JSON

📖 Reading hadiths.json...
✅ Loaded 92 entries
✅ Updated 92 entries
✅ Saved to hadiths_updated.json

📊 Summary:
  Preface entries: 1
  Hadith entries: 91
  Total: 92

📄 Sample entries:

  Entry 1:
    Hadith No: 0
    Section Type: preface
    Page Range: N/A
    Preview: Imam Ja’far Al Sadiq (a.s) said: “If anyone from our Shia and devotees does not ...

  Entry 2:
    Hadith No: 1
    Section Type: hadith
    Page Range: 6-9
    Preview: Sulaym says: “I heard Salman al-Farsi saying: “I was sitting with the Holy 
Prop...

  Entry 3:
    Hadith No: 2
    Section Type: hadith
    Page Range: 9-11
    Preview: Sulaym said: “I was told by Hadhrat Ali ibn Abu Talib (AS). He said: “I was 
goi...

✅ UPDATE COMPLETE!

💡 Your updated file is: hadiths_updated.json
💡 To replace original: rename hadiths_updated.json → hadiths.json


## **Train the JSON Data**

In [21]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.schema import Document
from pinecone import Pinecone as PineconeBaseClient
from dotenv import load_dotenv
import json
import os

load_dotenv()

pc = PineconeBaseClient(api_key=os.getenv("PINECONE_API_KEY"))
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME_SQ_V2")

def load_structured_hadiths(json_path='hadiths_updated.json'):
    """Load hadiths from JSON"""
    with open(json_path, 'r', encoding='utf-8') as f:
        hadiths = json.load(f)
    print(f"📖 Loaded {len(hadiths)} hadiths from {json_path}")
    return hadiths

def create_documents_from_hadiths(hadiths):
    """Convert hadiths to LangChain documents"""
    documents = []
    
    for hadith in hadiths:
        # CRITICAL: Put full text in page_content (no size limit)
        # Only put small reference data in metadata
        doc = Document(
            page_content=hadith['hadith_text'],  # ← Full text here (unlimited)
            metadata={
                'hadith_no': hadith['hadith_no'],
                'section_type': hadith.get('section_type', 'hadith'),
                'page_start': hadith.get('page_start', 'N/A'),
                'page_end': hadith.get('page_end', 'N/A'),
                'page_range': hadith.get('page_range', 'N/A'),
                'word_count': hadith.get('word_count', 0),
                'source': 'Kitab Sulaym ibn Qays'
                # ← No hadith_text in metadata! Only references
            }
        )
        documents.append(doc)
    
    print(f"✅ Created {len(documents)} documents")
    return documents

def store_in_pinecone(documents, INDEX_NAME):
    """Store documents in Pinecone"""
    
    embeddings = HuggingFaceEmbeddings(
        model_name='BAAI/bge-large-en-v1.5',
        model_kwargs={"device": "cuda"}
    )
    
    print("🔄 Creating embeddings and storing in Pinecone...")
    print(f"   This may take a few minutes for {len(documents)} documents...")
    
    # Store in batches to avoid timeouts
    pinecone_store = PineconeVectorStore.from_documents(
        documents=documents,
        embedding=embeddings,
        index_name=INDEX_NAME,
        batch_size=50  # ← Process in smaller batches
    )
    
    print(f"✅ Stored {len(documents)} hadiths in Pinecone index: {INDEX_NAME}")
    
    return pinecone_store

# Main execution
if __name__ == "__main__":
    print("=" * 70)
    print("🚀 TRAINING RAG WITH STRUCTURED HADITHS")
    print("=" * 70 + "\n")
    
    # Load structured hadiths
    hadiths = load_structured_hadiths()
    
    # Convert to documents
    documents = create_documents_from_hadiths(hadiths)
    
    # Show sample
    print("\n📄 Sample Document:")
    print(f"Hadith No: {documents[1].metadata['hadith_no']}")
    print(f"Section Type: {documents[1].metadata['section_type']}")
    print(f"Page Range: {documents[1].metadata['page_range']}")
    print(f"Text Preview: {documents[1].page_content[:150]}...")
    
    # Check metadata size
    import sys
    metadata_size = sys.getsizeof(str(documents[1].metadata))
    print(f"\n📊 Metadata size: {metadata_size} bytes (limit: 40960 bytes)")
    
    if metadata_size > 40000:
        print("⚠️ WARNING: Metadata too large!")
    else:
        print("✅ Metadata size OK")
    
    # Store in Pinecone
    print("\n" + "=" * 70)
    vector_store = store_in_pinecone(documents, INDEX_NAME)
    
    # Test retrieval
    print("\n🔍 Testing retrieval...")
    retriever = vector_store.as_retriever(search_kwargs={"k": 3})
    test_results = retriever.get_relevant_documents("attack on Fatimah house")
    
    print(f"\nRetrieved {len(test_results)} hadiths:")
    for doc in test_results:
        print(f"\n  Hadith No. {doc.metadata['hadith_no']} (Pages {doc.metadata['page_range']})")
        print(f"  Section: {doc.metadata['section_type']}")
        print(f"  Preview: {doc.page_content[:100]}...")
    
    print("\n" + "=" * 70)
    print("✅ TRAINING COMPLETE!")
    print("=" * 70)

🚀 TRAINING RAG WITH STRUCTURED HADITHS

📖 Loaded 92 hadiths from hadiths_updated.json
✅ Created 92 documents

📄 Sample Document:
Hadith No: 1
Section Type: hadith
Page Range: 6-9
Text Preview: Sulaym says: “I heard Salman al-Farsi saying: “I was sitting with the Holy 
Prophet (SAW) while he was in that period of illness in which he passed 
a...

📊 Metadata size: 199 bytes (limit: 40960 bytes)
✅ Metadata size OK

🔄 Creating embeddings and storing in Pinecone...
   This may take a few minutes for 92 documents...


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sun, 26 Oct 2025 18:15:04 GMT', 'Content-Type': 'application/json', 'Content-Length': '115', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '1778', 'x-pinecone-request-id': '1677343850328051571', 'x-envoy-upstream-service-time': '73', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata size is 49917 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}


In [22]:
import json
import sys

# Load your JSON
with open('hadiths_updated.json', 'r', encoding='utf-8') as f:
    hadiths = json.load(f)

# Find the largest hadith
largest = max(hadiths, key=lambda h: len(h['hadith_text']))

print(f"Largest hadith: No. {largest['hadith_no']}")
print(f"Text length: {len(largest['hadith_text'])} characters")
print(f"Text size: {sys.getsizeof(largest['hadith_text'])} bytes")
print(f"\nMetadata without text:")
metadata = {k: v for k, v in largest.items() if k != 'hadith_text'}
print(f"Size: {sys.getsizeof(str(metadata))} bytes")

Largest hadith: No. 25
Text length: 49778 characters
Text size: 99630 bytes

Metadata without text:
Size: 240 bytes


In [23]:
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone as PineconeBaseClient
from dotenv import load_dotenv
import json
import os
from tqdm import tqdm
import sys

load_dotenv()

pc = PineconeBaseClient(api_key=os.getenv("PINECONE_API_KEY"))
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME_SQ_V2")

def load_hadiths(json_path='hadiths_updated.json'):
    """Load hadiths from JSON"""
    with open(json_path, 'r', encoding='utf-8') as f:
        hadiths = json.load(f)
    print(f"📖 Loaded {len(hadiths)} hadiths")
    return hadiths

def store_in_pinecone_manual(hadiths, INDEX_NAME):
    """Store hadiths directly in Pinecone (bypass LangChain)"""
    
    # Initialize embeddings model
    print("🔄 Initializing embedding model...")
    embeddings = HuggingFaceEmbeddings(
        model_name='BAAI/bge-large-en-v1.5',
        model_kwargs={"device": "cuda"}
    )
    
    # Get Pinecone index
    index = pc.Index(INDEX_NAME)
    
    print(f"🔄 Processing {len(hadiths)} hadiths...")
    
    vectors_to_upsert = []
    
    for hadith in tqdm(hadiths, desc="Creating embeddings"):
        # Create embedding for the text
        text = hadith['hadith_text']
        embedding = embeddings.embed_query(text)
        
        # Create MINIMAL metadata (NO TEXT!)
        metadata = {
            'hadith_no': int(hadith['hadith_no']),
            'section_type': hadith.get('section_type', 'hadith'),
            'page_range': hadith.get('page_range', 'N/A'),
            'word_count': int(hadith.get('word_count', 0)),
            'source': 'Kitab Sulaym ibn Qays'
        }
        
        # Verify metadata size
        metadata_size = sys.getsizeof(str(metadata))
        if metadata_size > 40000:
            print(f"⚠️ Hadith {hadith['hadith_no']} metadata too large: {metadata_size} bytes")
            continue
        
        # Create vector ID
        vector_id = f"hadith_{hadith['hadith_no']}"
        
        # Add to batch
        vectors_to_upsert.append({
            'id': vector_id,
            'values': embedding,
            'metadata': metadata
        })
        
        # Upsert in batches of 50
        if len(vectors_to_upsert) >= 50:
            try:
                index.upsert(vectors=vectors_to_upsert)
                print(f"✅ Uploaded batch of {len(vectors_to_upsert)}")
            except Exception as e:
                print(f"❌ Error uploading batch: {e}")
            vectors_to_upsert = []
    
    # Upsert remaining vectors
    if vectors_to_upsert:
        try:
            index.upsert(vectors=vectors_to_upsert)
            print(f"✅ Uploaded final batch of {len(vectors_to_upsert)}")
        except Exception as e:
            print(f"❌ Error uploading final batch: {e}")
    
    print(f"✅ Stored {len(hadiths)} hadiths in Pinecone")
    
    # Verify
    stats = index.describe_index_stats()
    print(f"📊 Index stats: {stats['total_vector_count']} vectors")

if __name__ == "__main__":
    print("=" * 70)
    print("🚀 STORING HADITHS IN PINECONE (MANUAL METHOD)")
    print("=" * 70 + "\n")
    
    # Load hadiths
    hadiths = load_hadiths()
    
    # Check metadata size for largest hadith
    print("\n📊 Checking metadata size for largest hadith...")
    largest = max(hadiths, key=lambda h: len(h['hadith_text']))
    sample_metadata = {
        'hadith_no': largest['hadith_no'],
        'section_type': 'hadith',
        'page_range': largest.get('page_range', 'N/A'),
        'word_count': largest.get('word_count', 0),
        'source': 'Kitab Sulaym ibn Qays'
    }
    size = sys.getsizeof(str(sample_metadata))
    print(f"✅ Metadata size: {size} bytes (limit: 40960)")
    print(f"✅ Safe to proceed!\n")
    
    # Store in Pinecone
    store_in_pinecone_manual(hadiths, INDEX_NAME)
    
    print("\n" + "=" * 70)
    print("✅ STORAGE COMPLETE!")
    print("=" * 70)
    print("\n💡 Next: Use the HadithRAG class for retrieval")

🚀 STORING HADITHS IN PINECONE (MANUAL METHOD)

📖 Loaded 92 hadiths

📊 Checking metadata size for largest hadith...
✅ Metadata size: 172 bytes (limit: 40960)
✅ Safe to proceed!

🔄 Initializing embedding model...
🔄 Processing 92 hadiths...


Creating embeddings:  53%|█████▎    | 49/92 [00:03<00:02, 16.23it/s]

✅ Uploaded batch of 50


Creating embeddings: 100%|██████████| 92/92 [00:08<00:00, 10.79it/s]


✅ Uploaded final batch of 42
✅ Stored 92 hadiths in Pinecone
📊 Index stats: 246 vectors

✅ STORAGE COMPLETE!

💡 Next: Use the HadithRAG class for retrieval


In [28]:
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone as PineconeBaseClient
from dotenv import load_dotenv
import json
import os

load_dotenv()

class HadithRAG:
    def __init__(self, json_path='hadiths_updated.json'):
        """Initialize RAG system"""
        
        # Load full hadith texts from JSON
        print("📖 Loading hadiths from JSON...")
        with open(json_path, 'r', encoding='utf-8') as f:
            hadiths_list = json.load(f)
            # Create lookup dict by hadith_no
            self.hadiths = {h['hadith_no']: h for h in hadiths_list}
        
        print(f"✅ Loaded {len(self.hadiths)} hadiths")
        
        # Initialize embeddings
        print("🔄 Loading embedding model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name='BAAI/bge-large-en-v1.5',
            model_kwargs={"device": "cuda"}
        )
        
        # Connect to Pinecone
        pc = PineconeBaseClient(api_key=os.getenv("PINECONE_API_KEY"))
        self.index = pc.Index(os.getenv("PINECONE_INDEX_NAME_SQ_V2"))
        
        print("✅ RAG system ready!")
    
    def retrieve(self, query, k=10):
        """Retrieve relevant hadiths with full text"""
        
        # Create query embedding
        query_embedding = self.embeddings.embed_query(query)
        
        # Search Pinecone
        results = self.index.query(
            vector=query_embedding,
            top_k=k,
            include_metadata=True
        )
        
        # Lookup full texts from JSON
        retrieved_hadiths = []
        for match in results['matches']:
            hadith_no = match['metadata']['hadith_no']
            
            # Get full hadith data from JSON
            if hadith_no in self.hadiths:
                hadith_data = self.hadiths[hadith_no]
                
                retrieved_hadiths.append({
                    'hadith_no': hadith_no,
                    'page_range': match['metadata'].get('page_range', 'N/A'),  # ← Use .get()
                    'section_type': match['metadata'].get('section_type', 'hadith'),  # ← Use .get()
                    'text': hadith_data['hadith_text'],
                    'score': match['score']
                })
        
        return retrieved_hadiths
    
    def format_context(self, retrieved_hadiths):
        """Format hadiths for LLM context"""
        formatted = []
        
        for h in retrieved_hadiths:
            if h['section_type'] == 'preface':
                header = f"[Preface, Pages {h['page_range']}]"
            else:
                header = f"[Hadith No. {h['hadith_no']}, Pages {h['page_range']}]"
            
            formatted.append(f"{header}\n{h['text']}\n---")
        
        return "\n\n".join(formatted)
    
    def query(self, question, k=10):
        """Complete query pipeline"""
        print(f"\n🔎 Query: {question}\n")
        
        # Retrieve relevant hadiths
        retrieved = self.retrieve(question, k=k)
        print(f"📚 Retrieved {len(retrieved)} relevant hadiths")
        
        # Format context
        context = self.format_context(retrieved)
        
        return context, retrieved

# Test usage
if __name__ == "__main__":
    # Initialize RAG
    rag = HadithRAG()
    
    # Test query
    query = "Few people attacked the house of Lady Fatimah (SA) and burned the door"
    context, results = rag.query(query, k=5)
    
    print("\n" + "=" * 70)
    print("📋 RETRIEVED HADITHS:")
    print("=" * 70)
    
    for r in results:
        print(f"\n  Hadith No. {r['hadith_no']} (Pages {r['page_range']})")
        print(f"  Relevance: {r['score']:.4f}")
        print(f"  Preview: {r['text'][:150]}...")
    
    print("\n" + "=" * 70)
    print("📝 FORMATTED CONTEXT (for LLM):")
    print("=" * 70)
    print(context[:1000] + "...")

📖 Loading hadiths from JSON...
✅ Loaded 92 hadiths
🔄 Loading embedding model...
✅ RAG system ready!

🔎 Query: Few people attacked the house of Lady Fatimah (SA) and burned the door

📚 Retrieved 5 relevant hadiths

📋 RETRIEVED HADITHS:

  Hadith No. 1.0 (Pages 6-9)
  Relevance: 0.5763
  Preview: Sulaym says: “I heard Salman al-Farsi saying: “I was sitting with the Holy 
Prophet (SAW) while he was in that period of illness in which he passed 
a...

  Hadith No. 67.0 (Pages 230-236)
  Relevance: 0.5727
  Preview: Sulaym says: “I was present at Ali (AS), at the time when Ziyad ibn ‘Ubayd 
returned after having achieved victory in the Battle of Jamal. The house w...

  Hadith No. 67.0 (Pages 230-236)
  Relevance: 0.5727
  Preview: Sulaym says: “I was present at Ali (AS), at the time when Ziyad ibn ‘Ubayd 
returned after having achieved victory in the Battle of Jamal. The house w...

  Hadith No. 67.0 (Pages 230-236)
  Relevance: 0.5727
  Preview: Sulaym says: “I was present at Ali (AS), at 

In [30]:
from hadith_rag import HadithRAG
import google.generativeai as genai
import os
from system_prompt import system_prompt as input_system_prompt
from dotenv import load_dotenv

load_dotenv()

# Configure Gemini
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

# System prompt
system_prompt = input_system_prompt

def get_answer(query, rag, model):
    """Get answer from RAG + Gemini"""
    
    # Retrieve context
    context, retrieved = rag.query(query, k=12)
    
    # Create prompt
    prompt = system_prompt.format(context=context, question=query)
    
    # Get response from Gemini
    print("🤔 Generating response with Gemini...\n")
    
    response = model.generate_content(prompt)
    
    return response.text, retrieved

if __name__ == "__main__":
    print("=" * 80)
    print("🚀 INITIALIZING HADITH RAG SYSTEM WITH GEMINI")
    print("=" * 80 + "\n")
    
    # Initialize RAG
    rag = HadithRAG()
    
    # Initialize Gemini model
    model = genai.GenerativeModel('gemini-1.5-pro')
    
    # Test queries
    queries = [
        "Tell me about Imam Ali talking to the sun as per what the Holy Prophet told him and how did the sun reply to Imam Ali. Also quote the entire incident and give hadith no and page number from the book.",
        "Few people attacked the house of Lady Fatimah (SA) and burned the door. What was the entire incident? Explain in details. Also provide references like hadith no and page."
    ]
    
    for query in queries:
        print("\n" + "=" * 80)
        print(f"Q: {query}")
        print("=" * 80 + "\n")
        
        answer, retrieved = get_answer(query, rag, model)
        
        print("RETRIEVED HADITHS:")
        for r in retrieved:
            print(f"  • Hadith No. {r['hadith_no']} (Pages {r['page_range']}) - Score: {r['score']:.4f}")
        
        print("\n" + "-" * 80)
        print("ANSWER:")
        print("-" * 80)
        print(answer)
        print("\n" + "=" * 80 + "\n")

ImportError: cannot import name 'HadithRAG' from 'hadith_rag' (d:\Sadiq\rag-for-hq-books\hadith_rag.py)