In [1]:


import os
import fitz  # PyMuPDF
import pytesseract
import io
from PIL import Image
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from IPython.display import display, Markdown


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from typing import List, Dict


In [14]:
from langchain.tools import Tool
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI


In [15]:
from dotenv import load_dotenv
import os

# Load variables from .env
load_dotenv()

# Fetch the API key
openai_api_key = os.getenv("OPENAI_API_KEY")
#print("🔑 Loaded Key:", openai_api_key)



In [16]:
from openai import OpenAI

client = OpenAI(api_key=openai_api_key)
# models = client.models.list()
# print("✅ Models:", [m.id for m in models.data])


In [17]:
import os
import faiss
from sentence_transformers import SentenceTransformer
from openai import OpenAI   # ↳ v1 SDK

In [18]:
class PDFProcessor:
    def __init__(self, output_dir="Titantranscriptextracted"):
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def extract_text(self, pdf_path):
        base_name = os.path.splitext(os.path.basename(pdf_path))[0]
        pdf_output_dir = os.path.join(self.output_dir, base_name)
        os.makedirs(pdf_output_dir, exist_ok=True)

        text = ""

        print(f"📄 Processing file: {pdf_path}")

        try:
            with fitz.open(pdf_path) as doc:
                for page_number, page in enumerate(doc):
                    page_text = page.get_text()
                    text += f"\n--- Page {page_number+1} ---\n" + page_text
                    print(f"✅ Page {page_number+1}: Text length = {len(page_text)}")
        except Exception as e:
            print(f"[Failed to open PDF]: {e}")

        # 💾 Save final extracted text
        output_text_path = os.path.join(pdf_output_dir, f"{base_name}_extracted_text.txt")
        with open(output_text_path, "w", encoding="utf-8") as f:
            f.write(text)
            print(f"💾 Saved text to: {output_text_path}")

        return text

In [19]:
# ── Extract management names from first 3 pages ──────
def extract_management_names(pdf_path, keywords=["management", "participants", "executives"]):
    try:
        with fitz.open(pdf_path) as doc:
            for page_num in range(min(3, len(doc))):
                page_text = doc[page_num].get_text()
                lines = page_text.split('\n')

                capture = False
                names = []
                for line in lines:
                    line_clean = line.strip().lower()

                    if any(keyword in line_clean for keyword in keywords):
                        capture = True
                        continue

                    if capture:
                        if line.strip() == "" or re.match(r"^[A-Z ]{5,}$", line.strip()):
                            break
                        if 2 <= len(line.split()) <= 8:
                            names.append(line.strip())

                if names:
                    print(f"✅ Found likely management names on page {page_num+1}:")
                    return names
    except Exception as e:
        print(f"[Error extracting names]: {e}")
    return []



In [20]:
from typing import List

class RAGSystem:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.embed_model = SentenceTransformer(model_name)
        self.text_chunks = []
        self.chunk_metadata = []  # To store metadata
        self.vectors = None
        self.index = None
        self.chat_history = []  # 🔴 Added: memory for conversation

    def load_and_chunk_texts(self, text_folder: str, management_names: List[str] = None) -> None:
        prefixes = ["Moderator:", "Analyst:", "Management:", "Operator:", "Speaker:", "Mr.", "Ms.", "Mrs."]
        if management_names:
            prefixes.extend(name.split()[0] + ":" for name in management_names if name.strip())

        for root, _, files in os.walk(text_folder):
            for fname in files:
                if fname.endswith(".txt"):
                    with open(os.path.join(root, fname), "r", encoding="utf-8") as f:
                        lines = f.read().split("\n")

                    chunk = ""
                    speaker = ""
                    role = ""

                    for line in lines:
                        line = line.strip()
                        if any(line.startswith(p) for p in prefixes):
                            if chunk.strip():
                                self.text_chunks.append(chunk.strip())
                                self.chunk_metadata.append({"speaker": speaker, "role": role})
                            chunk = line
                            parts = line.split(":", 1)
                            speaker = parts[0].strip()
                            role = "Management" if speaker in [n.split()[0] for n in (management_names or [])] else "Unknown"
                        else:
                            chunk += " " + line

                    if chunk.strip():
                        self.text_chunks.append(chunk.strip())
                        self.chunk_metadata.append({"speaker": speaker, "role": role})

        print(f"📄 Loaded {len(self.text_chunks)} speaker-based chunks.")

        print("\n🔍 Sample Chunks with Metadata:")
        for i in range(min(3, len(self.text_chunks))):
            print(f"\nChunk #{i+1}:")
            print(f"Text: {self.text_chunks[i][:200]}...")
            print(f"Metadata: {self.chunk_metadata[i]}")

    def embed_and_index(self) -> None:
        print("🔄 Embedding chunks …")
        self.vectors = self.embed_model.encode(self.text_chunks, convert_to_numpy=True)
        self.index = faiss.IndexFlatL2(self.vectors.shape[1])
        self.index.add(self.vectors)
        print(f"📦 FAISS index ready with {self.index.ntotal} vectors.")

    def expand_query(self, query: str) -> str:
        """Use GPT to rewrite the query with related phrases."""
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo",  
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that expands search queries by adding synonyms or related terms."},
                    {"role": "user", "content": f"Original query: {query}\n\nExpand this query with synonyms or related phrases:"}
                ],
                temperature=0.5
            )
            expanded = response.choices[0].message.content.strip()
            return expanded
        except Exception as e:
            print(f"[Query expansion error] {e}")
            return query 
        

    def query(self, question: str, top_k: int = 3) -> List[dict]:
        expanded_question = self.expand_query(question)
        print(f"🔍 Expanded Query: {expanded_question}")
        q_vec = self.embed_model.encode([expanded_question], convert_to_numpy=True)
        _, idxs = self.index.search(q_vec, top_k)
        return [
            {
                "chunk": self.text_chunks[i],
                "metadata": self.chunk_metadata[i]
            } for i in idxs[0]
        ]

    def generate_answer(self, question: str, top_k: int = 3) -> str:
        retrieved = self.query(question, top_k=top_k)

        
        context = ""
        for item in retrieved:
            speaker = item["metadata"].get("speaker", "Unknown")
            role = item["metadata"].get("role", "Unknown")
            chunk = item["chunk"]
            context += f"[{role}] {speaker}:\n{chunk}\n\n"

        system_prompt = (
            "You are a financial assistant. Use the speaker context to answer accurately. "
            "Mention speaker names if useful. Say 'Not in transcript' if info is missing."
        )
        user_prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"

        self.chat_history.append({"role": "system", "content": system_prompt})  
        self.chat_history.append({"role": "user", "content": user_prompt})      

        try:
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=self.chat_history,  
                temperature=0.9
            )
            answer = resp.choices[0].message.content.strip()
            self.chat_history.append({"role": "assistant", "content": answer})  

            MAX_HISTORY = 2
            if len(self.chat_history) > MAX_HISTORY:
                self.chat_history = self.chat_history[-MAX_HISTORY:]

            return answer
        except Exception as e:
            return f"[OpenAI error] {e}"


In [26]:
# ── LangChain ReAct Agent with Trace ──────────────────

def build_react_agent_with_trace(rag: RAGSystem):
    tools = [
        Tool(
            name="ExpandQueryTool",
            func=rag.expand_query,
            description="Expands search query with synonyms or related terms.Use this tool only for vague queries"
        ),
        Tool(
            name="SearchIndexTool",
            func=lambda q: rag.query(q, top_k=3),
            description="Searches the FAISS index and returns top relevant chunks."
        ),
        Tool(
            name="GenerateAnswerTool",
            func=lambda q: rag.generate_answer(q, top_k=3),
            description="Generates an answer from the retrieved chunks and original question."
        ),
    ]

    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.7)
    agent = initialize_agent(
        tools=tools,
        llm=llm,
        agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        verbose=True  # ✅ Enables full trace output to console
    )
    return agent



In [27]:
# ── Example Execution ──────────────────────────────────
rag = RAGSystem()
rag.load_and_chunk_texts("Titantranscriptextracted")
rag.embed_and_index()

📄 Loaded 17 speaker-based chunks.

🔍 Sample Chunks with Metadata:

Chunk #1:
Text: --- Page 1 ---  Titan Company Limited `INTEGRITY` #193  Veerasandra  Electronics City P.O.  Off Hosur Main Road, Bangalore 560100 India. Tel: 9180 6704 7000 Fax: 9180 6704 6262 Registered Office 3, Si...
Metadata: {'speaker': '', 'role': ''}

Chunk #2:
Text: Moderator:  Ladies and gentlemen, good day and welcome to Titan Company Limited's Q4 & FY'25 Earnings Conference Call. As a reminder, all participant lines will be in the listen-only mode and there wi...
Metadata: {'speaker': 'Moderator', 'role': 'Unknown'}

Chunk #3:
Text: Moderator: Thank you very much. We will now begin the question-and-answer session. First question is from the line of Manoj Menon from ICICI Securities. Please go ahead. Manoj Menon: Hi. first of all,...
Metadata: {'speaker': 'Moderator', 'role': 'Unknown'}
🔄 Embedding chunks …
📦 FAISS index ready with 17 vectors.


In [28]:
agent = build_react_agent_with_trace(rag)
response = agent.run("How did jewellery business perform?")
print("\n🧾 Final Answer:\n", response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo understand how the jewellery business performed, I may need to look for recent reports or analyses related to the jewellery industry, such as sales trends, market growth, and consumer behavior. Since the query is somewhat vague, I should expand it to include more specific terms related to jewellery business performance. 

Action: ExpandQueryTool  
Action Input: "jewellery business performance"  [0m
Observation: [36;1m[1;3mjewelry company financial results[0m
Thought:[32;1m[1;3mThe expanded query provided relevant terms such as "jewelry company financial results," which can help in understanding the performance of the jewellery business. Now, I should search for specific financial results or performance reports related to jewellery companies.

Action: SearchIndexTool  
Action Input: "jewelry company financial results"  [0m🔍 Expanded Query: jewelry company earnings report, jewelry company fiscal performance, jewelry b

In [76]:
# Instantiate the RAG system
rag = RAGSystem()

# Load and chunk text files
rag.load_and_chunk_texts("Titantranscriptextracted")  # Your text folder

# Embed and index
rag.embed_and_index()



📄 Loaded 17 speaker-based chunks.

🔍 Sample Chunks with Metadata:

Chunk #1:
Text: --- Page 1 ---  Titan Company Limited `INTEGRITY` #193  Veerasandra  Electronics City P.O.  Off Hosur Main Road, Bangalore 560100 India. Tel: 9180 6704 7000 Fax: 9180 6704 6262 Registered Office 3, Si...
Metadata: {'speaker': '', 'role': ''}

Chunk #2:
Text: Moderator:  Ladies and gentlemen, good day and welcome to Titan Company Limited's Q4 & FY'25 Earnings Conference Call. As a reminder, all participant lines will be in the listen-only mode and there wi...
Metadata: {'speaker': 'Moderator', 'role': 'Unknown'}

Chunk #3:
Text: Moderator: Thank you very much. We will now begin the question-and-answer session. First question is from the line of Manoj Menon from ICICI Securities. Please go ahead. Manoj Menon: Hi. first of all,...
Metadata: {'speaker': 'Moderator', 'role': 'Unknown'}
🔄 Embedding chunks …
📦 FAISS index ready with 17 vectors.


In [73]:

# Ask a question
response = rag.generate_answer("What are the risk for the business ?")
print("Answer:", response)

🔍 Expanded Query: What are the potential hazards for the business?
Answer: The risks for the business include the following:

1. **Market Expansion vs. Cannibalization**: Ajoy Chawla mentioned the concern of expanding the market for Lab-Grown Diamonds (LGDs) potentially cannibalizing the existing natural diamond business. Understanding customer sentiment is crucial to avoid negatively impacting current sales.
  
2. **Customer Uncertainty**: Customers are reportedly unsure about LGDs, which could affect demand. There is a risk that the market for LGDs does not expand as anticipated, leading to fluctuations in customer acquisition and retention.

3. **Pricing Volatility**: Natural diamond prices have shown fluctuations, influenced by various factors such as international demand and market conditions. Price volatility in gold and diamonds can impact margins significantly, as highlighted by Ashok Sonthalia.

4. **Consumer Behavior Changes**: The potential for a shift in consumer preference

In [67]:

# Ask a question
response = rag.generate_answer("who spoke about it ?")
print("Answer:", response)

Answer: The discussions about hedging gains and gold price were primarily addressed by Ashok Sonthalia. The commentary on jewelry growth outlook and related topics was provided by Ajoy Chawla.


## Try some other DB which supports metadata

Great question. You've built a solid **basic RAG system** with:

* ✅ Text chunking
* ✅ FAISS-based retrieval
* ✅ SentenceTransformer embeddings
* ✅ GPT-based answer generation

Now, to **level it up** step by step, here’s a structured roadmap of enhancements — from low complexity to high sophistication.

---

## 🔰 LEVEL 1: Chunking & Preprocessing Enhancements

### 1. **Smarter Chunking**

* Current: Speaker-based
* ✅ Upgrade to **dynamic chunking**:

  * Fixed-size with overlap (e.g., 100 words + 20 overlap)
  * Sentence/window-based chunking
  * Scene/topic segmentation via LLM

### 2. **Metadata Tagging**

* Add metadata like:

  * Speaker name
  * Section (Q\&A, Management Commentary)
  * Timestamp/page number
* Helps in **filtered retrieval** later

---

## 🧠 LEVEL 2: Better Retrieval

### 3. **Switch to Cosine Similarity**

* Use `FAISS.IndexFlatIP` and normalize vectors
* Cosine is better for semantic similarity than Euclidean in most NLP tasks

### 4. **Hybrid Search**

* Combine:

  * 🔍 Keyword/BM25 (e.g., `whoosh`, `elasticsearch`)
  * 🤖 Vector-based search
* Ranks results using both lexical + semantic relevance

### 5. **Query Expansion**

* Use LLM or WordNet to add synonyms to query
* Boosts recall for narrow or ambiguous questions

---

## 🚀 LEVEL 3: Answer Generation Improvements

### 6. **Multi-turn Memory**

* Add **chat history** context to prompt
* Makes follow-up questions work better

### 7. **Answer Attribution**

* Add source info to each chunk (e.g., page, speaker)
* Let the model cite where the answer came from

### 8. **Response Validation**

* Use another LLM call to **validate or double-check** answer quality
* Flag hallucinations or vague responses

---

## 🧩 LEVEL 4: Architecture & Scalability

### 9. **Use LangChain or LangGraph**

* Manage prompt templates, memory, agent workflows, retries
* Build event-driven or conditional routing logic

### 10. **Knowledge Graph Augmented RAG**

* Extract entities & relations → store in Neo4j
* Allow graph-based lookup to complement text retrieval

### 11. **Document Classification & Routing**

* First classify query type:

  * Numerical (→ Finetuned tool)
  * Analytical (→ RAG)
  * Entity-based (→ KG)
* Route accordingly (multi-retriever setup)

---

## 🤖 LEVEL 5: Fully Agentic RAG

### 12. **Planning + Tool Use (ReAct)**

* Use LLM to decide:

  * Step 1: retrieve
  * Step 2: summarize
  * Step 3: synthesize answer
* Great for long docs or complex tasks

### 13. **Multi-agent setup**

* Separate agents for:

  * Retrieval
  * Question understanding
  * Answer generation
* All orchestrated via a graph or router

---

## 🛡️ LEVEL 6: Hardening & Monitoring

### 14. **Evaluation & Feedback Loop**

* Use `Ragas`, `ReLoRA`, or a human eval loop
* Tune chunking / embedding / prompt based on real queries

### 15. **Access Logging, Rate Limiting, Monitoring**

* Track latency, failures, feedback
* Prepare for scale/production

---

## ✅ Summary Table

| Category                 | Upgrade Ideas                                          |
| ------------------------ | ------------------------------------------------------ |
| Chunking & Preprocessing | Dynamic chunking, metadata, speaker-role detection     |
| Retrieval                | Cosine, hybrid, keyword+vector, query rewrite          |
| Answer Generation        | Chat history, validation, attribution                  |
| Architecture             | LangChain/LangGraph, multi-retrievers, knowledge graph |
| Agentic Systems          | ReAct, planning, multi-agent workflows                 |
| Productionization        | Evaluation loop, monitoring, feedback-based refinement |

---

Let me know your current goals (e.g., quality, speed, production-readiness), and I’ll prioritize suggestions accordingly.
