In [1167]:
from dotenv import load_dotenv
from pathlib import Path
from config import VECTORSTORE_PATH, CHUNK_SIZE, CHUNK_OVERLAP
import re
import os
import uuid
import tempfile
import numpy as np
import pandas as pd
import datetime  # To get the current date for the report

# Docx generation
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH

# LangChain core
from langchain.schema import Document as LangChainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.evaluation import load_evaluator
from langchain.retrievers import MultiVectorRetriever

# Docling & Utilities
from docling.document_converter import DocumentConverter
from utils.sitemap import get_sitemap_urls

In [1168]:
# Load environment variables
load_dotenv()
# Get OpenAI API key
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
# Initialize the LLM
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY, temperature=0)

In [1169]:
# Load once at startup
def get_embedding_function(api_key: str = None):
    """
    Returns an OpenAI-compatible embedding function. Uses environment variable if no API key provided.

    :param api_key: Optional manual API key override
    :return: OpenAIEmbeddings object
    """
    if api_key is None:
        api_key = os.environ.get("OPENAI_API_KEY")

    if not api_key:
        raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY in .env or pass it directly.")

    return OpenAIEmbeddings(
        model="text-embedding-ada-002",
        openai_api_key=api_key
    )


# assign globally
embedding_function = get_embedding_function()


**Document Conversion and Chunking Pipeline**

In [1170]:
def extract_clean_text_from_docling(file_path: Path) -> str:
    """
    Converts a supported file (PDF, DOCX, CSV, HTML, etc.) to plain cleaned text using Docling.

    :param file_path: Path to the file to convert
    :return: Cleaned text string
    """
    converter = DocumentConverter()
    result = converter.convert(file_path)
    text = result.document.export_to_text()

    # Optional: clean text
    text = re.sub(r'\n\s*', ' ', text)  # Remove newlines and leading whitespace
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spacing
    text = re.sub(r'\$ (\d)', r'$\1', text)
    return text

**Load and Split Document**

In [1171]:
def chunk_docling_file(file_path: Path, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
    """
    Load, clean, and split document content from Docling for embedding.

    :param file_path: Path to the file to process
    :param chunk_size: Maximum number of characters per chunk
    :param chunk_overlap: Number of characters that chunks should overlap
    :return: List of langchain Document objects
    """
    raw_text = extract_clean_text_from_docling(file_path)
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = splitter.split_text(raw_text)
    return [LangChainDocument(page_content=chunk, metadata={"source": str(file_path)}) for chunk in texts]

**Split Text Function**

In [1172]:
def split_text(text, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP):
    """
    Splits input text into overlapping chunks using global or user-defined parameters.

    :param text: The input string to split
    :param chunk_size: Max characters per chunk (default from config)
    :param chunk_overlap: Overlap between chunks (default from config)
    :return: List of chunked text strings
    """

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)


**Clean Chunk Text Function**

In [1173]:
#Improves LLM performance by removing token noise
def clean_chunk_text(text):
    """
    Cleans chunk text by removing excessive whitespace and newline artifacts.
    """
    text = re.sub(r'\n\s*', ' ', text)  # Remove newlines and leading whitespace
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spacing
    text = re.sub(r'\$ (\d)', r'$\1', text)  # Fix formatting like "$ 5B" → "$5B"
    return text

**Convert and Chunk Document Function**

In [1174]:
def convert_and_chunk_docling(file_path: Path) -> list[LangChainDocument]:
    raw_text = extract_clean_text_from_docling(file_path)
    return chunk_docling_file(file_path)

**Deduplicate Chunks Function**

In [1175]:
def deduplicate_chunks(chunks: list[LangChainDocument]) -> tuple[list[LangChainDocument], list[str]]:
    """
    Deduplicates a list of documents using UUID5-based hashing on content.

    :param chunks: List of LangChain Document objects
    :return: A tuple containing the list of unique Document objects and their UUIDs
    """
    unique_ids = set()
    unique_chunks, final_ids = [], []

    for chunk in chunks:
        chunk_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk.page_content))
        if chunk_id not in unique_ids:
            unique_ids.add(chunk_id)
            unique_chunks.append(chunk)
            final_ids.append(chunk_id)

    return unique_chunks, final_ids

**Clean Filename Function**

In [1176]:
def clean_filename(name):
    return re.sub(r'[^a-zA-Z0-9_\-]', '_', name)

**Create Vectorstore Function**

In [1177]:
def create_vectorstore(chunks: list[LangChainDocument], embedding_function, collection_name: str,
                       vectorstore_path=VECTORSTORE_PATH):
    """
    Create a persistent Chroma vector store from deduplicated Document chunks.

    :param chunks: A list of langchain Document objects (already deduplicated)
    :param embedding_function: An OpenAI-compatible embedding function
    :param collection_name: Collection name for the vector store
    :param vectorstore_path: Path where the vector store will be saved
    :return: A Chroma vector store object
    """
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_function,
        collection_name=clean_filename(collection_name),
        persist_directory=vectorstore_path
    )
    vectorstore.persist()
    print(f"Saved vectorstore for: {collection_name}")
    return vectorstore


**Create Vectorstore from Text Function**

In [1178]:
def create_vectorstore_from_texts(text: str, embedding_function, collection_name, vectorstore_path=VECTORSTORE_PATH):
    """
    Splits raw text into chunks, wraps into Documents, and stores them in a vector DB.

    :param text: Raw input text
    :param embedding_function: Embedding function (should be passed in)
    :param collection_name: Name of the vector DB collection
    :param vectorstore_path: Storage location
    :return: Chroma vectorstore object
    """
    chunks = split_text(text)
    documents = [LangChainDocument(page_content=chunk) for chunk in chunks]
    return create_vectorstore(documents, embedding_function, collection_name, vectorstore_path)

**Load Vectorstore Function**

In [1179]:
def load_vectorstore(collection_name, vectorstore_path=VECTORSTORE_PATH):
    embedding_function = get_embedding_function()
    return Chroma(
        persist_directory=vectorstore_path,
        embedding_function=embedding_function,
        collection_name=clean_filename(collection_name)
    )


**Load Multiple Vectorstores Function** unfinished


In [1180]:
def load_multiple_vectorstores(collection_names: list[str], embedding_function, vectorstore_path=VECTORSTORE_PATH):
    """
    Load multiple Chroma vectorstores and combine them using MultiVectorRetriever.

    :param collection_names: List of collection names
    :param embedding_function: Embedding function used to create the vectorstores
    :param vectorstore_path: Base directory for Chroma persistence
    :return: MultiVectorRetriever instance
    """
    retrievers = []
    for name in collection_names:
        vs = Chroma(
            collection_name=name,
            persist_directory=vectorstore_path,
            embedding_function=embedding_function
        )
        retrievers.append(vs.as_retriever(search_type="mmr"))  # or "similarity"

    return MultiVectorRetriever(retrievers=retrievers)

**Pipeline from File Function**

In [1181]:
def pipeline_from_file_docling(
        file_path: Path,
        embedding_function,
        collection_name: str,
        vector_store_path: str = "db"
):
    print(f"\nProcessing: {file_path.name}")

    # Step 1: Extract and chunk
    chunks = chunk_docling_file(file_path)  # or load_and_split_docling_text
    print(f"Chunks created: {len(chunks)}")

    # Step 2: Deduplicate
    unique_chunks, final_ids = deduplicate_chunks(chunks)
    print(f"Unique chunks: {len(unique_chunks)}")

    # Step 3: Create vectorstore

    print(f"{file_path.name}: {len(unique_chunks)} chunks created")

    vectorstore = create_vectorstore(
        chunks=unique_chunks,
        embedding_function=embedding_function,
        collection_name=collection_name,
        vectorstore_path=vector_store_path
    )

    print(f"Vectorstore created: {collection_name}")
    return vectorstore


**Batch Pipeline from Files Function**

In [1182]:
def batch_pipeline_from_files_docling(file_paths: list, embedding_function, vectorstore_path=VECTORSTORE_PATH):
    """
    Batch conversion and vectorstore creation from multiple document files using Docling.

    :param file_paths: List of file paths to process
    :param embedding_function: Embedding function (OpenAI-compatible)
    :param vectorstore_path: Directory to persist Chroma vectorstore
    :return: List of (collection_name, status) results
    """
    results = []
    for file_path in file_paths:
        file_path = Path(file_path)
        collection_name = file_path.stem
        try:
            vs = pipeline_from_file_docling(
                file_path=file_path,
                embedding_function=embedding_function,
                collection_name=collection_name,
                vector_store_path=vectorstore_path
            )
            results.append((collection_name, "✅ Success"))
        except Exception as e:
            results.append((collection_name, f"❌ Failed: {e}"))
    return results


**Retrievers for Vectorstore**

In [1183]:
def get_mmr_retriever(vectorstore, k=5, lambda_mult=0.5):
    """
    Return a retriever using Maximal Marginal Relevance (MMR) search.

    Args:
        vectorstore: Your Chroma or LangChain-compatible vector DB
        k (int): Top-k documents to return
        lambda_mult (float): Relevance-diversity balance. 1.0 = relevance only, 0.0 = diversity only.

    Returns:
        A configured retriever object
    """
    return vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": k,
            "lambda_mult": lambda_mult
        }
    )


**Query Vectorstore with MMR**


#### 📌 Definitions:
- **d** — A candidate document or chunk
- **q** — The user query
- **D** — Set of already selected documents
- **λ (lambda)** — Trade-off parameter between relevance and diversity (range: 0 to 1)
- **Relevance(d, q)** — Similarity between document `d` and the query `q` (often using cosine similarity)
- **Redundancy(d, D)** — Maximum similarity between document `d` and any already selected document in `D`

#### 🎯 Tuning λ:
- **λ → 1.0**: Focuses on returning the most relevant chunks, even if they are redundant
- **λ → 0.0**: Focuses on diversity, returning different perspectives or less similar content
- **λ = 0.5**: Balanced trade-off between relevance and novelty

MMR is commonly used in **RAG pipelines** to improve the variety and informativeness of context passed to a language model.


In [1184]:
def query_vectorstore_mmr(query, vectorstore, k=5, lambda_mult=0.5):
    """
    Retrieve top-k diverse and relevant chunks using Maximal Marginal Relevance (MMR).

    Args:
        query (str): User query
        vectorstore: Chroma or other vector DB loaded with documents
        k (int): Number of top results to return
        lambda_mult (float): Balance between relevance (1.0) and diversity (0.0). Default = 0.5

    Returns:
        List of relevant text chunks
    """
    retriever = get_mmr_retriever(vectorstore, k=k, lambda_mult=lambda_mult)
    raw_results = retriever.get_relevant_documents(query)

    return [doc.page_content for doc in raw_results]


Testing retrieving relevant chunks


## 🧠 Query Intent Classification Integration

### High-Level Overview
We improved our query handling system by integrating automatic **intent classification**. This allows the LLM to intelligently detect what kind of response the user expects — whether it's a **summary**, a **direct answer**, **data extraction**, or a **news check** — and adapt the output accordingly. This removes the need for hardcoded `if-else` logic and makes the assistant more flexible and intelligent.

### Low-Level Breakdown
- Introduced a new **intent classification function** using an LLM and a structured system prompt.
- The `classify_query_intent()` function feeds the query into a prompt asking the model to return one of four options: `summary`, `specific_question`, `data_extraction`, or `news_check`.
- This prompt was previously defined as a global variable; we refactored it into a reusable **function-based format**, making it more modular and easier to move (e.g., into a `config.py` file).
- The `respond_to_query()` function now uses this classified intent to route the query to the appropriate **LangChain prompt template**, ensuring the output matches the user's expectations.

In [1185]:
def intent_prompt():
    return ChatPromptTemplate.from_template("""
    You are an intent classifier for a competitive intelligence system. Determine what kind of response the user is expecting from the query below.

    Options:
    - summary
    - specific_question
    - data_extraction

    Query: {query}

    Respond with only one of the options above.
    """)


# This is conceptual, you'll need to adapt it to your actual classify_query_intent function.
# It assumes a similar structure to other prompt templates you might be using.

def classify_query_intent(query, llm):
    # This is a conceptual prompt for the LLM that performs classification
    classification_prompt = ChatPromptTemplate.from_messages([
        ("system", """
        You are an intelligent router that classifies user queries into specific intents.
        The possible intents are: 'summary', 'specific_question', 'data_extraction'.

        'summary': Use if the user wants a broad overview or general summary of the document.
        'specific_question': Use if the user is asking a direct, factual question that can be answered concisely.
        'data_extraction': Use if the user is asking for specific, structured information that can be extracted into sections,
                           like an 'executive summary', 'key takeaways', 'financial highlights', 'company overview', 'product lines', etc.

        Return only the intent word.
        """),
        ("human", "Query: {query}\nIntent:")
    ])

    # Chain the classification prompt with the LLM
    classification_chain = classification_prompt | llm

    # Invoke the chain
    response = classification_chain.invoke({"query": query}).content.strip().lower()

    # Basic error handling/fallback
    if "data_extraction" in response:
        return "data_extraction"
    elif "summary" in response:
        return "summary"
    elif "specific_question" in response:
        return "specific_question"
    else:
        return "specific_question" # Default fallback


# def respond_to_query(vectorstore, query, llm, k=5, lambda_mult=0.5):
#     """
#     Responds to the user query using MMR-based document retrieval and an LLM.
#
#     Detects query intent and routes to the appropriate response format:
#     - summary: strategic overview
#     - specific_question: direct answer
#
#     Args:
#         vectorstore: Chroma vector DB
#         query (str): The user’s input or question
#         llm: An OpenAI-compatible language model
#         k (int): Number of top docs to retrieve
#         lambda_mult (float): Relevance-diversity tradeoff for MMR
#
#     Returns:
#         str: The LLM’s response
#     """
#     # Step 1: Detect intent
#     intent = classify_query_intent(query, llm)
#     print(f"🔍 Detected intent: {intent}")
#
#     # Step 2: Retrieve relevant chunks
#     chunks = query_vectorstore_mmr(query, vectorstore, k=k, lambda_mult=lambda_mult)
#     context = "\n\n".join(chunks)
#
#     # Step 3: Route to proper prompt
#     prompt_map = {
#         "summary": ChatPromptTemplate.from_template("""
#             You are a competitive intelligence analyst. Based on the following context, write a strategic summary.
#             Focus on company background, products, and key insights.
#
#             Context:
#             {context}
#         """),
#         "specific_question": ChatPromptTemplate.from_template("""
#             You are a helpful research assistant. Use the context to directly and concisely answer the user’s question.
#
#             User question: {query}
#             Context:
#             {context}
#         """),
#         # Add more routing options as needed
#     }
#
#     prompt = prompt_map.get(intent, prompt_map["specific_question"])
#     chain = prompt | llm
#
#     if intent == "summary":
#         return chain.invoke({"context": context}).content
#     else:
#         return chain.invoke({"context": context, "query": query}).content


def respond_to_query(vectorstore, query, llm, k=5, lambda_mult=0.5):
    """
    Responds to the user query using MMR-based document retrieval and an LLM.

    Detects query intent and routes to the appropriate response format:
    - summary: strategic overview
    - specific_question: direct answer
    - data_extraction: extracts specific data points or sections

    Args:
        vectorstore: Chroma vector DB
        query (str): The user’s input or question
        llm: An OpenAI-compatible language model
        k (int): Number of top docs to retrieve
        lambda_mult (float): Relevance-diversity tradeoff for MMR

    Returns:
        str: The LLM’s response
    """
    # Step 1: Detect intent
    intent = classify_query_intent(query, llm)
    print(f"🔍 Detected intent: {intent}")

    # Step 2: Retrieve relevant chunks
    chunks = query_vectorstore_mmr(query, vectorstore, k=k, lambda_mult=lambda_mult)
    context = "\n\n".join(chunks)

    # Step 3: Route to proper prompt
    prompt_map = {
        "summary": ChatPromptTemplate.from_template("""
            You are a competitive intelligence analyst. Based on the following context, write a strategic summary.
            Focus on company background (use company name), products, and key insights.

            Context:
            {context}
        """),
        "specific_question": ChatPromptTemplate.from_template("""
            You are a helpful research assistant. Use the context to directly and concisely answer the user’s question.

            User question: {query}
            Context:
            {context}
        """),
        "data_extraction": ChatPromptTemplate.from_template("""
        You are an expert data extractor. From the provided context, extract the information requested by the user.

        **Output Instructions:**
        -   **Format:** Provide the output as plain text. Do NOT use any Markdown formatting (e.g., no bolding with **, no headings with ###, no bullet points with - or * or numbers).
        -   **Paragraphs:** Keep paragraphs short and concise, typically 2-4 sentences long.
        -   **Content:** Only provide the extracted information. Do NOT include any conversational filler, introductory phrases, or concluding remarks.
        -   **Use Company Name:** If the query asks for company background, include the company name from the context.

        User request: {query}
        Context:
        {context}
    """)
        # Add more routing options as needed
    }

    prompt = prompt_map.get(intent, prompt_map["specific_question"])
    chain = prompt | llm

    if intent == "summary":
        return chain.invoke({"context": context}).content
    elif intent == "data_extraction":
        return chain.invoke({"context": context, "query": query}).content # Use query for data extraction too
    else: # specific_question and any unhandled intents
        return chain.invoke({"context": context, "query": query}).content

**Cosine Similarity Test**

In [1186]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


similarity = cosine_similarity(embedding_function.embed_query("cat"), embedding_function.embed_query("dog"))
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.862957645542653


**Testing Single File Input**

In [1187]:
file = Path("/Users/ralstonraphael/Desktop/flash-report-generator/ingestion/pdf_data/2025-q1-earnings-transcript.pdf")
vectorstore = pipeline_from_file_docling(file, embedding_function, "earnings_2025_q1")
summary = respond_to_query(
    vectorstore,
    query="what company is this about?",
    llm=llm
)
print(summary)


Processing: 2025-q1-earnings-transcript.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Chunks created: 64
Unique chunks: 64
2025-q1-earnings-transcript.pdf: 64 chunks created
Saved vectorstore for: earnings_2025_q1
Vectorstore created: earnings_2025_q1
🔍 Detected intent: specific_question
The company being discussed is Google, as indicated by the mention of Sundar Pichai, who is the CEO of Google.


**Testing Batch File Inputs**

In [1188]:
file_paths = [
    Path("/Users/ralstonraphael/Desktop/flash-report-generator/ingestion/csv_data/alphabet_full_profile.csv"),
    Path("/Users/ralstonraphael/Desktop/flash-report-generator/ingestion/docx_data/alphabet_ai_losses_article.docx"),
    Path("/Users/ralstonraphael/Desktop/flash-report-generator/ingestion/pdf_data/2025-q1-earnings-transcript.pdf")
]

results = batch_pipeline_from_files_docling(file_paths, embedding_function)

for name, status in results:
    print(f"{name}: {status}")

Parameter `strict_text` has been deprecated and will be ignored.



Processing: alphabet_full_profile.csv
Chunks created: 2
Unique chunks: 2
alphabet_full_profile.csv: 2 chunks created


Parameter `strict_text` has been deprecated and will be ignored.


Saved vectorstore for: alphabet_full_profile
Vectorstore created: alphabet_full_profile

Processing: alphabet_ai_losses_article.docx
Chunks created: 3
Unique chunks: 3
alphabet_ai_losses_article.docx: 3 chunks created
Saved vectorstore for: alphabet_ai_losses_article
Vectorstore created: alphabet_ai_losses_article

Processing: 2025-q1-earnings-transcript.pdf


Parameter `strict_text` has been deprecated and will be ignored.


Chunks created: 64
Unique chunks: 64
2025-q1-earnings-transcript.pdf: 64 chunks created
Saved vectorstore for: 2025-q1-earnings-transcript
Vectorstore created: 2025-q1-earnings-transcript
alphabet_full_profile: ✅ Success
alphabet_ai_losses_article: ✅ Success
2025-q1-earnings-transcript: ✅ Success


In [1189]:
#Pydantic model for structured response

In [1190]:
vs = load_vectorstore(collection_name="2025-q1-earnings-transcript")


In [1191]:
# summary = respond_to_query(vs, query="Generate a Flash report of this company", llm=llm)
# print(summary)


In [1192]:
# Load the vectorstore (assuming 'vs' is already loaded from previous cells for '2025-q1-earnings-transcript')
# If not, ensure it's loaded:
# vs = load_vectorstore(collection_name="2025-q1-earnings-transcript")

print("--- Extracting Report Sections ---")

executive_summary = respond_to_query(
    vs,
    query="Provide a concise executive summary of the company's performance, focusing on overall results, key business changes, and outlook.",
    llm=llm
)
print(f"\nExecutive Summary:\n{executive_summary}\n")

key_takeaways = respond_to_query(
    vs,
    query="List the key takeaways and important strategic points from the document",
    llm=llm
)
print(f"\nKey Takeaways:\n{key_takeaways}\n")


financial_highlights = respond_to_query(
    vs,
    query="Summarize the main financial highlights and important figures, including revenues, profits, and cash flow.",
    llm=llm
)
print(f"\nFinancial Highlights:\n{financial_highlights}\n")

# Prepare data for the report
report_title = "Q1-2025 Norstella Quarterly Market Participant Update" # You might extract this dynamically too
report_date = datetime.date.today().strftime("%B %d, %Y") # Gets current date

report_data = {
    "report_title": report_title,
    "report_date": report_date,
    "executive_summary": executive_summary,
    "key_takeaways": key_takeaways,
    "financial_highlights": financial_highlights,
}

# Generate the DOCX report
print("\n--- Generating DOCX Report ---")

--- Extracting Report Sections ---
🔍 Detected intent: data_extraction

Executive Summary:
The company's performance has shown strong demand, particularly in Google Cloud, where customer demand has consistently outstripped capacity. To address this, the company is focused on ramping up its capabilities to meet customer needs while continuing to invest in long-term innovation. Recent strategic changes include the consolidation of teams, which enhances efficiency and accelerates the speed of bringing products to market. Overall, the company is committed to driving productivity and innovation responsibly, as reflected in its ongoing results.

🔍 Detected intent: data_extraction

Key Takeaways:
In Q1, Google experienced broad-based strength across various ad verticals, with Finance leading due to strong performance in Insurance. Retail, Healthcare, and Travel also contributed significantly to growth. For Q2, it is too early to make definitive comments, but potential impacts from the macro en

In [1193]:
def create_flash_report(data: dict, output_filename="Norstella_Flash_Report.docx"):
    """
    Generates a Flash Report in DOCX format based on the provided data.

    Args:
        data (dict): A dictionary containing the extracted information.
                     Expected keys:
                     - "executive_summary": str
                     - "key_takeaways": list of str or single str
                     - "financial_highlights": list of str or single str
                     - "report_title": str (e.g., "Q4-24 Norstella Quarterly Market Participant Update")
                     - "report_date": str (e.g., "February 19th, 2025")
        output_filename (str): The name of the output DOCX file.
    """
    document = Document()

    # --- Header (Simple text for now) ---
    section = document.sections[0]
    header = section.header
    # Ensure there's at least one paragraph in the header
    if not header.paragraphs:
        header.add_paragraph()
    paragraph = header.paragraphs[0]
    header_run = paragraph.add_run("Norstella")
    header_run.font.size = Pt(12)
    header_run.bold = True
    paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT

    # --- Main Content ---

    # Add Report Title
    title_paragraph = document.add_paragraph()
    title_run = title_paragraph.add_run(data.get("report_title", "Quarterly Market Participant Update"))
    title_run.font.size = Pt(24)
    title_run.bold = True
    title_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
    document.add_paragraph(f"Report Date: {data.get('report_date', 'Date Not Available')}", style='Intense Quote')


    # Add Executive Summary
    document.add_heading("EXECUTIVE SUMMARY", level=1)
    executive_summary_text = data.get("executive_summary", "Executive summary not available.")
    # Remove common boilerplate if present
    executive_summary_text = re.sub(r'### Strategic Summary.*?(Company Background:.*?)?|\*\*Company Background:\*\*', '', executive_summary_text, flags=re.DOTALL | re.IGNORECASE).strip()
    executive_summary_text = re.sub(r'\*\*Products:.*|\*\*Key Insights:.*', '', executive_summary_text, flags=re.DOTALL | re.IGNORECASE).strip()

    document.add_paragraph(executive_summary_text)

    # Helper function to add bullet points, cleaning up LLM boilerplate
    def add_bullet_points(doc, heading_text, content_key):
        doc.add_heading(heading_text, level=1)
        content = data.get(content_key)
        if content:
            if isinstance(content, str):
                # Remove common boilerplate from list items
                content = re.sub(r'### Strategic Summary.*?(Company Background:.*?)?|\*\*Company Background:\*\*', '', content, flags=re.DOTALL | re.IGNORECASE).strip()
                content = re.sub(r'\*\*Products:.*|\*\*Key Insights:.*', '', content, flags=re.DOTALL | re.IGNORECASE).strip()

                # Split by newlines, handling numbered lists or bullet points
                points = [p.strip() for p in content.split('\n') if p.strip()]
                # Refine splitting for numbered lists (e.g., "1. Point") or leading bullets
                cleaned_points = []
                for point in points:
                    # Remove leading numbers/bullets like "1. ", "- ", "* "
                    point = re.sub(r'^\s*[\d\*\-]+\s*', '', point).strip()
                    if point: # Only add if not empty after stripping
                        cleaned_points.append(point)

                if not cleaned_points: # Fallback if splitting didn't yield points
                    doc.add_paragraph(content)
                else:
                    for point in cleaned_points:
                        doc.add_paragraph(point, style='List Bullet')
            elif isinstance(content, list):
                for item in content:
                    doc.add_paragraph(item, style='List Bullet')
        else:
            doc.add_paragraph(f"{heading_text.lower().replace('highlights', 'information').replace('key ', '')} not available.")

    add_bullet_points(document, "KEY TAKEAWAYS", "key_takeaways")
    add_bullet_points(document, "FINANCIAL HIGHLIGHTS", "financial_highlights")

    # --- Footer ---
    section = document.sections[0]
    footer = section.footer
    # Ensure there's at least one paragraph in the footer
    if not footer.paragraphs:
        footer.add_paragraph()
    footer.paragraphs[0].text = "Confidential - Norstella Internal Report"
    footer.paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER

    document.save(output_filename)
    print(f"Report '{output_filename}' generated successfully.")

In [1194]:
create_flash_report(report_data, "Norstella_Q1_2025_Flash_Report.docx")

Report 'Norstella_Q1_2025_Flash_Report.docx' generated successfully.


Add Q&A Recognition

Instead of building the whole docx myself, i can populate an already created template by adding variables where I want the infromation added
Need to go to the sharepoint and access flash reports.
use