In [5]:
# Imports & Path Setup
import os
import json
from pprint import pprint
from pathlib import Path
import shutil # shutil for deleting directories

# Configuration
# Set this to True to force extractor, chunker, and embedder steps to re-run.
# This will delete previously generated files for these steps.
# The downloader will still skip if HTML files already exist.
FORCE_RERUN = False

# Path Setup
# Determine the root directory of the project
# This assumes the notebook is run from the project root or a sub-directory
# where 'data', 'chunks', and 'sample_outputs' are relative to the root.
root_dir = Path(os.getcwd())

# Define specific directories for data, extracted text, chunks, and sample outputs
data_dir = root_dir / "data"
extracted_dir = data_dir / "extracted"
chunks_dir = root_dir / "chunks"
chunks_file = chunks_dir / "all_chunks.json" # The file containing all chunked data
embedded_file = data_dir / "embedded_chunks.json" # The file containing all embedded chunks
sample_outputs_dir = root_dir / "sample_outputs" # Directory to save query responses

# Ensure all necessary output directories exist. Create them if they don't.
data_dir.mkdir(parents=True, exist_ok=True)
extracted_dir.mkdir(parents=True, exist_ok=True)
chunks_dir.mkdir(parents=True, exist_ok=True)
sample_outputs_dir.mkdir(parents=True, exist_ok=True)

# Expected Files
# List of companies and years for which 10-K filings are expected
companies = ["GOOGL", "MSFT", "NVDA"]
years = [2022, 2023, 2024]

# Generate expected file paths for downloaded HTML and extracted text files
expected_htmls = [data_dir / f"{c}_{y}_10K.html" for c in companies for y in years]
expected_txts = [extracted_dir / f"{c}_{y}_10K.txt" for c in companies for y in years]

# Helper Function
def files_exist(filepaths):
    """Checks if all specified files (Path objects) exist."""
    return all(f.exists() for f in filepaths)

print(f"--- Starting Main Pipeline (FORCE_RERUN = {FORCE_RERUN}) ---")

# Step 1: Downloader
# This step checks if all required HTML files exist. If not, it runs the downloader.
# The downloader itself has logic to skip individual files if they already exist.
if not files_exist(expected_htmls):
    print("Missing HTMLs. Running downloader...")
    %run downloader.ipynb
    try:
        run_downloader() # Calling the main function from downloader.ipynb
    except NameError:
        print("Please make sure `run_downloader()` is defined in downloader.ipynb and you run that notebook first.")
else:
    print("All 10-K HTML files found. Skipping downloader.")

# Step 2: Extractor
# If FORCE_RERUN is True, it deletes existing extracted text files to force re-extraction.
if FORCE_RERUN and extracted_dir.exists():
    print("FORCE_RERUN is True: Deleting existing extracted .txt files to force extraction.")
    shutil.rmtree(extracted_dir, ignore_errors=True) # Delete the entire extracted directory
    extracted_dir.mkdir(parents=True, exist_ok=True) # Recreate an empty extracted directory

# Check if all expected extracted text files exist. If not, run the extractor.
if not files_exist(expected_txts):
    print("Running extractor to generate .txt files...")
    %run extractor.ipynb
    try:
        run_extractor() # Call the main function from extractor.ipynb
    except NameError:
        print("Please make sure `run_extractor()` is defined in extractor.ipynb and you run that notebook first.")
else:
    print("All extracted .txt files found. Skipping extractor.")

# Step 3: Chunker
# If FORCE_RERUN is True, it deletes the existing chunks file to force re-chunking.
if FORCE_RERUN and chunks_file.exists():
    print("FORCE_RERUN is True: Deleting existing chunks file to force chunking.")
    if chunks_dir.exists():
        shutil.rmtree(chunks_dir, ignore_errors=True) # Delete the entire chunks directory
    chunks_dir.mkdir(parents=True, exist_ok=True) # Recreate an empty chunks directory

# Check if the 'all_chunks.json' file exists. If not, run the chunker.
if not chunks_file.exists():
    print("No chunks file found. Running chunker...")
    %run chunker.ipynb
    try:
        run_chunker() # Call the main function from chunker.ipynb
    except NameError:
        print(" Please make sure `run_chunker()` is defined in chunker.ipynb and you run that notebook first.")
else:
    print("Chunks already exist. Skipping chunker.")

# Step 4: Embedder
# If FORCE_RERUN is True, it deletes the existing embedded chunks file to force re-embedding.
if FORCE_RERUN and embedded_file.exists():
    print("FORCE_RERUN is True: Deleting existing embeddings file to force embedding.")
    os.remove(embedded_file) # Delete the embedded_chunks.json file

# Check if the 'embedded_chunks.json' file exists. If not, run the embedder.
if not embedded_file.exists():
    print("Running embedder to generate vector embeddings...")
    %run embedder.ipynb
    try:
        run_embedder() # Call the main function from embedder.ipynb
    except NameError:
        print(" Please make sure `run_embedder()` is defined in embedder.ipynb and you run that notebook first.")
else:
    print(" Embeddings already exist. Skipping embedder.")

# Step 5: Agent Query Engine
# Load the agent query engine, which defines the `agent_query` function.
print("\n Loading RAG Agent...")
%run agent_query_engine.ipynb

#  Step 6: Ask Your Questions & Save Outputs
# Define a list of queries to run, covering all required types.
test_queries = [
    {
        "name": "Basic Metrics",
        "query": "What was Microsoft's total revenue in 2023?",
        "filename": "microsoft_revenue_2023.json"
    },
    {
        "name": "YoY Comparison",
        "query": "How did NVIDIA's data center revenue grow from 2022 to 2023?",
        "filename": "nvidia_datacenter_growth_2022_2023.json"
    },
    {
        "name": "Cross-Company Analysis",
        "query": "Which company had the highest operating margin in 2023?",
        "filename": "highest_operating_margin_2023.json"
    },
    {
        "name": "Segment Analysis",
        "query": "What percentage of Google's revenue came from cloud in 2023?",
        "filename": "google_cloud_revenue_percentage_2023.json"
    },
    {
        "name": "AI Strategy",
        "query": "Compare AI investments mentioned by all three companies (Google, Microsoft, NVIDIA) in their 2024 10-Ks.",
        "filename": "ai_investments_comparison_2024.json"
    }
]

# Iterate through each query, execute it, and save the response.
for query_info in test_queries:
    query_name = query_info["name"]
    query_string = query_info["query"]
    output_filename = query_info["filename"]
    output_file_path = sample_outputs_dir / output_filename

    print(f"\n--- Testing with a {query_name} query ---")
    print(f" Querying agent: {query_string}")

    response = None # Initialize response to None for error handling
    try:
        response = agent_query(query_string) # Call the agent_query function

        # Print decomposed sub-queries if available
        if response and 'sub_queries' in response and response['sub_queries']:
            print("\n--- Decomposed Sub-queries: ---")
            for sq in response['sub_queries']:
                
                print(f"  - {sq}")
            print("--------------------------------------------------")

        # Print the final answer in a human-readable format
        print("\n Final Answer:")
        print(json.dumps(response, indent=2))

    except NameError:
        print(" Error: `agent_query` function not found. Make sure `agent_query_engine.ipynb` ran successfully.")
    except Exception as e:
        print(f" Error during query execution for '{query_name}': {e}")

    # Step 7: Save to sample_outputs
    try:
        if response: # Only attempt to save if a response was generated
            with open(output_file_path, "w", encoding="utf-8") as f:
                json.dump(response, f, indent=2)
            print(f"\n Response saved to {output_file_path}")
        else:
            print(f"\n Could not save response to {output_file_path} because no response was generated for '{query_name}'.")
    except Exception as e:
        print(f"\n Error saving response to {output_file_path}: {e}")

print("\n--- Main Pipeline Finished ---")


--- Starting Main Pipeline (FORCE_RERUN = False) ---
All 10-K HTML files found. Skipping downloader.
All extracted .txt files found. Skipping extractor.
Chunks already exist. Skipping chunker.
 Embeddings already exist. Skipping embedder.

 Loading RAG Agent...

--- Testing with a simple query ---

--- Decomposed Sub-queries for 'What was Microsoft's total revenue in 2023?': ---
  1. What was Microsoft's total revenue in 2023?
--------------------------------------------------

--- All Retrieved Chunks (before deduplication): ---
  Company: MSFT, Year: 2022, Score: 0.7022, Excerpt: ings, was as follows: (In millions) Year Ended June 30, 2022 2021 2020 Server products and cloud ser...
  Company: MSFT, Year: 2022, Score: 0.6913, Excerpt: th fiscal year 2021 included: • Microsoft Cloud (formerly commercial cloud) revenue increased 32% to...
  Company: MSFT, Year: 2024, Score: 0.6869, Excerpt: d cloud services revenue growth Revenue from Windows Commercial products and cloud services, comp