In [None]:
from __future__ import annotations
import argparse
import json
import re
import uuid
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import duckdb
import pandas as pd
import matplotlib.pyplot as plt

# -------- Embeddings / Vector store (Chroma via LangChain) --------
from langchain_core.documents import Document
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import (
    MarkdownHeaderTextSplitter,
    RecursiveCharacterTextSplitter,
)

# -------- Parsers --------
import pymupdf4llm  # PDF -> Markdown
from langchain_community.document_loaders import (
    UnstructuredPDFLoader,
    UnstructuredWordDocumentLoader,
)
import pdfplumber  # table extraction
from pypdf import PdfReader  # quick full-text as-needed
from docx import Document as DocxDocument
from dateutil import parser as dateparser

### **Configuration**

In [1]:
DEFAULT_DATA_DIR = "/home/ssever/InsightViewer/data/reports"
STORAGE_DIR = "/home/ssever/InsightViewer/storage"
#STORAGE_DIR.mkdir(parents=True, exist_ok=True)

CHROMA_DIR = STORAGE_DIR + "/chroma"
COLLECTION_NAME = "filings"

DUCKDB_PATH = STORAGE_DIR + "/metrics.duckdb"
#PLOTS_DIR = STORAGE_DIR + "/plots"
#PLOTS_DIR.mkdir(parents=True, exist_ok=True)

EMBED_MODEL = "all-MiniLM-L12-v2 "
CHUNK_SIZE = 400
CHUNK_OVERLAP = 60

### **Metadata extraction**

In [None]:
RE_FORM = re.compile(r"\bForm\s+(10[-\s]?K|10[-\s]?Q|8[-\s]?K)\b", re.I)
RE_AR   = re.compile(r"\b(Annual\s+Report)\b", re.I)
RE_FY_ENDED  = re.compile(r"\bfiscal\s+year\s+ended\s+([A-Za-z0-9, ]+)\b", re.I)
RE_Q_ENDED   = re.compile(r"\bquarterly\s+period\s+ended\s+([A-Za-z0-9, ]+)\b", re.I)
RE_FY_CODE   = re.compile(r"\bFY(?:20)?(\d{2})\b", re.I)
RE_Q_CODE    = re.compile(r"\bQ([1-4])\b", re.I)
RE_COMPANY   = re.compile(r"\b([A-Z][A-Za-z&.,()\- ]{2,}(?:Corporation|Company|Inc\.|Incorporated|PLC))\b")

TICKER_PREFIX_RE = re.compile(r"^([A-Za-z]{1,6})[ _-]", re.I)
FILENAME_YEAR_RE = re.compile(r"(20\d{2}|FY(?:20)?\d{2})", re.I)
FILING_NAME_RE  = re.compile(r"(10[-_]?K|10[-_]?Q|8-K|AR|Annual[_-]?Report|PressRelease|Slides|Transcript|Outlook)", re.I)

### **Table extraction + tidy facts**

In [None]:
TARGET_METRICS = {
    "revenue": ["revenue", "net sales", "total revenue", "sales"],
    "net_income": [
        "net income", "net income attributable to", "profit for the year",
        "net earnings", "consolidated net income", "net profit",
    ],
}


def scan_units_scale(text: str) -> Tuple[Optional[str], Optional[int]]:
    t = text.lower()
    units = "USD" if ("$" in t or "usd" in t or "dollars" in t) else None
    scale = None
    if "in millions" in t:
        scale = 1_000_000
    elif "in thousands" in t or "in 000s" in t:
        scale = 1_000
    return units, scale


def parse_number(cell: Any) -> Optional[float]:
    s = str(cell).strip()
    if s in ("", "-", "—", "–"):
        return None
    s = s.replace(",", "")
    negative = s.startswith("(") and s.endswith(")")
    if negative:
        s = s[1:-1]
    s = re.sub(r"[\$\€\£]|[^\d\.\-]", "", s)
    if not s or s in (".", "-"):
        return None
    try:
        val = float(s)
        return -val if negative else val
    except ValueError:
        return None


def best_metric_match(label: str, threshold: int = 80) -> Optional[str]:
    from rapidfuzz import fuzz, process
    label_norm = re.sub(r"\s+", " ", label.lower()).strip()
    candidates: List[Tuple[str, int]] = []
    for norm, synonyms in TARGET_METRICS.items():
        best = process.extractOne(label_norm, synonyms, scorer=fuzz.token_sort_ratio)
        if best and best[1] >= threshold:
            candidates.append((norm, best[1]))
    if not candidates:
        return None
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[0][0]


def extract_pdf_tables_with_provenance(path: Path) -> List[Dict[str, Any]]:
    out: List[Dict[str, Any]] = []
    with pdfplumber.open(str(path)) as pdf:
        for p_idx, page in enumerate(pdf.pages):
            try:
                tables = page.extract_tables() or []
            except Exception:
                tables = []
            for t_idx, table in enumerate(tables):
                if not table or len(table) < 2:
                    continue
                df = pd.DataFrame(table)
                header = df.iloc[0].astype(str).tolist()
                header_has_text = sum(bool(re.search(r"[A-Za-z]", c or "")) for c in header) >= 2
                if header_has_text:
                    df.columns = header
                    df = df.iloc[1:].reset_index(drop=True)
                out.append({
                    "page": p_idx + 1,
                    "table_id": t_idx + 1,
                    "df": df,
                })
    return out


def melt_wide_years(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame(columns=["label", "year", "value"])

    # Always work by column index to avoid duplicate-name issues
    ncols = df.shape[1]
    # Helper: get a Series for column i even if names are duplicated
    def col_series(i: int) -> pd.Series:
        ser = df.iloc[:, i]
        # If iloc gives a DataFrame (shouldn't), take first col
        if isinstance(ser, pd.DataFrame):
            ser = ser.iloc[:, 0]
        return ser

    # 1) pick a likely label column
    label_idx = 0
    for i in range(ncols):
        ser = col_series(i).astype(str).fillna("")
        # count entries that look textual
        text_count = ser.str.contains(r"[A-Za-z]", regex=True, na=False).sum()
        if text_count >= max(2, int(0.2 * len(ser))):
            label_idx = i
            break

    # 2) detect year columns by header text
    year_idxs = []
    for i in range(ncols):
        if i == label_idx:
            continue
        header = str(df.columns[i])
        if re.search(r"(19|20)\d{2}", header):
            year_idxs.append(i)

    if not year_idxs:
        return pd.DataFrame(columns=["label", "year", "value"])

    # 3) build a compact DataFrame with one label + many year columns
    tmp = pd.DataFrame()
    tmp["label"] = col_series(label_idx).astype(str)

    # Use distinct temporary column names to avoid duplicate header collisions
    year_col_map = {}
    for i in year_idxs:
        header = str(df.columns[i])
        # extract the 4-digit year token
        m = re.search(r"(19|20)\d{2}", header)
        year = m.group(0) if m else header
        colname = f"y_{year}"
        # de-duplicate if same year appears multiple times
        k = 2
        base = colname
        while colname in tmp.columns:
            colname = f"{base}_{k}"
            k += 1
        tmp[colname] = col_series(i)

        # remember which colname maps to which year
        year_col_map[colname] = year

    # 4) melt to tidy
    tidy = tmp.melt(id_vars=["label"], var_name="year_col", value_name="raw_value")

    # 5) map back to the actual year and parse numbers
    tidy["year"] = tidy["year_col"].map(year_col_map).fillna("")
    tidy.dropna(subset=["year"], inplace=True)
    tidy["value"] = tidy["raw_value"].apply(parse_number)

    # 6) final clean-up
    tidy = tidy[["label", "year", "value"]].reset_index(drop=True)
    return tidy

In [None]:
def get_chroma(collection_name: str = COLLECTION_NAME, persist_dir: str = CHROMA_DIR):
    embeddings = SentenceTransformerEmbeddings(model_name=EMBED_MODEL)
    vs = Chroma(
        collection_name=collection_name,
        persist_directory=persist_dir,
        embedding_function=embeddings,
    )
    return vs


def upsert_chunks_to_chroma(
    chunks,
    collection_name: str = COLLECTION_NAME,
    persist_dir: str = CHROMA_DIR,
):
    if not chunks:
        return 0

    vs = get_chroma(collection_name, persist_dir)

    # Build Documents then filter complex metadata
    docs = [Document(page_content=c["text"], metadata=c["metadata"]) for c in chunks]
    docs = filter_complex_metadata(docs)

    vs.add_documents(docs)

    # For older langchain versions that still have .persist(), this is harmless.
    try:
        vs.persist()  # no-op on modern versions / AttributeError otherwise
    except AttributeError:
        pass

    return len(docs)