# Example Reports

In [None]:
# Testdateien für die Pilotanalyse
pilot_analyse_dateien = [
    # Primärquellen (Earnings Call)
    '01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Brief_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf',
    '01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Transcript_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf',
    
    # Nachfolgende Analysten-Reports
    '02_01_2023__Invest_Heroes__AMD___________________________4_____2022_______Dmitrii_Novichkov.pdf', # auf russisch?
    '02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann.pdf', # viel Text, gut geeignet
    '02_03_2023__Invest_Heroes__AMD__financial_results_for_Q4_2022__Dmitrii_Novichkov.pdf', # englische Version zum russischen
    '02_06_2023__Marktfeld__Advanced_Micro_Devices_Inc__Peer_Group_Analytics_-_Q4_FY2022__Michail_Paraskevopoulos.pdf', # nur Graphen über verschiedene Metriken
    '02_09_2023__PriceTarget_Research__Advanced_Micro_Devices_Inc_Investment_Status_Report__John_Lafferty.pdf' # Graphen und Text, vielleicht geeignet
]

# Information extraction from Rosenblatt_Securities Text

In [87]:
import os, json, time
from dotenv import load_dotenv
from openai import OpenAI

# ----------------- Setup -----------------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

def extract_financial_info_with_evidence(text_content: str) -> dict:
    system_prompt = """
You are an expert financial analyst and information extractor.
Your only task is to identify distinct information snippets from the provided text.

For each snippet:
- Create a short, concise, fact-based statement in English that summarizes the information.
- Include the exact original text span as quote.
- Classify as one of: Statement, Forecast, Assumption, Evaluation.

Output JSON:
{
  "snippets": [
    {
      "summary": "short fact-based snippet",
      "quote": "exact original text snippet",
      "type": "Statement | Forecast | Assumption | Evaluation"
    }
  ]
}
Keep summaries short. Use only what is stated in the text. Do not merge multiple facts.
"""
    user_prompt = f"""
Return JSON for the following text:
{text_content}
"""

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.0,
        response_format={"type": "json_object"}
    )
    return json.loads(resp.choices[0].message.content)

def process_chunks_file(input_path: str, output_path: str):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    chunks = data.get("chunks", [])
    results = {
        "source_file": data.get("document_info", {}).get("filename"),
        "processed_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
        "model": "gpt-4o-mini",
        "items": []
    }

    for ch in chunks:
        chunk_id = ch.get("chunk_id")
        content = ch.get("content", "")
        if not content:
            continue

        try:
            extracted = extract_financial_info_with_evidence(content)
            for snip in extracted.get("snippets", []):
                results["items"].append({
                    "chunk_id": chunk_id,
                    "summary": snip.get("summary"),
                    "quote": snip.get("quote"),
                    "type": snip.get("type")
                })
        except Exception as e:
            results["items"].append({
                "chunk_id": chunk_id,
                "error": str(e)
            })
            # kleine Pause bei Rate Limit
            time.sleep(1.0)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

# ----------------- Ausführung -----------------
input_path = r"../mistral_OCR/02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann_ocr_chunks.json"
output_path = r"02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann_extracted.json"

process_chunks_file(input_path, output_path)
print("Fertig:", output_path)


Fertig: 02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann_extracted.json


## Highlight Information Snippets

In [52]:
# Optimized quote alignment with anchor indexing, caching, and reduced comparisons
from functools import lru_cache
from difflib import SequenceMatcher
from html import escape
from IPython.display import HTML, display
import pandas as pd
import re

try:
    from caas_jupyter_tools import display_dataframe_to_user
    HAS_CaaS_DF = True
except Exception:
    HAS_CaaS_DF = False

# ---------- Preprocessing and indexing ----------

def _normalize_ws(s: str) -> str:
    return re.sub(r"\s+", " ", s)

def _strip_soft(s: str) -> str:
    # Lowercase, collapse spaces, keep punctuation for exact search first
    return _normalize_ws(s).lower()

def build_anchor_index(text: str, k: int = 10, stride: int = 5):
    """
    Build a light index of k-gram to positions on lowercased text with collapsed spaces.
    Stores a mapping back to original indices for reconstruction.
    """
    # Build normalized version and map positions
    raw = text
    norm_chars = []
    raw_pos = []
    for i, ch in enumerate(raw):
        if ch.isspace():
            if norm_chars and norm_chars[-1] != " ":
                norm_chars.append(" ")
                raw_pos.append(i)
        else:
            norm_chars.append(ch.lower())
            raw_pos.append(i)
    norm = "".join(norm_chars)

    table = {}
    n = len(norm)
    if n >= k:
        for i in range(0, n - k + 1, stride):
            gram = norm[i:i+k]
            table.setdefault(gram, []).append(i)
    return {"norm": norm, "raw": raw, "raw_pos": raw_pos, "k": k, "table": table}

_ANCHOR_CACHE = {}

def get_anchor_index(text: str):
    key = id(text)
    idx = _ANCHOR_CACHE.get(key)
    if idx is None:
        idx = build_anchor_index(text, k=10, stride=4)
        _ANCHOR_CACHE[key] = idx
    return idx

def _candidate_positions_from_quote(idx, quote_norm: str, max_candidates: int = 200):
    k = idx["k"]
    table = idx["table"]
    norm = idx["norm"]

    if len(quote_norm) < k:
        # fallback to simple search for the whole quote_norm
        pos = []
        start = 0
        while True:
            j = norm.find(quote_norm, start)
            if j == -1:
                break
            pos.append(j)
            start = j + 1
        return pos[:max_candidates]

    # sample a few grams from the quote
    grams = []
    step = max(1, len(quote_norm) // 5)
    for off in range(0, len(quote_norm) - k + 1, step):
        grams.append(quote_norm[off:off+k])
        if len(grams) >= 5:
            break

    candidates = []
    for g in grams:
        hits = table.get(g, [])
        candidates.extend(hits)
    # de-duplicate and sort
    candidates = sorted(set(candidates))
    return candidates[:max_candidates]

# ---------- Fast approximate locate ----------

@lru_cache(maxsize=2048)
def locate_quote_fast(text: str, quote: str, max_error: int = 3, min_ratio: float = 0.83):
    """
    Faster quote location:
    1) exact search
    2) exact search ignoring case and flexible whitespace
    3) anchor-indexed windowed fuzzy match with difflib only over small windows
    Returns (start, end, score, matched_text) or (None, None, 0.0, "")
    """
    if not quote or not text:
        return None, None, 0.0, ""

    # 1) exact
    idx = text.find(quote)
    if idx != -1:
        return idx, idx + len(quote), 1.0, quote

    # 2) case insensitive with whitespace normalization
    idx_data = get_anchor_index(text)
    norm_text = idx_data["norm"]
    raw_pos = idx_data["raw_pos"]
    quote_norm = _strip_soft(quote)

    j = norm_text.find(quote_norm)
    if j != -1:
        start_raw = raw_pos[j]
        end_raw = raw_pos[min(j + len(quote_norm) - 1, len(raw_pos) - 1)] + 1
        return start_raw, end_raw, 0.98, text[start_raw:end_raw]

    # 3) anchor-indexed fuzzy search in windows
    cand_norm_positions = _candidate_positions_from_quote(idx_data, quote_norm, max_candidates=120)
    if not cand_norm_positions:
        # fallback to very light scan at the beginning
        cand_norm_positions = [0]

    qlen_norm = len(quote_norm)
    best = (None, None, 0.0, "")

    for p in cand_norm_positions:
        # map norm position to raw start guess
        raw_start_guess = idx_data["raw_pos"][min(p, len(raw_pos) - 1)]
        # choose a raw window around the guess
        window_start = max(0, raw_start_guess - 120)
        window_end = min(len(text), raw_start_guess + max(len(quote) + 120, 200))
        window_text = text[window_start:window_end]

        # try lengths around raw quote length and around norm length
        min_len = max(1, len(quote) - max_error)
        max_len = min(len(window_text), len(quote) + max_error)

        # stride greater than 1 to reduce comparisons
        for i in range(0, len(window_text) - min_len + 1, 2):
            # try a couple of lengths only
            for L in (min_len, (min_len + max_len) // 2, max_len):
                if i + L > len(window_text):
                    continue
                seg = window_text[i:i+L]
                score = SequenceMatcher(None, _strip_soft(seg), quote_norm).ratio()
                if score > best[2]:
                    best = (window_start + i, window_start + i + L, score, seg)

    if best[2] >= min_ratio:
        return best
    return None, None, 0.0, ""

# ---------- Public API matching the previous helpers ----------

# ---------- Public API matching the previous helpers WITH TYPES AND COLORS ----------

def align_quotes_to_spans(text: str, extracted: dict, max_error: int = 3, min_ratio: float = 0.83):
    spans = []
    entries = []
    if isinstance(extracted, dict) and "snippets" in extracted:
        # expect optional "type" per snippet
        for s in extracted.get("snippets", []):
            entries.append({
                "summary": s.get("summary",""),
                "quote": s.get("quote",""),
                "type": s.get("type","Statement")
            })
    elif isinstance(extracted, dict) and "facts" in extracted:
        for f in extracted.get("facts", []):
            ev = (f.get("evidence") or {})
            entries.append({
                "summary": f.get("label","") or f.get("category",""),
                "quote": ev.get("quote",""),
                "type": f.get("type","Statement")
            })

    for e in entries:
        q = e.get("quote") or ""
        if not q.strip():
            continue
        start, end, score, matched = locate_quote_fast(text, q, max_error=max_error, min_ratio=min_ratio)
        if start is not None:
            spans.append({
                "start": start,
                "end": end,
                "label": e.get("summary") or "snippet",
                "summary": e.get("summary") or "snippet",
                "type": e.get("type","Statement"),
                "score": score,
                "matched_text": matched,
                "original_quote": q
            })
    spans.sort(key=lambda s: (s["start"], s["end"]))
    return spans

def build_highlight_html(text: str, spans):
    # color map per type
    type_colors = {
        "Statement": "#fff2b2",   # soft yellow
        "Forecast":  "#cdeffd",   # soft blue
        "Assumption":"#e9d5ff",   # soft purple
        "Evaluation":"#c8f7c5",   # soft green
    }
    clean = []
    for s in spans:
        a = int(s["start"])
        b = int(s["end"])
        if 0 <= a < b <= len(text):
            clean.append(s.copy())
    if not clean:
        return "<div>No spans found</div>"

    bounds = set([0, len(text)])
    for s in clean:
        bounds.add(s["start"])
        bounds.add(s["end"])
    edges = sorted(bounds)

    parts = []
    for i in range(len(edges) - 1):
        a, b = edges[i], edges[i + 1]
        seg_text = escape(text[a:b])
        applied = None
        for s in clean:
            if s["start"] <= a and b <= s["end"]:
                applied = s
                break
        if applied:
            t = applied.get("type","Statement")
            color = type_colors.get(t, "#fff2b2")
            title = escape(f'{applied.get("summary","snippet")}  type {t}  score {applied["score"]:.2f}')
            parts.append(f'<mark title="{title}" style="background:{color}; padding:0.15em 0.2em; border-radius:0.2em;">{seg_text}</mark>')
        else:
            parts.append(seg_text)

    # legend
    legend_items = []
    for t, col in type_colors.items():
        legend_items.append(
            f'<span style="display:inline-flex; align-items:center; gap:0.4rem; margin-right:0.8rem; margin-bottom:0.4rem;">'
            f'<span style="display:inline-block; width:0.9rem; height:0.9rem; background:{col}; border-radius:0.2rem; border:1px solid rgba(0,0,0,0.08);"></span>'
            f'<span class="mono">{escape(t)}</span>'
            f'</span>'
        )
    css = """
    <style>
      .facts-table { margin-top: 0.75rem; }
      .small { font-size: 0.9em; color: #555; }
      .mono { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; }
      .legend { display:flex; flex-wrap:wrap; gap:0.4rem; margin:0.5rem 0 0.75rem 0; }
    </style>
    """
    legend_html = '<div class="legend">' + "".join(legend_items) + "</div>"
    return css + legend_html + "<div>" + "".join(parts) + "</div>"

def snippets_dataframe_with_alignment(extracted: dict, spans: list) -> pd.DataFrame:
    rows = []
    for sp in spans:
        rows.append({
            "type": sp.get("type","Statement"),
            "summary": sp.get("summary",""),
            "original_quote": sp.get("original_quote",""),
            "matched_text": sp.get("matched_text",""),
            "match_score": round(sp.get("score", 0.0), 3),
            "char_start": sp.get("start", None),
            "char_end": sp.get("end", None)
        })
    return pd.DataFrame(rows)

def show_highlight_and_table(report_text: str, extracted: dict, max_error: int = 3, min_ratio: float = 0.83, table_name: str = "Snippets with alignment"):
    spans = align_quotes_to_spans(report_text, extracted, max_error=max_error, min_ratio=min_ratio)
    html = build_highlight_html(report_text, spans)
    display(HTML(html))

    df = snippets_dataframe_with_alignment(extracted, spans)
    if HAS_CaaS_DF:
        display_dataframe_to_user(table_name, df)
    else:
        display(df)

print("Optimized functions loaded. Use show_highlight_and_table(report_text, extracted).")
# Automatisches Einlesen des letzten Eintrags aus der JSON und Visualisierung
import os
import json

output_path = "extracted_financial_info.json"  # Pfad ggf. anpassen
if os.path.exists(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        all_data = json.load(f)
    if isinstance(all_data, list) and len(all_data) > 0:
        last_entry = all_data[-1]
        report_text = last_entry.get("original_text", "")
        extracted = last_entry.get("extracted", {})
        print(f"Letzter Eintrag geladen aus {output_path}.")
        show_highlight_and_table(report_text, extracted, max_error=3, min_ratio=0.83)
    else:
        print(f"Keine Einträge in {output_path} gefunden.")
else:
    print(f"Datei {output_path} nicht gefunden.")

Optimized functions loaded. Use show_highlight_and_table(report_text, extracted).
Letzter Eintrag geladen aus extracted_financial_info.json.


Unnamed: 0,type,summary,original_quote,matched_text,match_score,char_start,char_end
0,Statement,Target price is $100.00.,Target Price | $100.00,Target Price | $\$ 100.00,0.936,4,29
1,Statement,52-week price range is $54.57 to $132.96.,52 Week Price Range | $54.57 to $132.96,2 Week Price Range | $\$ 54.57$ to $\$ 132,0.864,36,78
2,Statement,Shares outstanding are 1.61 billion.,Shares Outstanding | 1.61 Billion,Shares Outstanding | 1.61 Billion,1.0,87,120
3,Statement,Dividend is $0.00.,Dividend | $0.00,Dividend | $\$ 0.0,0.857,124,143
4,Statement,Sector is Technology.,Sector | Technology,Sector | Technology,1.0,166,185
5,Evaluation,Sector rating is MARKET WEIGHT.,Sector Rating | MARKET WEIGHT,Sector Rating | MARKET WEIGHT,1.0,204,233
6,Evaluation,Financial strength rating is MEDIUM-HIGH.,Financial Strength Rating | MEDIUM-HIGH,Financial Strength Rating | MEDIUM-HIGH,1.0,307,346
7,Statement,Debt/Capital ratio is 8.9%.,Debt/Capital Ratio | $8.9 %,Debt/Capital Ratio | $8.9,0.963,364,391
8,Statement,Return on equity is 10.7%.,Return on Equity | $10.7 %,Return on Equity | $10.7,0.962,398,424
9,Statement,Net margin is 5.6%.,Net Margin | $5.6 %,Net Margin | $5.6 \,0.947,432,451


# DSRAG Pipeline

In [None]:
!pip install dsrag

In [89]:
from dsrag.llm import OpenAIChatAPI
from dsrag.reranker import CohereReranker
from dsrag.create_kb import create_kb_from_file
from dsrag.knowledge_base import KnowledgeBase
from dotenv import load_dotenv
load_dotenv()


llm = OpenAIChatAPI(model='gpt-4o-mini')
reranker = CohereReranker()

kb = KnowledgeBase(kb_id="AMD_Jan_2023_Earnings_Call", reranker=reranker, auto_context_model=llm, storage_directory="./my_kb_storage")

In [38]:
print(kb.storage_directory)

./my_kb_storage


In [90]:
file_path = "../data/Auswahl/01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Brief_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf"
kb.add_document(
    doc_id="01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Brief_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf", 
    file_path=file_path,
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
    )

In [91]:
file_path = "../data/Auswahl/01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Transcript_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf"
kb.add_document(
    doc_id="01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Transcript_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf", 
    file_path=file_path,
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
)

In [92]:
from dsrag.knowledge_base import KnowledgeBase

kb = KnowledgeBase("AMD_Jan_2023_Earnings_Call", storage_directory="./my_kb_storage")
search_queries = ["Revenue $5.6b"]
results = kb.query(search_queries)
for segment in results:
    print(segment)

{'doc_id': '01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Brief_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf', 'chunk_start': 12, 'chunk_end': 14, 'score': np.float64(1.2810939131539474), 'content': "Document context: the following excerpt is from a document titled 'Q4 2022 Advanced Micro Devices Inc Earnings Call'. This document is about: the financial results and outlook for Advanced Micro Devices Inc. (AMD) for the fourth quarter of 2022, including revenue, net income, and segment performance, as well as expectations for 2023.\n\nCo. reported 4Q22 revenue of $5.6b, net income of $1.1b and diluted EPS of $0.69. Expects 1Q23 revenue to be approx. $5.3b plus\nor minus $300m.\nFINANCIAL DATA\nA. 2022 revenue = $23.6b.B. 4Q22 revenue = $5.6b.\nC. 2022 net income = $5.5b.\nD. 4Q22 net income = $1.1b.\nE. 2022 diluted EPS = $3.50.\nF. 4Q22 diluted EPS = $0.69.\nG. 2022 YoverY revenue growth = 44%.", 'segment_page_start': 2, 'segment_page

In [94]:
import json
from openai import OpenAI
from dsrag.knowledge_base import KnowledgeBase

# OpenAI Client laden (API Key muss in der Umgebung gesetzt sein)
client = OpenAI()

# Knowledge Base laden
kb = KnowledgeBase("AMD_Jan_2023_Earnings_Call", storage_directory="./my_kb_storage")

# JSON laden
with open("02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann_extracted.json", "r", encoding="utf-8") as f:
    data = json.load(f)

system_prompt = """You are an expert financial analyst.
Evaluate whether the provided Knowledge Base evidence supports the snippet.
Return one of the following labels:
- Supported (claim clearly appears in the evidence)
- Partially Supported (evidence is related but not complete)
- Not Supported (evidence does not support the claim)
- Contradicted (evidence explicitly contradicts the claim)

Also explain briefly why you chose this label."""

# Erste 10 Snippets prüfen
for i, item in enumerate(data["items"][:10], start=1):
    quote = item["quote"]
    results = kb.query([quote])

    print(f"\nSnippet {i}: {quote}")
    if not results:
        print(" → Keine Treffer in KB")
        continue

    # Nur die ersten 2 Treffer für GPT verwenden
    evidence_texts = [seg.get("content", seg.get("text", str(seg))) for seg in results[:2]]
    evidence_combined = "\n---\n".join(evidence_texts)

    user_prompt = f"""
Snippet (claim):
{quote}

Evidence from Knowledge Base:
{evidence_combined}
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0
    )

    evaluation = response.choices[0].message.content
    print(" → GPT Evaluation:", evaluation)



Snippet 1: Advanced Micro Devices is the number-two player in x86-based microprocessors, behind Intel.
 → GPT Evaluation: Label: Partially Supported

Explanation: The evidence provided discusses AMD's advancements in processor technology and its performance in the supercomputer market, which indicates that AMD is a significant player in the microprocessor space. However, it does not explicitly confirm that AMD is the number-two player in x86-based microprocessors behind Intel. While the context suggests AMD's competitiveness, it lacks direct comparison or ranking against Intel in the x86 market, making the support for the claim incomplete.

Snippet 2: with the 2008 acquisition of ATI -- a top player in graphic processors.
 → Keine Treffer in KB

Snippet 3: In 2021, Advanced Micro Devices acquired Xilinx, expanding its presence in embedded computing and data center.
 → GPT Evaluation: Label: Supported

Explanation: The evidence clearly supports the claim that Advanced Micro Devices (AM