# Example Reports

In [None]:
# Testdateien für die Pilotanalyse
pilot_analyse_dateien = [
    # Primärquellen (Earnings Call)
    '01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Brief_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf',
    '01_31_2023__LSEG_StreetEvents__AMD_OQ_-_Event_Transcript_of_Advanced_Micro_Devices_Inc_conference_call__Jan__31__2023___5_00PM_ET__Research_Department.pdf',
    
    # Nachfolgende Analysten-Reports
    '02_01_2023__Invest_Heroes__AMD___________________________4_____2022_______Dmitrii_Novichkov.pdf', # auf russisch?
    '02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann.pdf', # viel Text, gut geeignet
    '02_03_2023__Invest_Heroes__AMD__financial_results_for_Q4_2022__Dmitrii_Novichkov.pdf', # englische Version zum russischen
    '02_06_2023__Marktfeld__Advanced_Micro_Devices_Inc__Peer_Group_Analytics_-_Q4_FY2022__Michail_Paraskevopoulos.pdf', # nur Graphen über verschiedene Metriken
    '02_09_2023__PriceTarget_Research__Advanced_Micro_Devices_Inc_Investment_Status_Report__John_Lafferty.pdf' # Graphen und Text, vielleicht geeignet
]

# Information extraction from Rosenblatt_Securities Text

In [None]:
import os, json, time, asyncio
from dotenv import load_dotenv
from openai import AsyncOpenAI
from tqdm import tqdm

# ----------------- Setup -----------------
load_dotenv()
client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))

MAX_CONCURRENT = 10  # Anzahl paralleler Anfragen

system_prompt = """
You are an expert financial text cleaner and extractor.

Your task:
1. Split the provided input into single sentences.
   - For each: save clean "text", the exact "quote" (original snippet), classify as Statement, Evaluation, Forecast (multiple possible).
   - Add "source": "text".
2. Detect tables in the input.
   - For each key/value row: treat as one entry.
   - Save "text" as "Key: Value".
   - Save "quote" as the original row text.
   - Classify as Statement, Evaluation, Forecast.
   - Add "source": "table".
   - If value is a range or number, keep it as is in "text" (do not parse separately).
   - Add the header of the paragraph where the quote comes from.
3. If there is other structured content (e.g. headings, lists, metadata), include them as entries with "source": "other".
4. Output JSON only in this format:

{
  "items": [
    {
      "text": "...",
      "quote": "...",
      "types": ["Statement","Evaluation", "Forecast"],
      "source": "text|table|other",
      "paragraph": "header of the paragraph"
    }
  ]
}
"""

# ----------------- GPT Call -----------------
async def extract_with_gpt(text: str) -> dict:
    resp = await client.chat.completions.create(
        model="gpt-5-nano",
        messages=[
            {"role": "system", "content": system_prompt.strip()},
            {"role": "user", "content": text}
        ],
        response_format={"type": "json_object"}
    )
    return json.loads(resp.choices[0].message.content)

# ----------------- Single Chunk -----------------
async def process_chunk(chunk, sem, pbar):
    cid = chunk.get("chunk_id")
    page = chunk.get("page_number")
    content = chunk.get("content", "")

    if not content:
        pbar.update(1)
        return []

    async with sem:  # Rate-Limiting
        try:
            extracted = await extract_with_gpt(content)
            out = []
            for item in extracted.get("items", []):
                item["chunk_id"] = cid
                item["page"] = page
                out.append(item)
        except Exception as e:
            out = [{
                "chunk_id": cid,
                "page": page,
                "error": str(e)
            }]
        finally:
            pbar.update(1)
        return out

# ----------------- Main -----------------
async def process_chunks_file_parallel(input_path: str, output_path: str):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    chunks = data.get("chunks", [])
    results = {
        "source_file": data.get("document_info", {}).get("filename"),
        "processed_at": time.strftime("%Y-%m-%dT%H:%M:%S"),
        "model": "gpt-5-nano",
        "items": []
    }

    sem = asyncio.Semaphore(MAX_CONCURRENT)
    pbar = tqdm(total=len(chunks), desc="Processing", unit="chunk")

    tasks = [process_chunk(ch, sem, pbar) for ch in chunks]
    all_results = await asyncio.gather(*tasks, return_exceptions=True)

    pbar.close()

    for res in all_results:
        if isinstance(res, list):
            results["items"].extend(res)
        elif isinstance(res, Exception):
            results["items"].append({"error": str(res)})

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"✅ Done: {output_path}")

# ----------------- Run -----------------
if __name__ == "__main__":
    input_path = "../mistral_OCR/02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann_ocr_chunks.json"
    output_path = "02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann_extracted.json"
    await process_chunks_file_parallel(input_path, output_path)

Processing: 100%|██████████| 21/21 [02:42<00:00,  7.76s/chunk]

✅ Done: 02_01_2023__Rosenblatt_Securities__Inc___ABC_News__AI__Bandwidth_and_Compute_in_the_Last_Week__Mr__Hans_Mosesmann_extracted.json





# DSRAG Pipeline

In [4]:
!pip install dsrag



In [2]:
from dsrag.llm import OpenAIChatAPI
from dsrag.reranker import CohereReranker
from dsrag.create_kb import create_kb_from_file
from dsrag.knowledge_base import KnowledgeBase
from dotenv import load_dotenv
load_dotenv()

from dsrag.knowledge_base import KnowledgeBase

llm = OpenAIChatAPI(model='gpt-4o-mini')
reranker = CohereReranker()

kb = KnowledgeBase(kb_id="AMD_Jan_2023_Earnings_Call", reranker=reranker, auto_context_model=llm, storage_directory="./my_kb_storage")

In [3]:
print(kb.storage_directory)

./my_kb_storage


In [3]:
kb.add_document(
    doc_id="2022 10-Q-3Q22.pdf",
    file_path="assets/2022 10-Q-3Q22.pdf",
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
)

kb.add_document(
    doc_id="2022 10-Q-anual.pdf",
    file_path="assets/2022 10-Q-anual.pdf",
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
)

kb.add_document(
    doc_id="2022 Q4 - 8-K.pdf",
    file_path="assets/2022 Q4 - 8-K.pdf",
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
)

kb.add_document(
    doc_id="2022 Q4 - Presentation.pdf",
    file_path="assets/2022 Q4 - Presentation.pdf",
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
)

kb.add_document(
    doc_id="01_31_2023_LSEG_Brief.pdf",
    file_path="assets/01_31_2023_LSEG_Brief.pdf",
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
)

kb.add_document(
    doc_id="01_32_2023_LSEG_Transcript.pdf",
    file_path="assets/01_32_2023_LSEG_Transcript.pdf",
    chunk_size=200,
    semantic_sectioning_config={"use_semantic_sectioning": False}
)



In [4]:
import json
from tqdm import tqdm
from dsrag.knowledge_base import KnowledgeBase

# JSON einlesen
with open("02_01_2023_Rosenblatt_Export.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Die ersten 10 Sätze aus items holen
sentences = [
    item.get("text", item.get("quote", "")) 
    for item in data["items"]
]
# nur nicht-leere Strings
sentences = [s for s in sentences if s]
# Dict für Anfragen + Ergebnisse
query_results = {}

# Jede Anfrage ausführen und Ergebnisse speichern
for i, sentence in enumerate(tqdm(sentences, desc="Querying", unit="sent"), start=1):
    results = kb.query([sentence])
    query_results[f"sentence_{i}"] = {
        "query": sentence,
        "results": [str(segment) for segment in results]
    }

# Optional: JSON speichern
with open("query_results.json", "w", encoding="utf-8") as f:
    json.dump(query_results, f, ensure_ascii=False, indent=2)

print("Fertig, Ergebnisse in query_results.json gespeichert.")


Querying: 100%|██████████| 294/294 [04:32<00:00,  1.08sent/s]

Fertig, Ergebnisse in query_results.json gespeichert.





# LLM Analysis

In [9]:
import json
import ast
import re
from openai import OpenAI

client = OpenAI()

qr_path = "query_results.json"
with open(qr_path, "r", encoding="utf-8") as f:
    query_results = json.load(f)

def parse_result_str(s: str) -> dict:
    s = re.sub(r"np\.float64\(([^)]+)\)", r"\1", s)
    return ast.literal_eval(s)

def segment_to_text(seg: dict) -> str:
    for k in ("content", "text", "body"):
        if k in seg and isinstance(seg[k], str):
            return seg[k]
    return json.dumps(seg, ensure_ascii=False)

system_prompt = """You are an expert financial analyst.
Evaluate whether the provided Knowledge Base evidence supports the snippet.
Return a JSON object with two keys:
- evaluation: one of [Supported, Partially Supported, Not Supported, Contradicted, No Evidence]
- reason: a short explanation.
"""

evaluations = {}

for key in list(query_results.keys()):  # nur 5 Elemente
    entry = query_results[key]
    quote = entry.get("query", "")
    raw_results = entry.get("results", [])[:5]

    if not raw_results:
        evaluations[key] = {
            "query": quote,
            "evaluation": "No Evidence",
            "reason": "Keine Evidenz gefunden",
            "evidence": []
        }
        print(f"\n{key}: {quote}\n → Keine Evidenz gefunden")
        continue

    parsed = []
    for r in raw_results:
        try:
            parsed.append(parse_result_str(r))
        except Exception:
            parsed.append({"text": str(r)})

    ev_texts = [segment_to_text(seg) for seg in parsed]
    evidence_combined = "\n---\n".join(ev_texts)

    user_prompt = f"""Snippet:
{quote}

Evidence from Knowledge Base:
{evidence_combined}
"""

    resp = client.chat.completions.create(
        model="gpt-5-nano",
        response_format={"type": "json_object"},  # garantiert JSON
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
    )

    parsed_eval = json.loads(resp.choices[0].message.content)

    evaluations[key] = {
        "query": quote,
        "evidence": ev_texts,
        "evaluation": parsed_eval.get("evaluation", "Unknown"),
        "reason": parsed_eval.get("reason", "")
    }

    print(f"\n{key}: {quote}")
    print(" → Label:", evaluations[key]["evaluation"])
    print(" → Reason:", evaluations[key]["reason"])

# JSON mit Bewertungen speichern
with open("evaluations.json", "w", encoding="utf-8") as f:
    json.dump(evaluations, f, ensure_ascii=False, indent=2)

print("\nFertig, Bewertungen in evaluations.json gespeichert")



sentence_1: Advanced Micro Devices is the number-two player in x86-based microprocessors, behind Intel, and -- with the 2008 acquisition of ATI -- a top player in graphic processors.
 → Label: Not Supported
 → Reason: The knowledge base states Intel has been the market share leader for microprocessors but does not specify AMD's ranking (second) or reference the 2008 ATI acquisition or AMD's position in graphics processors. Therefore, the snippet's claims are not supported by the provided KB.

sentence_2: In 2021, Advanced Micro Devices acquired Xilinx, expanding its presence in embedded computing and data center.
 → Label: Contradicted
 → Reason: Knowledge Base states the Xilinx acquisition was completed on February 14, 2022 (not 2021), though it confirms expansion into embedded computing and data center segments via the acquired Xilinx portfolio.

sentence_3: Analyst's Notes
 → Keine Evidenz gefunden

sentence_4: Analysis by Jim Kelleher, CFA, February 1, 2023
 → Keine Evidenz gefund

# Evaluation

In [14]:
# Ready to run notebook cell that reads your two files
# and builds all requested overviews, including cross tables

# Configure file paths here
EVAL_PATH = "evaluations.json"
ROSENBLATT_PATH = "02_01_2023_Rosenblatt_Export.json"

import json
import pandas as pd
from pathlib import Path

# ---------- load evaluations.json ----------
with open(Path(EVAL_PATH), "r", encoding="utf-8") as f:
    evaluations = json.load(f)

# evaluations.json is expected to be a dict keyed by sentence ids
eval_rows = []
for sid, payload in evaluations.items():
    eval_rows.append({
        "sentence_id": sid,
        "evaluation": payload.get("evaluation", ""),
        "reason": payload.get("reason", "")
    })
df_eval = pd.DataFrame(eval_rows).sort_values("sentence_id").reset_index(drop=True)

# counts of evaluation categories with fixed buckets
all_eval_categories = ["Supported", "Partially Supported", "Not Supported", "Contradicted", "No Evidence"]
eval_counts_actual = (
    df_eval["evaluation"]
    .value_counts(dropna=False)
    .rename_axis("evaluation")
    .reset_index(name="count")
)
eval_counts = (
    pd.DataFrame({"evaluation": all_eval_categories})
    .merge(eval_counts_actual, on="evaluation", how="left")
    .fillna({"count": 0})
    .astype({"count": int})
)

# ---------- load 02_01_2023_Rosenblatt_Export.json ----------
with open(Path(ROSENBLATT_PATH), "r", encoding="utf-8") as f:
    rosenblatt = json.load(f)

items = rosenblatt.get("items", [])
df_items = pd.DataFrame(items) if items else pd.DataFrame(columns=["text","quote","types","source","chunk_id","page"])
df_items = df_items.reset_index(drop=True)

# source overview for text other table with fixed buckets
all_sources = ["text", "other", "table"]
if not df_items.empty and "source" in df_items:
    source_counts_actual = (
        df_items["source"].value_counts(dropna=False)
        .rename_axis("source")
        .reset_index(name="count")
    )
else:
    source_counts_actual = pd.DataFrame(columns=["source","count"])
source_counts = (
    pd.DataFrame({"source": all_sources})
    .merge(source_counts_actual, on="source", how="left")
    .fillna({"count": 0})
    .astype({"count": int})
)

# type overview Statement Evaluation Forcast with fixed buckets
if not df_items.empty and "types" in df_items:
    df_types_long = df_items.explode("types")
else:
    df_types_long = pd.DataFrame(columns=["types"])
all_types = ["Statement", "Evaluation", "Forcast"]  # keep exact label
if not df_types_long.empty and "types" in df_types_long:
    type_counts_actual = (
        df_types_long["types"].value_counts(dropna=False)
        .rename_axis("type")
        .reset_index(name="count")
    )
else:
    type_counts_actual = pd.DataFrame(columns=["type","count"])
type_counts = (
    pd.DataFrame({"type": all_types})
    .merge(type_counts_actual, on="type", how="left")
    .fillna({"count": 0})
    .astype({"count": int})
)

# ---------- cross overviews: evaluation x source and evaluation x type ----------
# align evaluations with items by order: sentence_1 -> items[0], etc
# if lengths mismatch, align by the min length
min_len = min(len(df_eval), len(df_items))
if min_len > 0:
    df_merge = pd.concat(
        [
            df_eval.loc[:min_len - 1, ["sentence_id", "evaluation"]].reset_index(drop=True),
            df_items.loc[:min_len - 1, ["source", "types"]].reset_index(drop=True)
        ],
        axis=1
    )
    df_merge = df_merge.explode("types")
else:
    df_merge = pd.DataFrame(columns=["sentence_id","evaluation","source","types"])

# ensure categories are present even if zero by reindexing after crosstab
def ensure_crosstab_buckets(ct, index_labels, column_labels):
    # add missing index
    for lab in index_labels:
        if lab not in ct.index:
            ct.loc[lab] = 0
    # add missing columns
    for lab in column_labels:
        if lab not in ct.columns:
            ct[lab] = 0
    # sort to fixed order
    ct = ct.reindex(index=index_labels)
    ct = ct.reindex(columns=column_labels)
    return ct.astype(int)

# evaluation x source
if not df_merge.empty:
    eval_by_source = pd.crosstab(df_merge["evaluation"], df_merge["source"])
else:
    eval_by_source = pd.DataFrame()

eval_by_source = ensure_crosstab_buckets(
    eval_by_source, all_eval_categories, all_sources
)

# evaluation x type
if not df_merge.empty:
    eval_by_type = pd.crosstab(df_merge["evaluation"], df_merge["types"])
else:
    eval_by_type = pd.DataFrame()

eval_by_type = ensure_crosstab_buckets(
    eval_by_type, all_eval_categories, all_types
)

# ---------- display ----------
print("Evaluation counts")
display(eval_counts)

print("Source counts")
display(source_counts)

print("Type counts")
display(type_counts)

print("Evaluation x Source")
display(eval_by_source)

print("Evaluation x Type")
display(eval_by_type)



Evaluation counts


Unnamed: 0,evaluation,count
0,Supported,42
1,Partially Supported,54
2,Not Supported,9
3,Contradicted,3
4,No Evidence,139


Source counts


Unnamed: 0,source,count
0,text,213
1,other,25
2,table,56


Type counts


Unnamed: 0,type,count
0,Statement,265
1,Evaluation,16
2,Forcast,0


Evaluation x Source


source,text,other,table
evaluation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Supported,31,4,7
Partially Supported,38,4,15
Not Supported,6,1,2
Contradicted,2,0,1
No Evidence,110,9,31


Evaluation x Type


types,Statement,Evaluation,Forcast
evaluation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Supported,34,2,0
Partially Supported,53,3,0
Not Supported,7,0,0
Contradicted,2,0,0
No Evidence,122,11,0
