In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from openai import OpenAI

openai_api_key = "EMPTY" # replace with your inference server api key
openai_api_base = "http://150.239.209.43:8008/v1" # replace with your inference server endpoint


client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

models = client.models.list()
teacher_model = models.data[0].id

# Test the connection with a simple completion
response = client.chat.completions.create(
    model=teacher_model,
    messages=[{"role": "user", "content": "Hello!"}],
    temperature=0.0,
    max_tokens=10
)
completion = response.choices[0].message.content

print(f"Connection successful! {teacher_model}: {completion}")

Connection successful! meta-llama/Llama-3.3-70B-Instruct: Hello. How can I help you today?


In [3]:
from datasets import load_dataset

seed_data = load_dataset("json", data_files="seed_data/financial_call_transcripts.jsonl", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
seed_data[0]

{'conversation_id': 'c47a92e006b54d014a79b447528c55a7',
 'pdf_path': 'seed_data/financial_call_transcripts/c47a92e006b54d014a79b447528c55a7.pdf'}

In [5]:
import os
from instructlab.sdg.pipeline import Pipeline, PipelineContext
from blocks import *

ctx = PipelineContext(client=client, model_family="llama", model_id=teacher_model, batch_size=0)
skills_pipe = Pipeline.from_file(ctx, os.path.join(os.getcwd(), "flows/grounded_summary_extraction.yaml"))

In [6]:
seed_data = seed_data.select(range(10))

In [7]:
seed_data

Dataset({
    features: ['conversation_id', 'pdf_path'],
    num_rows: 10
})

In [8]:
generated_data = skills_pipe.generate(seed_data)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map: 100%|██████████| 10/10 [00:17<00:00,  1.75s/ examples]
Map: 100%|██████████| 10/10 [00:00<00:00, 3113.81 examples/s]


In [9]:
generated_data[0]

{'conversation_id': 'c47a92e006b54d014a79b447528c55a7',
 'pdf_path': 'seed_data/financial_call_transcripts/c47a92e006b54d014a79b447528c55a7.pdf',
 'conversation': "## Conversation ID: c47a92e006b54d014a79b447528c55a7\n\nGood morning. My name is Natalia, and I will be your conference operator today.\n\nAlso, please note that we will discuss certain non-GAAP financial measures in this call. Reconciliations on a GAAP basis for these measures are included in today's press release\n\nIn January, we made major progress in completing our transformation into a meaningfully less leveraged and much more focused consumer products company\n\nAs the results of these -- as a result of these steps early this year, we are on track to achieve our leverage target of approximately 3.5x at the end of this fiscal year.\n\nand we began to see the effects of that late this November. We now must work closely with our retail customers to drive this business forward, despite the recent headwinds in the new hous

In [10]:
import yaml
import json

def normalize_model_output(model_output: dict) -> dict:
    def parse_named_entities(raw_text):
        try:
            parsed = yaml.safe_load(raw_text)
            return {
                "organizations": parsed.get("organizations", []) if isinstance(parsed, dict) else [],
                "people": parsed.get("people", []) if isinstance(parsed, dict) else [],
                "locations": parsed.get("locations", []) if isinstance(parsed, dict) else [],
                "dates": parsed.get("dates", []) if isinstance(parsed, dict) else [],
            }
        except Exception:
            return {
                "organizations": None,
                "people": None,
                "locations": None,
                "dates": None,
            }

    result = {
        "summary": model_output.get("summary", None),
        "keywords": None,
        "named_entities": {
            "organizations": None,
            "people": None,
            "locations": None,
            "dates": None,
        },
        "sentiment": model_output.get("sentiment", None),
    }

    # Parse keywords
    try:
        if isinstance(model_output.get("keywords"), str):
            result["keywords"] = [kw.strip() for kw in model_output["keywords"].split(",") if kw.strip()]
    except Exception:
        result["keywords"] = None

    # Parse named_entities
    result["named_entities"] = parse_named_entities(model_output.get("named_entities", ""))

    return result

In [11]:
model_output = {
    'summary': 'Spectrum Brands reported Q1 results in line with expectations...',
    'keywords': 'Spectrum Brands, deleveraging, GAAP, non-GAAP, fiscal year guidance, HHI, Wi-Fi-enabled Halo Smart Locks, advertising investments, balance sheet strength, free cash flow',
    'named_entities': "organizations:\n  - Spectrum Brands\n  - Consumer Electronics Show\npeople:\n  - Natalia\nlocations:\ndates:\n  - January\n  - November\n  - June\n  - September\n  - Q1\n  - fiscal '19",
    'sentiment': 'Positive'
}

structured_json = normalize_model_output(model_output)
print(json.dumps(structured_json, indent=2))

{
  "summary": "Spectrum Brands reported Q1 results in line with expectations...",
  "keywords": [
    "Spectrum Brands",
    "deleveraging",
    "GAAP",
    "non-GAAP",
    "fiscal year guidance",
    "HHI",
    "Wi-Fi-enabled Halo Smart Locks",
    "advertising investments",
    "balance sheet strength",
    "free cash flow"
  ],
  "named_entities": {
    "organizations": [
      "Spectrum Brands",
      "Consumer Electronics Show"
    ],
    "people": [
      "Natalia"
    ],
    "locations": null,
    "dates": [
      "January",
      "November",
      "June",
      "September",
      "Q1",
      "fiscal '19"
    ]
  },
  "sentiment": "Positive"
}
