In [35]:
from pydantic import BaseModel, Field


# Pydantic
class Animal(BaseModel):
    """Animal to tell user."""

    name: str = Field(description="The name of the animal")
    sound: str = Field(description="The sound the animal makes")

class Joke(BaseModel):
    """Joke to tell user."""

    animal: Animal = Field(description="The animal in the joke")
    setup: str = Field(description="The setup of the joke")
    punchline: str = Field(description="The punchline to the joke")

structured_llm = LLM.with_structured_output(Joke)

result = structured_llm.invoke("Tell me a joke about cats")
result_json = json.dumps(result.model_dump(), indent=2)
print(result_json)

{
  "animal": {
    "name": "cat",
    "sound": "meow"
  },
  "setup": "Why was the cat sitting on the computer?",
  "punchline": "Because it wanted to keep an eye on the mouse!"
}


In [None]:
import os
import json
from dotenv import load_dotenv
load_dotenv()

from typing import List, Dict, Any

from langchain_openai import AzureChatOpenAI
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from markitdown import MarkItDown

GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") or st.secrets.get("GITHUB_TOKEN")
OPEN_AI_API_KEY = os.environ.get("OPEN_AI_API_KEY") or st.secrets.get("OPEN_AI_API")

LLM = ChatOpenAI(
    model_name="gpt-4.1-nano",
    temperature=1,
    openai_api_key=OPEN_AI_API_KEY,
)

# LLM = AzureChatOpenAI(
#     azure_endpoint="https://models.inference.ai.azure.com",
#     azure_deployment="gpt-4.1-nano",
#     openai_api_version="2025-03-01-preview", 
#     model_name="gpt-4.1-nano",
#     temperature=1,
#     api_key=GITHUB_TOKEN,
# )

# Pydantic
class Metadata(BaseModel):
    """Metadata of the document."""
    document_title: str = Field(description="The name of the document", default="")
    document_date: str = Field(description="The date of the document", default="")
    document_status: str = Field(description="The status of the document", default="")
    document_author: str = Field(description="The author of the document", default="")

class Person(BaseModel):
    """Person in the document beside the author."""
    name: str = Field(description="The name of the person")
    role: str = Field(description="The role of the person")
    email: str = Field(description="The email of the person")

class Problem(BaseModel):
    """Overview of the meeting minutes"""
    vision_opportunity: str = Field(description="The Vision & Opportunity of the document")
    target_use_case: str = Field(description="The Target Use Case of the document")

class Solution(BaseModel):
    """Solution to the problem"""
    goals: str = Field(description="The goals of the solution")
    conceptual_model: str = Field(description="The conceptual model of the solution")
    requirements: List[str] = Field(description="The requirements of the solution")

class Summary(BaseModel):
    metadata: Metadata = Field(description="The metadata of the document")
    person: Person = Field(description="The person in the document")
    problem: Problem = Field(description="The problem in the document")
    solution: Solution = Field(description="The solution in the document")

summary_parser = PydanticOutputParser(pydantic_object=Summary)

def extract_prd(prd):
    prd_template = """"
    You are an expert in analyzing Product Requirements Documents (PRDs).
    Your task is to extract software requirements from the provided PRD text.
    
    PRD text: {prd}
    
    For each PRD text, provide the following information:
    {format_instructions}
    
    If a section doesn't contain any requirements, output nothing for that section.
    """

    prd_prompt_template = PromptTemplate(
        input_variables=["prd"],
        template=prd_template,
        partial_variables={"format_instructions": summary_parser.get_format_instructions()},
    )

    chain = prd_prompt_template | LLM | summary_parser
    output = chain.invoke(input={"prd": prd})
    return output

def convert_to_md(input_file):
    md = MarkItDown()
    result = md.convert(input_file)
    return result.text_content

text = convert_to_md("Test.pdf")
summary = extract_prd(text)
print(summary.model_dump_json(indent=2))

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


{
  "metadata": {
    "document_title": "Proposed Teamfood Homepage Changes",
    "document_date": "2024-12-25",
    "document_status": "DRAFT/REVIEWED",
    "document_author": "Ian Chiu"
  },
  "person": {
    "name": "Ian Chiu",
    "role": "",
    "email": ""
  },
  "problem": {
    "vision_opportunity": "Address user confusion on first landing and improve onboarding for new users.",
    "target_use_case": "New users landing on the homepage who are unfamiliar with the product."
  },
  "solution": {
    "goals": "Reduce user confusion and help people get started on the product faster. Propose cheap short-term fixes that can be implemented quickly.",
    "conceptual_model": "Simplified homepage with clear messaging, guidance, and a single call-to-action for creating the first workspace or table.",
    "requirements": [
      "[P0] When no workspaces or tables are present, display information about product use and provide links to help materials.",
      "[P0] When no workspaces or tab