In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
from notte.env import NotteEnv, NotteEnvConfig
from pydantic import BaseModel, Field

from notte.sdk.types import ScrapeParams

class ArticleSchema(BaseModel):
    title: str
    points: int
    by: str
    commentsURL: str

class TopArticlesSchema(BaseModel):
    top: list[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")


env = NotteEnv(NotteEnvConfig().disable_perception().dev_mode)
await env.start()

format_json_schema = {
  "type": "object",
  "properties": {
    "page_summary": {
      "type": "string"
    }
  },
  "required": [
    "page_summary"
  ]
}

scrape_params = ScrapeParams(
    response_format=format_json_schema,
    #instructions="Extract the top 5 stories from Hacker News"
)


class IrrelevantSchema(BaseModel):
    medical_diagnosis: str
    treatment: str
    prognosis: str

data = await env.scrape(
    url='https://news.ycombinator.com',
    # response_format=TopArticlesSchema,
    # response_format=format_json_schema,
    instructions="Extract the top 5 stories from Hacker News"
)





[32m2025-02-18 00:28:16.398[0m | [1mINFO    [0m | [36mnotte.env[0m:[36m__init__[0m:[36m155[0m - [1m🔧 Custom notte-env config: 
{
  "max_steps": 20,
  "processing_type": "dom",
  "browser": {
    "headless": false,
    "disable_web_security": false,
    "goto_timeout": 10000,
    "goto_retry_timeout": 1000,
    "retry_timeout": 1000,
    "step_timeout": 1000,
    "short_wait_timeout": 500,
    "screenshot": true,
    "empty_page_max_retry": 5,
    "verbose": true
  },
  "scraping": {
    "type": "simple",
    "rendering": {
      "type": "markdown",
      "include_ids": false,
      "include_attributes": [
        "role",
        "alt",
        "src",
        "aria_label",
        "name",
        "value",
        "title",
        "href",
        "placeholder",
        "tabindex",
        "type",
        "aria_expanded"
      ],
      "max_len_per_attribute": 60,
      "include_text": true,
      "include_links": true,
      "prune_dom_tree": true,
      "verbose": true
    },

In [None]:
data.data.model_dump()

In [None]:
TopArticlesSchema.model_validate(data.data.structured.data)

In [None]:
TopArticlesSchema.model_json_schema()

In [None]:
params.response_format.model_json_schema()

In [None]:
data.data.structured.data

In [None]:
TopArticlesSchema.model_json_schema()

In [None]:


# Second way: Create from JSON schema string
schema_str = '''
{
    "title": "ArticleSchema",
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "points": {"type": "integer"},
        "by": {"type": "string"},
        "commentsURL": {"type": "string"}
    },
    "required": ["title", "points", "by", "commentsURL"]
}
'''

# Parse JSON schema
schema = json.loads(schema_str)
schema = {'$defs': {'ArticleSchema': {'properties': {'title': {'title': 'Title',
     'type': 'string'},
    'points': {'title': 'Points', 'type': 'integer'},
    'by': {'title': 'By', 'type': 'string'},
    'commentsURL': {'title': 'Commentsurl', 'type': 'string'}},
   'required': ['title', 'points', 'by', 'commentsURL'],
   'title': 'ArticleSchema',
   'type': 'object'}},
 'properties': {'top': {'description': 'Top 5 stories',
   'items': {'$ref': '#/$defs/ArticleSchema'},
   'maxItems': 5,
   'title': 'Top',
   'type': 'array'}},
 'required': ['top'],
 'title': 'TopArticlesSchema',
 'type': 'object'}

# Create Pydantic model from schema
DynamicModel = create_model_from_json_schema(
    "DynamicArticleSchema",
    schema
)

# Now you can use the model
instance = DynamicModel(
    title="Some Title",
    points=42,
    by="user123",
    commentsURL="https://example.com"
)

instance

In [None]:
from notte.sdk.types import ScrapeParams
schema_dict = {
    "title": "ArticleSchema",
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "content": {"type": "string"}
    },
    "required": ["title", "content"]
}

params = ScrapeParams(
    response_format=schema_dict
)



params.response_format(title="ArticleSchema", content="Content", yo="uaieau")

In [None]:
from notte.data.space import StructuredData

StructuredData.model_validate_json('{"success": true, "data": {"some": "value"}}')

In [None]:
from pydantic import BaseModel, Field
from notte.env import NotteEnv, NotteEnvConfig

# Define the schema for flight data
class FlightSchema(BaseModel):
    airline: str
    departure: str
    arrival: str
    price: float
    duration: str
    departure_time: str
    arrival_time: str
    currency: str

# Define a schema for a list of flight search results
class FlightSearchResultsSchema(BaseModel):
    flights: list[FlightSchema] = Field(..., max_items=5, description="Top 5 flight results")

# Initialize the environment and start scraping
env = NotteEnv(NotteEnvConfig.simple())

# Scrape data from a Google Flights-like URL (assuming the structure fits)
async with NotteEnv(NotteEnvConfig.simple()) as env:
    url = 'https://www.google.com/travel/flights/search?tfs=CBwQAhooEgoyMDI1LTA2LTA3agwIAhIIL20vMDg5NjZyDAgDEggvbS8wNXF0ahooEgoyMDI1LTA3LTE2agwIAxIIL20vMDVxdGpyDAgCEggvbS8wODk2NkABSAFwAYIBCwj___________8BmAEB'  # Placeholder for a real Google Flights search URL
    data = await env.scrape(url=url, response_format=FlightSearchResultsSchema)
    print(data)

# Example usage: Run the scraping function
# response = await scrape_flights()
# print(response)


In [None]:
print(data.data.structured.model_dump_json(indent=4))

In [None]:
content = """
[{'role': 'system', 'content': 'You are extracting content on behalf of a user.\nIf a user asks you to extract a \'list\' of information, or \'all\' information,\nYOU MUST EXTRACT ALL OF THE INFORMATION THAT THE USER REQUESTS.\n\nAlways prioritize using the provided content to answer the question.\nDo not miss any important information.\nDo not make up an answer.\nDo not hallucinate.\nIn case you can\'t find the information and the string is required, instead of \'N/A\' or \'Not speficied\', return an empty string: \'\', if it\'s not a string and you can\'t find the information, return null.\nBe concise and follow the schema always if provided.\nIf the document provided is not relevant to the prompt nor to the final user schema, return null.\nHere are the urls the user provided of which he wants to extract information from:\nhttps://news.ycombinator.com/\n\nHere is the user schema you should follow for your output:\n```json\n{\n  "$defs": {\n    "ArticleSchema": {\n      "properties": {\n        "title": {\n          "title": "Title",\n          "type": "string"\n        },\n        "points": {\n          "title": "Points",\n          "type": "integer"\n        },\n        "by": {\n          "title": "By",\n          "type": "string"\n        },\n        "commentsURL": {\n          "title": "Commentsurl",\n          "type": "string"\n        }\n      },\n      "required": [\n        "title",\n        "points",\n        "by",\n        "commentsURL"\n      ],\n      "title": "ArticleSchema",\n      "type": "object"\n    }\n  },\n  "properties": {\n    "top": {\n      "description": "Top 5 stories",\n      "items": {\n        "$ref": "#/$defs/ArticleSchema"\n      },\n      "maxItems": 5,\n      "title": "Top",\n      "type": "array"\n    }\n  },\n  "required": [\n    "top"\n  ],\n  "title": "TopArticlesSchema",\n  "type": "object"\n}\n```\n\nToday is: 2025-02-17 18:12:32\n\nAdditional instructions:\nno additional instructions\n\nTransform the following content into structured JSON output based on the provided schema if any and the following user request:\n\n```markdown\n1. [You\'re not a senior engineer until you\'ve worked on a legacy project](https://www.infobip.com/developers/blog/seniors-working-on-a-legacy-project) ([infobip.com](from?site=infobip.com))  \n126 points by [tonkkatonka](user?id=tonkkatonka)[4 hours ago](item?id=43047341) | [hide](hide?id=43047341&goto=news) | [87 comments](item?id=43047341)  \n2. [Hk, a new Git hook manager from jdx](https://hk.jdx.dev/about.html) ([jdx.dev](from?site=jdx.dev))  \n17 points by [DrBenCarson](user?id=DrBenCarson)[48 minutes ago](item?id=43080535) | [hide](hide?id=43080535&goto=news) | [4 comments](item?id=43080535)  \n3. [Fluoxetine promotes metabolic defenses to protect from sepsis-induced lethality](https://www.science.org/doi/10.1126/sciadv.adu4034) ([science.org](from?site=science.org))  \n81 points by [bookofjoe](user?id=bookofjoe)[4 hours ago](item?id=43078537) | [hide](hide?id=43078537&goto=news) | [42 comments](item?id=43078537)  \n4. [Open Source projects could sell SBoM fragments](https://www.thomas-huehn.com/open-source-projects-could-sell-sbom-fragments/) ([thomas-huehn.com](from?site=thomas-huehn.com))  \n11 points by [Tomte](user?id=Tomte)[1 hour ago](item?id=43080378) | [hide](hide?id=43080378&goto=news) | [4 comments](item?id=43080378)  \n5. [0+0 > 0: C++ thread-local storage performance](https://yosefk.com/blog/cxx-thread-local-storage-performance.html) ([yosefk.com](from?site=yosefk.com))  \n73 points by [ingve](user?id=ingve)[5 hours ago](item?id=43077675) | [hide](hide?id=43077675&goto=news) | [30 comments](item?id=43077675)  \n6. [Umami is a simple, fast, privacy-focused alternative to Google Analytics](https://github.com/umami-software/umami) ([github.com/umami-software](from?site=github.com/umami-software))  \n207 points by [ksec](user?id=ksec)[10 hours ago](item?id=43040507) | [hide](hide?id=43040507&goto=news) | [108 comments](item?id=43040507)  \n7. [Homemade polarimetric synthetic aperture radar drone](https://hforsten.com/homemade-polarimetric-synthetic-aperture-radar-drone.html) ([hforsten.com](from?site=hforsten.com))  \n467 points by [picture](user?id=picture)[15 hours ago](item?id=43073808) | [hide](hide?id=43073808&goto=news) | [42 comments](item?id=43073808)  \n8. [All Kindles can now be jailbroken](https://kindlemodding.org/jailbreaking/WinterBreak/) ([kindlemodding.org](from?site=kindlemodding.org))  \n976 points by [lumerina](user?id=lumerina)[15 hours ago](item?id=43073969) | [hide](hide?id=43073969&goto=news) | [334 comments](item?id=43073969)  \n9. [Making Markets on Kalshi](https://rlafuente.com/post?post=2025-2-16-marketmaking-on-kalshi) ([rlafuente.com](from?site=rlafuente.com))  \n5 points by [andes314](user?id=andes314)[1 hour ago](item?id=43073377) | [hide](hide?id=43073377&goto=news) | [2 comments](item?id=43073377)  \n10. [Uchū – Color palette for internet lovers](https://uchu.style) ([uchu.style](from?site=uchu.style))  \n571 points by [NetOpWibby](user?id=NetOpWibby)[18 hours ago](item?id=43072338) | [hide](hide?id=43072338&goto=news) | [232 comments](item?id=43072338)  \n11. [Reflections on AGI from 1879](https://www.learningfromexamples.com/p/reflections-on-superintelligence) ([learningfromexamples.com](from?site=learningfromexamples.com))  \n60 points by [benbreen](user?id=benbreen)[7 hours ago](item?id=43053403) | [hide](hide?id=43053403&goto=news) | [8 comments](item?id=43053403)  \n12. [How do modern compilers choose which variables to put in registers?](https://langdev.stackexchange.com/questions/4325/how-do-modern-compilers-choose-which-variables-to-put-in-registers) ([langdev.stackexchange.com](from?site=langdev.stackexchange.com))  \n262 points by [azeemba](user?id=azeemba)[14 hours ago](item?id=43048073) | [hide](hide?id=43048073&goto=news) | [39 comments](item?id=43048073)  \n13. [The secret ingredients of word2vec (2016)](https://www.ruder.io/secret-word2vec/) ([ruder.io](from?site=ruder.io))  \n151 points by [todsacerdoti](user?id=todsacerdoti)[12 hours ago](item?id=43075347) | [hide](hide?id=43075347&goto=news) | [14 comments](item?id=43075347)  \n14. [kartoffels v0.7: Cellular Automata, Statistics, 32-bit RISC-V](https://pwy.io/posts/kartoffels-v0.7/) ([pwy.io](from?site=pwy.io))  \n4 points by [Patryk27](user?id=Patryk27)[17 minutes ago](item?id=43080858) | [hide](hide?id=43080858&goto=news) | [discuss](item?id=43080858)  \n15. [My Time at MIT](http://muratbuffalo.blogspot.com/2025/02/my-time-at-mit.html) ([muratbuffalo.blogspot.com](from?site=muratbuffalo.blogspot.com))  \n126 points by [rrampage](user?id=rrampage)[12 hours ago](item?id=43075113) | [hide](hide?id=43075113&goto=news) | [30 comments](item?id=43075113)  \n16. [Does or did COBOL default to 1875-05-20 for corrupt or missing dates?](https://retrocomputing.stackexchange.com/questions/31288/does-or-did-cobol-default-to-1875-05-20-for-corrupt-or-missing-dates) ([retrocomputing.stackexchange.com](from?site=retrocomputing.stackexchange.com))  \n214 points by [SeenNotHeard](user?id=SeenNotHeard)[17 hours ago](item?id=43073149) | [hide](hide?id=43073149&goto=news) | [329 comments](item?id=43073149)  \n17. [Espargos: ESP32-based WiFi sensing array](https://espargos.net/) ([espargos.net](from?site=espargos.net))  \n10 points by [leoedin](user?id=leoedin)[3 hours ago](item?id=43079023) | [hide](hide?id=43079023&goto=news) | [1 comment](item?id=43079023)  \n18. [When Not to Obey Orders (2019)](https://warontherocks.com/2019/07/when-not-to-obey-orders/) ([warontherocks.com](from?site=warontherocks.com))  \n204 points by [throwaway19577](user?id=throwaway19577)[20 hours ago](item?id=43071286) | [hide](hide?id=43071286&goto=news) | [99 comments](item?id=43071286)  \n19. [San Francisco homelessness: Park ranger helps one person at a time](https://sfstandard.com/2025/02/08/golden-gate-park-ranger-homelessness/) ([sfstandard.com](from?site=sfstandard.com))  \n277 points by [NaOH](user?id=NaOH)[16 hours ago](item?id=43073292) | [hide](hide?id=43073292&goto=news) | [401 comments](item?id=43073292)  \n20. [Show HN: Tools for Math Research](https://sugaku.net/) ([sugaku.net](from?site=sugaku.net))  \n5 points by [rfurmani](user?id=rfurmani)[2 hours ago](item?id=43054506) | [hide](hide?id=43054506&goto=news) | [1 comment](item?id=43054506)  \n21. [“A calculator app? Anyone could make that”](https://chadnauseam.com/coding/random/calculator-app) ([chadnauseam.com](from?site=chadnauseam.com))  \n1640 points by [pie_flavor](user?id=pie_flavor)[1 day ago](item?id=43066953) | [hide](hide?id=43066953&goto=news) | [401 comments](item?id=43066953)  \n22. [Show HN: Inscribed, create stop motion animation and slide powered by Excalidraw](https://inscribed.app/) ([inscribed.app](from?site=inscribed.app))  \n10 points by [chunza2542](user?id=chunza2542)[4 hours ago](item?id=43078555) | [hide](hide?id=43078555&goto=news) | [1 comment](item?id=43078555)  \n23. [Now you can run Ruby on Rails in the browser using WebAssembly](https://web.dev/blog/ruby-on-rails-on-webassembly) ([web.dev](from?site=web.dev))  \n14 points by [danielwetan](user?id=danielwetan)[1 hour ago](item?id=43079791) | [hide](hide?id=43079791&goto=news) | [4 comments](item?id=43079791)  \n24. [Mistral Saba](https://mistral.ai/en/news/mistral-saba) ([mistral.ai](from?site=mistral.ai))  \n89 points by [stephen37](user?id=stephen37)[3 hours ago](item?id=43079046) | [hide](hide?id=43079046&goto=news) | [7 comments](item?id=43079046)  \n25. [Step-Video-T2V: The Practice, Challenges, and Future of Video Foundation Model](https://arxiv.org/abs/2502.10248) ([arxiv.org](from?site=arxiv.org))  \n16 points by [limoce](user?id=limoce)[7 hours ago](item?id=43077074) | [hide](hide?id=43077074&goto=news) | [3 comments](item?id=43077074)  \n26. [The lottery of the snakebite antivenom industry](https://www.theguardian.com/global-development/2025/feb/13/its-a-cowboy-show-out-there-the-deadly-lottery-of-the-snakebite-antivenom-industry) ([theguardian.com](from?site=theguardian.com))  \n54 points by [n1b0m](user?id=n1b0m)[12 hours ago](item?id=43036560) | [hide](hide?id=43036560&goto=news) | [17 comments](item?id=43036560)  \n27. [Extreme supersonic winds measured on a planet outside our solar system](https://phys.org/news/2025-01-extreme-supersonic-planet-solar.html) ([phys.org](from?site=phys.org))  \n66 points by [PaulHoule](user?id=PaulHoule)[16 hours ago](item?id=43050447) | [hide](hide?id=43050447&goto=news) | [30 comments](item?id=43050447)  \n28. [Gold Is Worth More in New York](https://www.bloomberg.com/opinion/articles/2025-02-13/gold-is-worth-more-in-new-york) ([bloomberg.com](from?site=bloomberg.com))  \n117 points by [ioblomov](user?id=ioblomov)[21 hours ago](item?id=43040129) | [hide](hide?id=43040129&goto=news) | [113 comments](item?id=43040129)  \n29. [X users are unable to post “Signal.me” links](https://www.disruptionist.com/p/elon-musks-x-blocks-links-to-signal) ([disruptionist.com](from?site=disruptionist.com))  \n656 points by [confusing3478](user?id=confusing3478)[8 hours ago](item?id=43076710) | [hide](hide?id=43076710&goto=news) | [579 comments](item?id=43076710)  \n30. [Physics Informed Neural Networks](https://nchagnet.pages.dev/blog/physics-informed-neural-networks/) ([nchagnet.pages.dev](from?site=nchagnet.pages.dev))  \n79 points by [nchagnet](user?id=nchagnet)[19 hours ago](item?id=43071775) | [hide](hide?id=43071775&goto=news) | [8 comments](item?id=43071775)  \n[More](?p=2)\n\n\n```\n'}]
2025-02-17 18:12:33.292 | INFO     | notte.llms.engine:structured_completion:51 - LLM response: 
"""



print(content)