In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

# Initialize and constants

# load_dotenv(override=True)
# api_key = os.getenv('OPENAI_API_KEY')

# if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
#     print("API key looks good so far")
# else:
#     print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
# MODEL = 'gpt-4o-mini'
# openai = OpenAI()

In [38]:
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """ A utility class to represent a website that we have scraped, now with links"""
    url: str
    title: str
    body: str
    links: str
    text: str

    def __init__(self, url):
         self.url   = url
         response = requests.get(url, headers=headers)
         self.body = response.content
         soup = BeautifulSoup(self.body, 'html.parser')
         self.title = soup.title.string if soup.title else 'No title found'
         if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
         else:
            self.text = ""
         links = [link.get('href') for link in soup.find_all('a')]
         self.links = [link for link in links if link]
    def get_contents(self):
        return f"webpage Title:\n{self.title}\nwebpage Content:\n{self.text}\n\n"

In [39]:
ed = Website("https://anthropic.com")
# print(ed.get_contents())
# print("webpage Links:\n",ed.links)
ed.links

['#main',
 '#footer',
 'https://www.anthropic.com/',
 'https://www.anthropic.com/claude',
 'https://www.anthropic.com/claude-code',
 'https://www.anthropic.com/max',
 'https://www.anthropic.com/team',
 'https://www.anthropic.com/enterprise',
 'https://www.anthropic.com/pricing',
 'https://claude.ai/download',
 'https://claude.ai/',
 'https://www.anthropic.com/news/claude-character',
 'https://www.anthropic.com/api',
 'https://docs.anthropic.com/',
 'https://www.anthropic.com/pricing#api',
 'https://console.anthropic.com/',
 'https://docs.anthropic.com/en/docs/welcome',
 'https://www.anthropic.com/solutions/agents',
 'https://www.anthropic.com/solutions/code-modernization',
 'https://www.anthropic.com/solutions/coding',
 'https://www.anthropic.com/solutions/customer-support',
 'https://www.anthropic.com/solutions/education',
 'https://www.anthropic.com/solutions/financial-services',
 'https://www.anthropic.com/solutions/government',
 'https://www.anthropic.com/customers',
 'https://www.

In [28]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [29]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [40]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [41]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://anthropic.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
#main
#footer
https://www.anthropic.com/
https://www.anthropic.com/claude
https://www.anthropic.com/claude-code
https://www.anthropic.com/max
https://www.anthropic.com/team
https://www.anthropic.com/enterprise
https://www.anthropic.com/pricing
https://claude.ai/download
https://claude.ai/
https://www.anthropic.com/news/claude-character
https://www.anthropic.com/api
https://docs.anthropic.com/
https://www.anthropic.com/pricing#api
https://console.anthropic.com/
https://docs.anthropic.com/en/docs/welcome
https://www.anthropic.com/solutions/agents
https://www.anthropic.com/solutions/code-modernization
https://www.anthropic.com/solutions/coding
https://www.anthropic.com/solutions/customer-support
htt

In [42]:
def messages_for(website):
    return[
        {"role": "system", "content": link_system_prompt},
        {"role": "user", "content": get_links_user_prompt(website)}
    ]

In [33]:
messages_for(ed)

[{'role': 'system',
  'content': 'You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.\nYou should respond in JSON as in this example:\n{\n    "links": [\n        {"type": "about page", "url": "https://full.url/goes/here/about"},\n        {"type": "careers page", "url": "https://another.full.url/careers"}\n    ]\n}\n'},
 {'role': 'user',
  'content': 'Here is the list of links on the website of https://anthropic.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\nLinks (some might be relative links):\n#main\n#footer\nhttps://www.anthropic.com/\nhttps://www.anthropic.com/claude\nhttps://www.anthropic.com/claude-code\nhttps://www.anthropic.com/max\nhttps://www.a

In [64]:
ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
def get_links(url):
    website = Website(url)
    response = ollama_via_openai.chat.completions.create(
        model="llama3.2",
        messages= messages_for(website),
        response_format={"type": "json_object"}
    )
    # more accurate with gpt!
    # response = openai.chat.completions.create(
    #     model=MODEL,
    #     messages=[
    #         {"role": "system", "content": link_system_prompt},
    #         {"role": "user", "content": get_links_user_prompt(website)}
    #   ],
    #     response_format={"type": "json_object"}
    # )
    result = response.choices[0].message.content
    return json.loads(result)

In [65]:
huggingface = Website("https://anthropic.com")
huggingface.links
# get_links_user_prompt(huggingface)

['#main',
 '#footer',
 'https://www.anthropic.com/',
 'https://www.anthropic.com/claude',
 'https://www.anthropic.com/claude-code',
 'https://www.anthropic.com/max',
 'https://www.anthropic.com/team',
 'https://www.anthropic.com/enterprise',
 'https://www.anthropic.com/pricing',
 'https://claude.ai/download',
 'https://claude.ai/',
 'https://www.anthropic.com/news/claude-character',
 'https://www.anthropic.com/api',
 'https://docs.anthropic.com/',
 'https://www.anthropic.com/pricing#api',
 'https://console.anthropic.com/',
 'https://docs.anthropic.com/en/docs/welcome',
 'https://www.anthropic.com/solutions/agents',
 'https://www.anthropic.com/solutions/code-modernization',
 'https://www.anthropic.com/solutions/coding',
 'https://www.anthropic.com/solutions/customer-support',
 'https://www.anthropic.com/solutions/education',
 'https://www.anthropic.com/solutions/financial-services',
 'https://www.anthropic.com/solutions/government',
 'https://www.anthropic.com/customers',
 'https://www.

In [66]:
get_links("https://anthropic.com")

{'links': [{'type': 'Company information', 'url': 'https://www.anthropic.com'},
  {'type': 'Team information', 'url': 'https://www.anthropic.com/team'},
  {'type': "About founders (Claude's page)",
   'url': 'https://www.anthropic.com/claude'},
  {'type': 'Company research and publications',
   'url': 'https://www.anthropic.com/research'},
  {'type': 'Economic index and analysis',
   'url': 'https://www.anthropic.com/economic-index'},
  {'type': "Claude's personal projects and writings",
   'url': 'https://claude.ai/'},
  {'type': 'API documentation and pricing',
   'url': 'https://docs.anthropic.com/'},
  {'type': 'Console for managing Anthropic services',
   'url': 'https://console.anthropic.com/'},
  {'type': 'News and blog articles', 'url': 'https://www.anthropic.com/news'},
  {'type': 'Careers and job information',
   'url': 'https://www.anthropic.com/careers'},
  {'type': 'Jobs page with a list of open positions',
   'url': 'https://x.com/AnthropicAI'},
  {'type': 'Contact sales 

Second step: make the broucher!

In [67]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [68]:
print(get_all_details("https://anthropic.com"))

Found links: {'links': [{'type': 'company page', 'url': 'https://www.anthropic.com'}, {'type': 'about the company', 'url': 'https://www.anthropic.com/company'}, {'type': 'careers/jobs page', 'url': 'https://www.anthropic.com/careers'}, {'type': 'services/solutions page', 'url': 'https://www.anthropic.com/solutions/agents'}, {'type': 'research/reports page', 'url': 'https://www.anthropic.com/research'}, {'type': 'news blog page', 'url': 'https://www.anthropic.com/news'}, {'type': 'product/pricing page for Claude AI', 'url': 'https://claude.ai/login?redirect_uri=https%3A%2F%2Fla.du%2Flogin'}, {'type': 'product download page for Claude AI', 'url': 'https://claude.ai/download'}]}
Landing page:
webpage Title:
Home \ Anthropic
webpage Content:
Skip to main content
Skip to footer
Claude
Product
Claude
Claude Code
Plans
Max plan
Team plan
Enterprise plan
Explore pricing
Download apps
Claude log in
News
Claude’s Character
API
Build with Claude
API overview
Developer docs
Explore pricing
Console