In [6]:
import os
import requests
import json
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display, update_display

In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [4]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [8]:
link_system_prompt = "You are provided with a list of links found on webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an about page, or a Company page, Careers/jobs pages\n"

link_system_prompt += "You should respond in JSON as in this example:"

link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""


In [9]:
print(link_system_prompt)

You are provided with a list of links found on webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an about page, or a Company page, Careers/jobs pages
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [10]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of the links on the website of {website.url}"
    user_prompt += "please decide which of these are relevant web links for a brochre about the company, respond with the full https URL in JSON format. \
        Do not include Terms of service, Privacy, email links.\n"
    user_prompt += "Links (some might be rellative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [11]:
print(get_links_user_prompt(ed))

Here is the list of the links on the website of https://edwarddonner.complease decide which of these are relevant web links for a brochre about the company, respond with the full https URL in JSON format.         Do not include Terms of service, Privacy, email links.
Links (some might be rellative links):
https://edwarddonner.com/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2024/12/21/llm-resources-superdatascience

In [14]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format= {"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [15]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/blog/inference-providers',
 '/deepseek-ai/DeepSeek-R1',
 '/deepseek-ai/Janus-Pro-7B',
 '/mistralai/Mistral-Small-24B-Instruct-2501',
 '/deepseek-ai/DeepSeek-V3',
 '/hexgrad/Kokoro-82M',
 '/models',
 '/spaces/deepseek-ai/Janus-Pro-7B',
 '/spaces/tencent/Hunyuan3D-2',
 '/spaces/lllyasviel/iclight-v2',
 '/spaces/Qwen/Qwen2.5-Max-Demo',
 '/spaces/ReverseImageSearch/Reverse-Image-Search2',
 '/spaces',
 '/datasets/open-thoughts/OpenThoughts-114k',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/cognitivecomputations/dolphin-r1',
 '/datasets/ServiceNow-AI/R1-Distill-SFT',
 '/datasets/bespokelabs/Bespoke-Stratos-17k',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/m

In [16]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}]}

In [17]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [19]:
# print(get_all_details("https://huggingface.co"))

In [20]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [21]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nNEW\nWelcome to Inference Providers on the Hub 🔥\nsmolagents - a smol library to build great agents\nUse models from the HF Hub in LM Studio\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nTrending on\nthis week\nModels\ndeepseek-ai/DeepSeek-R1\nUpdated\n6 days ago\n•\n1.54M\n•\n7.28k\ndeepseek-ai/Janus-Pro-7B\nUpdated\n6 days ago\n•\n223k\n•\n2.67k\nmistralai/Mistral-Small-24B-Instruct-2501\nUpdated\n4 days ago\n•\n30.1k\n•\n634\ndeepseek-ai/DeepSeek-V3\nUpdated\n13 days ago\n•\n1.05M\n•\n3.22k\nhexgrad/Kokor

In [25]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [26]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'community discussions', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub', 'url': 'https://github.com/huggingface'}, {'type': 'LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

### Welcome to Hugging Face – The AI Community Building the Future!

#### **Company Overview**
Hugging Face is a pioneering company at the forefront of artificial intelligence and machine learning innovation. Our mission is to create a platform where the machine learning community can collaborate, share, and build models, datasets, and applications that will influence the future of technology. With over 1 million models and 250,000 datasets available, Hugging Face is the go-to hub for AI enthusiasts, researchers, and enterprises alike.

---

#### **Our Offerings**

- **Models**: Explore and utilize a vast selection of state-of-the-art machine learning models for various tasks including NLP, computer vision, and more.
- **Datasets**: Access extensive datasets tailored for a range of AI and machine learning projects.
- **Spaces**: Host and collaborate on ML projects effortlessly with our integrated tools.
- **Enterprise Solutions**: Tailored services to support businesses with advanced security and dedicated support.

---

#### **Why Choose Hugging Face?**

- **Collaborative Community**: Join our thriving community of over 50,000 organizations including industry leaders like Google, Amazon, and Meta.
- **Open Source Commitment**: We champion open source, providing various ML tools and a collaborative environment to help drive innovation in the AI landscape.
- **Diverse Applications**: Whether you're interested in text generation, audio processing, or image analysis, we support a myriad of applications across all modalities, including text, image, video, audio, and 3D.

---

#### **Company Culture**
At Hugging Face, we foster a culture of collaboration, creativity, and innovation. Our team is composed of passionate individuals from diverse backgrounds who come together to tackle some of the biggest challenges in AI. We believe in supporting one another through collaborative projects and knowledge sharing, fostering an inclusive environment where everyone can thrive.

---

#### **Careers at Hugging Face**
We are always on the lookout for talented individuals who are passionate about artificial intelligence and machine learning. Join us to work in an engaging environment that values innovation and creativity. Check out our [Careers Page](https://huggingface.co/jobs) for current job openings and become a part of our dynamic team!

---

#### **Join Us!**
Become part of an exciting community that’s shaping the future of AI. Whether you're a researcher, an industry professional, or simply an enthusiast, Hugging Face welcomes you to collaborate and innovate.

---

**Connect with Us:**
- Website: [Hugging Face](https://huggingface.co)
- Follow us on [Twitter](https://twitter.com/huggingface), [LinkedIn](https://linkedin.com/company/hugging-face), and [Discord](https://discord.com/invite/huggingface).

---

### Hugging Face – Let's Build the Future Together!