In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [47]:
load_dotenv(override = True)
api_key = os.getenv("OPENAI_API_KEY")

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")

MODEL = 'gpt-4o-mini'
openai = OpenAI()


API key looks good so far


In [48]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link.get('href') for link in soup.find_all('a') if link.get("href") ]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [49]:
ed = Website("https://edwarddonner.com")
ed.links


['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [50]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [51]:
print (link_system_prompt)


You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [52]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt
    

In [53]:
print(get_links_user_prompt)





<function get_links_user_prompt at 0x0000020698526DE0>


In [57]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [58]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/Qwen/QwQ-32B',
 '/deepseek-ai/DeepSeek-R1',
 '/microsoft/Phi-4-multimodal-instruct',
 '/Wan-AI/Wan2.1-T2V-14B',
 '/CohereForAI/aya-vision-8b',
 '/models',
 '/spaces/Wan-AI/Wan2.1',
 '/spaces/nanotron/ultrascale-playbook',
 '/spaces/ASLP-lab/DiffRhythm',
 '/spaces/Qwen/QwQ-32B-Demo',
 '/spaces/black-forest-labs/FLUX.1-dev',
 '/spaces',
 '/datasets/facebook/natural_reasoning',
 '/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k',
 '/datasets/FreedomIntelligence/medical-o1-reasoning-SFT',
 '/datasets/GeneralReasoning/GeneralThought-195K',
 '/datasets/KodCode/KodCode-V1',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/gramm

In [59]:
get_links("https://huggingface.co")



{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'company page',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'}]}

SECOND STEP 

In [60]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [64]:
print(get_all_details("https://huggingface.co"))kshvCL


SyntaxError: invalid syntax (741817616.py, line 1)

In [63]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [65]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt
    

In [66]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [67]:
create_brochure("HuggingFace", "https://huggingface.co")


Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'github page', 'url': 'https://github.com/huggingface'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

Welcome to Hugging Face, the AI community building the future. We are dedicated to creating a collaborative platform where machine learning enthusiasts, researchers, and organizations come together to innovate and advance the field of artificial intelligence.

---

## Our Mission

At Hugging Face, we strive to democratize AI technology, making it accessible for everyone. We believe in the power of community collaboration, fostering an environment where developers can share models, datasets, and applications.

---

## Community & Collaboration

- **1M+ Models**: Browse through a vast array of machine learning models across different domains including text, image, video, and audio.
- **250k+ Datasets**: Access and contribute to an extensive collection of datasets tailored for various machine learning tasks.
- **Spaces**: Collaborate with developers globally on innovative applications and projects utilizing state-of-the-art technology.

### Trending Models This Week:
- Qwen/QwQ-32B
- deepseek-ai/DeepSeek-R1
- microsoft/Phi-4-multimodal-instruct
- Wan-AI/Wan2.1-T2V-14B
- CohereForAI/aya-vision-8b

---

## Customers & Partnerships

More than **50,000 organizations** trust and use Hugging Face, including notable companies such as:
- Google
- Microsoft
- Amazon
- Grammarly
- Meta

---

## Careers at Hugging Face

Join us in shaping the future of AI! We are always on the lookout for passionate individuals who are eager to learn and grow within the field of machine learning and artificial intelligence. Embrace an environment that thrives on openness, innovation, and collaboration. 

**Current Opportunities:**
- Data Scientists
- Machine Learning Engineers
- Product Managers
- Community Managers

Explore our job openings and apply to be part of a vibrant and diverse team!

---

## Why Choose Hugging Face?

- **Open Source Commitment**: We are committed to building the foundation of ML tooling with the community. Our open-source projects include Transformers, Diffusers, and Safetensors.
- **Enterprise Solutions**: We offer paid compute and enterprise solutions to efficiently scale your AI projects with top-notch security and support.
- **Cutting-edge Tools**: Utilize our state-of-the-art tools and resources aimed at accelerating your machine learning projects.

---

Join us at Hugging Face and be a part of the revolution in artificial intelligence. Together, let's build a smarter future!

For more information, visit our website: [Hugging Face](https://huggingface.co) 

Connect with us:
- **GitHub**
- **Twitter**
- **LinkedIn**
- **Discord**

---

In [None]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)


In [70]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

## **About Us**
Hugging Face is an innovative platform dedicated to powering the AI community and building the future of machine learning (ML). We offer a collaborative atmosphere where users can create, discover, and collaborate on a diverse array of models, datasets, and applications. With over 1 million models and 250,000 datasets available, we support a wide range of ML tasks across text, image, video, audio, and even 3D modalities.

## **Our Vision**
The mission of Hugging Face is to democratize machine learning by providing accessible tools and resources for everyone, from startups to large enterprises. We are committed to fostering an inclusive AI community that encourages collaboration and innovation.

## **Company Culture**
At Hugging Face, we embrace diversity, openness, and collaboration. Our culture is centered around the idea of community-driven development, and we believe that the best advancements in AI come from shared knowledge and collective effort. We prioritize creating a supportive environment for our employees, encouraging creativity, learning, and growth.

### **Core Values:**
- **Collaboration:** We thrive on collective innovation and share knowledge within our community.
- **Open Source:** Our tools and models are built with the idea that everyone should have access to the resources needed to advance AI.
- **Inclusivity:** We welcome individuals from all backgrounds and expertise levels.

## **Our Customers**
More than 50,000 organizations, including major players like Amazon, Google, Microsoft, and Meta, utilize Hugging Face’s offerings. From non-profits to tech giants, our platform supports a variety of users looking to leverage AI for their specific needs.

## **Careers at Hugging Face**
Hugging Face is continually on the lookout for passionate individuals who share our mission to democratize AI. We offer a range of career opportunities in various domains, including engineering, research, product management, and community engagement. 

### **Why Join Us?**
- **Impactful Work:** Be part of a team that is shaping the future of AI.
- **Flexible Environment:** We encourage a healthy work-life balance, allowing employees to perform at their best.
- **Community Engagement:** Collaborate with experts and enthusiasts alike in a vibrant and diverse community.

## **Get Involved**
Join the Hugging Face community today to explore models, collaborate on projects, and share your work. Whether you are looking to further your career in AI, find innovative solutions for your organization, or simply curious about machine learning, Hugging Face has something to offer for everyone.

---

For more information:
- **Website:** [Hugging Face](https://huggingface.co)
- **Follow Us:** [Twitter](https://twitter.com/huggingface), [LinkedIn](https://linkedin.com/company/huggingface), [Discord](https://discord.gg/huggingface)

Together, let’s build the future of AI!