Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits.


In [2]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [3]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [4]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

Fiter the relevant links using GPT4o mini

In [6]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [None]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "You should respond in JSON as in this example:"
    user_prompt += """
    {
        "links": [
            {"type": "about page", "url": "https://full.url/goes/here/about"},
            {"type": "careers page", "url": "https://another.full.url/careers"},
            {"type": "contact page", "url": "https://full.url/goes/here/contact"},
            {"type": "services page", "url": "https://full.url/goes/here/services"},
            {"type": "blog page", "url": "https://full.url/goes/here/blog"}
        ]
    }"""
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [9]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/blog/fireworks-ai',
 '/spaces',
 '/models',
 '/deepseek-ai/DeepSeek-R1',
 '/Zyphra/Zonos-v0.1-hybrid',
 '/agentica-org/DeepScaleR-1.5B-Preview',
 '/microsoft/OmniParser-v2.0',
 '/Zyphra/Zonos-v0.1-transformer',
 '/models',
 '/spaces/lllyasviel/LuminaBrush',
 '/spaces/black-forest-labs/FLUX.1-dev',
 '/spaces/m-ric/open_Deep-Research',
 '/spaces/tencent/Hunyuan3D-2',
 '/spaces/agents-course/First_agent_template',
 '/spaces',
 '/datasets/open-r1/OpenR1-Math-220k',
 '/datasets/open-thoughts/OpenThoughts-114k',
 '/datasets/saiyan-world/Goku-MovieGenBench',
 '/datasets/Anthropic/EconomicIndex',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 

In [10]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'company page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'social media page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'linkedin page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

### Creating brocher using LLM

In [11]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [12]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community discussion page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
Landing page:
Webpage Title:
Hugging Face ‚Äì The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
NEW
Welcome Fireworks.ai on the Hub üéÜ
Welcome to Inference Providers on the Hub üî•
smolagents - a smol library to build great agents
The AI community 

In [13]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."


In [15]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [16]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}]}


"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face ‚Äì The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nNEW\nWelcome Fireworks.ai on the Hub üéÜ\nWelcome to Inference Providers on the Hub üî•\nsmolagents - a smol library to build great agents\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\ndeepseek-ai/DeepSeek-R1\nUpdated\n9 days ago\n‚Ä¢\n4.06M\n‚Ä¢\n9.29k\nZyphra/Zonos-v0.1-hybrid\nUpdated\n2 days ago\n‚Ä¢\n14.6k\n‚Ä¢\n878\nagentica-org/DeepScaleR-1.5B-Preview\nUpdated\n7 days ago\n‚Ä¢\n9.68k\n‚Ä¢\n402\nmicrosoft/OmniParser-v2.0\nUpdat

In [17]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [18]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

---

## Company Overview

**Welcome to Hugging Face**  
Hugging Face is an innovative platform at the forefront of artificial intelligence, where the machine learning community comes together to create, discover, and collaborate on models, datasets, and applications. Our mission is to empower developers and researchers to build the future of AI with open-source tools and cutting-edge solutions.

---

## Our Offerings

- **Models**: Explore over **1 million machine learning models** including state-of-the-art transformers, diffusion models, and more tailored for various applications such as text, image, and audio processing.
  
- **Datasets**: Access and contribute to a rich library of **over 250,000 datasets**, helping fuel the growth of research and development in AI.

- **Spaces**: Host and collaborate on projects using **over 400,000 applications** which provide environments for developing AI solutions.

- **Enterprise Solutions**: Access premium services tailored for over 50,000 organizations with features such as advanced security, dedicated support, and customized infrastructure.

---

## Company Culture

At Hugging Face, we foster a vibrant community built on collaboration and openness. Our culture encourages innovation through shared knowledge and collective growth, drawing participation from various stakeholders ranging from individual developers to major enterprises like Google, Amazon, and Microsoft.

### Key Cultural Values:
- **Collaboration**: We believe in the power of community. Our platform encourages contributions from everyone, making AI accessible and collaborative.
- **Innovation**: We're at the cutting edge of machine learning technologies and promote continuous learning and improvement.
- **Inclusivity**: Hugging Face values diverse perspectives and embraces a culture of acceptance, allowing everyone to contribute to and benefit from our advancements.

---

## Our Customers

We serve a wide array of customers, from enthusiastic hobbyists to industry giants. Notable organizations leveraging Hugging Face technology include:
- **Meta**
- **Google**
- **Amazon Web Services**
- **Microsoft**

Over **50,000 organizations** use our platform, showcasing its essential role in the global AI ecosystem.

---

## Careers at Hugging Face

Join a community of innovators at Hugging Face! We are constantly on the lookout for passionate individuals to help us push the boundaries of machine learning. Whether you are a developer, researcher, or business professional, we offer exciting careers in various roles including:

- Data Scientists
- Software Engineers
- Business Development
- Community Managers

### Why Work With Us?
- **Cutting-Edge Projects**: Work on groundbreaking AI solutions.
- **Flexible Work Environment**: Enjoy the flexibility of remote work, promoting a balanced lifestyle.
- **Growth Opportunities**: Engage with an environment that supports professional development and continuous learning.

---

## Join Us!

Ready to be part of an extraordinary journey in AI?  
**[Sign Up](#)** or **[Explore Career Opportunities](#)** at Hugging Face today!

---

*Hugging Face ‚Äì Building the future of AI, together.*  

---