In [81]:
from bs4 import BeautifulSoup
import requests
from openai import OpenAI
from dotenv import load_dotenv
import os
from IPython.display import display, Markdown, update_display
import json

In [69]:
# loading environment variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

openai = OpenAI()
MODEL = "gpt-4o-mini"

In [37]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all("a", href=True)]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"website title: {self.title}\nWebsite text: {self.text}\n\n"


In [45]:
ed = Website("https://edwarddonner.com")

In [46]:
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/

In [41]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{   
    "links": [
        {"type": "About page", "url": "https://full.url/goes/here/about"},
        {"type": "Careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [43]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{   
    "links": [
        {"type": "About page", "url": "https://full.url/goes/here/about"},
        {"type": "Careers page", "url": "https://another.full.url/careers"}
    ]
}



In [44]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt


In [47]:
get_links_user_prompt(ed)

'Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format.     Do not include Terms of Service, Privacy, email links.\nLinks (some might be relative links):\nhttps://edwarddonner.com/\nhttps://edwarddonner.com/outsmart/\nhttps://edwarddonner.com/about-me-and-about-nebula/\nhttps://edwarddonner.com/posts/\nhttps://edwarddonner.com/\nhttps://news.ycombinator.com\nhttps://nebula.io/?utm_source=ed&utm_medium=referral\nhttps://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html\nhttps://patents.google.com/patent/US20210049536A1/\nhttps://www.linkedin.com/in/eddonner/\nhttps://edwarddonner.com/2024/12/21/llm-resources-superdatascience/\nhttps://edwarddonner.com/2024/12/21/llm-resources-superdatascience/\nhttps://edwarddonner.com/2024/11/13/llm-engineering-resources/\nhttps://edwarddonn

In [48]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/
https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/
https://edwarddonner.com/2024/11/13/llm-engineering-resources/
https://edwarddonner.com/2024/11/1

In [70]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format = {"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [71]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/deepseek-ai/DeepSeek-R1',
 '/hexgrad/Kokoro-82M',
 '/openbmb/MiniCPM-o-2_6',
 '/MiniMaxAI/MiniMax-Text-01',
 '/deepseek-ai/DeepSeek-V3',
 '/models',
 '/spaces/hexgrad/Kokoro-TTS',
 '/spaces/JeffreyXiang/TRELLIS',
 '/spaces/lllyasviel/iclight-v2',
 '/spaces/Kwai-Kolors/Kolors-Virtual-Try-On',
 '/spaces/black-forest-labs/FLUX.1-dev',
 '/spaces',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/NovaSky-AI/Sky-T1_data_17k',
 '/datasets/HumanLLMs/Human-Like-DPO-Dataset',
 '/datasets/DAMO-NLP-SG/multimodal_textbook',
 '/datasets/BIOMEDICA/biomedica_webdataset',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformer

In [72]:
hf = get_links("https://huggingface.co")

In [73]:
hf["links"]

[{'type': 'About page', 'url': 'https://huggingface.co/huggingface'},
 {'type': 'Careers page', 'url': 'https://apply.workable.com/huggingface/'},
 {'type': 'Enterprise page', 'url': 'https://huggingface.co/enterprise'},
 {'type': 'Models page', 'url': 'https://huggingface.co/models'},
 {'type': 'Datasets page', 'url': 'https://huggingface.co/datasets'},
 {'type': 'Spaces page', 'url': 'https://huggingface.co/spaces'},
 {'type': 'Documentation page', 'url': 'https://huggingface.co/docs'},
 {'type': 'Blog page', 'url': 'https://huggingface.co/blog'},
 {'type': 'Community forum', 'url': 'https://discuss.huggingface.co'},
 {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
 {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
 {'type': 'LinkedIn page',
  'url': 'https://www.linkedin.com/company/huggingface/'}]

In [74]:
def get_all_details(url):
    result = "Landing Page\n"
    result += Website(url).get_contents()
    result += "Links\n"
    links = get_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [63]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [75]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [76]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding Page\nwebsite title: Hugging Face – The AI community building the future.\nWebsite text: Hugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nTrending on\nthis week\nModels\ndeepseek-ai/DeepSeek-R1\nUpdated\nabout 5 hours ago\n•\n5.57k\n•\n1.12k\nhexgrad/Kokoro-82M\nUpdated\n4 days ago\n•\n29.4k\n•\n2.15k\nopenbmb/MiniCPM-o-2_6\nUpdated\nabout 10 hours ago\n•\n24.1k\n•\n713\nMiniMaxAI/MiniMax-Text-01\nUpdated\n4 days ago\n•\n3.18k\n•\n447\ndeepseek-ai/DeepSeek-V3\nUpdated\n23 days ago\n•\n173k\n•\n2.12k\nBrowse 400k+ models\nSpaces\nRunning\non\nZero\n1.31k\n❤️\nKokoro TTS\nNow in 5 languages!\nRunning\non\nZe

In [77]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [78]:
create_brochure("HuggingFace", "https://huggingface.com")

# Hugging Face Brochure

## Welcome to Hugging Face

**The AI Community Building the Future**  
Join a vibrant machine learning community collaborating on advanced models, datasets, and applications. At Hugging Face, we are dedicated to participating in and leading the progression of artificial intelligence and machine learning technologies.

---

## Our Offerings

- **Models:** Access over **400,000 machine learning models** designed to cater to various applications in AI.
- **Datasets:** Discover and utilize more than **100,000 datasets** optimized for text, image, and audio analysis.
- **Spaces:** Explore and run more than **150,000 applications**, from virtual try-ons to 3D generation.

### Collaboration Made Easy
Our platform allows users to host unlimited public models, datasets, and applications, ensuring easier collaboration within the ML community.

### Scalable Solutions
For enterprise needs, we offer **Compute and Enterprise solutions**:
- **Compute:** Deploy optimized inference endpoints and utilize powerful GPU resources, starting at **$0.60/hour**.
- **Enterprise:** Tailored services starting at **$20/user/month**, providing advanced security, access controls, and dedicated support for teams over **50,000 organizations**.

---

## Culture at Hugging Face

At Hugging Face, we pride ourselves on fostering an inclusive and collaborative culture driven by innovation and open-source collaboration. Our community is grounded in shared learning, empowerment, and a collective passion for advancing machine learning technologies. 

### Join Our Team
We are always looking for talented and passionate individuals to join our team. As a part of our workforce, you will have the opportunity to contribute to impactful projects while working in a dynamic environment that values creativity and individual growth.

---

## Our Clients

We proudly serve a diverse range of clients, from startups to industry giants. Notable organizations using Hugging Face models and solutions include:
- **Amazon Web Services**
- **Google**
- **Microsoft**
- **Intel**
- **Grammarly**

---

## Get Started with Hugging Face

Explore our offerings, collaborate with the community, and start transforming your projects with state-of-the-art machine learning technologies. Sign up today and build your ML profile!

🔗 [Visit our Website](https://huggingface.co)

---

### Connect with Us

Follow us on:
- [Twitter](https://twitter.com/huggingface)
- [LinkedIn](https://www.linkedin.com/company/huggingface)
- [Discord](https://discord.gg/huggingface)

Join the conversation today and be a part of the future of AI!

In [79]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [82]:
stream_brochure("HuggingFace", "https://huggingface.co")


# Hugging Face Brochure

## About Us
Welcome to **Hugging Face**, where we are dedicated to building the future of AI together. As a vibrant and collaborative community, we provide a platform for machine learning enthusiasts and professionals to connect over models, datasets, and innovative applications.

### Our Mission
At Hugging Face, we believe in the power of collaboration in machine learning. We aim to accelerate ML development and create open-source solutions that leverage the collective intelligence of our community.

## Our Offerings
### Models
Browse through over **400,000 models** ranging from state-of-the-art machine learning implementations to innovative tools designed for varied applications. 

### Datasets
Access and share from our extensive repository of **100,000+ datasets** tailored for tasks in computer vision, NLP, and more.

### Applications
Explore our **150,000+ applications** and discover new ways to utilize ML in real life. Whether you're into text, audio, video, or 3D, we've got you covered.

### Enterprise Solutions
Hugging Face Enterprise provides advanced platforms for organizations, ensuring robust security and dedicated support tailored to business needs. Starting at **$20/user/month**, we are trusted by over **50,000 organizations**, including industry leaders like Meta, AWS, Google, Microsoft, and more.

## Company Culture
At Hugging Face, inclusivity and community spirit are at the core of our culture. Our team is composed of passionate AI enthusiasts and professionals who share a commitment to continuous learning and innovation. We foster an environment where collaboration thrives, and everyone has the opportunity to contribute and make an impact.

## Career Opportunities
Join us in shaping the future of AI! We are always on the lookout for talented individuals who are eager to dive into the world of machine learning. Explore a range of positions across various teams, from engineering to community engagement.

### Current Openings
- Software Engineers
- Data Scientists
- Community Managers
- Product Designers

## Connect with Us
Join our thriving community by getting involved! Follow us on social media, engage with our forums, or check out our blog for the latest news in AI and ML.

- [GitHub](https://github.com/huggingface)
- [Twitter](https://twitter.com/huggingface)
- [LinkedIn](https://www.linkedin.com/company/hugging-face)

### Together, Let's Build the Future of AI!

