In [1]:
###Building Brochure for a URL, based on important sub links in URL and not only main page..


In [2]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [3]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

In [4]:
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [5]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
###Change here is that I add links as a variable to be accessed later, use for loop i.e get all links in the webpage then add all links into a list
class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [6]:
ps = Website("https://prith27.github.io/")
ps.links

['mailto:prithvi.seshadri01@gmail.com',
 'tel:+919940654174',
 'https://www.linkedin.com/in/prithvi-seshadri-b736631b3/',
 'https://scholar.google.com/citations?user=aP4C_hoAAAAJ&hl=en',
 '#',
 'https://ieeexplore.ieee.org/document/10395841',
 'https://onlinelibrary.wiley.com/doi/10.1002/9781119905172.ch6',
 'https://ai.plainenglish.io/a-beginners-guide-to-training-a-yolov5-object-detection-model-91adffe99f79',
 'https://ai.plainenglish.io/building-accurate-object-detection-models-with-retinanet-a-comprehensive-step-by-step-guide-b8a35f435285',
 'https://iopscience.iop.org/article/10.1088/1742-6596/2115/1/012038']

In [7]:
###ASK gpt for identifying relevant links from above

In [8]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [9]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [10]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [12]:
print(get_links_user_prompt(ps))

Here is the list of links on the website of https://prith27.github.io/ - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
mailto:prithvi.seshadri01@gmail.com
tel:+919940654174
https://www.linkedin.com/in/prithvi-seshadri-b736631b3/
https://scholar.google.com/citations?user=aP4C_hoAAAAJ&hl=en
#
https://ieeexplore.ieee.org/document/10395841
https://onlinelibrary.wiley.com/doi/10.1002/9781119905172.ch6
https://ai.plainenglish.io/a-beginners-guide-to-training-a-yolov5-object-detection-model-91adffe99f79
https://ai.plainenglish.io/building-accurate-object-detection-models-with-retinanet-a-comprehensive-step-by-step-guide-b8a35f435285
https://iopscience.iop.org/article/10.1088/1742-6596/2115/1/012038


In [13]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [14]:
get_links("https://prith27.github.io/")

{'links': []}

In [15]:
###Apparently my website has no relevant links haha, will try Hugginface lol

In [16]:
get_links("https://huggingface.co") ###Damn looks relevant

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'community discussion page',
   'url': 'https://discuss.huggingface.co'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

In [17]:
def get_all_details(url):  ###Here i am appending all website url details and info one below other, will understand if you see print statement
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [18]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Trending on
this week
Models
deepseek-ai/DeepSeek-V3-Base
Updated
3 days ago
•
6.34k
•
1.04k
deepseek-ai/DeepSeek-V3
Updated
3 days ago
•
40k
•
846
Qwen/QVQ-72B-Preview
Updated
8 days ago
•
41.1k
•
419
black-forest-labs/FLUX.1-dev
Updated
Aug 16, 2024
•
1.19M
•
7.64k
answerdotai/ModernBERT-base
Update

In [19]:
###Prompts
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [20]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'docs page', 'url': 'https://huggingface.co/docs'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nTrending on\nthis week\nModels\ndeepseek-ai/DeepSeek-V3-Base\nUpdated\n3 days ago\n•\n6.34k\n•\n1.04k\ndeepseek-ai/DeepSeek-V3\nUpdated\n3 days ago\n•\n40k\n•\n846\nQwen/QVQ-72B-Preview\nUpdated\n8 days ago\n•\n41.1k\n•\n419\nblack-forest-labs/FLUX.1-dev\nUpdated\nAug 16, 2024\n•\n1.19M\n•\n7.64k\nanswerdotai/ModernBERT-base\nUpdated\n7 days ago\n•\n57.6k\n•\n561\nBrowse 400k+ models\nSpaces\nRunning\non\nZero\n2.37k\n🏢\nTRELLIS\nScalable and Versatile 3D Generati

In [22]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [23]:
create_brochure("HuggingFace", "https://huggingface.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.com/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.com/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.com/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.com/blog'}, {'type': 'discuss page', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}]}


# Hugging Face Company Brochure

## Overview
**Hugging Face** is at the forefront of the AI community, dedicated to building the future of machine learning. We provide a platform where developers and researchers collaborate to create state-of-the-art models, share datasets, and develop innovative applications in an open-source ecosystem.

## Company Culture
Hugging Face thrives on the principles of collaboration, innovation, and democratization of technology. Our culture revolves around community engagement, where every team member contributes to a collective mission of advancing machine learning. We believe in open-source tooling that enables everyone to access cutting-edge technology and foster creativity in AI development.

**Core Values:**
- **Collaboration:** We work together with the machine learning community to share ideas and advancements.
- **Accessibility:** We aim to make machine learning tools available to all, fostering diversity and innovation.
- **Continuous Learning:** We support our team in pursuing knowledge and growth within the industry.

## Our Offerings
- **Models:** Access to a library of over 400,000 machine learning models.
- **Datasets:** Browse and collaborate on over 100,000 datasets for various applications.
- **Spaces:** Create and showcase applications using our robust infrastructure.
- **Enterprise Solutions:** Tailored services that provide advanced security and support for businesses.

## Who We Serve
With a user base of **more than 50,000 organizations**, including industry giants like Google, Microsoft, and Amazon Web Services, Hugging Face has established itself as a trusted platform for AI development. We cater to enterprises seeking to enhance their AI capabilities with top-tier solutions, as well as individual developers and researchers.

## Careers at Hugging Face
We are always looking for passionate individuals to join our team. Working at Hugging Face means being part of a vibrant community where innovative ideas flourish and every voice is heard.

**Why Join Us?**
- **Impactful Work:** Contribute to projects that democratize machine learning and make a difference in the tech world.
- **Flexible Environment:** Enjoy a supportive atmosphere that encourages creativity and growth.
- **Diverse Team:** Work alongside a talented group of individuals from various backgrounds, united by a common mission.

Explore our current job openings and become part of a community that is shaping the future of AI! 

For more information, visit our [Careers Page](https://huggingface.co/jobs).

---

Join Hugging Face and be part of the AI revolution today!