In [16]:
import requests 
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0"
}

def scrape_website(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.title.string if soup.title else "No title found"

    # Remove scripts and irrelevant tags
    for tag in soup(["script", "style", "img", "input"]):
        tag.decompose()

    text = soup.get_text(separator="\n", strip=True)
    links = [a.get('href') for a in soup.find_all('a') if a.get('href')]

    return title, text, links

# Try it out
title, text, links = scrape_website("https://huggingface.co")
print("Title:", title)
print("Links found:", links[:5])


Title: Hugging Face – The AI community building the future.
Links found: ['/', '/models', '/datasets', '/spaces', '/posts']


In [17]:
import ollama

def pick_relevant_links(links, website_name):
    prompt = f"""
Here is a list of links from {website_name}:
{links}

Which of these would be most useful for writing a company brochure? 
Pick only relevant links (e.g. About, Team, Products, Careers), and return only the exact links.
    """

    response = ollama.chat(
        model='llama3.2:1b',
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    return response['message']['content']


In [18]:
useful_links = pick_relevant_links(links, "huggingface.co")
print("Useful Links:\n", useful_links)

Useful Links:
 Here are the relevant links that could be useful for writing a company brochure:

1. /docs/huggingface_hub
2. /about
3. /team
4. /products
5. /careers


In [19]:
from urllib.parse import urljoin

def scrape_useful_pages(base_url, useful_links_text):
    pages = {}
    for line in useful_links_text.strip().splitlines():
        link = line.strip()
        full_url = urljoin(base_url, link)
        try:
            title, text, _ = scrape_website(full_url)
            pages[full_url] = {"title": title, "text": text}
        except Exception as e:
            print(f"❌ Failed to scrape {full_url}: {e}")
    return pages


In [20]:
base_url = "https://huggingface.co"
scraped_pages = scrape_useful_pages(base_url, useful_links)
print("Pages scraped:", list(scraped_pages.keys()))


Pages scraped: ['https://huggingface.co/Here are the relevant links that could be useful for writing a company brochure:', 'https://huggingface.co', 'https://huggingface.co/1. /docs/huggingface_hub', 'https://huggingface.co/2. /about', 'https://huggingface.co/3. /team', 'https://huggingface.co/4. /products', 'https://huggingface.co/5. /careers']


In [21]:
combined_content = ""
for url, data in scraped_pages.items():
    combined_content += f"\n## From {url}\n"
    combined_content += data['text'][:3000]  # Optional: limit very long pages


In [22]:
def generate_brochure_with_ollama(content, company_name):
    prompt = f"""
You're a professional brochure writer.

Using the following content scraped from {company_name}'s website, write a brochure in **Markdown** format.

It should highlight the company’s mission, products, team, and career opportunities.

Use clear headings and bullet points where appropriate.

Here is the content:
{content}
"""
    response = ollama.chat(
        model='llama3.2:1b',
        messages=[{"role": "user", "content": prompt}]
    )
    return response['message']['content']


In [23]:
brochure_md = generate_brochure_with_ollama(combined_content, "Hugging Face")
print(brochure_md)


# Hugging Face: Where AI Meets Community

## About Us
------------

Hugging Face is a leading platform for machine learning (ML) and artificial intelligence (AI). We're a community-driven organization that's passionate about building the future of ML.

## Mission
----------

Our mission is to make AI accessible, inclusive, and collaborative. We believe that ML should be used to solve real-world problems, not just in research or academia.

## Models
--------

We have an incredible library of models, covering a wide range of tasks such as:

* Vision: Object detection, segmentation, classification, and generation
* Natural Language Processing (NLP): Text classification, sentiment analysis, language translation, and more
* Music Generation: Generating high-quality music using state-of-the-art models

## Datasets
------------

We offer a vast collection of public datasets, including:

* Computer Vision: Image classification, object detection, segmentation, and generation
* NLP: Sentiment an