In [1]:
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [14]:
MODEL = "mistral:7b"

In [22]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, "html.parser")
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
        
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [23]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [24]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
def get_links(url):
    website = Website(url)
    response= ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ]
    )
    result = response['message']['content']
    return json.loads(result)



In [25]:
def (url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']get_all_details}\n"
        result += Website(link["url"]).get_contents()
    return result


In [27]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [28]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [29]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ]
    )
    result = response['message']['content']
    display(Markdown(result))


In [30]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://endpoints.huggingface.co'}, {'type': 'docs page', 'url': 'https://huggingface.co/docs'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'discussion forum', 'url': 'https://discuss.huggingface.co/'}, {'type': 'github repository', 'url': 'https://github.com/huggingface'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}]}


 # Hugging Face: The AI Community Building the Future

Welcome to Hugging Face, a leading platform for collaboration on models, datasets, and applications in machine learning.

## Company Culture
Hugging Face is a vibrant community of developers, researchers, and enthusiasts dedicated to advancing AI technology. We believe in open-source and collaborative work, fostering an environment where everyone can contribute and benefit from the collective knowledge.

## Customers
Our platform serves over 50,000 organizations across various industries, including tech giants like Meta, Amazon, Google, Microsoft, Intel, Grammarly, and Writer.

## Services for You

### Prospective Users
Explore our vast library of 1 million+ models and 250k+ datasets. Discover AI applications spanning text, image, video, audio, and even 3D modalities. Collaborate with the community to build your portfolio and advance in ML.

### Investors
Invest in a promising future with Hugging Face. Our platform offers paid compute and enterprise solutions, providing advanced infrastructure for AI development and enterprise-grade security, access controls, and dedicated support.

### Recruits
Join our team of innovators shaping the future of AI. Contribute to our open-source projects like Transformers, Diffusers, Tokenizers, and many more. Together, we can build the foundation for machine learning tooling.

### Key Open Source Projects

1. Transformers: State-of-the-art ML for PyTorch, TensorFlow, JAX
2. Diffusers: State-of-the-art Diffusion models in PyTorch
3. Tokenizers: Fast tokenizers optimized for research & production
4. TRL: Train transformers LMs with reinforcement learning
5. Datasets: Access & share datasets for any ML tasks
6. Text Generation Inference: Serve language models with TGI optimized toolkit
7. Accelerate: Train PyTorch models with multi-GPU, TPU, mixed precision
8. System theme: Website themes for AI projects
9. Transformers.js: State-of-the-art ML running directly in your browser
10. smolagents: Smol library to build great agents in Python
11. PEFT: Parameter-efficient finetuning for large language models

Join Hugging Face today and be part of the AI community building the future!
Sign Up [here](http://huggingface.co/signup)