In [41]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [42]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [43]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    Utility class representing a scraped website, including links and content.
    """

    def __init__(self, url):
        self.url = url
        self.title, self.text, self.links = self._scrape_website()

    def _scrape_website(self):
        # Fetch and parse the webpage
        response = requests.get(self.url, headers=HEADERS)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title
        title = soup.title.string if soup.title else "No title found"

        # Extract the webpage text, excluding irrelevant tags
        text = self._extract_text(soup)

        # Extract all valid links
        links = [link.get('href') for link in soup.find_all('a') if link.get('href')]

        return title, text, links

    def _extract_text(self, soup):
        if soup.body:
            # Remove irrelevant tags like script, style, img, input
            for tag in soup.body(["script", "style", "img", "input"]):
                tag.decompose()
            return soup.body.get_text(separator="\n", strip=True)
        return ""

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [44]:
ap = Website("https://appwrite.io/")
ap.links

['#main',
 '/',
 'https://cloud.appwrite.io',
 '/',
 '/products/auth',
 '/docs/products/databases',
 '/products/storage',
 '/products/functions',
 '/products/messaging',
 '/docs/apis/realtime',
 '/blog/post/case-study-undo',
 '/blog/category/case-studies',
 '/docs',
 '/pricing',
 'https://github.com/appwrite/appwrite',
 'https://cloud.appwrite.io',
 'https://cloud.appwrite.io/register',
 'https://cloud.appwrite.io',
 '/docs',
 '/pricing',
 'https://github.com/appwrite/appwrite',
 '/blog/post/introducing-new-compute-capabilities-appwrite-functions',
 'https://cloud.appwrite.io',
 '/discord',
 'https://github.com/appwrite/appwrite',
 'https://twitter.com/appwrite',
 'https://www.youtube.com/@Appwrite',
 'https://github.com/appwrite/appwrite',
 'https://twitter.com/appwrite',
 'https://twitter.com/appwrite',
 '/docs/quick-starts/flutter',
 '/docs/quick-starts/nextjs',
 '/docs/quick-starts/react',
 '/docs/quick-starts/sveltekit',
 '/docs/quick-starts/nuxt',
 '/docs/quick-starts/vue',
 '/do

In [45]:
link_system_prompt = """You have been provided with a list of links extracted from a webpage.
Your task is to categorize these links based on their relevance to a company brochure.

The goal is to identify important pages such as:
- **'About Us'** or **'Company Overview'** pages  
- **'Careers'** or **'Job Opportunities'** pages  
- **'Contact'** pages  
- **'Products'** or **'Services'** pages (if applicable)  

### **Guidelines for Filtering Links:**
- **Exclude invalid links**, including:  
  - `mailto:` email links (e.g., `mailto:contact@company.com`)  
  - JavaScript-based links (e.g., `javascript:void(0)`)  
  - Anchors within the same page (e.g., `#section1`)  
  - Non-web URLs (e.g., `ftp://`, `tel:`, `data:`)  
- Ensure each URL is a **fully qualified web address** (`https://` or `http://`).  
- Ignore unrelated links such as Terms of Service, Privacy Policy, and login pages.  

### **Examples of Proper Classification:**

#### **Example 1**
{
    "links": [
        {"type": "about us", "url": "https://example.com/about-us"},
        {"type": "careers", "url": "https://example.com/jobs"},
        {"type": "contact", "url": "https://example.com/contact"},
        {"type": "services", "url": "https://example.com/our-services"}
    ]
}

#### **Example 2**
{
    "links": [
        {"type": "company overview", "url": "https://business.com/who-we-are"},
        {"type": "careers", "url": "https://business.com/careers"},
        {"type": "products", "url": "https://business.com/our-products"},
        {"type": "contact", "url": "https://business.com/contact-us"}
    ]
}

#### **Example 3**
{
    "links": [
        {"type": "about us", "url": "https://techstartup.io/our-story"},
        {"type": "careers", "url": "https://techstartup.io/work-with-us"},
        {"type": "products", "url": "https://techstartup.io/solutions"},
        {"type": "contact", "url": "https://techstartup.io/contact"}
    ]
}

Now, using the same format, classify the following links and respond in **valid JSON**:
"""


In [46]:
def get_links_user_prompt(website):
    user_prompt = f"""You have been provided with a list of links extracted from the website: {website.url}.

Your task is to identify which of these links are most relevant for inclusion in a company brochure. Focus on pages that provide key company information, such as:
- "About Us" or "Company Overview" pages
- "Careers" or "Job Opportunities" pages
- "Contact" pages
- "Products" or "Services" pages (if applicable)

**Guidelines:**
- Exclude links related to Terms of Service, Privacy Policies, and email addresses.
- If a link is relative (e.g., "/about"), assume it belongs to the website's domain and convert it to a full URL.
- Provide only the relevant links, following the expected structured format.

Here is the list of links extracted from the website:
{chr(10).join(website.links)}
"""
    return user_prompt


In [47]:
def get_links(url):
    """
    Fetches and classifies relevant links from a given website URL.

    Args:
        url (str): The website URL to scrape links from.

    Returns:
        dict: A dictionary containing classified links in JSON format.
    """
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result) if result else {}

In [48]:
get_links("https://appwrite.io/")

{'links': [{'type': 'about us', 'url': 'https://appwrite.io/company'},
  {'type': 'careers', 'url': 'https://appwrite.careers'},
  {'type': 'contact', 'url': 'https://appwrite.io/contact-us'},
  {'type': 'products', 'url': 'https://appwrite.io/products/auth'},
  {'type': 'products', 'url': 'https://appwrite.io/products/storage'},
  {'type': 'products', 'url': 'https://appwrite.io/products/functions'},
  {'type': 'products', 'url': 'https://appwrite.io/products/messaging'}]}

In [49]:
get_links("https://huggingface.co")

{'links': [{'type': 'about us', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'products', 'url': 'https://huggingface.co/models'},
  {'type': 'products', 'url': 'https://huggingface.co/datasets'},
  {'type': 'products', 'url': 'https://huggingface.co/spaces'},
  {'type': 'contact', 'url': 'https://www.linkedin.com/company/huggingface/'}]}

In [50]:
def get_all_details(url):
    """
    Fetches the main page content and details of relevant subpages from the website.

    Args:
        url (str): The main website URL.

    Returns:
        str: A formatted string containing content from the landing page and relevant subpages.
    """
    
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [51]:
system_prompt = """You are a skilled assistant specializing in analyzing company websites and crafting engaging, informative brochures. 

Your task is to create a compelling, well-structured brochure that highlights key aspects of the company for prospective customers, investors, and job seekers. 

### **Key Focus Areas:**
- **Company Overview** – What does the company do? What makes it unique?
- **Company Culture** – Insights into values, work environment, and mission.
- **Products & Services** – A brief overview of offerings (if available).
- **Customers & Clients** – Who does the company serve? Any notable partners?
- **Careers & Opportunities** – Available job roles, work benefits, and reasons to join.

### **Output Format:**
- Respond in **markdown** for easy readability.
- Structure the content with **headings, bullet points, and concise sections**.

#### **Customizable Tone:**
By default, keep the brochure **professional and informative**. However, if a more humorous or entertaining style is required, adapt the tone while keeping key information intact.

Use the provided website content to generate an insightful and engaging brochure.
"""

In [52]:
def get_brochure_user_prompt(company_name, url):
    """
    Generates a structured user prompt for creating a company brochure.

    Args:
        company_name (str): The name of the company.
        url (str): The website URL of the company.

    Returns:
        str: A well-formatted prompt containing company details for brochure generation.
    """
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [53]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [56]:
create_brochure("Appwrite", "https://appwrite.io/")

# Appwrite Brochure

## **Company Overview**
Welcome to **Appwrite** – the ultimate open-source backend server designed to simplify the development process for modern applications. With Appwrite, developers can build and scale robust, secure applications within minutes, empowering teams to work like they are hundreds strong. 

### **What Makes Us Unique?**
- **Open-Source Platform:** Fully customizable to fit your needs.
- **Flexible Integration:** Supports frameworks and languages of your choice.
- **Developer-Focused:** A strong community with rich support and resources.

## **Company Culture**
At Appwrite, we foster a collaborative and innovative work environment, driven by values that prioritize:

- **Community Engagement:** We believe in the power of collaboration and knowledge sharing.
- **Open Communication:** Our team ethos encourages open dialogue and feedback.
- **Supportive Learning:** Continuous growth and development opportunities for all employees.

### **Our Mission**
To empower developers around the world by providing tools that simplify backend processes, enabling them to focus on building user-centric applications.

## **Products & Services**
Appwrite offers a comprehensive suite of tools tailored for modern application development:

- **Auth:**  
  Secure authentication with over 30 login methods.

- **Databases:**  
  Scalable, robust, and customizable database solutions.

- **Storage:**  
  Advanced file management with encryption and image transformations.

- **Functions:**  
  Deploy and scale serverless functions effortlessly.

- **Messaging:**  
  A unified messaging service for real-time communication.

- **Realtime:**  
  Monitor live updates and events across your application.

## **Customers & Clients**
Appwrite serves a diverse range of developers, from startups to leading organizations. Our platform has garnered recognition and trust from:

- Several high-profile tech leaders and innovators.
- A thriving community with over 45.1K stars on GitHub.
- Engaged users in our Discord channel, boasting over 17k members.

## **Careers & Opportunities**
Join the exciting journey at Appwrite and be part of a team dedicated to innovation! 

### **Available Roles**
- Software Developers
- Community Managers
- Marketing Specialists

### **Work Benefits**
- **Flexible Working Hours:** Balance your work-life commitments.
- **Remote Work Options:** Work from anywhere while contributing to a global community.
- **Growth Opportunities:** Skill enhancement through workshops and community events.

### **Why Join Us?**
- Be part of an inspiring team working on groundbreaking technology.
- Contribute to an open-source platform loved by developers around the world.
- Experience a culture of support and collaboration.

---

**Start building your backend with Appwrite today!**  
Connect with us at [Appwrite Website](https://appwrite.io) | Follow us on [GitHub](https://github.com/appwrite) | Join our [Discord Community](https://discord.gg/appwrite)  

Let’s shape the future of app development together!

In [55]:
create_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face Brochure

---

## **Company Overview**
**Welcome to Hugging Face!**  
We are an AI community dedicated to shaping the future of machine learning. Our platform serves as the hub where individuals and organizations collaborate on developing models, datasets, and applications that drive innovation in the AI landscape.  

### **What Makes Us Unique?**
- **Open Collaboration:** We foster a culture of shared knowledge, allowing users to contribute and utilize community-driven resources.
- **Diverse Offerings:** From advanced models to extensive datasets, we provide tools that cater to various AI needs.
- **Accessibility:** Our interface is designed for users at all levels, ensuring that everyone can participate in AI development.

---

## **Company Culture**
At Hugging Face, our mission is clear: build an inclusive and supportive environment that empowers creators, researchers, and businesses alike.

### **Core Values**
- **Community-Driven:** We believe in the power of collaboration.
- **Innovation:** Continuous improvement and cutting-edge research are at our core.
- **Transparency:** Open access to our tools and resources is fundamental to our operations.

### **The Work Environment**
- **Flexible & Inclusive:** Emphasizing a work-life balance and diversity amongst team members.
- **Supportive Team:** Our culture encourages growth, experimentation, and knowledge sharing.

---

## **Products & Services**
### **Our Offerings Include:**
- **Models:** Access over 1 million models tailored for various machine learning applications.
- **Datasets:** Explore a vast repository of over 250,000 datasets to enhance your projects.
- **Spaces:** A platform for running applications and models seamlessly.
- **Enterprise Solutions:** Advanced options for organizations including GPU Compute and dedicated support.

---

## **Customers & Clients**
We proudly serve over **50,000 organizations**, including notable enterprises such as:
- **Meta**
- **Amazon Web Services**
- **Google**
- **Intel**
- **Microsoft**

This diverse clientele showcases our platform's capacity to meet the needs of both startups and established corporations.

---

## **Careers & Opportunities**
Join our dynamic team at Hugging Face! We are continuously looking for passionate individuals to contribute to our mission.

### **Current Job Openings:**
- Machine Learning Engineers
- Data Scientists
- DevOps Specialists

### **Why Work With Us?**
- **Competitive Benefits:** Enjoy comprehensive health plans, flexible working hours, and opportunities for professional development.
- **Impactful Work:** Contribute to projects that have a direct effect on the AI community.
- **Collaborative Atmosphere:** Work alongside leading experts and innovators in the field.

---

**Ready to join the AI revolution?**  
**Visit us at [Hugging Face](https://huggingface.co) to learn more, explore opportunities, or dive into our resources!** 

---

Let's build the future, together!  🌟 