In [204]:
# Import the libraries 
from openai import OpenAI
import os 
from dotenv import load_dotenv 
import requests
from bs4 import BeautifulSoup 
import json 
from IPython.display import Markdown, display, update_display


In [18]:
# Set up the environment 
load_dotenv(override = True)
api_key = os.getenv('OPENAI_API_KEY')

# Define the model 
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [183]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
# Example usage 
link = 'https://www.bbc.com'
Web = Website(link) 
#print(Web.title) 
#print(Web.text) 
#print(Web.get_contents())
        
        

In [184]:
# Define a system prompt for link_filtering
system_prompt_link_filtering = (
    "You are an intelligent assistant. You will be given a list of URLs from a specific website. "
    "Your task is to select the links that are most useful for creating a brochure that represents the website's purpose, offerings, and key content. "
    "Focus on links that highlight the company's services, products, mission, contact information, or any visually rich or informative sections."
)
system_prompt_link_filtering += "You should respond in JSON as in this example:"
system_prompt_link_filtering += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

# Define a user-prompt for link filtering
def user_prompt_link_filtering(link): 
    Web_class = Website(link)
    
    prompt = (
        f"We are creating a brochure for the company titled: '{Web_class.title}'.\n\n"
        f"Your task is to select the most relevant and helpful links from the list below to assist in designing the brochure.\n"
        f"Focus on pages that showcase the company's mission, services, products, key information, and visually rich or promotional content.\n\n"
        f"respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\n"
        f"Here are the available links:\n{Web_class.links}\n\n"
        "Please choose the links that would be most helpful for this purpose."
    )
    
    return prompt


In [185]:
def message_link_filtering(link): 
    return [
        {'role': 'system', 'content': system_prompt_link_filtering}, 
        {'role': 'user', 'content': user_prompt_link_filtering(link)}
    ] 

# Example usage: 
#link = 'https://www.bbc.com'
#message_link_filtering(link)

In [207]:
def filtered_links(link): 
    response = openai.chat.completions.create(
        model = MODEL, 
        messages = message_link_filtering(link), 
        response_format = {'type': 'json_object'}
    )
    return json.loads(response.choices[0].message.content)

#links = filtered_links(link)
#links 

In [202]:
def get_all_contents(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = filtered_links(url)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [192]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [196]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_contents(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

#get_brochure_user_prompt('bbc_news',link)

In [200]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))
    

In [203]:
create_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face Company Brochure

## Welcome to Hugging Face

At **Hugging Face**, we are building the future of artificial intelligence through community collaboration and innovative technologies aimed at enhancing machine learning. Join us in our mission to create an inclusive and collaborative environment that welcomes everyone from researchers and engineers to businesses and hobbyists.

---

## Our Offerings

### **Explore AI Models & Datasets**
- **Models**: Access over **1 million models** from leading developers to leverage in your AI projects, spanning various modalities - text, image, video, and more.
- **Datasets**: Browse through more than **250,000 datasets** tailored for machine learning tasks.
- **Spaces**: Discover applications created by the community with capabilities to generate any application or interact with AI agents.

### **Enterprise Solutions**
Hugging Face provides robust enterprise-grade solutions with features like:
- Advanced security protocols
- Dedicated support
- Access controls
- Optimized compute resources starting at $20/user/month

Trusted by over **50,000 organizations**, including major players such as Google, Amazon, Microsoft, and Grammarly.

---

## Company Culture

At Hugging Face, we foster a **collaborative and inclusive culture** that prioritizes open-source development and community engagement. We value transparency, innovation, and creativity in tackling challenges within the machine learning space. We believe that AI should be accessible to all, and we actively encourage contributions from developers and practitioners worldwide.

---

## Join Us! 

### **Careers**
Looking to make an impact in the AI community? Hugging Face is always on the lookout for passionate individuals to join our team. Explore opportunities across various functions including engineering, product management, and community relations on our **Jobs** page.

### **Community Engagement**
Connect with like-minded individuals via our [forum](https://huggingface.co/forum), or follow us on social media platforms like [Twitter](https://twitter.com/huggingface), [LinkedIn](https://www.linkedin.com/company/huggingface/), and [Discord](https://discord.com/invite/huggingface) to stay updated on the latest trends, blog posts, and community events.

---

## Get Involved

Whether you're an AI enthusiast, a business seeking enterprise solutions, or a developer eager to contribute to the open-source landscape, we welcome you to explore and engage with us. Together, let's shape the future of AI!

**Sign Up** today at [Hugging Face](https://huggingface.co) and take your first step towards innovating the world of machine learning.

In [206]:
create_brochure("BBC NEWS", "https://www.bbc.com/")

# BBC News Brochure

---

## Welcome to BBC News

The British Broadcasting Corporation, known as BBC, is a renowned global news organization providing trusted news coverage, in-depth analysis, and engaging features across various sectors. Our commitment to delivering accurate and impartial journalism has solidified our role as a leader in the media landscape.

---

## What We Cover

At BBC News, we bring you coverage from around the globe, including:

- **Breaking News**: Stay informed with the latest updates on critical global events.
- **International News**: In-depth reporting from regions such as Africa, Asia, Europe, and the Americas.
- **Sports**: Up-to-date news and live coverage of major sporting events, including the Premier League and international competitions.
- **Business & Innovation**: Insights into market trends, banking, technology, and sustainable practices affecting businesses today.
- **Culture & Arts**: Features on music, film, literature, and artistic expressions from diverse cultures worldwide.

---

## Company Culture

At the BBC, we foster a culture rooted in integrity, creativity, and inclusivity. Our team is comprised of diverse individuals from various backgrounds, who collaborate to bring unparalleled news insights to our audiences. We prioritize the well-being of our employees and encourage professional growth through ongoing training and developmental opportunities.

---

## Join Our Team

Are you passionate about journalism and media? The BBC is always on the lookout for dedicated individuals to join our vibrant team. Whether you are an experienced journalist, a digital media expert, or are interested in supporting roles, there is a place for you at the BBC. Visit our Careers page to explore current job openings and team initiatives.

---

## Connect With Us

Stay updated with the latest from BBC News by following us on our social media platforms and subscribing to our newsletters. Your source for trust, innovation, and world-class journalism is right here—welcome to the BBC!

---

**Contact Us:**  
Email: info@bbc.co.uk  
Website: [bbc.co.uk/news](https://www.bbc.co.uk/news)  
Social Media: [Twitter](https://twitter.com/BBCNews) | [Facebook](https://www.facebook.com/bbcnews) | [Instagram](https://www.instagram.com/bbcnews)  

--- 

We look forward to welcoming you to the BBC community!

# 📄 Web Brochure Assistant using OpenAI GPT

This project creates a **smart assistant** that builds brochures for companies based on the structure and content of their websites. It uses web scraping, prompt engineering, and the OpenAI GPT model to intelligently select which web pages are most useful for a brochure.

---

## 🔍 What It Does

Given a company URL, this system:

1. Scrapes the main content and all subpage links using `BeautifulSoup`.
2. Crafts a **system prompt** and a **user prompt** with structured instructions.
3. Calls the **OpenAI API** with those prompts to:
   - Understand the site context.
   - Choose only the most relevant subpages (e.g., About, Services, Contact).
4. Optionally gathers content from the selected links.
5. Constructs a brochure prompt or markdown-ready summary using the filtered information.

---

## 💡 Key Concepts

### 🤖 Agentic Thinking

This architecture mirrors the **agentic workflow**:
- The assistant receives context (`Website` class).
- It makes decisions (`link filtering`).
- It takes actions (`fetch content`, `build brochure`).
This is similar to how **AI agents** plan and act across multiple steps.

### 📬 OpenAI API Usage

The system sends **multi-turn chat messages** to GPT:
- `system_prompt`: Sets behavior (e.g., "You are a helpful assistant for filtering links").
- `user_prompt`: Gives specific input (e.g., site links and goal).
This results in multiple API calls when:
- The site is parsed.
- Content for each subpage is gathered.
- Final brochure summary is generated.

You can choose between:
- **One-shot prompting**: Pass all context in one go (fast, concise).
- **Few-shot prompting**: Provide multiple examples for richer understanding (more robust).

---

## 🧰 How to Use

1. Clone this repo and add your OpenAI API key to a `.env` file:
    ```env
    OPENAI_API_KEY=your_key_here
    ```

2. Run the notebook:
    ```python
    from openai import OpenAI
    from Website import Website

    get_brochure_user_prompt("BBC", "https://www.bbc.com")
    ```

3. Modify the prompt, model (e.g., `gpt-4o`), or link parser as needed.

---

## 📈 Future Extensions

- Add visual brochure generation using HTML/CSS.
- Integrate vector search for knowledge-grounded generation.
- Use LangChain/Autogen for autonomous multi-step workflows.
- Add caching and retries for stability.

---

## 💼 Positioning

This tool can serve as:
- A **sales assistant** for agencies building company brochures.
- A **content strategy generator** for marketing teams.
- A **scouting tool** for partnerships and investor research.
- A **knowledge organizer** for product managers.

---

## 👨‍💻 Requirements

- Python 3.8+
- `openai`, `beautifulsoup4`, `requests`, `dotenv`

---

## 📣 Contact

For questions, suggestions, or collaboration ideas, reach out to [Your Name] at [your-email].


## 🧠 Creative Applications of This Architecture

Here are several **creative reuses** of this architecture in different contexts:

1. **Investor Brief Generator**  
   Instead of brochures, filter links and content relevant for an investor pitch—e.g., financials, team, roadmap.

2. **Job Application Analyzer**  
   Feed a job description URL, and filter the company’s pages to generate a tailored cover letter with OpenAI.

3. **Academic Profile Builder**  
   Use university department pages to filter professor/research pages and generate a summary for students.

4. **Real Estate Guide**  
   Input a city/neighborhood website, and the system filters useful pages (schools, amenities, transport) for house hunters.

5. **Conference Preparation Agent**  
   Given a conference website, generate brochures or summaries for attendees by filtering schedules, keynotes, and venue info.

6. **Competitor Research Assistant**  
   Scrape a competitor’s site and summarize the offerings, USPs, and positioning using filtered pages.

