# Company Brochure Generator
The idea is to create a brochure generator for companies. Once the company name and main website are provided, the tool will scan the site and filter out the most relevant links and content that would work well in a brochure. With the help of prompt engineering and a simple Gradio interface, the goal is to build an easy-to-use tool that can generate polished, professional brochures directly from the company’s existing web content.

In [None]:
# imports

import os
import requests
import json
import re
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

For this brochure generator, we will use the Qwen3:4B model. 

To demonstrate the process, we will use Enroot Earth as a sample website. 

You can check out the website at https://www.enroot.earth/

In [None]:
# Constants

MODEL = 'qwen3:4b'
WEBSITE_URL = 'https://www.enroot.earth/'
COMPANY_NAME = 'Enroot Earth'

In [None]:
# A class to scrape a webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

from urllib.parse import urlparse, urljoin

class Website:
    """
    A utility class to scrape a website, with filtered links
    """

    # List of file extensions to exclude (common downloadable file types)
    EXCLUDED_EXTENSIONS = {".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".zip", ".rar", ".exe", ".dmg", ".tar", ".gz", ".7z", ".mp3", ".mp4", ".avi", ".mkv", ".iso"}

    def __init__(self, url, headers=None):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        
        self.title = soup.title.string if soup.title else "No title found"
        
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        
        # Extract and filter links
        self.links = self.filter_links([link.get('href') for link in soup.find_all('a') if link.get('href')])

    def filter_links(self, links):
        """Filters out links pointing to downloadable files and keeps only web pages, emails, 
        and social media links."""
        filtered_links = []
        for link in links:
            parsed_link = urlparse(link)
            
            # Keep email links
            if parsed_link.scheme == "mailto":
                filtered_links.append(link)
                continue
            
            # Keep social media links
            if any(domain in parsed_link.netloc for domain in ["facebook.com", "twitter.com", "linkedin.com", "instagram.com", "t.me", "youtube.com"]):
                filtered_links.append(link)
                continue
            
            # Ignore links with excluded extensions
            if any(parsed_link.path.lower().endswith(ext) for ext in self.EXCLUDED_EXTENSIONS):
                continue
            
            # Convert relative URLs to absolute URLs
            absolute_link = urljoin(self.url, link)
            filtered_links.append(absolute_link)
        
        return filtered_links

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [None]:
web = Website(WEBSITE_URL)
web_links = web.links
print(web_links)

First, we'll use the Qwen3:4B model via Ollama to figure out which links on the website are relevant. By making a call to the model, it will read the available links and return the results in a structured JSON format. The model will decide which links are most suitable for the brochure. 

To help it respond accurately, we'll use a __"one-shot prompting"__ approach, where an example of the expected response is included directly in the prompt.

In [None]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""
print(link_system_prompt)

In [None]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "Please decide which of these are relevant web links for a brochure about the company, \
    respond with the full https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [None]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ]
    )

    result = response['message']['content'].strip()

    # Remove <think>...</think> blocks
    result = re.sub(r"<think>.*?</think>", "", result, flags=re.DOTALL).strip()

    # Remove ```json and ``` if present
    result = re.sub(r"^```json|```$", "", result.strip(), flags=re.MULTILINE).strip()

    try:
        return json.loads(result)
    except json.JSONDecodeError:
        print("⚠️ Failed to parse JSON. Raw model output:\n", result)
        return {"links": []}

In [None]:
web_links = get_links(WEBSITE_URL)
print(web_links)

Now that we have the relevant links, we'll move on to generating the brochure. We'll start by extracting the content from each of the selected web pages. Once the content is ready, we'll ask the Qwen3:4B model to create a brochure using the gathered information.

In [None]:
def get_all_details(url, links_dict):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    
    print("Found links:", links_dict)

    # Get the actual list of links from the dictionary
    links = links_dict.get("links", [])

    for link_info in links:
        link_url = link_info.get("url")
        if link_url:
            result += Website(link_url).get_contents()
    
    return result

In [None]:
print(get_all_details(WEBSITE_URL, web_links))

In [None]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. \
Respond in markdown.Include details of company culture, customers and careers/jobs if you have the information."

In [None]:
def get_brochure_user_prompt(company_name, url, links):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    Use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url, links)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [None]:
get_brochure_user_prompt(COMPANY_NAME, WEBSITE_URL, web_links)

In [None]:
def create_brochure(company_name, url, links):
    response = ollama.chat(
        model=MODEL, 
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url, links)}
        ]
    )
    result = response['message']['content'].strip()
    
    # Remove <think>...</think> blocks
    result = re.sub(r"<think>.*?</think>", "", result, flags=re.DOTALL).strip()

    return result

In [None]:
brochure = create_brochure(COMPANY_NAME, WEBSITE_URL, web_links)
display(Markdown(brochure))

## A minor improvement
With a small adjustment, we can change this so that the results are streamed back, with the familiar typewriter animation.

In [None]:
def stream_brochure(company_name, url):
    links = get_links(url)
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url, links)}
        ],
        stream=True
    )

    in_think_block = False
    result_chunks = []
    live_output = display(Markdown(""), display_id=True)

    for chunk in stream:
        content = chunk.get("message", {}).get("content", "")

        # Skip <think> blocks
        if "<think>" in content:
            in_think_block = True
            continue
        elif "</think>" in content:
            in_think_block = False
            continue
        elif in_think_block:
            continue

        result_chunks.append(content)
        current_result = "".join(result_chunks).strip()
        live_output.update(Markdown(current_result))

In [None]:
stream_brochure(COMPANY_NAME, WEBSITE_URL)

## Finally
We'll build a simple and interactive user interface using Gradio. This will allow anyone to easily input a company name and website, and instantly generate a polished brochure. The goal is to make the entire process seamless and user-friendly, requiring no technical expertise from the end user.

In [None]:
import gradio as gr

In [None]:
def stream_brochure(company_name, url):
    links = get_links(url)
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url, links)}
        ],
        stream=True
    )
    in_think_block = False
    result_so_far = ""
    
    for chunk in stream:
        content = chunk.get("message", {}).get("content", "")
        # Skip <think> blocks
        if "<think>" in content:
            in_think_block = True
            continue
        elif "</think>" in content:
            in_think_block = False
            continue
        elif in_think_block:
            continue
            
        result_so_far += content
        yield result_so_far

In [None]:
with gr.Blocks(title="Company Brochure Generator") as demo:
    gr.Markdown("# Company Brochure Generator")
    
    with gr.Row():
        with gr.Column(scale=1):
            company_name = gr.Textbox(label="Company name:")
            url = gr.Textbox(label="Landing page URL (including http:// or https://)")
            generate_btn = gr.Button("Generate Brochure")
        
        with gr.Column(scale=2):
            output = gr.Markdown(label="Brochure:")
    
    generate_btn.click(
        fn=stream_brochure,
        inputs=[company_name, url],
        outputs=output
    )

demo.launch()