In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [None]:
ed = Website("https://alammarcookware.com")
print(ed.get_contents())

Webpage Title:
Al Ammar Cook Ware – Al Ammar Cook Ware
Webpage Contents:
Home
Shop
Bulk Order
Contact Us
Login / Register
Search
0
items
/
₨
0
Menu
Login / Register
Discover Premium Cookware at Alammar
Explore our collection of high-quality steel cookware, engineered with precision and crafted for lasting performance. Durable, hygienic, and stylish—perfectly designed for every modern kitchen.
Shop now
Popular of The Week
The most popular products from the collection
-23%
Artisan Wave Glass
Premium Glasses
₨
1,149
–
₨
6,299
Select options
This product has multiple variants. The options may be chosen on the product page
Quick view
Add to wishlist
-9%
Classic Steel Glass
Premium Glasses
₨
3,500
Original price was: ₨ 3,500.
₨
3,199
Current price is: ₨ 3,199.
Select options
This product has multiple variants. The options may be chosen on the product page
Quick view
Add to wishlist
-23%
Curved Barrel Glass
Premium Glasses
₨
1,149
–
₨
6,299
Select options
This product has multiple variants. T

In [37]:

link_system_prompt = """You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.

CRITICAL: You must respond with valid JSON only. Follow these formatting rules strictly:
- Use double quotes for all strings
- No spaces around colons or within quotes
- No trailing commas
- Ensure all URLs are complete and properly formatted
- No extra spaces before or after URLs
- No line breaks within string values

You must respond in this EXACT JSON format:
{"links":[{"type":"about page","url":"https://full.url/goes/here/about"},{"type":"careers page","url":"https://another.full.url/careers"}]}

Example of correct response:
{"links":[{"type":"about page","url":"https://example.com/about"},{"type":"careers page","url":"https://example.com/careers"},{"type":"contact page","url":"https://example.com/contact"}]}

Return ONLY the JSON object, no additional text or explanations."""

In [38]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.

CRITICAL: You must respond with valid JSON only. Follow these formatting rules strictly:
- Use double quotes for all strings
- No spaces around colons or within quotes
- No trailing commas
- Ensure all URLs are complete and properly formatted
- No extra spaces before or after URLs
- No line breaks within string values

You must respond in this EXACT JSON format:
{"links":[{"type":"about page","url":"https://full.url/goes/here/about"},{"type":"careers page","url":"https://another.full.url/careers"}]}

Example of correct response:
{"links":[{"type":"about page","url":"https://example.com/about"},{"type":"careers page","url":"https://example.com/careers"},{"type":"contact page","url":"https://example.com/contact"}]}

Return ONLY the JSON object, no ad

In [39]:
def get_links_user_prompt(website: Website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [32]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://alammarcookware.com/ - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://alammarcookware.com/
https://alammarcookware.com/shop/
https://alammarcookware.com/bulk-order/
https://alammarcookware.com/contact-us/
https://alammarcookware.com/
https://alammarcookware.com/my-account/
#
https://alammarcookware.com/cart/
#
https://alammarcookware.com/
https://alammarcookware.com/my-account/
https://alammarcookware.com/shop/
https://alammarcookware.com/product/artisan-wave-glass/
https://alammarcookware.com/product/artisan-wave-glass/
https://alammarcookware.com/product/artisan-wave-glass/
https://alammarcookware.com/product-category/glass/
https://alammarcookware.com/product/artisan-wave-glass/
https://alammarcookware.com/product/artisan-wave-glass/
https://alamm

In [45]:
import re

def clean_json_response(response_text):
    """Clean common JSON formatting issues"""
    if not response_text:
        return "{}"
    
    # Remove any text before the first {
    start_idx = response_text.find('{')
    if start_idx == -1:
        return "{}"
    
    # Remove any text after the last }
    end_idx = response_text.rfind('}')
    if end_idx == -1:
        return "{}"
    
    cleaned = response_text[start_idx:end_idx + 1]
    
    # Fix specific issues seen in your response:
    # 1. Fix spaced quotes like " url":" -> "url":
    cleaned = re.sub(r'"\s+(\w+)"\s*:\s*"', r'"\1":"', cleaned)
    
    # 2. Fix URLs with leading spaces like " https://..." -> "https://..."
    cleaned = re.sub(r':\s*"\s+([^"]*)"', r':"\1"', cleaned)
    
    # 3. Remove trailing commas before closing braces/brackets
    cleaned = re.sub(r',\s*}', '}', cleaned)
    cleaned = re.sub(r',\s*]', ']', cleaned)
    
    # 4. Fix any remaining whitespace issues in key-value pairs
    cleaned = re.sub(r'"\s*:\s*"', '":"', cleaned)
    
    return cleaned
def get_links(url):
    website = Website(url)
    ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
    response = ollama_via_openai.chat.completions.create(
        # response_format={"type": "json_object"},
    model=MODEL,
    messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
)
    result = response.choices[0].message.content
    result_cleaned = clean_json_response(result)
    return json.loads(result_cleaned)





In [20]:
alAmmar = Website("https://www.kryptomind.com")
alAmmar.links

['https://kryptomind.com',
 'https://kryptomind.com/services/',
 'https://kryptomind.com/portfolio/',
 'https://kryptomind.com/blogs/',
 '#',
 'https://kryptomind.com/our-team/',
 'https://kryptomind.com/careers/',
 'https://kryptomind.com/contact-us/',
 'https://kryptomind.com/blockchain-training-2/',
 'https://kryptomind.com/staff-augmentation/',
 'https://kryptomind.com/services/',
 'https://kryptomind.com/portfolio/',
 'https://kryptomind.com/blogs/',
 '#',
 'https://kryptomind.com/our-team/',
 'https://kryptomind.com/careers/',
 'https://kryptomind.com/contact-us/',
 'https://kryptomind.com/blockchain-training-2/',
 'https://kryptomind.com/staff-augmentation/',
 'https://kryptomind.com/get-a-quote/',
 'https://kryptomind.com/services/',
 'https://kryptomind.com/services/',
 'https://kryptomind.com/our-team/',
 'https://kryptomind.com/portfolio/',
 'https://kryptomind.com/portfolio/',
 'https://itrate.co/blockchain-developers/all',
 'https://www.goodfirms.co/company/kryptomind-llc'

In [34]:
get_links("https://www.kryptomind.com")

{'links': [{'type': 'Our Team', 'url': 'https://kryptomind.com/our-team/'},
  {'type': 'Careers/Jobs', 'url': 'https://kryptomind.com/careers/'},
  {'type': 'Contact Us', 'url': 'https://kryptomind.com/contact-us/'},
  {'type': 'Privacy Policy', 'url': 'https://kryptomind.com/privacy-policy/'},
  {'type': 'Portfolio', 'url': 'https://kryptomind.com/portfolio/'},
  {'type': 'Blogs', 'url': 'https://kryptomind.com/blogs/'}]}

In [46]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [47]:
print(get_all_details("https://www.kryptomind.com"))

Found links: {'links': [{'type': 'about page', 'url': 'https://kryptomind.com/about'}, {'type': 'careers page', 'url': 'https://kryptomind.com/careers'}, {'type': 'contact page', 'url': 'https://kryptomind.com/contact-us'}, {'type': 'privacy policy', 'url': 'https://kryptomind.com/privacy-policy'}, {'type': 'blockchain training', 'url': 'https://kryptomind.com/blockchain-training-2'}, {'type': 'staff augmentation', 'url': 'https://kryptomind.com/staff-augmentation'}, {'type': 'facebook page', 'url': 'https://www.facebook.com/kryptomindllc'}, {'type': 'twitter page', 'url': 'https://twitter.com/Kryptomindllc'}, {'type': 'youtube channel', 'url': 'https://www.youtube.com/channel/UCrSmEYax1hzppPkHg4pc-Bw'}, {'type': 'instagram page', 'url': 'https://www.instagram.com/kryptomindllc'}, {'type': 'behance profile', 'url': 'https://www.behance.net/kryptomind'}, {'type': 'linkedin company page', 'url': 'https://www.linkedin.com/company/kryptomindpvtltd'}]}
Landing page:
Webpage Title:
KryptoMin

In [48]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [49]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [50]:
get_brochure_user_prompt("Kryptomind", "https://www.kryptomind.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://kryptomind.com/'}, {'type': 'careers page', 'url': 'https://kryptomind.com/careers/'}, {'type': 'contact page', 'url': 'https://kryptomind.com/contact-us/'}, {'type': 'privacy policy page', 'url': 'https://kryptomind.com/privacy-policy/'}, {'type': 'blog page', 'url': 'https://kryptomind.com/blogs/'}, {'type': 'our team page', 'url': 'https://kryptomind.com/our-team/'}, {'type': 'portfolio page', 'url': 'https://kryptomind.com/portfolio/'}, {'type': 'services page', 'url': 'https://kryptomind.com/services/'}, {'type': 'careers jobs pages', 'url': 'https://kryptomind.com/careers/'}, {'type': 'blockchain training page', 'url': 'https://kryptomind.com/blockchain-training-2/'}, {'type': 'staff augmentation page', 'url': 'https://kryptomind.com/staff-augmentation/'}, {'type': 'facebook page', 'url': 'https://www.facebook.com/kryptomindllc/'}, {'type': 'twitter handle', 'url': 'https://twitter.com/Kryptomindllc'}, {'type': 'youtu

'You are looking at a company called: Kryptomind\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nKryptoMind – Web3 Development Company\nWebpage Contents:\nServices\nPortfolio\nBlogs\nAbout Us\nOur Team\nCareers\nContact Us\nBlockchain Training\nStaff Augmentation\nMenu\nServices\nPortfolio\nBlogs\nAbout Us\nOur Team\nCareers\nContact Us\nBlockchain Training\nStaff Augmentation\nGet a Quote\nTransparent Text with Border\nWhere\nBlockchain\nMeets\nArtificial Intelligence\nAnimation for Kryptomind\n/*Sides square*/\nFFFFFF\nWhere\nBlockchain\nmeets\nAI\n.\nExcelling in\nBlockchain\nArtificial\xa0Intelligence\nMetaverse\xa0&\xa0VR\nDeFi\xa0&\xa0dApps\nSmart\xa0Contracts\nWeb\xa0Applications\nMobile\xa0Applications\nDesktop\xa0Applications\nDevelopment\nExcelling in\nDevelopment\n100\n+\nProjects\n100\n+\nClients\n1\nY+\nExperience\n1\n+\nTeam\nBitcoin\nEthereum\nP

In [51]:
def stream_brochure(company_name, url):
    ollama_via_openai = OpenAI(base_url=OLLAMA_API, api_key='ollama')
    stream = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [57]:
stream_brochure("Kryptomind", "https://www.kryptomind.com")

Found links: {'links': ['https://kryptomind.com', 'https://kryptomind.com/about', 'https://kryptomind.com/contact-us', 'https://kryptomind.com/careers', 'https://kryptomind.com/portfolio', 'https://kryptomind.com/services', 'https://itrate.co/blockchain-developers/all', 'https://www.goodfirms.co/company/kryptomind-llc', 'https://www.google.com/maps/place/1221+College+Park+Dr+%23116,+Dover,%20DE+19904,+USA/@39.1795855,-75.5535667,18z/data=!4m6!3m5!1s0x89c77b154afaaaab:0x8abc7d1812992f57!8m2!3d39.1794836!4d-75.5522792!16s%2Fg%2F11s664rww_?entry=ttu', 'https://kryptomind.com/privacy-policy', 'https://kryptomind.com/blogs/', 'https://kryptomind.com/our-team/', 'tel:+1 914 290 4987', 'https://www.google.com/maps/place/KryptoMind+(Pvt)+Ltd./@31.4656057,74.257514,15z/data=!4m5!3m4!1s0x0:0xfe5b07aae7b0bc08!8m2!3d31.4656236!4d74.2574941', 'tel:+971 54 3297866', 'https://www.google.com/maps/place/KryptoMind+(Pvt)+Ltd./@31.4656057,74.257514,15z/data=!4m5!3m4!1s0x0:0xfe5b07aae7b0bc08!8m2!3d31.4656

TypeError: string indices must be integers, not 'str'