In [46]:
# generalimports
import os
import requests
import ollama
import json
from typing import List
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import gradio as gr

In [31]:
# used for calling ollama
model = "llama3.2"

In [32]:
# here we check a website and return a topple with the webpage content and a list of all the links
def webPage(url):
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"}
    
    response = requests.get(url, headers=header)
    body = response.content
    soup = BeautifulSoup(body, 'html.parser')
    webTitle = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        webText = soup.body.get_text(separator="\n", strip=True)
    else:
        webText = ""
    links = [link.get('href') for link in soup.find_all('a')]
    webLinks = [link for link in links if link]
    return f"Webpage Title:\n{webTitle}\nWebpage Contents:\n{webText}\n\n", links        

In [33]:
# the system prompt that we define the exact type of json we are execting
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [34]:
# this is the user prompt that we initially pass the url and expect the links
# from these links we ask the AI to decide which are the most relevant for the system prompt ... i.e a 
# company brochure
def get_links_user_prompt(url):
    _ , links = webPage(url)
    user_prompt = f"Here is the list of links on the website of {url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(links)
    return user_prompt

In [35]:
# the olama response 
def get_links(url):
    response = ollama.chat(
        model=model, 
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
      ], format='json'
    )
    return json.loads(response["message"]["content"])

In [36]:
# her ewe create the entire content we wil be passing to the AI
# along with the content of all the relevant links of the main page
def get_all_details(url):
    result = "Landing page:\n"
    mainPageContent, _ = webPage(url)
    result += mainPageContent
    links = get_links(url)
    #print("Found links:", links)
    for link in links["links"]:
        try:
            result += f"\n\n{link['type']}\n"
            subPageContent, _ = webPage(link["url"])
            result += subPageContent
        except Exception as e:
            _
            #print(f"An error occurred for this webpage: {e}. Specific page will be ignored")
    return result

In [37]:
# depending on the tone we get a different answer
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [38]:
def user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [39]:
# the stream part in combination to the markdown is in order to get a
# stream like experience in the jupyter notebook
def createBrochureAndStreamMarkdown(company,url):
    streamingResponse = ollama.chat(
        model=model, 
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt(company,url)}
      ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in streamingResponse:
        response += chunk.get('message', {}).get('content', '') or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [44]:
#createBrochureAndStreamMarkdown("", "")

In [48]:
# iface = gr.Interface(fn=createBrochureAndStreamMarkdown,
#                      inputs=[gr.Text(), gr.Text()], 
#                      outputs=gr.MultimodalTextbox())
# iface.launch()