# Business Challenge
- Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits
- provide a company name and their primary website

In [1]:
import os
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display 
from openai import OpenAI

In [29]:
#initializing constants
llama_url = 'http://localhost:11434/v1'
llama_api = "http://localhost:11434/api/chat"
Headers = {"Content-Type": "application/json"}
Model = "llama3.1"

In [10]:
# Utility Class - Expand on website class from ollam website summary project
class Website:

    def __init__(self, url):
        self.url = url

        response = requests.get(url)
        #return content from url 
        self.body = response.content
        soup = BeautifulSoup(response.content, 'html.parser')

        self.title = soup.title.string if soup.title else "No Title"

        if soup.body:
            # remove unneeded objects from scrape for summarization
            for irrelevant_objects in soup.body(["script", "style", "img", "input"]):
                irrelevant_objects.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link] #Build a new list with only the truthy (non-empty, non-None, non-zero) items from links

    def get_content(self):
        return f"Webpage Title:\n{self.title}\n Webpage Contents:\n{self.text}\n\n"

In [24]:
anthropic_website = Website("https://www.anthropic.com/")
anthropic_website.links

['#main',
 '#footer',
 'https://www.anthropic.com/',
 'https://www.anthropic.com/claude',
 'https://www.anthropic.com/claude-code',
 'https://www.anthropic.com/max',
 'https://www.anthropic.com/team',
 'https://www.anthropic.com/enterprise',
 'https://www.anthropic.com/pricing',
 'https://claude.ai/download',
 'https://claude.ai/',
 'https://www.anthropic.com/news/claude-character',
 'https://www.anthropic.com/api',
 'https://docs.anthropic.com/',
 'https://www.anthropic.com/pricing#api',
 'https://console.anthropic.com/',
 'https://docs.anthropic.com/en/docs/welcome',
 'https://www.anthropic.com/solutions/agents',
 'https://www.anthropic.com/solutions/code-modernization',
 'https://www.anthropic.com/solutions/coding',
 'https://www.anthropic.com/solutions/customer-support',
 'https://www.anthropic.com/solutions/education',
 'https://www.anthropic.com/solutions/financial-services',
 'https://www.anthropic.com/solutions/government',
 'https://www.anthropic.com/customers',
 'https://www.

### Links System Prompts

Use a call to llama to read the links on a webpage, and respond in structured JSON.  
It should decide which links are relevant, and replace relative links such as "/about" with "https://company.com/about".  
We will use "one shot prompting" in which we provide an example of how it should respond in the prompt.

This is an excellent use case for an LLM, because it requires nuanced understanding. Imagine trying to code this without LLMs by parsing and analyzing the webpage - it would be very hard!

Sidenote: there is a more advanced technique called "Structured Outputs" in which we require the model to respond according to a spec. 

In [18]:
# System Prompts
links_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
links_system_prompt += "You should respond in JSON as in this example:"
links_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [19]:
# Create User Prompt
def get_links_for_user_prompt(website):
    user_prompt = f"Here are a list of links for {website.url}"
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [23]:
print(get_links_for_user_prompt(anthropic_website))

Here are a list of links for https://www.anthropic.com/please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
#main
#footer
https://www.anthropic.com/
https://www.anthropic.com/claude
https://www.anthropic.com/claude-code
https://www.anthropic.com/max
https://www.anthropic.com/team
https://www.anthropic.com/enterprise
https://www.anthropic.com/pricing
https://claude.ai/download
https://claude.ai/
https://www.anthropic.com/news/claude-character
https://www.anthropic.com/api
https://docs.anthropic.com/
https://www.anthropic.com/pricing#api
https://console.anthropic.com/
https://docs.anthropic.com/en/docs/welcome
https://www.anthropic.com/solutions/agents
https://www.anthropic.com/solutions/code-modernization
https://www.anthropic.com/solutions/coding
https://www.anthropic.com/solutions/customer-support
https://www.

In [30]:
#function that puts everything together and gets links - remember I can use OpenAI to call llama3.1
def get_links(url):
    website = Website(url)
    #calling local llama3.1 model
    ollama_via_openai = OpenAI(base_url=llama_url, api_key="ollama")

    response = ollama_via_openai.chat.completions.create(
        model= Model,
        messages=[
            {"role":"system", "content":links_system_prompt},
            {"role":"user", "content":get_links_for_user_prompt(website)}
            ],
        response_format= {"type":"json_object"}
    )

    results = response.choices[0].message.content
    return json.loads(results)



In [31]:
#using huggingface.co try get_links
get_links("https://huggingface.co/")

{'links': [{'type': 'home page', 'url': '/'},
  {'type': 'About page', 'url': '/brand'},
  {'type': 'Company page', 'url': '/huggingface'},
  {'type': 'Products/Services page', 'url': '/models'},
  {'type': 'Products/Services page', 'url': '/datasets'},
  {'type': 'Products/Services page', 'url': '/enterprise'},
  {'type': 'Pricing page', 'url': '/pricing'},
  {'type': 'Blog page', 'url': '/blog'},
  {'type': 'Forum page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'Status page', 'url': 'https://status.huggingface.co/'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'Join/Careers page', 'url': '/join'},
  {'type': 'Careers/Jobs page',
   'url': 'https://apply.workable.com/huggingface/'}]}