In [None]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

In [20]:
#Load environment variables in a file called .env

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

#Check the key

if not api_key:
    print("No API key was found")
elif api_key[:8]!="sk-proj-":
    print("An API was found, but it doesn't start with sk-proj-; Please check you are using right key")
elif api_key.strip() !=api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them")
else:
    print("API key found and looks good so far!")

API key found and looks good so far!


In [21]:
openai = OpenAI()

In [22]:
# A class to represent a Webpage

class Website:
    """
    A utility class to represent a Website that we have scraped
    """
    url: str
    title: str
    test: str

    def __init__(self, url):
        """
        Create this Website object from the given url using BeautifulSoup library
        """
        self.url = url
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "imput"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [23]:
# Let's try one out

ed = Website("http://books.toscrape.com/")
print(ed.title)
print(ed.text)


    All products | Books to Scrape - Sandbox

Books to Scrape
We love being scraped!
Home
All products
Books
Travel
Mystery
Historical Fiction
Sequential Art
Classics
Philosophy
Romance
Womens Fiction
Fiction
Childrens
Religion
Nonfiction
Music
Default
Science Fiction
Sports and Games
Add a comment
Fantasy
New Adult
Young Adult
Science
Poetry
Paranormal
Art
Psychology
Autobiography
Parenting
Adult Fiction
Humor
Horror
History
Food and Drink
Christian Fiction
Business
Biography
Thriller
Contemporary
Spirituality
Academic
Self Help
Historical
Christian
Suspense
Short Stories
Novels
Health
Politics
Cultural
Erotica
Crime
All products
1000
results - showing
1
to
20
.
This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.
A Light in the ...
£51.77
In stock
Add to basket
Tipping the Velvet
£53.74
In stock
Add to basket
Soumission
£50.10
In stock
Add to basket
Sharp Objects
£47.82
In stock
Add to basket
Sapiens: A Brief Hist

Types of Prompts

A system prompt that tells them what task they are performing and what tone they use

A user prompt the conversation starter that they should reply to 

In [24]:
# Defining System Prompt

system_prompt = """You are an assistant that analyzes the contents of a website \ 
and provides a short summary, ignoring text that might be navigating related. \ 
Respond in markdown."""

In [40]:
# Function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at website titled {website.title}"
    user_prompt += "\n The contents of this website is as follows; \
Please provide a short summary of this website in markdown. \
If it included news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [41]:
print(user_prompt_for(ed))

You are looking at website titled 
    All products | Books to Scrape - Sandbox

 The contents of this website is as follows; Please provide a short summary of this website in markdown. If it included news or announcements, then summarize these too.

Books to Scrape
We love being scraped!
Home
All products
Books
Travel
Mystery
Historical Fiction
Sequential Art
Classics
Philosophy
Romance
Womens Fiction
Fiction
Childrens
Religion
Nonfiction
Music
Default
Science Fiction
Sports and Games
Add a comment
Fantasy
New Adult
Young Adult
Science
Poetry
Paranormal
Art
Psychology
Autobiography
Parenting
Adult Fiction
Humor
Horror
History
Food and Drink
Christian Fiction
Business
Biography
Thriller
Contemporary
Spirituality
Academic
Self Help
Historical
Christian
Suspense
Short Stories
Novels
Health
Politics
Cultural
Erotica
Crime
All products
1000
results - showing
1
to
20
.
This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.


In [27]:
# Messages

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [28]:
messages_for(ed)

[{'role': 'system',
  'content': 'You are an assistant that analyzes the contents of a website \\ \nand provides a short summary, ignoring text that might be navigating related. \\ \nRespond in markdown.'},
 {'role': 'user',

In [36]:
# Call the OpenAI API.

def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [37]:
summarize("http://books.toscrape.com/")

'# Summary of "Books to Scrape - Sandbox"\n\n**Overview:**\nThe "Books to Scrape - Sandbox" website serves as a testing platform for web scraping, showcasing a variety of books across numerous genres. \n\n**Content:**\nThe site features a comprehensive list of products, categorized into genres such as:\n- Travel\n- Mystery\n- Historical Fiction\n- Sequential Art\n- Philosophy\n- Romance\n- Children\'s\n- Nonfiction\n- Science Fiction\n- Poetry\n- And many more...\n\nCurrently, there are 1000 products displayed, with pagination indicating that there are multiple pages of items to browse through. Each product includes a title, a price, and an "Add to basket" option.\n\n**Important Note:**\nAll prices and ratings on this demo site are randomly assigned and do not reflect real values, serving solely for testing purposes in web scraping.\n\nNo news or announcements are present on the site.'

In [38]:
# Functions to display this nicely in Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [39]:
display_summary("http://books.toscrape.com/")

# Summary of "Books to Scrape - Sandbox"

"Books to Scrape" is a demo website designed for web scraping purposes. The site features a wide range of books across numerous genres, including:

- **Fiction**
- **Non-Fiction**
- **Fantasy**
- **Science Fiction**
- **Mystery**
- **Romance**
- **Children's Books**
- **Historical Fiction**
- **Self Help**
- **Biographies**
- **Academic**
- **Horror**
- **Poetry**

The website lists various books along with their prices and stock availability, although it is important to note that this information is randomly assigned and holds no real significance. The site displays a total of 1000 results, suggesting a robust catalog of books for demonstration purposes.

There are no news or announcements provided on the website.