In [1]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook

## Connecting to OpenAI¶

In [2]:
# Load environment variables in a file called .env

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [3]:
openai = OpenAI()

## Web Scraping for JavaScript Website with selenium

In [9]:
!pip install selenium
!pip install undetected-chromedriver
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2


In [34]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

class WebScraper:
    def __init__(self):
        # Initialize the Chrome driver
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

    def extract_content(self, url):
        # Open the URL
        self.driver.get(url)
        
        # Extract the page title
        title = self.driver.title
        
        # Try to find the main content by common tags and classes
        main_content = ""
        try:
            main_content_element = self.driver.find_element(By.TAG_NAME, 'main')
            main_content = main_content_element.text
        except:
            pass
        
        if not main_content:
            try:
                main_content_element = self.driver.find_element(By.CLASS_NAME, 'main-content')
                main_content = main_content_element.text
            except:
                pass
        
        if not main_content:
            try:
                main_content_element = self.driver.find_element(By.ID, 'content')
                main_content = main_content_element.text
            except:
                pass
        
        if not main_content:
            try:
                main_content_element = self.driver.find_element(By.CLASS_NAME, 'content')
                main_content = main_content_element.text
            except:
                pass
        
        return title, main_content

    def close(self):
        # Close the browser
        self.driver.quit()

## Define our system prompt 

In [12]:

system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

## A function that writes a User Prompt that asks for summaries of websites:


In [55]:

def user_prompt_for(title, text):
    user_prompt = f"You are looking at a website titled {title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += text
    return user_prompt

## Build messages boject for OpenAI

In [54]:
def messages_for(title, text):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(title, text)}
    ]

In [56]:
def summarize(url):
    scraper = WebScraper()
    title, content = scraper.extract_content(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(title, content)
    )
    return response.choices[0].message.content

In [57]:
summarize("https://edwarddonner.com")

'# Summary of the Website - Edward Donner\n\nThe website features various resources and announcements related to artificial intelligence (AI) and data science. Key highlights include:\n\n- **January 23, 2025**: Information on an upcoming LLM Workshop focusing on hands-on experience with AI agents.\n- **December 21, 2024**: A welcoming announcement to a group referred to as SuperDataScientists.\n- **November 13, 2024**: Resources available for mastering AI and LLM engineering.\n- **October 16, 2024**: Resources for transitioning from a software engineer to an AI data scientist.\n\nThe content emphasizes educational resources and workshops aimed at enhancing skills in AI and data science.'

In [58]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [60]:
display_summary("https://openai.com")

# Summary of OpenAI Website

The OpenAI website features a variety of resources and announcements related to artificial intelligence and its applications. Key areas of focus include:

### Recent Announcements and News
- **Introducing the Intelligence Age** (Feb 9, 2025): Discusses a new era in AI development.
- **OpenAI and CSU System Collaboration** (Feb 4, 2025): Partnership aimed at integrating AI technology into educational environments for 500,000 students and faculty.
- **Introducing ChatGPT Gov** (Jan 28, 2025): New features and applications of ChatGPT tailored for government use.
- **Announcing The Stargate Project** (Jan 21, 2025): A new initiative likely focused on advanced AI research and applications.
- **Partnership with Axios** (Jan 15, 2025): Collaboration to enhance AI’s role in the news industry.

### Product and Research Highlights
- **Deep Research Introduction** (Feb 2, 2025): A look at advancements in reasoning capabilities.
- **ChatGPT Practices**: Users can engage in vocabulary quizzes, message ChatGPT, and explore various research projects such as building custom AI tools for education and business.

The site serves as a hub for exploring AI advancements, partnerships, ongoing projects, and updates on OpenAI's initiatives.