In [1]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook!

# Connecting to OpenAI

The next cell is where we load in the environment variables in your `.env` file and connect to OpenAI.  


In [3]:
# Load environment variables in a file called .env

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key - see troubleshooting notebook")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")


API key found and looks good so far!


In [4]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.
# If it STILL doesn't work (horrors!) then please see the Troubleshooting notebook in this folder for full instructions

# Let's make a quick call to a model to get started

In [5]:
# To give you a preview -- calling OpenAI with these messages is this easy. Any problems, head over to the Troubleshooting notebook.

message = "Hello, GPT! This is my first ever message to you! Hi!"
response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"user", "content":message}])
print(response.choices[0].message.content)

Hello! Welcome! I'm glad to hear from you. How can I assist you today?


In [7]:
print(response)

ChatCompletion(id='chatcmpl-C6es3A7o3gnFjmB4CgGxHXirKhSdc', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="Hello! Welcome! I'm glad you're here. How can I assist you today?", refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1755703111, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier='default', system_fingerprint='fp_560af6e559', usage=CompletionUsage(completion_tokens=16, prompt_tokens=22, total_tokens=38, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))


## Onwards with our first project

In [30]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting sortedcontainers (from trio~=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m25.0 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
Downloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downlo

In [31]:
!pip install webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver-manager
Successfully installed webdriver-manager-4.0.2


In [42]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
import os 

class WebsiteSel:
    def __init__(self, url, driver_path=None, wait_time=3):
        self.url = url
        self.wait_time = wait_time

        # Headless Chrome settings
        options = Options()
        # options.add_argument("--headless")  
        # Headless mode runs the browser in the background (invisible).
        # However, some websites (like openai.com) block headless browsers.
        # So if this line is active, the page may not load correctly and you may not get the full content.
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920x1080")

        # Driver path
        if driver_path:
            service = Service(executable_path=driver_path)
        else:
            service = Service() 

        # Start browser
        driver = webdriver.Chrome(service=service, options=options)
        driver.get(url)

        # Wait for the loading page
        time.sleep(self.wait_time)

        # Take page source
        html = driver.page_source
        driver.quit()

        # Analysis with BeautifulSoup 
        soup = BeautifulSoup(html, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"

        # # Clean irrelevant tags
        # for irrelevant in soup.body(["script", "style", "img", "input"]):
        #     irrelevant.decompose()
        # # Remove irrelevant tags: scripts, styles, images, inputs, navs, footers, headers, and asides
        for irrelevant in soup.body(["script", "style", "img", "input", "nav", "footer", "header", "aside"]):
            irrelevant.decompose()
        
        # Remove common ad containers (classes and IDs often used for ads/boilerplate)
        for ad in soup.body.find_all(attrs={
            "class": ["ad", "ads", "advertisement", "sponsored", "promo", "banner", "cookie-consent"],
            "id": ["ad", "ads", "advertisement", "sponsored", "promo", "banner", "cookie-consent"]
        }):
            ad.decompose()
        
        # Remove repetitive boilerplate sections (menus, sidebars, etc.)
        for boilerplate in soup.body.find_all(attrs={
            "class": ["sidebar", "menu", "navigation", "breadcrumb", "footer-links"]
        }):
            boilerplate.decompose()
        
        # Now extract cleaned text
        self.text = soup.body.get_text(separator="\n", strip=True)       

In [44]:
site = WebsiteSel("https://nykaa.com", driver_path="/Users/rahulrajrana/Downloads/chromedriver-mac-arm64/chromedriver")
print("Title:", site.title)
print("\nFirst 500 character:\n", site.text[:500])

Title: Buy Cosmetics Products & Beauty Products Online in India at Best Price | Nykaa

First 500 character:
 Up To 20% Off
On Kay Bestsellers!
Up to 15% off
Free Gifts On ₹2500+
Flat 50% Off
*On Entire Range
Upto 20% Off
On Bestseller!
New Launch
Bubble has landed
Flat 15% Off
On M.A.C Foundation
Min 20% Off
On Entire Range!
Up To 50% Off
On Bestsellers
Flat 10% Off On Lips
On ₹6000: 3 Free Gifts
Up To 30% Off
On Trending Makeup!
Gifts on ₹3500
Nourishes & Protects
Only at ₹650/-
10% Off on 1200+
AHA 30% + BHA 2%
For clear pores & even tone
With 7 Natural Oils
For All Hair Types!
Up To 20% Off
On Entir


In [15]:
# A class to represent a Webpage
# If you're not familiar with Classes, check out the "Intermediate Python" notebook

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        # Remove irrelevant tags: scripts, styles, images, inputs, navs, footers, headers, and asides
        for irrelevant in soup.body(["script", "style", "img", "input", "nav", "footer", "header", "aside"]):
            irrelevant.decompose()
        
        Remove common ad containers (classes and IDs often used for ads/boilerplate)
        for ad in soup.body.find_all(attrs={
            "class": ["ad", "ads", "advertisement", "sponsored", "promo", "banner", "cookie-consent"],
            "id": ["ad", "ads", "advertisement", "sponsored", "promo", "banner", "cookie-consent"]
        }):
            ad.decompose()
        
        # Remove repetitive boilerplate sections (menus, sidebars, etc.)
        for boilerplate in soup.body.find_all(attrs={
            "class": ["sidebar", "menu", "navigation", "breadcrumb", "footer-links"]
        }):
            boilerplate.decompose()
        
        Now extract cleaned text
        self.text = soup.body.get_text(separator="\n", strip=True)

In [28]:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# self.title = soup.title.string if soup.title else "No title found"
# # Remove irrelevant tags: scripts, styles, images, inputs, navs, footers, headers, and asides
for irrelevant in soup.body(["script", "style", "img", "input", "nav", "footer", "header", "aside"]):
    irrelevant.decompose()

# Remove common ad containers (classes and IDs often used for ads/boilerplate)
for ad in soup.body.find_all(attrs={
    "class": ["ad", "ads", "advertisement", "sponsored", "promo", "banner", "cookie-consent"],
    "id": ["ad", "ads", "advertisement", "sponsored", "promo", "banner", "cookie-consent"]
}):
    ad.decompose()

# Remove repetitive boilerplate sections (menus, sidebars, etc.)
for boilerplate in soup.body.find_all(attrs={
    "class": ["sidebar", "menu", "navigation", "breadcrumb", "footer-links"]
}):
    boilerplate.decompose()

# # Now extract cleaned text
clean_text = soup.body.get_text(separator="\n", strip=True)

In [27]:
url = "https://team-bhp.com"

In [51]:
# Let's try one out. Change the website and add print statements to follow along.

bhp = WebsiteSel("https://nykaa.com",driver_path="/Users/rahulrajrana/Downloads/chromedriver-mac-arm64/chromedriver")
print(bhp.title)
print(bhp.text)

Buy Cosmetics Products & Beauty Products Online in India at Best Price | Nykaa
New Launch
Bubble has landed
Flat 15% Off
On M.A.C Foundation
Up To 20% Off
On Kay Bestsellers!
Upto 20% Off
On Bestseller!
Up to 15% off
Free Gifts On ₹2500+
Min 20% Off
On Entire Range!
Flat 50% Off
*On Entire Range
Up To 50% Off
On Bestsellers
Ends In:
08h 00m 54s
Flat 10% Off On Lips
On ₹6000: 3 Free Gifts
Up To 20% Off
On Entire Brand!
Only at ₹650/-
10% Off on 1200+
Upto 5% off
On Bestseller!
Gifts on ₹3500
Nourishes & Protects
AHA 30% + BHA 2%
For clear pores & even tone
With 7 Natural Oils
For All Hair Types!
Up To 30% Off
On Trending Makeup!
Nykaa - The Online Beauty Store
How may we help you


In [47]:
# Define our system prompt"

system_prompt = "You are a helpful shopping assistant. \
Your job is to read product webpage content and explain it to a shopper in clear, simple language. \
Always provide a decision guide with key features, pros and cons, who the product is best for, and a short verdict. \
Keep the tone neutral, practical, and easy to scan.\
Respond in markdown."

In [49]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += f"""
Read the following product webpage content and generate a shopping decision guide. 

Use this structure:

Product: <name/category>  
Key Features:  
- ...  
Pros:  
- ...  
Cons:  
- ...  
Best For:  
- ...  
Verdict:  
- ...  """
    user_prompt += website.text
    return user_prompt

In [52]:
print(user_prompt_for(bhp))

You are looking at a website titled Buy Cosmetics Products & Beauty Products Online in India at Best Price | Nykaa
Read the following product webpage content and generate a shopping decision guide. 

Use this structure:

Product: <name/category>  
Key Features:  
- ...  
Pros:  
- ...  
Cons:  
- ...  
Best For:  
- ...  
Verdict:  
- ...  New Launch
Bubble has landed
Flat 15% Off
On M.A.C Foundation
Up To 20% Off
On Kay Bestsellers!
Upto 20% Off
On Bestseller!
Up to 15% off
Free Gifts On ₹2500+
Min 20% Off
On Entire Range!
Flat 50% Off
*On Entire Range
Up To 50% Off
On Bestsellers
Ends In:
08h 00m 54s
Flat 10% Off On Lips
On ₹6000: 3 Free Gifts
Up To 20% Off
On Entire Brand!
Only at ₹650/-
10% Off on 1200+
Upto 5% off
On Bestseller!
Gifts on ₹3500
Nourishes & Protects
AHA 30% + BHA 2%
For clear pores & even tone
With 7 Natural Oils
For All Hair Types!
Up To 30% Off
On Trending Makeup!
Nykaa - The Online Beauty Store
How may we help you


In [16]:
messages = [
    {"role": "system", "content": "You are an angry assistant"},
    {"role": "user", "content": "What is 2 * 2?"}
]

In [17]:
# To give you a preview -- calling OpenAI with system and user messages:

response = openai.chat.completions.create(model="gpt-4o-mini", messages=messages)
print(response.choices[0].message.content)

Seriously? It's 4. Everyone knows that.


## And now let's build useful messages for GPT-4o-mini, using a function

In [53]:
# See how this function creates exactly the format above

def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

In [54]:
# Try this out, and then try for a few more websites

messages_for(bhp)

[{'role': 'system',
  'content': 'You are a helpful shopping assistant. Your job is to read product webpage content and explain it to a shopper in clear, simple language. Always provide a decision guide with key features, pros and cons, who the product is best for, and a short verdict. Keep the tone neutral, practical, and easy to scan.Respond in markdown.'},
 {'role': 'user',
  'content': 'You are looking at a website titled Buy Cosmetics Products & Beauty Products Online in India at Best Price | Nykaa\nRead the following product webpage content and generate a shopping decision guide. \n\nUse this structure:\n\nProduct: <name/category>  \nKey Features:  \n- ...  \nPros:  \n- ...  \nCons:  \n- ...  \nBest For:  \n- ...  \nVerdict:  \n- ...  New Launch\nBubble has landed\nFlat 15% Off\nOn M.A.C Foundation\nUp To 20% Off\nOn Kay Bestsellers!\nUpto 20% Off\nOn Bestseller!\nUp to 15% off\nFree Gifts On ₹2500+\nMin 20% Off\nOn Entire Range!\nFlat 50% Off\n*On Entire Range\nUp To 50% Off\nOn

## Time to bring it together - the API for OpenAI!

In [59]:
# And now: call the OpenAI API. You will get very familiar with this!

def summarize(url):
    website = WebsiteSel(url,driver_path="/Users/rahulrajrana/Downloads/chromedriver-mac-arm64/chromedriver")
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [62]:
summarize("https://nykaa.com")

'## Product: Cosmetics and Beauty Products\n\n### Key Features:\n- Wide range of products available from various brands.\n- Discounts up to 50% on selected items, including bestsellers.\n- Offers free gifts on purchases over ₹2500 or ₹3500.\n- New launches available, ensuring access to the latest trends.\n- Special deals including up to 30% off on trending makeup and flat rates like 10% off on lips.\n- Product range includes items suitable for all hair types with nourishing ingredients.\n\n### Pros:\n- Competitive pricing with frequent sales and discounts.\n- Promotions that include attractive free gifts, enhancing the shopping experience.\n- A diverse selection of beauty products catering to different needs and preferences.\n- Easy access to both bestselling and new arrivals in the beauty industry.\n\n### Cons:\n- Discounts may apply only for a limited time, creating urgency for buyers.\n- Minimum purchase amounts required to obtain free gifts may deter some shoppers.\n- Potential for

In [60]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [63]:
display_summary("https://www.nykaa.com/")

# Shopping Decision Guide: Nykaa Cosmetics

### Product: Cosmetics & Beauty Products  

### Key Features:
- Variety of cosmetics including makeup and skincare from multiple brands
- Discounts ranging from 5% to 50% on various products
- Free gifts available with purchases over ₹2500 or ₹3500
- New product launches available for shoppers
- Specific sales on bestsellers and selected items such as M.A.C Foundation

### Pros:
- Competitive prices with significant discounts (up to 50%)
- Wide selection of brands and products
- Attractive offers like free gifts which enhance value
- User-friendly shopping experience on an established online platform 

### Cons:
- Limited time on discounts (varies by offer)
- Some bestsellers may sell out quickly due to high demand
- Free gifts may require a higher spending threshold which might not be feasible for all shoppers

### Best For:
- Makeup enthusiasts looking for a variety of products at discounted prices
- Shoppers who want to try new beauty products with the benefit of free gifts
- Individuals seeking specific brands like M.A.C or interested in trending cosmetics

### Verdict:
- Nykaa is an excellent choice for anyone looking to explore a wide range of cosmetics and beauty products at competitive prices, with the added benefits of promotions and free gifts. However, it’s wise to act fast on deals due to limited time offers and stock availability.

# Let's try more websites

Note that this will only work on websites that can be scraped using this simplistic approach.

Websites that are rendered with Javascript, like React apps, won't show up. 

But many websites will work just fine!

In [25]:
display_summary("https://indianexpress.com")

# Summary of Latest News Today - The Indian Express

The **Indian Express** website, titled "Latest News Today," serves as a comprehensive news portal covering a wide array of topics including politics, business, sports, entertainment, and more, focused primarily on developments in India. 

### Key Headlines

- **Political Drama**: A TMC leader has filed a defamation case against the father of a rape-murder victim, generating significant controversy.
- **Military Movements**: Israel is mobilizing 60,000 army reservists in preparation for an offensive in Gaza City.
- **Local Incidents**: A tragic incident in Ahmedabad occurred where a Class 8 student fatally stabbed a Class 10 student, prompting protests and violence at a local school.
- **Protests and Policing**: Major protests against a power project turned violent in Madhya Pradesh, resulting in the police employing lathicharge and teargas to control the crowds, with around 300 individuals booked.
- **Entertainment**: Shah Rukh Khan made headlines at a preview event for *The Ba***ds of Bollywood*, creating quite the buzz with his charisma and interactions with fans.
- **Legislation Moves**: The Indian government is pushing forward with talks aimed at resuming trade agreements with the Russia-led EAEU bloc amidst complicated international tariff discussions.

### Miscellaneous Updates

- Mumbai continues to experience heavy rains leading to significant impacts on daily life and transportation.
- An incident involving alleged police malpractice has raised concerns regarding conditions at the Mandoli Jail, prompting a judicial reprimand.
- The RBI Governor has voiced concerns over monetary policy in light of external uncertainties affecting the Indian economy.

In essence, *The Indian Express* remains a vital source for breaking news and in-depth analysis on current events shaping India today.

# Even converts Hindi News to it's English Equivalent Summary

In [27]:
display_summary("https://bhaskar.com")

# Summary of Dainik Bhaskar Website

Dainik Bhaskar is a comprehensive Hindi news website providing the latest updates on various topics, including politics, sports, entertainment, and lifestyle. The site offers real-time news coverage, videos, and special features on trending issues.

## Key Highlights:

- **Political News**: The site discusses significant political events like recent bills introduced in the Lok Sabha, including a bill that could lead to the removal of PM or CM upon arrest. There were also incidents of opposition protests where paper balls were thrown in Parliament.

- **Crime and Legal Matters**: Reports cover alarming incidents, such as an attack on Delhi's Chief Minister and issues surrounding migrant workers in Delhi-NCR. The aftermath of natural disasters, such as fatalities caused by cloudbursts in Kishtwar, is also highlighted.

- **Sports Updates**: Cricket features prominently, including discussions about India's team composition for the Asia Cup, and controversies surrounding the rankings of players like Rohit Sharma and Virat Kohli.

- **Business News**: Developments in the business world include the potential banning of online gaming sponsors and updates related to job recruitment in banks.

- **Lifestyle Articles**: Content spans lifestyle tips and relationship advice, focusing on the well-being of readers.

- **Trending Topics**: The site reports on various trending topics like the ongoing Russia-Ukraine war, the political landscape in Maharashtra, and unique stories captivating audience interest.

Overall, Dainik Bhaskar serves as a key source of information for Hindi-speaking audiences seeking diverse news coverage from local to international events.