<a href="https://colab.research.google.com/github/silvia-denanni/DI-Bootcamp-nov25/blob/main/W8D3MiniProjectScrapingDynamicSite.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install selenium webdriver-manager # Install selenium and webdriver-manager.
# selenium is like a robot that can control web browsers
!apt-get update # This command updates a list of other software that your virtual computer knows about

# Add Google Chrome repository and key
!wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add -
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list

!apt-get update # Update apt-get again after adding repository
!apt install -y google-chrome-stable # Install Google Chrome Stable browser

import selenium
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService # Import ChromeService
from webdriver_manager.chrome import ChromeDriverManager # Import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pprint  # To tidy up
import os
import re # Import re for regex operations

#This creates a special container to hold all the custom settings to apply to the new Chrome browser when it starts.
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run Chrome in headless mode, crucial setting for running Chrome in Colab
options.add_argument("--no-sandbox")  # Bypass OS security model, turning off some Chrome security features that might prevent it from running correctly in a restricted environment like Colab
options.add_argument("--disable-dev-shm-usage")  # Helps Chrome perform better in environments with limited memory
options.add_argument("--window-size=1920,1080") # Add a default window size for headless
options.add_argument("--disable-gpu") # Added for headless stability (trying to use a GPU can sometimes lead to instability in headless mode)

# Specify the Chrome binary location for google-chrome-stable in Colab
# Tells Selenium the exact location on the virtual machine where the Google Chrome browser program is installed
options.binary_location = "/usr/bin/google-chrome"

#This is the command that starts the Chrome browser:
# use ChromeDriverManager to automatically download and install the correct chromedriver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)

url = "https://books.toscrape.com/"
driver.get(url)

#Smart instruction that tells Selenium to wait for certain things to appear on the web page for up to 10 seconds
wait = WebDriverWait(driver, 10) # Initialize WebDriverWait, a "waiter" object

soup = BeautifulSoup(driver.page_source, 'html.parser')

# #This is where the "waiter" comes into action: it looks for all elements on the page that have the HTML class product_pod
books = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'product_pod')))

all_books_data = [] # Initialize list to store all book data

for book_element in books:
    # Get the outer HTML of the WebElement (at this point book_element is still a selenium object) so that we can parse it with BeautifulSoup
    book_soup = BeautifulSoup(book_element.get_attribute('outerHTML'), 'html.parser')

    # Extract title: it first checks if title_element (<h3> tag) exists and if it contains an <a> (anchor/link) tag.
    # If both exist, it gets the text from the title attribute of that <a> tag (which often holds the full title) and uses .strip() to remove any extra spaces around it
    # If title_element or its <a> tag isn't found, it defaults the title to 'N/A'
    title_element = book_soup.find('h3')
    title = title_element.a['title'].strip() if title_element and title_element.a else 'N/A'

    # Extract score (star rating)
    score_element = book_soup.find('p', class_='star-rating')  # gets all the class names attached to it (e.g., ['star-rating', 'Three'])
    score = 'N/A'       # default score
    if score_element:
        score_classes = score_element.get('class', [])
        if len(score_classes) > 1:
            raw_score = score_classes[1] # e.g., 'Three'
            score_map = {
                'One': '1 star',
                'Two': '2 stars',
                'Three': '3 stars',
                'Four': '4 stars',
                'Five': '5 stars'
            }
            score = score_map.get(raw_score, raw_score + ' star')

    # Extract price
    price_element = book_soup.find('p', class_='price_color')
    price = price_element.get_text().strip() if price_element else 'N/A'

    # Extract stock availability
    stock_element = book_soup.find('p', class_='instock availability')
    stock_text = stock_element.get_text().strip() if stock_element else 'N/A'

    # Use regex to find the number of available items, or default to 'Available' or 'N/A'
    match = re.search(r'\((\d+) available\)', stock_text)
    if match:
        stock = f"{match.group(1)} available"
    elif 'In stock' in stock_text:
        stock = 'Available'
    else:
        stock = stock_text # Fallback to original text if no clear pattern

    book_data = {
        'title': title,
        'score': score,
        'price': price,
        'stock': stock
    }
    all_books_data.append(book_data)

pprint.pprint(all_books_data) # Print all collected book data

Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:5 https://dl.google.com/linux/chrome/deb stable InRelease
Get:6 https://cli.github.com/packages stable InRelease [3,917 B]
Hit:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Fetched 3,917 B in 3s (1,407 B/s)
Reading package lists... Done
W: Target Packages (main/binary-amd64/Packages) is configured multiple times in /etc/apt/sources.list.d/google-chrome.list:3 and /etc/apt/sources.list.d/google-chrome.list:4
W: Target Packages (main/binary-all/Packages) is configured multiple times in /etc/apt/sources.list

In [20]:
import pandas as pd

# Convert the list of dictionaries to a pandas DataFrame
df_books = pd.DataFrame(all_books_data)

# Display the DataFrame to verify
print("\n--- Extracted Book Data DataFrame ---")
display(df_books.head())



--- Extracted Book Data DataFrame ---


Unnamed: 0,title,score,price,stock
0,A Light in the Attic,3 stars,£51.77,Available
1,Tipping the Velvet,1 star,£53.74,Available
2,Soumission,1 star,£50.10,Available
3,Sharp Objects,4 stars,£47.82,Available
4,Sapiens: A Brief History of Humankind,5 stars,£54.23,Available


In [34]:
#Transforming score column from object to integer
df_books['score'] = df_books['score'].replace(r'\star', '', regex=True)
df_books['score'] = pd.to_numeric(df_books['score'], errors='coerce')

#Transforming price column from object to integer
df_books['price'] = df_books['price'].replace(r'\£', '', regex=True)
df_books['price'] = pd.to_numeric(df_books['price'], errors='coerce')

#Transforming stock column to boolean
df_books['stock'] = df_books['stock'].replace(r'Unavailable', 'False', regex=True)
df_books['stock'] = df_books['stock'].replace(r'Available', 'True', regex=True)

df_books.head()

Unnamed: 0,title,score,price,stock
0,A Light in the Attic,3,51.77,True
1,Tipping the Velvet,1,53.74,True
2,Soumission,1,50.1,True
3,Sharp Objects,4,47.82,True
4,Sapiens: A Brief History of Humankind,5,54.23,True


In [35]:
# Save the DataFrame to a CSV file
csv_filename = 'books_data.csv'
df_books.to_csv(csv_filename, index=False)
print(f"\nData saved to {csv_filename}")


Data saved to books_data.csv


In [36]:
# Close the browser
driver.quit()
print("Selenium WebDriver closed.")



Selenium WebDriver closed.
