In [1]:
!pip -q install requests beautifulsoup4 pandas lxml

import re
import time
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup


In [2]:
BASE = "https://books.toscrape.com/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def get_star_from_classes(classes):
    # classes look like: ["star-rating","Three"]
    for c in classes:
        if c in {"One","Two","Three","Four","Five"}:
            return c
    return None

def scrape_books_all_pages(start_url=BASE):
    url = start_url
    all_rows = []

    while True:
        r = requests.get(url, headers=HEADERS, timeout=30)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "lxml")

        for art in soup.select("article.product_pod"):
            title = art.select_one("h3 a")["title"].strip()
            price = art.select_one(".price_color").get_text(strip=True)
            availability = art.select_one(".availability").get_text(strip=True)
            star_cls = art.select_one(".star-rating")["class"]
            star = get_star_from_classes(star_cls)

            all_rows.append({
                "Title": title,
                "Price": price,
                "Availability": availability,
                "Star Rating": star
            })

        # pagination: <li class="next"><a href="catalogue/page-2.html">next</a></li>
        next_link = soup.select_one("li.next a")
        if not next_link:
            break
        url = urljoin(url, next_link.get("href"))
        # polite scraping pause
        time.sleep(0.4)

    return pd.DataFrame(all_rows)

df_books = scrape_books_all_pages(BASE)
df_books.head(), df_books.shape


(                                   Title    Price Availability Star Rating
 0                   A Light in the Attic  Â£51.77     In stock       Three
 1                     Tipping the Velvet  Â£53.74     In stock         One
 2                             Soumission  Â£50.10     In stock         One
 3                          Sharp Objects  Â£47.82     In stock        Four
 4  Sapiens: A Brief History of Humankind  Â£54.23     In stock        Five,
 (1000, 4))

In [3]:
df_books.to_csv("books.csv", index=False, encoding="utf-8")
print("Saved:", "books.csv")
df_books.sample(5).reset_index(drop=True)


Saved: books.csv


Unnamed: 0,Title,Price,Availability,Star Rating
0,A Short History of Nearly Everything,Â£52.40,In stock,Five
1,The Boys in the Boat: Nine Americans and Their...,Â£22.60,In stock,Four
2,Lumberjanes Vol. 3: A Terrible Plan (Lumberjan...,Â£19.92,In stock,Two
3,The Tipping Point: How Little Things Can Make ...,Â£10.02,In stock,Two
4,Unreasonable Hope: Finding Faith in the God Wh...,Â£46.33,In stock,Two


Q2

In [4]:
# If this apt step errors on your Colab runtime, just re-run the cell.
!apt-get -qq update
!apt-get -qq install -y chromium-browser chromium-chromedriver
!pip -q install selenium pandas

import os, pandas as pd, time, re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

CHROME_BIN = "/usr/bin/chromium-browser"
CHROMEDRIVER_PATH = "/usr/bin/chromedriver"

def make_driver():
    from selenium.webdriver.chrome.options import Options
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.binary_location = CHROME_BIN
    service = Service(CHROMEDRIVER_PATH)
    return webdriver.Chrome(service=service, options=options)


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Preconfiguring packages ...
Selecting previously unselected package apparmor.
(Reading database ... 126718 files and directories currently installed.)
Preparing to unpack .../apparmor_3.0.4-2ubuntu2.4_amd64.deb ...
Unpacking apparmor (3.0.4-2ubuntu2.4) ...
Selecting previously unselected package squashfs-tools.
Preparing to unpack .../squashfs-tools_1%3a4.5-3build1_amd64.deb ...
Unpacking squashfs-tools (1:4.5-3build1) ...
Preparing to unpack .../libudev1_249.11-0ubuntu3.17_amd64.deb ...
Unpacking libudev1:amd64 (249.11-0ubuntu3.17) over (249.11-0ubuntu3.12) ...
Setting up libudev1:amd64 (249.11-0ubuntu3.17) ...
Selecting previously unselected package udev.
(Reading database ... 126918 files and directories currently installed.)
Preparing to unpack .../udev_249.11-0ubuntu3.17_amd64.deb ...
Unpacking 

In [5]:
URL = "https://www.imdb.com/chart/top/"

def maybe_accept_cookies(driver, timeout=5):
    candidates = [
        (By.XPATH, "//button[contains(., 'Accept')]"),
        (By.CSS_SELECTOR, "button[aria-label='Agree'], button[aria-label='Accept all']"),
        (By.ID, "consent-banner-accept"),
    ]
    for by, sel in candidates:
        try:
            btn = WebDriverWait(driver, timeout).until(EC.element_to_be_clickable((by, sel)))
            btn.click()
            time.sleep(0.5)
            return True
        except Exception:
            pass
    return False

def parse_top250(driver):
    rows = []

    # Strategy A: New layout (UL/IPCs)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list, li.ipc-metadata-list-summary-item"))
        )
        items = driver.find_elements(By.CSS_SELECTOR, "li.ipc-metadata-list-summary-item")
        if len(items) > 200:  # looks like Top 250 list
            for li in items:
                # "1. The Shawshank Redemption"
                title_text = li.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text
                # split out rank and title
                m = re.match(r"^\s*(\d+)\.\s+(.*)$", title_text)
                if m:
                    rank = int(m.group(1))
                    title = m.group(2).strip()
                else:
                    # fallback: rank might be elsewhere
                    rank = int(li.find_element(By.CSS_SELECTOR, "span.ipc-title__subtext").text.strip("#"))
                    title = li.find_element(By.CSS_SELECTOR, "a.ipc-title-link-wrapper").text.strip()

                # Year: try common patterns
                year = None
                for sel in [
                    "span.ipc-title__meta-year",
                    "span.cli-title-metadata-item",
                    "span[data-testid='title-year']",
                ]:
                    try:
                        year_text = li.find_element(By.CSS_SELECTOR, sel).text
                        year_match = re.search(r"(19|20)\d{2}", year_text)
                        if year_match:
                            year = int(year_match.group(0))
                            break
                    except Exception:
                        pass

                # Rating:
                rating = None
                for sel in [
                    "span.ipc-rating-star--rating",
                    "span[data-testid='rating-value']",
                    "span.sc-bde20123-1",  # older imdb CSS
                ]:
                    try:
                        rating = float(li.find_element(By.CSS_SELECTOR, sel).text.strip())
                        break
                    except Exception:
                        pass

                rows.append({
                    "Rank": rank,
                    "Movie Title": title,
                    "Year of Release": year,
                    "IMDB Rating": rating
                })
            if rows:
                return rows
    except Exception:
        pass

    # Strategy B: Older table layout
    try:
        tbody = WebDriverWait(driver, 8).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody"))
        )
        tr_list = tbody.find_elements(By.TAG_NAME, "tr")
        for i, tr in enumerate(tr_list, start=1):
            try:
                title_col = tr.find_element(By.CSS_SELECTOR, "td.titleColumn")
                title_full = title_col.text  # like "1. The Shawshank Redemption (1994)"
                # Rank:
                rank = int(title_full.split(".")[0].strip())
                # Title:
                title = title_col.find_element(By.TAG_NAME, "a").text.strip()
                # Year:
                year_txt = title_col.find_element(By.TAG_NAME, "span").text  # "(1994)"
                year = int(re.search(r"(19|20)\d{2}", year_txt).group(0))
                # Rating:
                rating_txt = tr.find_element(By.CSS_SELECTOR, "td.imdbRating strong").text
                rating = float(rating_txt)
                rows.append({
                    "Rank": rank,
                    "Movie Title": title,
                    "Year of Release": year,
                    "IMDB Rating": rating
                })
            except Exception:
                continue
        if rows:
            return rows
    except Exception:
        pass

    return rows

driver = make_driver()
driver.get(URL)
maybe_accept_cookies(driver)

data = parse_top250(driver)
driver.quit()

df_imdb = pd.DataFrame(data).sort_values("Rank").reset_index(drop=True)
print("Scraped rows:", len(df_imdb))
df_imdb.head(10)


WebDriverException: Message: Service /usr/bin/chromedriver unexpectedly exited. Status code was: 1


In [6]:
df_imdb.to_csv("imdb_top250.csv", index=False, encoding="utf-8")
print("Saved:", "imdb_top250.csv")
df_imdb.sample(10).sort_values("Rank").head(10).reset_index(drop=True)


NameError: name 'df_imdb' is not defined

Q3

In [7]:
import re, time, pandas as pd, requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}


In [8]:
URL = "https://www.timeanddate.com/weather/"

def parse_weather_table(soup):
    """Find the main world-cities weather table and return list of dicts."""
    rows = []
    # Typical table classes on timeanddate: zebra fw tb-theme or similar
    tables = soup.select("table.zebra, table.tb-wt, table.fw, table.tb-theme")
    if not tables:
        return rows

    # choose table that has 'City' / 'Weather' / 'Temp' headers
    target = None
    for t in tables:
        headers = " ".join(th.get_text(" ", strip=True).lower() for th in t.select("thead th"))
        if any(k in headers for k in ["city", "weather"]) and ("temp" in headers or "temperature" in headers):
            target = t
            break
    if target is None:
        # fallback: first big zebra table
        target = tables[0]

    for tr in target.select("tbody tr"):
        tds = tr.find_all("td")
        if len(tds) < 3:
            continue

        # Heuristics:
        # - First cell contains City link
        # - One cell contains temperature like "28 °C"
        # - One cell contains condition text like "Clear", "Cloudy", etc.
        city = None
        cond = None
        temp = None

        # City
        a = tr.find("a")
        if a:
            city = a.get_text(strip=True)

        # Temp (prefer something with ° or C/F)
        for td in tds:
            txt = td.get_text(" ", strip=True)
            if re.search(r"(-?\d+)\s*°", txt) or re.search(r"(-?\d+)\s*(?:C|F)\b", txt, re.I):
                temp = txt
                break

        # Condition (look for words like Clear/Cloudy/Rain/etc.) not containing degree symbol
        for td in tds:
            txt = td.get_text(" ", strip=True)
            if any(word in txt.lower() for word in ["clear","cloud","rain","sun","thunder","storm","snow","mist","fog","haze","drizzle","overcast"]):
                if "°" not in txt:
                    cond = txt
                    break

        if city:
            rows.append({
                "City Name": city,
                "Temperature": temp,
                "Weather Condition": cond
            })
    return rows

resp = requests.get(URL, headers=HEADERS, timeout=30)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")

rows = parse_weather_table(soup)
df_weather = pd.DataFrame(rows).drop_duplicates(subset=["City Name"]).reset_index(drop=True)

print("Cities scraped:", len(df_weather))
df_weather.head(10)


Cities scraped: 0


In [9]:
df_weather.to_csv("weather.csv", index=False, encoding="utf-8")
print("Saved:", "weather.csv")
df_weather.sample(min(10, len(df_weather))).reset_index(drop=True)


Saved: weather.csv
