In [21]:
#q1
!pip install requests beautifulsoup4 pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_books():
    base_url = "http://books.toscrape.com/catalogue/"
    current_page_url = base_url + "page-1.html"
    all_books_data = []

    while current_page_url:
        print(f"Scraping: {current_page_url}")
        try:
            response = requests.get(current_page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            books = soup.find_all("article", class_="product_pod")

            for book in books:
                title = book.h3.a["title"]
                price = book.find("p", class_="price_color").text
                availability = book.find("p", class_="instock availability").text.strip()


                rating = book.find("p", class_="star-rating")["class"][1]

                all_books_data.append({
                    "Title": title,
                    "Price": price,
                    "Availability": availability,
                    "Star Rating": rating
                })

            next_button = soup.find("li", class_="next")
            if next_button:
                next_page_relative_url = next_button.a["href"]
                current_page_url = base_url + next_page_relative_url
            else:
                current_page_url = None
        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
            break

    return all_books_data

books_data = scrape_books()

if books_data:
    df = pd.DataFrame(books_data)
    df.to_csv("books.csv", index=False)
    print("\nScraping complete! Data saved to books.csv.")
    print(df.head())
else:
    print("No data was scraped.")

Scraping: http://books.toscrape.com/catalogue/page-1.html
Scraping: http://books.toscrape.com/catalogue/page-2.html
Scraping: http://books.toscrape.com/catalogue/page-3.html
Scraping: http://books.toscrape.com/catalogue/page-4.html
Scraping: http://books.toscrape.com/catalogue/page-5.html
Scraping: http://books.toscrape.com/catalogue/page-6.html
Scraping: http://books.toscrape.com/catalogue/page-7.html
Scraping: http://books.toscrape.com/catalogue/page-8.html
Scraping: http://books.toscrape.com/catalogue/page-9.html
Scraping: http://books.toscrape.com/catalogue/page-10.html
Scraping: http://books.toscrape.com/catalogue/page-11.html
Scraping: http://books.toscrape.com/catalogue/page-12.html
Scraping: http://books.toscrape.com/catalogue/page-13.html
Scraping: http://books.toscrape.com/catalogue/page-14.html
Scraping: http://books.toscrape.com/catalogue/page-15.html
Scraping: http://books.toscrape.com/catalogue/page-16.html
Scraping: http://books.toscrape.com/catalogue/page-17.html
Scrapi

In [20]:
#q2

!pip install selenium
!apt-get update
!apt install chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    return webdriver.Chrome(options=options)

def scrape_imdb_robust():
    url = "https://www.imdb.com/chart/top/"
    driver = setup_driver()
    all_movies_data = []

    try:
        print("Fetching IMDB Top 250 page...")
        driver.get(url)


        wait = WebDriverWait(driver, 20)
        movie_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.ipc-metadata-list-summary-item__c")))

        print(f"Successfully located {len(movie_elements)} movie containers. Starting scrape...")

        for element in movie_elements:
            try:
                title_text = element.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text
                rank_str, title = title_text.split('. ', 1)

                metadata_items = element.find_elements(By.CSS_SELECTOR, "span.cli-title-metadata-item")
                year = metadata_items[0].text if metadata_items else "N/A"

                rating_span = element.find_element(By.CSS_SELECTOR, "span.ipc-rating-star")
                rating = rating_span.text.split()[0]

                all_movies_data.append({
                    "Rank": int(rank_str),
                    "Movie Title": title,
                    "Year of Release": year,
                    "IMDB Rating": float(rating)
                })
            except (NoSuchElementException, IndexError, ValueError):
                continue

    except TimeoutException:
        print("Scraping failed: The movie list did not load within the 20-second time limit.")
        print("The website structure has likely changed, or anti-scraping measures are blocking the script.")
    except Exception as e:
        print(f"A critical error occurred: {e}")
    finally:
        driver.quit()

    return all_movies_data

imdb_data = scrape_imdb_robust()

if imdb_data:
    df = pd.DataFrame(imdb_data)
    df.to_csv("imdb_top250.csv", index=False)
    print(f"\nScraping complete! Found {len(imdb_data)} movies. Data saved to imdb_top250.csv.")
    print(df.head())
else:
    print("\nNo data was scraped from IMDB.")

Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Get:2 https://cli.github.com/packages stable InRelease [3,917 B]
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 3,917 B in 1s (2,870 B/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (

In [1]:
#q3
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_weather():
    url = "https://www.timeanddate.com/weather/"
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")

    all_weather_data = []


    table = soup.find("table", class_="zebra fw tb-theme")
    rows = table.find_all("tr")

    for row in rows[1:]:
        cols = row.find_all("td")
        if len(cols) >= 3:
            city = cols[0].text.strip()
            temperature = cols[1].text.strip()
            condition = cols[2].text.strip()

            all_weather_data.append({
                "City": city,
                "Temperature": temperature,
                "Condition": condition
            })

    return all_weather_data



weather_data = scrape_weather()

if weather_data:
    df = pd.DataFrame(weather_data)
    df.to_csv("weather.csv", index=False)
    print("✅ Scraping complete! Data saved to weather.csv")
    print(df.head())
else:
    print("⚠️ No data was scraped from Time and Date.")


✅ Scraping complete! Data saved to weather.csv
          City  Temperature Condition
0        Accra  Wed 3:45 pm          
1  Addis Ababa  Wed 6:45 pm          
2     Adelaide  Thu 1:15 am          
3      Algiers  Wed 4:45 pm          
4       Almaty  Wed 8:45 pm          
