In [12]:
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip install selenium


Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting urllib3<3.0,>=2.5.0 (from urllib3[socks]<3.0,>=2.5.0->selenium)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting typing_extensions~=4.14.0 (from selenium)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.35.0-py3-none-any.whl (9.6 MB)
   ---------------------------------------- 0.0/9.6 MB ? eta -:--:--
   --- ------------------------------------ 0.8/9.6 MB 

Question 1

In [31]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "http://books.toscrape.com/catalogue/"
TOTAL_PAGES = 50

def get_data_from_page(page_num):
    page_data = []
    current_page_url = f"{BASE_URL}page-{page_num}.html"

    response = requests.get(current_page_url)
    soup = BeautifulSoup(response.content, "html.parser")

    books = soup.find_all("article", class_="product_pod")
    if not books:
        return []

    for book in books:
        title_element = book.h3.a
        title = title_element["title"] if title_element else 'NaN'

        price_element = book.find("p", class_="price_color")
        price = price_element.get_text(strip=True) if price_element else 'NaN'

        availability_element = book.find("p", class_="instock availability")
        availability = availability_element.get_text(strip=True) if availability_element else 'NaN'

        rating_element = book.find("p", class_="star-rating")
        star_rating = rating_element["class"][1] if rating_element and len(rating_element["class"]) > 1 else 'NaN'

        page_data.append({
            "Title": title,
            "Price": price,
            "Availability": availability,
            "Star Rating": star_rating
        })

    return page_data

all_data = []
for page in range(1, TOTAL_PAGES + 1):
  all_data.extend(get_data_from_page(page))

df = pd.DataFrame(all_data)

if not df.empty:
    df.head()
    df.to_csv('book_data.csv')

Question 2

In [29]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

driver = webdriver.Chrome(options=options)
driver.get("https://www.imdb.com/chart/top/")

wait = WebDriverWait(driver, 20)
list_container = wait.until(
    EC.presence_of_element_located((By.CSS_SELECTOR, "ul.ipc-metadata-list"))
)
movies = list_container.find_elements(By.TAG_NAME, "li")

movies_data = []
for movie_item in movies:
    title_text = movie_item.find_element(By.CSS_SELECTOR, "h3.ipc-title__text").text
    rank_str, title = title_text.split(". ", 1)

    metadata_items = movie_item.find_elements(By.CSS_SELECTOR, "span.cli-title-metadata-item")
    year_str = metadata_items[0].text

    rating_str = movie_item.find_element(By.CSS_SELECTOR, "span.ipc-rating-star").text.split("\n")[0]

    movies_data.append({
        "Rank": int(rank_str),
        "Movie Title": title,
        "Year of Release": int(year_str),
        "IMDB Rating": float(rating_str)
    })

driver.quit()

df = pd.DataFrame(movies_data)
df = df.sort_values(by="Rank").reset_index(drop=True)
df.head()
df.to_csv("imdb_top250.csv", index=False, encoding='utf-8')

Question 3

In [30]:
def extract_temp_as_float(temp):

    value = temp.replace("°C", "").replace("°F", "").strip()

    value = value.replace("\u00a0", "").replace("\xa0", "")
    return float(value)

url = "https://www.timeanddate.com/weather/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tds = soup.find_all('td')
results = []
current_city = None
weather_condition = ""

for i, td in enumerate(tds):

    if td.find('a'):
        current_city = td.get_text(strip=True)

        weather_condition = ""
        for offset in range(1, 3):
            if i + offset < len(tds):
                img = tds[i + offset].find('img')
                if img and img.get("alt"):
                    weather_condition = img["alt"]
                    break

    elif 'rbi' in td.get('class', []) and current_city:
        temp_str = td.get_text(strip=True)
        try:
            temp_float = extract_temp_as_float(temp_str)
        except Exception as e:
            temp_float = None
        results.append({
            "City Name": current_city,
            "Temperature": temp_float,
            "Weather Condition": weather_condition
        })
        current_city = None
        weather_condition = ""

df = pd.DataFrame(results)
df.head()
df.to_csv('weather.csv', index=False)
