In [1]:
#q1
import requests
from bs4 import BeautifulSoup
import pandas as pd

books_data = []
base_url = "https://books.toscrape.com/catalogue/page-{}.html"
page_num = 1

print("Scraping Books... (Paginating through all pages)")

while True:
    url = base_url.format(page_num)
    response = requests.get(url)

    if response.status_code != 200:
        # Check if the page exists; break when 404 is encountered (end of site)
        print(f"Finished. Last successful page: {page_num - 1}")
        break

    soup = BeautifulSoup(response.content, 'html.parser')
    books = soup.find_all('article', class_='product_pod')

    for book in books:
        # 1. Title [cite: 6]
        title = book.h3.a['title']
        
        # 2. Price [cite: 7]
        price = book.find('p', class_='price_color').text.strip().replace('£', '')
        
        # 3. Availability [cite: 8]
        availability = book.find('p', class_='instock availability').text.strip()
        
        # 4. Star Rating [cite: 9]
        rating_class = book.p['class'][1]
        rating_map = {'One': 'One', 'Two': 'Two', 'Three': 'Three', 'Four': 'Four', 'Five': 'Five'}
        star_rating = rating_map.get(rating_class, 'Unknown')
        
        books_data.append({
            'Title': title,
            'Price': price,
            'Availability': availability,
            'Star Rating': star_rating
        })

    page_num += 1

# Store in DataFrame and export to CSV [cite: 10]
df_books = pd.DataFrame(books_data)
df_books.to_csv('books.csv', index=False)

print(f"✅ Q1 Complete. Scraped {len(df_books)} books and saved to books.csv.")

Scraping Books... (Paginating through all pages)
Finished. Last successful page: 50
✅ Q1 Complete. Scraped 1000 books and saved to books.csv.


In [2]:
#q2
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd

# NOTE: Set the path to your ChromeDriver or use Service for newer versions.
# This assumes ChromeDriver is in your system PATH or use a specific path/service.
# For minimal code, we assume ChromeDriver is accessible/Service is set up.
# driver = webdriver.Chrome(service=Service('/path/to/chromedriver')) 
driver = webdriver.Chrome() # Assumes driver is in PATH

url_imdb = "https://www.imdb.com/chart/top/"
driver.get(url_imdb)

# Find the table containing all movie entries
table = driver.find_element(By.CSS_SELECTOR, 'ul.ipc-metadata-list')
movies = table.find_elements(By.TAG_NAME, 'li')

imdb_data = []

for movie in movies:
    try:
        # 1. Rank & 2. Movie Title [cite: 14, 15]
        # Locate the title element which contains the rank and title
        title_element = movie.find_element(By.CSS_SELECTOR, 'h3.ipc-title__text')
        # e.g., "1. The Shawshank Redemption"
        
        # Split rank and title (split at the first '.')
        rank_title = title_element.text
        rank, title = rank_title.split('.', 1)
        rank = rank.strip()
        title = title.strip()

        # 3. Year of Release [cite: 16] (often under the title)
        # Use find_elements to handle cases where some items might not have a year or rating
        year_element = movie.find_elements(By.CSS_SELECTOR, 'span.cli-title-metadata-item:nth-child(1)')
        year = year_element[0].text if year_element else 'N/A'

        # 4. IMDB Rating [cite: 17]
        rating_element = movie.find_element(By.CSS_SELECTOR, 'span.ipc-rating-star--imdb')
        rating = rating_element.text.split('\n')[0].strip() # Takes the "8.9" part

        imdb_data.append({
            'Rank': rank,
            'Movie Title': title,
            'Year of Release': year,
            'IMDB Rating': rating
        })
    except Exception as e:
        # Skip if a list item doesn't conform to the expected structure (e.g., ads, breaks)
        continue

driver.quit()

# Store in DataFrame and export to CSV [cite: 18]
df_imdb = pd.DataFrame(imdb_data)
df_imdb.to_csv('imdb_top250.csv', index=False)

print(f"✅ Q2 Complete. Scraped {len(df_imdb)} movies and saved to imdb_top250.csv.")

✅ Q2 Complete. Scraped 0 movies and saved to imdb_top250.csv.


In [3]:
#q3
import requests
from bs4 import BeautifulSoup
import pandas as pd

url_weather = "https://www.timeanddate.com/weather/"
response = requests.get(url_weather)
soup = BeautifulSoup(response.content, 'html.parser')

# The cities and weather data are typically in a table on this page
# Find the main table for world cities (based on typical timeanddate structure)
table = soup.find('table', class_='wtt') # Use table class for stability

weather_data = []

if table:
    # Iterate over rows in the table body, skipping the header
    rows = table.find('tbody').find_all('tr')
    
    for row in rows:
        cells = row.find_all(['th', 'td'])
        
        if len(cells) >= 3: # Expect at least City, Temp, Condition
            # 1. City Name [cite: 22] (often in the first cell, sometimes a link)
            city_name = cells[0].text.strip()
            
            # 2. Temperature [cite: 23] (often in the second cell)
            temperature = cells[1].text.strip()
            
            # 3. Weather Condition [cite: 24] (often in the third cell)
            condition = cells[2].text.strip()

            weather_data.append({
                'City Name': city_name,
                'Temperature': temperature,
                'Weather Condition': condition
            })

# Store in DataFrame and export to CSV [cite: 25]
df_weather = pd.DataFrame(weather_data)
df_weather.to_csv('weather.csv', index=False)

print(f"✅ Q3 Complete. Scraped {len(df_weather)} weather records and saved to weather.csv.")

✅ Q3 Complete. Scraped 0 weather records and saved to weather.csv.
