<a href="https://colab.research.google.com/github/ronitjain235/codealpha/blob/main/task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Task 1: Web Scraping with BeautifulSoup (Google Colab Version)

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL
base_url = "https://books.toscrape.com/catalogue/page-{}.html"

# Lists to store scraped data
book_titles = []
book_prices = []
book_availability = []
book_ratings = []

# Function to convert rating text into numbers
def get_rating(rating_class):
    ratings_map = {
        "One": 1,
        "Two": 2,
        "Three": 3,
        "Four": 4,
        "Five": 5
    }
    for word, num in ratings_map.items():
        if word in rating_class:
            return num
    return None

# Loop through all 50 pages
for page in range(1, 51):
    print(f"🔄 Scraping page {page} ...")

    response = requests.get(base_url.format(page))
    if response.status_code != 200:
        print(f"❌ Page {page} not found. Stopping...")
        break

    soup = BeautifulSoup(response.text, "html.parser")
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        title = book.h3.a["title"]
        price = book.find("p", class_="price_color").text.strip()
        availability = book.find("p", class_="instock availability").text.strip()
        rating_class = book.find("p", class_="star-rating")["class"]

        # Clean price (remove £ and convert to float)
        price_clean = float(price.replace("£", "").replace("Â", ""))

        # Get rating number
        rating = get_rating(rating_class)

        book_titles.append(title)
        book_prices.append(price_clean)
        book_availability.append(availability)
        book_ratings.append(rating)

# Create DataFrame
df = pd.DataFrame({
    "Title": book_titles,
    "Price (£)": book_prices,
    "Availability": book_availability,
    "Rating (1-5)": book_ratings
})

# Save CSV
df.to_csv("scraped_books_full.csv", index=False, encoding="utf-8")

print("✅ Scraping completed. Data saved to scraped_books_full.csv")
print(f"📊 Total books scraped: {len(df)}")


🔄 Scraping page 1 ...
🔄 Scraping page 2 ...
🔄 Scraping page 3 ...
🔄 Scraping page 4 ...
🔄 Scraping page 5 ...
🔄 Scraping page 6 ...
🔄 Scraping page 7 ...
🔄 Scraping page 8 ...
🔄 Scraping page 9 ...
🔄 Scraping page 10 ...
🔄 Scraping page 11 ...
🔄 Scraping page 12 ...
🔄 Scraping page 13 ...
🔄 Scraping page 14 ...
🔄 Scraping page 15 ...
🔄 Scraping page 16 ...
🔄 Scraping page 17 ...
🔄 Scraping page 18 ...
🔄 Scraping page 19 ...
🔄 Scraping page 20 ...
🔄 Scraping page 21 ...
🔄 Scraping page 22 ...
🔄 Scraping page 23 ...
🔄 Scraping page 24 ...
🔄 Scraping page 25 ...
🔄 Scraping page 26 ...
🔄 Scraping page 27 ...
🔄 Scraping page 28 ...
🔄 Scraping page 29 ...
🔄 Scraping page 30 ...
🔄 Scraping page 31 ...
🔄 Scraping page 32 ...
🔄 Scraping page 33 ...
🔄 Scraping page 34 ...
🔄 Scraping page 35 ...
🔄 Scraping page 36 ...
🔄 Scraping page 37 ...
🔄 Scraping page 38 ...
🔄 Scraping page 39 ...
🔄 Scraping page 40 ...
🔄 Scraping page 41 ...
🔄 Scraping page 42 ...
🔄 Scraping page 43 ...
🔄 Scraping page 44 .

In [2]:
from google.colab import files
files.download("scraped_books_full.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>