Amazon Wob Scraping Using Python

In [None]:
# import libraries 

from bs4 import BeautifulSoup
import requests
import time
import datetime

import smtplib

In [10]:


URL = 'https://www.amazon.in/s?k=car+toys&i=automotive&crid=337ZMVQ39AVLD&sprefix=car+toys+%2Cautomotive%2C542&ref=nb_sb_noss_2'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}

page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")

# Product Name
title_tag = soup.find(id='productTitle')
title = title_tag.get_text().strip() if title_tag else "Title not found"

# MRP / Original Price
mrp_tag = soup.find("span", class_="a-size-small aok-offscreen")
mrp_text = mrp_tag.get_text().strip() if mrp_tag else "MRP not found"

# Discount %
discount_tag = soup.find("span", class_="savingsPercentage")
discount = discount_tag.get_text().strip() if discount_tag else "No discount"

# Final selling price
price_whole_tag = soup.find("span", class_="a-price-whole")
price_symbol_tag = soup.find("span", class_="a-price-symbol")
final_price = f"{price_symbol_tag.text}{price_whole_tag.text}" if price_whole_tag and price_symbol_tag else "Price not found"

# Ratings
rating_tag = soup.find("span", class_="a-icon-alt")
rating = rating_tag.get_text().strip() if rating_tag else "No rating"

print("Product Name:", title)
print("MRP / Original Price:", mrp_text)
print("Discount %:", discount)
print("Final Price:", final_price)
print("Rating:", rating)


Product Name: Umadiya® Range Rover Defender 1:32 Scale Model Car Exclusive Alloy Metal Pull Back Die-cast Car Metal Pullback Toy car with Openable Doors & Light Music Best Gifts Toys Kids【Colors as Per Stock】
MRP / Original Price: M.R.P.: ₹3,000.00
Discount %: -70%
Final Price: ₹888.
Rating: 3.8 out of 5 stars


In [12]:
# Clean up the product name and prices
title = title.strip()

# Final price: remove currency symbol and any commas
final_price_clean = final_price.strip().replace("₹","").replace(",","")

# MRP: remove currency symbol and extract numeric value
mrp_tag = soup.find("span", class_="a-size-small aok-offscreen")
if mrp_tag:
    mrp_text = mrp_tag.get_text().strip()  # e.g., "M.R.P.: ₹3,000.00"
else:
    # fallback: visible strike-through price
    mrp_span = soup.find("span", class_="a-price a-text-price")
    mrp_text = mrp_span.get_text().strip() if mrp_span else "MRP not found"

# Clean numeric value
import re
mrp_clean = re.findall(r'\d[\d,]*\.?\d*', mrp_text)  # extract numeric part
mrp_clean = mrp_clean[0].replace(",", "") if mrp_clean else "MRP not found"

print("MRP:", mrp_clean)  # Output: 3000


# Discount: remove % if you want numeric only
discount_clean = discount.replace("%","").replace("-","").strip()

# Rating: extract numeric part only
rating_clean = rating.split()[0]  # e.g., '3.8'

print("Product Name:", title)
print("Final Price:", final_price_clean)
print("MRP:", mrp_clean)
print("Discount %:", discount_clean)
print("Rating:", rating_clean)


MRP: 3000.00
Product Name: Umadiya® Range Rover Defender 1:32 Scale Model Car Exclusive Alloy Metal Pull Back Die-cast Car Metal Pullback Toy car with Openable Doors & Light Music Best Gifts Toys Kids【Colors as Per Stock】
Final Price: 888.
MRP: 3000.00
Discount %: 70
Rating: 3.8


In [13]:
# Create a Timestamp for your output to track when data was collected

import datetime

today = datetime.date.today()

print(today)

2025-08-22


In [15]:
import csv
from datetime import datetime

# Cleaned data
title = title.strip()
price_clean = final_price_clean
mrp_clean = mrp_clean
discount_clean = discount_clean
rating_clean = rating_clean
today = datetime.today().strftime('%Y-%m-%d')  # e.g., 2025-08-22

# Header and data
header = ['Title', 'Final Price', 'MRP', 'Discount %', 'Rating', 'Date']
data = [title, price_clean, mrp_clean, discount_clean, rating_clean, today]

# Write to CSV (append mode so you don't overwrite previous entries)
file_path = 'AmazonWeb.csv'
with open(file_path, 'a', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    
    # Write header only if file is empty
    if f.tell() == 0:
        writer.writerow(header)
    
    writer.writerow(data)

print("Data saved to CSV successfully!")


Data saved to CSV successfully!


In [7]:

# Multiple Products Scarping 

import requests
from bs4 import BeautifulSoup
import re
import csv
import time

# Base URL for Amazon search results
BASE_URL = "https://www.amazon.in/s?i=toys&rh=n%3A1378242031&s=popularity-rank&fs=true&ref=lp_1378242031_sar&page={}"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36",
    "Accept-Language": "en-IN,en;q=0.9"
}

all_rows = []
header = ["Title", "Price", "MRP", "Discount", "Rating", "Age Range"]

# Loop through multiple pages
for page_num in range(1, 6):  # scrape 5 pages (change as needed)
    print(f"🔎 Scraping page {page_num}...")
    url = BASE_URL.format(page_num)
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")

    products = soup.find_all("div", {"data-component-type": "s-search-result"})
    if not products:
        print("⚠️ No more products found, stopping.")
        break

    for product in products:
        # --- Title ---
        title_tag = product.find("h2")
        title = title_tag.get_text().strip() if title_tag else "N/A"

        # --- Price ---
        price_tag = product.find("span", class_="a-price-whole")
        price = price_tag.get_text().strip().replace(",", "") if price_tag else "0"

        # --- MRP ---
        mrp_tag = product.find("span", class_="a-text-price")
        mrp_text = mrp_tag.get_text().strip() if mrp_tag else "0"
        mrp_clean = re.findall(r"\d[\d,]*\.?\d*", mrp_text)
        mrp_clean = mrp_clean[0].replace(",", "") if mrp_clean else "0"

        # --- Discount ---
        discount_tag = product.find("span", string=re.compile(r"% off"))
        if discount_tag:
            discount_clean = re.findall(r"\d+", discount_tag.get_text())
            discount_clean = discount_clean[0] + "% off" if discount_clean else "0% off"
        else:
            discount_clean = "0% off"

        # --- Rating ---
        rating_tag = product.find("span", class_="a-icon-alt")
        rating = rating_tag.get_text().split()[0] if rating_tag else "0"

        # --- Age Range ---
        age_parent = product.find("div", {"data-cy": "product-details-recipe"})
        if age_parent:
            age_tag = age_parent.find("div", class_="a-row a-size-base a-color-base")
            age_text = age_tag.get_text().strip() if age_tag else "N/A"
        else:
            age_text = "N/A"

        # Save row
        all_rows.append([title, price, mrp_clean, discount_clean, rating, age_text])

    # Small delay to avoid detection
    time.sleep(2)

# Save results to CSV
with open("AmazonFullResults.csv", "w", newline="", encoding="UTF-8") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerows(all_rows)

print(f"✅ Scraped {len(all_rows)} products in total and saved to AmazonFullResults.csv")


🔎 Scraping page 1...
🔎 Scraping page 2...
🔎 Scraping page 3...
🔎 Scraping page 4...
🔎 Scraping page 5...
✅ Scraped 120 products in total and saved to AmazonFullResults.csv


In [16]:
import pandas as pd

df = pd.read_csv(r'C:\Users\praso\AmazonWeb.csv')

print(df)

                                               Title  Final Price   MRP  \
0  Umadiya® Range Rover Defender 1:32 Scale Model...          888  3000   

   Discount %  Rating        Date  
0          70     3.8  22-08-2025  
