In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
from urllib.parse import urljoin
from datetime import datetime

In [None]:
def get_title(soup):
    try:
        # Outer Tag 
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner String Object
        title_value = title.text
        # Title as a string value
        title_string = title_value.strip()
    except AttributeError:
        title_string = ""
    return title_string

In [None]:
def get_price(soup):
    try:
        price_tag = soup.select_one("span.a-price span.a-offscreen")
        if price_tag and price_tag.text.strip():
            price_text = price_tag.text.strip()
        elif soup.find("span", {"id": "priceblock_ourprice"}):
            price_text = soup.find("span", {"id": "priceblock_ourprice"}).text.strip()
        elif soup.find("span", {"id": "priceblock_dealprice"}):
            price_text = soup.find("span", {"id": "priceblock_dealprice"}).text.strip()
        elif soup.find("span", {"id": "priceToPay"}):
            price_text = soup.find("span", {"id": "priceToPay"}).text.strip()
        elif soup.select_one("span.a-price-whole"):
            whole = soup.select_one("span.a-price-whole").text.replace(",", "").strip()
            fraction_tag = soup.select_one("span.a-price-fraction")
            fraction = fraction_tag.text.strip() if fraction_tag else "00"
            price_text = f"{whole}.{fraction}"
        else:
            return None
        cleaned_price = (
            price_text
            .replace("‚Çπ", "")
            .replace(",", "")
            .strip()
                )
        
        return float(cleaned_price)
        
    except Exception:
        return None

In [None]:
def get_rating(soup):
    try:
        rating = soup.find("span", {"class": "a-icon-alt"})
        return rating.text.split(" ")[0]
    except:
        return ""


In [None]:
def get_review_count(soup):
    try:
        # Method 1: Standard review count selector
        review = soup.find("span", {"id": "acrCustomerReviewText"})
        if review:
            review_text = review.text.strip()
            # Brackets aur commas remove karo
            review_number = review_text.split(" ")[0]
            review_number = review_number.replace(",", "").replace("(", "").replace(")", "")
            return int(review_number)
        
        # Method 2: Alternative selector for reviews
        review_alt = soup.select_one("#acrCustomerReviewLink span")
        if review_alt:
            review_text = review_alt.text.strip()
            review_number = review_text.split(" ")[0]
            review_number = review_number.replace(",", "").replace("(", "").replace(")", "")
            return int(review_number)
        
        # Method 3: Another common pattern
        review_count = soup.find("a", {"data-hook": "see-all-reviews-link-foot"})
        if review_count:
            review_text = review_count.find("span").text.strip()
            review_number = review_text.split(" ")[0]
            review_number = review_number.replace(",", "").replace("(", "").replace(")", "")
            return int(review_number)
        
        # Method 4: Check karo agar rating link mein reviews mentioned hain
        rating_link = soup.find("a", {"id": "acrCustomerReviewLink"})
        if rating_link:
            review_text = rating_link.text.strip()
            # "4.5 out of 5 stars    2,031 ratings" jaise format handle karne ke liye
            parts = review_text.split()
            for part in parts:
                cleaned = part.replace(",", "").replace("(", "").replace(")", "")
                if cleaned.isdigit():
                    num = int(cleaned)
                    if num > 10:  # Rating 10 se zyada nahi hoti, toh ye review count hoga
                        return num
            
        return 0  # Agar koi review nahi mila toh 0 return karo
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è  Review scraping error: {e}")
        return 0

In [None]:
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()
    except AttributeError:
        available = "Not Available"	
    return available

In [None]:
def get_discount(soup):
    try:
        discount = soup.find("span", {"class": "savingsPercentage"})
        if discount:
            return discount.text.replace("%", "").replace("-", "").strip()
        return ""
    except:
        return ""

In [None]:
def get_brand(soup):
    try:
        brand = soup.find("a", {"id": "bylineInfo"})
        if brand:
            return brand.text.replace("Visit the", "").replace("Store", "").strip()
        # Alternative method
        brand_alt = soup.find("span", {"class": "a-size-base po-break-word"})
        if brand_alt:
            return brand_alt.text.strip()
        return ""
    except:
        return ""

In [None]:
if __name__ == '__main__':
    HEADERS = ({'User-Agent':'',
            'Accept-Language': 'en-US, en;q=0.5'})
    BASE_URL = "https://www.amazon.in"
    
    # Price range define karo
    MIN_PRICE = 0
    MAX_PRICE = 10000
    TOTAL_PAGES = 3
    PRICE_LABEL = "Under 10k"
    
    print("="*60)
    print(f"üîç Amazon Smartphone Scraper - {PRICE_LABEL}")
    print(f"üí∞ Price Range: ‚Çπ{MIN_PRICE:,} - ‚Çπ{MAX_PRICE:,}")
    print(f"üìÑ Pages to Scrape: {TOTAL_PAGES}")
    print("="*60)
    
    product_links = []
    
    # Step 1: Collect all product links
    for page in range(1, TOTAL_PAGES + 1):
        print(f"\nüìÑ Scraping search page {page}/{TOTAL_PAGES}...")
        search_url = f"https://www.amazon.in/s?k=smartphones+under+10k&crid=3YMQSPRDU2DB&sprefix=smartphones+under+10%2Caps%2C380&ref=nb_sb_noss_2&page={page}"
        
        try:
            response = requests.get(search_url, headers=HEADERS, timeout=10)
            if response.status_code != 200:
                print(f"  ‚ö†Ô∏è  Page {page} failed with status code: {response.status_code}")
                continue
        except requests.exceptions.RequestException as e:
            print(f"  ‚ö†Ô∏è  Page {page} request failed: {e}")
            continue
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
                               
        for link in links:
            href = link.get("href")
            if href and "/dp/" in href:
                product_links.append(href.split("?")[0])
        
        print(f"  ‚úì Found {len(links)} products on page {page}")
        time.sleep(2)

    product_links = list(set(product_links))
    print(f"\n‚úÖ Total unique products found: {len(product_links)}")
    print("="*60)

    data = {
        "title": [],
        "price": [],
        "rating": [],
        "reviews": [],
        "discount": [],
        "brand": [],
        "product_url": []
    }
    
    # Step 3: Scrapping each product
    print(f"\nüõí Starting to scrape {len(product_links)} products...")
    print("="*60)
    
    successful_scrapes = 0
    failed_scrapes = 0

    for idx, link in enumerate(product_links, start=1):
        product_url = urljoin(BASE_URL, link)
        print(f"üì¶ [{idx}/{len(product_links)}] Scraping product...")
        
        try:
            product_page = requests.get(product_url, headers=HEADERS, timeout=10)
            if product_page.status_code != 200:
                print(f"  ‚ùå Failed - Status code: {product_page.status_code}")
                failed_scrapes += 1
                continue
        except Exception as e:
            print(f"  ‚ùå Failed - Request error: {e}")
            failed_scrapes += 1
            continue
        
        product_soup = BeautifulSoup(product_page.content, "html.parser")
        
        try:
            title = get_title(product_soup)
            cleaned_price = get_price(product_soup)
            rating = get_rating(product_soup)
            reviews = get_review_count(product_soup)
            discount = get_discount(product_soup)
            brand = get_brand(product_soup)
            
            # Data append karo
            data["title"].append(title)
            data["price"].append(cleaned_price)
            data["rating"].append(rating)
            data["reviews"].append(reviews)
            data["discount"].append(discount)
            data["brand"].append(brand)
            data["product_url"].append(product_url)
            
            # Success message with key details
            print(f"  ‚úì Success - {title[:50]}... | Price: ‚Çπ{cleaned_price} | Reviews: {reviews}")
            successful_scrapes += 1
            
        except Exception as e:
            print(f"  ‚ùå Failed - Data extraction error: {e}")
            failed_scrapes += 1
            continue
        
        time.sleep(2)

    print("\n" + "="*60)
    print("üìä Creating DataFrame...")
    amazon_df = pd.DataFrame.from_dict(data)
    
    # Convert to proper data types
    amazon_df["price"] = pd.to_numeric(amazon_df["price"], errors="coerce")
    amazon_df["rating"] = pd.to_numeric(amazon_df["rating"], errors="coerce")
    amazon_df["reviews"] = pd.to_numeric(amazon_df["reviews"], errors="coerce")
    amazon_df["discount"] = pd.to_numeric(amazon_df["discount"], errors="coerce")
    
    # Step 5: Save to CSV
    csv_filename = f"amazon_smartphones_under_10k.csv"
    amazon_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')

üîç Amazon Smartphone Scraper - Under 10k
üí∞ Price Range: ‚Çπ0 - ‚Çπ10,000
üìÑ Pages to Scrape: 3

üìÑ Scraping search page 1/3...
  ‚úì Found 25 products on page 1

üìÑ Scraping search page 2/3...
  ‚úì Found 22 products on page 2

üìÑ Scraping search page 3/3...
  ‚úì Found 22 products on page 3

‚úÖ Total unique products found: 50

üõí Starting to scrape 50 products...
üì¶ [1/50] Scraping product...
  ‚úì Success - Samsung Galaxy M07 Mobile (Black, 4GB RAM, 64GB St... | Price: ‚Çπ7499.0 | Reviews: 668
üì¶ [2/50] Scraping product...
  ‚úì Success - Tecno Spark GO 2, 4GB + 64GB, Segment 1st IP64 SGS... | Price: ‚Çπ7499.0 | Reviews: 170
üì¶ [3/50] Scraping product...
  ‚úì Success - Motorola g35 5G (Midnight Black, 128 GB) (8 GB RAM... | Price: ‚Çπ13138.0 | Reviews: 60
üì¶ [4/50] Scraping product...
  ‚úì Success - realme P3 Lite 5G (Jade Green, 4GB RAM, 128GB Stor... | Price: ‚Çπ11530.0 | Reviews: 1
üì¶ [5/50] Scraping product...
  ‚úì Success - Slide and Learn Multiplica