In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# PHASE 1: WEB SCRAPING 

In [2]:
base_url = "https://www.snapdeal.com/search?keyword=SHOES&santizedKeyword=&catId=&categoryId=0&suggested=false&vertical=p&noOfResults=20&searchState=&clickSrc=go_header&lastKeyword=&prodCatId=&changeBackToAll=false&foundInAll=false&categoryIdSearched=&cityPageUrl=&categoryUrl=&url=&utmContent=&dealDetail=&sort=plrty"

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/115.0.0.0 Safari/537.36"}


In [14]:
# Scrap First Page

def scrape_snapdeal_first_page(url):
    products_data = []

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    items = soup.find_all("div", class_="product-tuple-listing")

    for item in items:
        # Name
        title = item.find("p", class_="product-title")
        name = title.text.strip() if title else None

        # Price
        price_tag = item.find("span", class_="product-price")
        price = (
            price_tag.text.replace("₹", "").replace(",", "").strip()
            if price_tag else None
        )

        # MRP
        mrp_tag = item.find("span", class_="lfloat product-desc-price strike")
        mrp = (
            mrp_tag.text.replace("₹", "").replace(",", "").strip()
            if mrp_tag else price
        )

        # Discount
        discount_tag = item.find("div", class_="product-discount")
        discount = discount_tag.text.strip() if discount_tag else "0%"

        # Rating
        rating = None
        rating_tag = item.find("div", class_="filled-stars")
        if rating_tag and "style" in rating_tag.attrs:
            rating = round(
                float(rating_tag["style"].split(":")[1].replace("%", "")) / 20, 1
            )

        # Reviews
        reviews = None
        review_tag = item.find("p", class_="product-rating-count")
        if review_tag:
            text = review_tag.text.replace("(", "").replace(")", "").strip()
            if text.isdigit():
                reviews = int(text)

        products_data.append({
            "Name": name,
            "Price": price,
            "MRP": mrp,
            "Discount": discount,
            "Rating": rating,
            "Reviews": reviews
        })

    return products_data


In [15]:
url = "https://www.snapdeal.com/search?keyword=SHOES&santizedKeyword=&catId=&categoryId=0&suggested=false&vertical=p&noOfResults=20&searchState=&clickSrc=go_header&lastKeyword=&prodCatId=&changeBackToAll=false&foundInAll=false&categoryIdSearched=&cityPageUrl=&categoryUrl=&url=&utmContent=&dealDetail=&sort=plrty"
first_page_data = scrape_snapdeal_first_page(url)

print(len(first_page_data))
first_page_data[:5]

20


[{'Name': "Campus SNIPER LIGHT GREY Men's Sports Running Shoes",
  'Price': 'Rs.  807',
  'MRP': 'Rs. 1949',
  'Discount': '59% Off',
  'Rating': 4.3,
  'Reviews': 163},
 {'Name': "ASIAN TITAAN-06 Off White Men's Sports Running Shoes",
  'Price': 'Rs.  717',
  'MRP': 'Rs. 1999',
  'Discount': '64% Off',
  'Rating': 4.3,
  'Reviews': 712},
 {'Name': "hotstyle Gray Men's Sports Running Shoes",
  'Price': 'Rs.  512',
  'MRP': 'Rs. 2249',
  'Discount': '77% Off',
  'Rating': 4.0,
  'Reviews': 826},
 {'Name': "PENNEN Black Men's Sports Running Shoes",
  'Price': 'Rs.  354',
  'MRP': 'Rs. 999',
  'Discount': '65% Off',
  'Rating': 3.7,
  'Reviews': 48},
 {'Name': "Bersache N-SPO-S-9197 White Men's Sports Running Shoes",
  'Price': 'Rs.  1061',
  'MRP': 'Rs. 4999',
  'Discount': '79% Off',
  'Rating': 4.1,
  'Reviews': 121}]

In [16]:
df1 = pd.DataFrame(first_page_data)
df1

Unnamed: 0,Name,Price,MRP,Discount,Rating,Reviews
0,Campus SNIPER LIGHT GREY Men's Sports Running ...,Rs. 807,Rs. 1949,59% Off,4.3,163
1,ASIAN TITAAN-06 Off White Men's Sports Running...,Rs. 717,Rs. 1999,64% Off,4.3,712
2,hotstyle Gray Men's Sports Running Shoes,Rs. 512,Rs. 2249,77% Off,4.0,826
3,PENNEN Black Men's Sports Running Shoes,Rs. 354,Rs. 999,65% Off,3.7,48
4,Bersache N-SPO-S-9197 White Men's Sports Runni...,Rs. 1061,Rs. 4999,79% Off,4.1,121
5,Clymb Gray Men's Sports Running Shoes,Rs. 632,Rs. 2100,70% Off,4.1,538
6,Campus ZURIK PRO Blue Men's Sports Running Shoes,Rs. 827,Rs. 1999,59% Off,4.3,653
7,HotStyle (tm) Gray Men's Sports Running Shoes,Rs. 512,Rs. 2249,77% Off,4.4,19
8,ASIAN DOMINATOR-03 Navy Men's Sports Running S...,Rs. 839,Rs. 2499,66% Off,4.2,1266
9,Campus WELLS Black Men's Sports Running Shoes,Rs. 602,Rs. 899,33% Off,4.1,337


## Extract data into CSV file

In [20]:
# Loop to Scrape All Pages

products = []

def scrape_snapdeal_category(base_url, max_pages=50):
    products_per_page = 20

    for page in range(1, max_pages + 1):
        start = (page - 1) * products_per_page

        url = f"{base_url}&pageno={page}&start={start}"

        print(f"Scraping page {page} | start={start}")

        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Request failed on page {page}: {e}")
            break
            
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        items = soup.find_all("div", class_="product-tuple-listing")

        if not items:
            print("Pagination finished.")
            break

        for item in items:
            # Name
            title_tag = item.find("p", class_="product-title")
            name = title_tag.text.strip() if title_tag else None

            # Price
            price_tag = item.find("span", class_="product-price")
            price = (price_tag.text.replace("₹", "").replace(",", "").strip()
                if price_tag else None)

            # MRP
            mrp_tag = item.find("span", class_="lfloat product-desc-price strike")
            mrp = (mrp_tag.text.replace("₹", "").replace(",", "").strip()
                if mrp_tag else price)

            # Discount
            discount_tag = item.find("div", class_="product-discount")
            discount = discount_tag.text.strip() if discount_tag else "0%"

            # Rating (convert from % width to 0–5 scale)
            rating_tag = item.find("div", class_="filled-stars")
            rating = None
            if rating_tag and "style" in rating_tag.attrs:
                rating = (float(rating_tag["style"].split(":")[1].replace("%", "")) / 20)

            # Review Count
            review_tag = item.find("p", class_="product-rating-count")
            reviews = None
            text = ""
            if review_tag:
                text = review_tag.text.strip().replace("(", "").replace(")", "")
            if text.isdigit():
                reviews = int(text)


            products.append({
                "Name": name,
                "Price": price,
                "MRP": mrp,
                "Discount": discount,
                "Rating": rating,
                "Reviews": reviews
            })

        time.sleep(2)

    return products

In [21]:
data = scrape_snapdeal_category(base_url, max_pages=80)

Scraping page 1 | start=0
Scraping page 2 | start=20
Scraping page 3 | start=40
Scraping page 4 | start=60
Scraping page 5 | start=80
Scraping page 6 | start=100
Scraping page 7 | start=120
Scraping page 8 | start=140
Scraping page 9 | start=160
Scraping page 10 | start=180
Scraping page 11 | start=200
Scraping page 12 | start=220
Scraping page 13 | start=240
Scraping page 14 | start=260
Scraping page 15 | start=280
Scraping page 16 | start=300
Scraping page 17 | start=320
Scraping page 18 | start=340
Scraping page 19 | start=360
Scraping page 20 | start=380
Scraping page 21 | start=400
Scraping page 22 | start=420
Scraping page 23 | start=440
Scraping page 24 | start=460
Scraping page 25 | start=480
Scraping page 26 | start=500
Scraping page 27 | start=520
Scraping page 28 | start=540
Scraping page 29 | start=560
Scraping page 30 | start=580
Scraping page 31 | start=600
Scraping page 32 | start=620
Scraping page 33 | start=640
Scraping page 34 | start=660
Scraping page 35 | start=680


In [22]:
print(f"Total products scraped: {len(products)}")

Total products scraped: 1600


In [23]:
df = pd.DataFrame(products)
df

Unnamed: 0,Name,Price,MRP,Discount,Rating,Reviews
0,Campus SNIPER LIGHT GREY Men's Sports Running ...,Rs. 807,Rs. 1949,59% Off,4.3,163.0
1,ASIAN TITAAN-06 Off White Men's Sports Running...,Rs. 717,Rs. 1999,64% Off,4.3,712.0
2,Campus ZURIK PRO Blue Men's Sports Running Shoes,Rs. 827,Rs. 1999,59% Off,4.3,653.0
3,hotstyle Gray Men's Sports Running Shoes,Rs. 512,Rs. 2249,77% Off,4.0,826.0
4,ASIAN NAVIGATOR-02 White Men's Sports Running ...,Rs. 841,Rs. 2999,72% Off,4.3,217.0
...,...,...,...,...,...,...
1595,PENNEN Blue Men's Sports Running Shoes,Rs. 490,Rs. 999,51% Off,5.0,1.0
1596,Impakto Beige Men's Sports Running Shoes,Rs. 949,Rs. 3609,74% Off,3.3,3.0
1597,Campus STREME Navy Men's Sports Running Shoes,Rs. 857,Rs. 1899,55% Off,4.4,90.0
1598,PENNEN Gray Men's Sports Running Shoes,Rs. 352,Rs. 999,65% Off,,


In [26]:
# Save data to csv file
df.to_csv("snapdeal_shoes.csv", index=False, encoding="utf-8")

In [27]:
print(f"Saved {len(df)} records to CSV")

Saved 1600 records to CSV
