Import required libraries

In [16]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [17]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
import re #helps to call the url

Amazon URL

In [18]:
URL = "https://www.amazon.in/s?k=laptop&crid=3H4ZV8KKLMX73&sprefix=lapto%2Caps%2C348&ref=nb_sb_noss_2"

Make a header to mimic a browser visit (We make headers to tell that we are not any hackers or scammers)

In [19]:
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/144.0.0.0 Safari/537.36"}

Send request to URL

Beautifulsoup to parse the HTML Content (It helps to convert the scarp data to html form)

In [20]:
data = []
print("Data list cleared")

for page in range(1, 7):  
    params = {"k": "laptop", "page": page}
    
    # Use verify=False to bypass SSL verification (for testing purposes)
    response = requests.get(URL, params=params, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    
    products = soup.find_all("div", {"data-component-type": "s-search-result"})

    
    
    for product in products:
        # Extract the product title
        title_tag = product.find("h2")
        if not title_tag:
            continue
        title_text = title_tag.get_text(strip=True)

        
        # Extract the product price
        price_tag = product.find("span", {"class": "a-price-whole"})
        price_text = price_tag.get_text(strip=True) if price_tag else "N/A"

        #Extract the processor
        processor = re.search(r"(i[3579]|Ryzen\s?\d)", title_text)
        processor = processor.group() if processor else None
        
        # Extract the product brand
        match = re.match(r'^\W*([A-Za-z]+)', title_text)
        brand = match.group(1).upper() if match else "UNKNOWN"

        # Extract RAM (improved version)
        ram_match = re.search(r'(\d+)\s*GB\s*(RAM|DDR\d+|LPDDR\d+)', title_text, re.IGNORECASE)
        ram = ram_match.group(1) + "GB" if ram_match else "N/A"
        
        # Extract the product rating 
        rating_tag = product.find("span", class_="a-icon-alt")
        rating = "N/A"
        if rating_tag:
            rating_match = re.search(r'(\d+\.?\d*)', rating_tag.get_text())
            rating = rating_match.group(1) if rating_match else "N/A"

        # Extract the Product SSD - Simplified
        ssd_match = re.search(r'(\d+)\s*(GB|TB)\s*(?:SSD|NVMe)', title_text, re.IGNORECASE) or re.search(r'(\d+)\s*(GB|TB)', title_text)
        ssd = (ssd_match.group(1) + ssd_match.group(2).upper()) if ssd_match else "N/A"
      

        # Extract the Product Windows Version - Improved version
        windows_version = "N/A"
        
        # Try: "Windows 11", "Windows 10", "Win11", "Win10", etc.
        windows_match = re.search(r'(?:Windows\s*|Win\s*)(\d+)', title_text, re.IGNORECASE)
        if windows_match:
            windows_version = "Windows " + windows_match.group(1)
        
        # Extract the Product Color
        color_match = re.search(r'\b(Black|Silver|Gray|Grey|White|Blue|Red|Gold|Green|Brown|Pink|Purple|Yellow|Orange|Champagne|Midnight|Space|Cosmic|Stardust|Graphite|Ash|Onyx|Platinum|Metallic)\b', title_text, re.IGNORECASE)
        color = color_match.group(1) if color_match else "N/A"

        # Extract Discount Percentage
        discount_tag = product.find("span", string=re.compile(r'%'))
        discount = discount_tag.get_text(strip=True) if discount_tag else "N/A"

        screen_match = re.search(r'(\d{1,2}\.?\d?)\s*(?=[^\d]*cm|[^\d]*["\'])',title_text,re.IGNORECASE)
        screen_size = screen_match.group(1) if screen_match else "N/A"



        # Store the extracted data in a dictionary and append to the list
        data.append({
            "Title": title_text,
            "Price": price_text,
            "Processor":processor,
            "Brand": brand,
            'RAM': ram,
            "Rating": rating,
            "Storage": ssd,
            "Windows": windows_version,
            "Color": color,
            "Discount": discount,
            "Screen Size": screen_size
        })

      
    print(f"Page {page} scraped")
    time.sleep(1)


Data list cleared
Page 1 scraped
Page 2 scraped
Page 3 scraped
Page 4 scraped
Page 5 scraped
Page 6 scraped


In [21]:
for product in data:
    print("Title:", product["Title"])
    print("Price:", product["Price"])
    print("Brand:", product["Brand"])
    print("Processor:",product["Processor"])
    print("RAM:", product.get("RAM"))
    print("Rating:", product.get("Rating"))
    print("Storage:", product.get("Storage"))
    print("Windows:", product.get("Windows"))
    print("Color:", product.get("Color"))
    print("Discount:",product.get("Discount"))
    print("Screen Size:",product.get("screen_size"))
    print("-" * 50)

Title: Dell 15, Intel Core i5 13th Gen-1334U, 16GB DDR4, 512GB SSD, FHD, 15.6"/39.62cm, Windows 11, MSO 2024, Grey, 1.66kg, [Vostro 3530], 120Hz 250 nits, 15 Month McAfee, Thin & Light, Laptop
Price: 56,990
Brand: DELL
Processor: i5
RAM: 16GB
Rating: 3.7
Storage: 512GB
Windows: Windows 11
Color: Grey
Discount: (21% off)
Screen Size: None
--------------------------------------------------
Title: ASUS Vivobook S14,Smartchoice,AMD Ryzen AI 7 350,16GB RAM,1TB SSD,OLED,14",Windows 11, Office24,M365 Basic (1Yr)*, Matte Gray,1.4Kg, M3407KA-SF049WS,50 Tops,Metallic Design,Next-Gen AI Laptop,Copilot+
Price: 83,990
Brand: ASUS
Processor: None
RAM: 16GB
Rating: 3.8
Storage: 1TB
Windows: Windows 11
Color: Gray
Discount: (23% off)
Screen Size: None
--------------------------------------------------
Title: EBook 11.6" HD Laptop | Best Student & Office Work Laptop | Celeron N4020 | 4GB DDR4 | 128GB eMMC + M.2 SSD Expandable Slot | Win 11 Home |31Wh Battery | UHD Graphics 600 | Black
Price: 10,990
Bra

In [22]:
data

[{'Title': 'Dell 15, Intel Core i5 13th Gen-1334U, 16GB DDR4, 512GB SSD, FHD, 15.6"/39.62cm, Windows 11, MSO 2024, Grey, 1.66kg, [Vostro 3530], 120Hz 250 nits, 15 Month McAfee, Thin & Light, Laptop',
  'Price': '56,990',
  'Processor': 'i5',
  'Brand': 'DELL',
  'RAM': '16GB',
  'Rating': '3.7',
  'Storage': '512GB',
  'Windows': 'Windows 11',
  'Color': 'Grey',
  'Discount': '(21% off)',
  'Screen Size': '15.6'},
 {'Title': 'ASUS Vivobook S14,Smartchoice,AMD Ryzen AI 7 350,16GB RAM,1TB SSD,OLED,14",Windows 11, Office24,M365 Basic (1Yr)*, Matte Gray,1.4Kg, M3407KA-SF049WS,50 Tops,Metallic Design,Next-Gen AI Laptop,Copilot+',
  'Price': '83,990',
  'Processor': None,
  'Brand': 'ASUS',
  'RAM': '16GB',
  'Rating': '3.8',
  'Storage': '1TB',
  'Windows': 'Windows 11',
  'Color': 'Gray',
  'Discount': '(23% off)',
  'Screen Size': '14'},
 {'Title': 'EBook 11.6" HD Laptop | Best Student & Office Work Laptop | Celeron N4020 | 4GB DDR4 | 128GB eMMC + M.2 SSD Expandable Slot | Win 11 Home |31

In [23]:
df=pd.DataFrame(data)
df

Unnamed: 0,Title,Price,Processor,Brand,RAM,Rating,Storage,Windows,Color,Discount,Screen Size
0,"Dell 15, Intel Core i5 13th Gen-1334U, 16GB DD...",56990,i5,DELL,16GB,3.7,512GB,Windows 11,Grey,(21% off),15.6
1,"ASUS Vivobook S14,Smartchoice,AMD Ryzen AI 7 3...",83990,,ASUS,16GB,3.8,1TB,Windows 11,Gray,(23% off),14
2,"EBook 11.6"" HD Laptop | Best Student & Office ...",10990,,EBOOK,4GB,5.0,4GB,Windows 11,Black,(56% off),11.6
3,JioBook 11 with Lifetime Office | Android 4G L...,10990,,JIOBOOK,4GB,2.9,4GB,,Blue,(56% off),
4,"HP 15, 13th Gen Intel Core i3-1315U (12GB DDR4...",41990,i3,HP,12GB,4.1,512GB,Windows 11,Silver,(20% off),15.6
...,...,...,...,...,...,...,...,...,...,...,...
127,Lenovo IdeaPad Flex 5 AMD Ryzen 5 5625U (16GB ...,,Ryzen 5,LENOVO,16GB,4.2,512GB,Windows 11,Grey,,35.5
128,"Lenovo ThinkBook 16, AMD Ryzen 7 7735HS, 16GB ...",57990,Ryzen 7,LENOVO,16GB,3.7,512GB,Windows 11,,(38% off),16
129,"ASUS Vivobook 15,13th Gen,Intel Core i7-13620H...",67990,i7,ASUS,16GB,4.0,1TB,Windows 11,Blue,(23% off),15.6
130,"Primebook 2 Max 2025 (New Launch) | 8GB RAM, 2...",22990,,PRIMEBOOK,8GB,4.3,8GB,,Gray,(34% off),


In [24]:
df['Product Name'] = df['Title'].str.split(r'[|,]').str[0].str.strip()
col = df.pop('Product Name')
df.insert(0, 'Product Name', col)
df.drop("Title",axis=1,inplace=True)

In [25]:
df.to_csv("amazon_laptops_raw.csv")

In [26]:
df

Unnamed: 0,Product Name,Price,Processor,Brand,RAM,Rating,Storage,Windows,Color,Discount,Screen Size
0,Dell 15,56990,i5,DELL,16GB,3.7,512GB,Windows 11,Grey,(21% off),15.6
1,ASUS Vivobook S14,83990,,ASUS,16GB,3.8,1TB,Windows 11,Gray,(23% off),14
2,"EBook 11.6"" HD Laptop",10990,,EBOOK,4GB,5.0,4GB,Windows 11,Black,(56% off),11.6
3,JioBook 11 with Lifetime Office,10990,,JIOBOOK,4GB,2.9,4GB,,Blue,(56% off),
4,HP 15,41990,i3,HP,12GB,4.1,512GB,Windows 11,Silver,(20% off),15.6
...,...,...,...,...,...,...,...,...,...,...,...
127,Lenovo IdeaPad Flex 5 AMD Ryzen 5 5625U (16GB ...,,Ryzen 5,LENOVO,16GB,4.2,512GB,Windows 11,Grey,,35.5
128,Lenovo ThinkBook 16,57990,Ryzen 7,LENOVO,16GB,3.7,512GB,Windows 11,,(38% off),16
129,ASUS Vivobook 15,67990,i7,ASUS,16GB,4.0,1TB,Windows 11,Blue,(23% off),15.6
130,Primebook 2 Max 2025 (New Launch),22990,,PRIMEBOOK,8GB,4.3,8GB,,Gray,(34% off),
