<a href="https://colab.research.google.com/github/sudarshan-360/Machine-Learning/blob/main/Web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 lxml pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
import time

headers = {
    "User-Agent": "Mozilla/5.0"
}
def get_product_links(base_url, max_pages=3):
    links = []
    for page in range(1, max_pages + 1):
        url = f"{base_url}?p={page}"
        print(f"🔗 Scanning: {url}")
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.content, "lxml")

        for a in soup.select("a.product-item-link"):
            href = a.get("href")
            if href and href.startswith("https"):
                links.append(href)
        time.sleep(1)
    return list(set(links))

def extract_saree_info(url):
    try:
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.content, "lxml")

        name = soup.select_one("h1.page-title")
        name = name.text.strip() if name else "Not Found"

        price = soup.select_one("span.price")
        price = price.text.strip() if price else "Not Found"

        specs = {
            tr.find("th").text.strip(): tr.find("td").text.strip()
            for tr in soup.select("table.data.table.additional-attributes tbody tr")
        }

        fabric = specs.get("Material", "Not Found")
        color = specs.get("Color", "Not Found")
        border = specs.get("Border") or specs.get("Border Type") or "Not Found"
        blouse = specs.get("Blouse Color", "Not Found")

        desc = soup.select_one(".product.attribute.description")
        description_text = desc.text.strip() if desc else ""

        occasion_match = re.search(r"(wedding|casual|festive|formal|party)", description_text, re.IGNORECASE)
        occasion = occasion_match.group(0).capitalize() if occasion_match else "Not Mentioned"

        pallu_match = re.search(r"(pallu.*?)\.", description_text, re.IGNORECASE)
        pallu = pallu_match.group(0).strip() if pallu_match else "Not Mentioned"

        img = soup.select_one("img.fotorama__img")
        img_url = img.get("src") if img else "Not Found"

        return {
            "Name": name,
            "Price": price,
            "Base Color": color,
            "Fabric Type": fabric,
            "Border Type": border,
            "Blouse Color": blouse,
            "Occasion": occasion,
            "Pallu": pallu,
            "Image URL": img_url,
            "Product URL": url
        }

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None


# Use your favorite category — change this link if needed
category_url = "https://www.nalli.com/woman/saree/kanchipuram"
product_links = get_product_links(category_url, max_pages=100)

print(f"\n🔍 Found {len(product_links)} product pages.\nScraping now...")

sarees = [extract_saree_info(link) for link in product_links]
sarees = [s for s in sarees if s is not None]

# Save to CSV
df = pd.DataFrame(sarees)
filename = f"nalli_saree_dataset_{datetime.now().strftime('%Y-%m-%d')}.csv"
df.to_csv(filename, index=False)

print("✅ DONE! Saved as", filename)
df.head()


🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=1
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=2
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=3
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=4
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=5
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=6
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=7
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=8
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=9
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=10
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=11
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=12
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=13
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=14
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=15
🔗 Scanning: https://www.nalli.com/woman/saree/kanchipuram?p=16
🔗

Unnamed: 0,Name,Price,Base Color,Fabric Type,Border Type,Blouse Color,Occasion,Pallu,Image URL,Product URL
0,Pure Silk Pure Zari Orange Kanchipuram Saree,"₹46,145.00",Orange,Silk,Zari Border,Purple,Not Mentioned,Not Mentioned,Not Found,https://www.nalli.com/orange-kanchipuram-silk-...
1,Pure Silk Pure Zari Purple Dual Tone Kanchipur...,"₹39,270.00",Purple,Silk,Fancy Border,Purple,Not Mentioned,Not Mentioned,Not Found,https://www.nalli.com/purple-dual-tone-kanchip...
2,Brown Dual Tone Kanchipuram Silk Saree,"₹8,055.00",Brown,Silk,Zari Border,Red,Not Mentioned,Not Mentioned,Not Found,https://www.nalli.com/brown-dual-tone-kanchipu...
3,Pure Silk Pure Zari Orange Dual Tone Kanchipur...,"₹47,068.00",Orange,Silk,Zari Border,Sea Green,Not Mentioned,Not Mentioned,Not Found,https://www.nalli.com/orange-dual-tone-kanchip...
4,Maroon Kanchipuram Silk Saree,"₹17,145.00",Maroon,Silk,Ganga Jamuna Border,Maroon,Not Mentioned,Not Mentioned,Not Found,https://www.nalli.com/maroon-kanchipuram-silk-...


In [None]:
print("Number of rows (excluding header):", len(df))
print("Number of columns:", len(df.columns))

Number of rows (excluding header): 3326
Number of columns: 10


In [None]:
from google.colab import files
files.download("nalli_saree_dataset_2025-07-19.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>