<a href="https://colab.research.google.com/github/pravinjaju1005/pravinjaju1005/blob/main/Nykaa_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urlparse, parse_qs

headers = {'User-Agent': 'Mozilla/5.0'}

def extract_sku_from_url(url):
    parsed = urlparse(url)
    qs = parse_qs(parsed.query)
    if 'skuId' in qs:
        return qs['skuId'][0]
    elif 'productId' in qs:
        return qs['productId'][0]
    else:
        match = re.search(r'/p/(\d+)', url)
        return match.group(1) if match else ''

def scrape_nykaa_product_detail(url, seed_url):
    try:
        res = requests.get(url, headers=headers)
        soup = BeautifulSoup(res.text, 'html.parser')

        json_ld = soup.find('script', type='application/ld+json')
        data = {}
        breadcrumb = {}

        if json_ld:
            try:
                raw = json.loads(json_ld.text.strip())
                if isinstance(raw, dict):
                    if raw.get('@type') == 'Product':
                        data = raw
                    elif raw.get('@type') == 'BreadcrumbList':
                        breadcrumb = raw
                elif isinstance(raw, list):
                    for item in raw:
                        if isinstance(item, dict):
                            if item.get('@type') == 'Product':
                                data = item
                            elif item.get('@type') == 'BreadcrumbList':
                                breadcrumb = item
            except json.JSONDecodeError:
                pass

        # --- Brand ---
        brand_data = data.get('brand') if isinstance(data, dict) else None
        brand = brand_data.get('name') if isinstance(brand_data, dict) else (brand_data if isinstance(brand_data, str) else '')

        title = data.get('name', '') if isinstance(data, dict) else ''
        sku = extract_sku_from_url(url)

        # --- Breadcrumb subcategory/product_type ---
        subcategory = ''
        product_type = ''
        if isinstance(breadcrumb, dict):
            items = breadcrumb.get('itemListElement', [])
            if isinstance(items, list) and len(items) > 1:
                subcategory = items[-1].get('name', '')
                product_type = items[-2].get('name', '') if len(items) > 2 else ''

        # --- Price Section (MRP, Sale Price, Discount) ---
        # price_blocks = soup.select("div.css-1d0jf8e")
        # mrp, sale_price, discount = '', '', ''
        # for block in price_blocks:
        #     spans = block.find_all("span")
        #     texts = [s.get_text(strip=True).replace('MRP:', '').replace('‚Çπ', '').replace(',', '') for s in spans]
        #     if len(texts) == 2:
        #         mrp, sale_price = texts
        #         discount = '0'
        #     elif len(texts) == 3:
        #         mrp, sale_price, discount_raw = texts
        #         discount = re.sub(r'[^\d]', '', discount_raw)
        # --- MRP, Sale Price, Discount extraction ---
        mrp = sale_price = discount = ''
        price_block = soup.select_one("div.css-1d0jf8e")

        if price_block:
            spans = price_block.find_all("span")

            if len(spans) >= 3:
                # MRP (nested span)
                nested_mrp = spans[0].find('span')
                if nested_mrp:
                    mrp = nested_mrp.get_text(strip=True).replace('‚Çπ', '').replace(',', '')
                else:
                    mrp = spans[0].get_text(strip=True).replace('MRP:', '').replace('‚Çπ', '').replace(',', '')

                # ‚úÖ Corrected Sale Price
                sale_price = spans[2].get_text(strip=True).replace('‚Çπ', '').replace(',', '')

                # ‚úÖ Corrected Discount
                discount = re.sub(r'[^\d]', '', spans[3].get_text(strip=True)) or '0'

            elif len(spans) == 2:
                # Only MRP and sale price
                mrp = spans[0].get_text(strip=True).replace('MRP:', '').replace('‚Çπ', '').replace(',', '')
                sale_price = spans[1].get_text(strip=True).replace('‚Çπ', '').replace(',', '')
                discount = str(int(mrp) - int(sale_price)) if mrp and sale_price and mrp != sale_price else '0'

            elif len(spans) == 1:
                # Only sale price
                sale_price = spans[0].get_text(strip=True).replace('‚Çπ', '').replace(',', '')
                mrp = sale_price
                discount = '0'


        # --- Ratings and Reviews ---
        # --- Average Rating ---
        rating_elem = soup.select_one('div.css-m6n3ou')
        rating = ''
        if rating_elem:
            rating = rating_elem.get_text(strip=True).replace('/5', '').strip()

        # rating_data = data.get('aggregateRating', {}) if isinstance(data, dict) else {}
        # rating = rating_data.get('ratingValue', '') if isinstance(rating_data, dict) else ''

        rating_reviews_block = soup.select("div.css-1eip5u4 .css-1hvvm95")
        ratings_count, reviews_count = '', ''
        for div in rating_reviews_block:
            text = div.get_text(strip=True)
            if "ratings" in text:
                ratings_count = re.sub(r"[^\d]", "", text)
            elif "reviews" in text:
                reviews_count = re.sub(r"[^\d]", "", text)

        # --- Stock Status ---
        offers = data.get('offers', {}) if isinstance(data, dict) else {}
        stock = offers.get('availability', '') if isinstance(offers, dict) else ''
        stock_status = 'In Stock' if 'InStock' in stock else 'Out of Stock'

        # --- Description ---
        desc_block = soup.select_one('.css-1p8o1c5')
        description = desc_block.get_text(" ", strip=True) if desc_block else (
            data.get('description', '') if isinstance(data, dict) else '')

        # --- Notes ---
        def extract_note(note_type):
            match = re.search(rf'{note_type} Notes?:?\s*([^.]+)', description, re.IGNORECASE)
            return match.group(1).strip() if match else ''

        top_notes = extract_note("Top")
        heart_notes = extract_note("Heart|Middle")
        base_notes = extract_note("Base")

        # --- Raw Text Extraction for tags ---
        full_text = soup.get_text(" ", strip=True)

        ingredients_match = re.search(r'Ingredients\s*[:\-]?\s*(.+?)(Formulation|Gender|Fragrance|About|$)', full_text, re.IGNORECASE)
        ingredients = ingredients_match.group(1).strip() if ingredients_match else ''

        formulation = re.search(r'Formulation\s*[:\-]?\s*([^\|]+)', full_text)
        gender = re.search(r'Gender\s*[:\-]?\s*([^\|]+)', full_text)
        fragrance_type = re.search(r'Fragrance\s*[:\-]?\s*([^\|]+)', full_text)
        occasion = re.search(r'Occasion\s*[:\-]?\s*([^\|]+)', full_text)
        preference = re.search(r'Preference\s*[:\-]?\s*([^\|]+)', full_text)

        # --- Country of Origin & Manufacturer ---
        # --- Country of Origin & Manufacturer ---
        manufacturer = ''
        country = ''

        # Prefer contextual pairs
        for tag in soup.find_all("p", class_="mrp-n-expiry"):
            label = tag.get_text(strip=True).lower()
            next_val = tag.find_next_sibling("p", class_="content-info")
            if not next_val:
                continue

            value = next_val.get_text(strip=True)
            if "manufacturer" in label:
                manufacturer = value
            elif "country of origin" in label:
                country = value

        # Fallback: try to find in description block if missing
        if not country:
            desc = soup.select_one('.css-1p8o1c5')
            if desc:
                if 'country of origin' in desc.get_text(strip=True).lower():
                    matches = re.findall(r'Country of Origin\s*:\s*([A-Za-z\s\(\)]+)', desc.get_text(), re.IGNORECASE)
                    if matches:
                        country = matches[0].strip()


        # --- Images ---
        image_urls = []
        if isinstance(data, dict) and 'image' in data:
            image_data = data['image']
            if isinstance(image_data, list):
                image_urls = image_data
            elif isinstance(image_data, str):
                image_urls = [image_data]

        # --- Size extraction (from dropdown) ---
        size = ''
        size_dropdown = soup.select_one('select[title="SIZE"]')
        if size_dropdown:
            size_options = [opt.text.strip() for opt in size_dropdown.find_all('option') if 'Select' not in opt.text]
            size = '|'.join(size_options)  # If multiple sizes like "40ml|100ml"


        return {
            'source': 'Nykaa',
            'keyword': seed_url,
            'sku': sku,
            'parent_sku': '',
            'brand': brand,
            'title': title,
            'category': 'Fragrance',
            'subcategory': subcategory,
            'product_type': product_type,
            'size': size,
            'color': '',
            'offers': '',
            'avg rating': rating,
            'no ratings': ratings_count,
            'no reviews': reviews_count,
            'no Q&A': '',
            'mrp': mrp,
            'sale_price': sale_price,
            'discounnt': discount,
            'manufacturer_name': manufacturer,
            'seller_name': 'Nykaa E retail limited',
            'seller_country': country if country else 'India',
            'stock': stock_status,
            'description': description,
            'ingredients': ingredients,
            'formulation_filter': formulation.group(1).strip() if formulation else '',
            'fragrance_v1_filter': fragrance_type.group(1).strip() if fragrance_type else '',
            'gender_filter': gender.group(1).strip() if gender else '',
            'occasion_filter': occasion.group(1).strip() if occasion else '',
            'preference_filter': preference.group(1).strip() if preference else '',
            'url': url,
            'seed_url': seed_url,
            'top_notes': top_notes,
            'heart_notes': heart_notes,
            'base_notes': base_notes,
            'images': '|'.join(image_urls)
        }

    except Exception as e:
        print(f"‚ùå Error scraping {url}: {e}")
        return None


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin
from tqdm.notebook import tqdm  # For nice progress bar in Jupyter

# Base listing URL
BASE_LISTING_URL = "https://www.nykaa.com/fragrance/c/53"
LISTING_TEMPLATE = BASE_LISTING_URL + "?page_no={page}&sort=popularity&search_redirection=True&eq=desktop"

headers = {
    'User-Agent': 'Mozilla/5.0'
}

# ‚úÖ 1. Function to extract product detail (you already have this defined)
# Place your `scrape_nykaa_product_detail()` here

# ‚úÖ 2. Get all product URLs
def get_product_urls(max_pages=100):
    all_urls = set()
    for page in range(1, max_pages + 1):
        url = LISTING_TEMPLATE.format(page=page)
        res = requests.get(url, headers=headers)
        if res.status_code != 200:
            print(f"‚ùå Failed to fetch page {page}")
            break

        soup = BeautifulSoup(res.text, 'html.parser')
        anchors = soup.select("a.css-qlopj4")  # Product links

        new_urls = {urljoin("https://www.nykaa.com", a.get('href').split('?')[0])
                    for a in anchors if a.get('href')}

        if not new_urls:
            print(f"‚úÖ No products found on page {page}, stopping.")
            break

        all_urls.update(new_urls)
        time.sleep(0.3)  # Be nice to server

    return list(all_urls)


# def run_scraper(sample_size=5):
#     product_urls = get_product_urls()
#     print(f"üîó Found {len(product_urls)} product URLs")

#     sample_urls = product_urls[:sample_size]  # Limit to first 5
#     print(f"üß™ Sampling {len(sample_urls)} URLs for test")

#     results = []
#     for url in tqdm(sample_urls, desc="üì¶ Scraping products"):
#         try:
#             product_data = scrape_nykaa_product_detail(url, BASE_LISTING_URL)
#             if product_data:
#                 results.append(product_data)
#         except Exception as e:
#             print(f"‚ùå Error scraping {url}: {e}")

#     df = pd.DataFrame(results)
#     df.drop_duplicates(subset=["sku"], inplace=True)
#     return df
def run_scraper():
    product_urls = get_product_urls()
    print(f"üîó Found {len(product_urls)} product URLs")

    results = []
    for url in tqdm(product_urls, desc="üì¶ Scraping products"):
        try:
            product_data = scrape_nykaa_product_detail(url, BASE_LISTING_URL)
            if product_data:
                results.append(product_data)
        except Exception as e:
            print(f"‚ùå Error scraping {url}: {e}")

    df = pd.DataFrame(results)
    df.drop_duplicates(subset=["sku"], inplace=True)
    return df


In [None]:
df_sample = run_scraper()
df_sample.head()


üîó Found 1946 product URLs


üì¶ Scraping products:   0%|          | 0/1946 [00:00<?, ?it/s]

‚ùå Error scraping https://www.nykaa.com/yardley-london-navy-body-spray-for-men/p/13221609: list index out of range
‚ùå Error scraping https://www.nykaa.com/clinique-happy-heart-perfume-spray-config/p/306093: 'NoneType' object has no attribute 'strip'
‚ùå Error scraping https://www.nykaa.com/engage-mate-bodylicious-deo-spray-220ml-each-buy-1-get-1-free/p/470479: list index out of range
‚ùå Error scraping https://www.nykaa.com/bella-vita-organic-date-woman-eau-de-parfum-with-heart-shape-pendant-gold-chain/p/19145830: 'NoneType' object has no attribute 'strip'
‚ùå Error scraping https://www.nykaa.com/revlon-charlie-blue-body-deo-spray-charlie-blue-natural-perfume-spray-gift-set/p/18038819: list index out of range
‚ùå Error scraping https://www.nykaa.com/revlon-charlie-red-body-deo-spray-charlie-red-natural-perfume-spray-gift-set/p/18038817: list index out of range
‚ùå Error scraping https://www.nykaa.com/layer-r-wottagirl-cupid-heart-body-splash/p/7829849: 'NoneType' object has no attrib

Unnamed: 0,source,keyword,sku,parent_sku,brand,title,category,subcategory,product_type,size,...,fragrance_v1_filter,gender_filter,occasion_filter,preference_filter,url,seed_url,top_notes,heart_notes,base_notes,images
0,Nykaa,https://www.nykaa.com/fragrance/c/53,836727,,Gucci,Gucci Bloom Ambrosia Di Fiori Eau De Parfum,Fragrance,Perfumes (EDT & EDP),Womens Fragrance,10ml|100ml,...,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,,https://www.nykaa.com/gucci-bloom-ambrosia-di-...,https://www.nykaa.com/fragrance/c/53,,,,https://images-static.nykaa.com/media/catalog/...
1,Nykaa,https://www.nykaa.com/fragrance/c/53,13221605,,Yardley London,Yardley London Gentleman Urbane Deodorant Roll...,Fragrance,Deodorants/Roll-ons,Male Grooming,,...,,,,,https://www.nykaa.com/yardley-london-urbane-ro...,https://www.nykaa.com/fragrance/c/53,,,,https://images-static.nykaa.com/media/catalog/...
2,Nykaa,https://www.nykaa.com/fragrance/c/53,17493667,,Moi,Moi by Nykaa Joie De Vivre Citrus Fresh Perfum...,Fragrance,Perfumes (EDT & EDP),Womens Fragrance,,...,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,,https://www.nykaa.com/moi-by-nykaa-joie-de-viv...,https://www.nykaa.com/fragrance/c/53,,,,https://images-static.nykaa.com/media/catalog/...
3,Nykaa,https://www.nykaa.com/fragrance/c/53,12893265,,Dolce&Gabbana,Q By Dolce&Gabbana Eau De Parfum,Fragrance,Perfumes (EDT & EDP),Womens Fragrance,30ml|100ml,...,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,,https://www.nykaa.com/dolce-gabbana-q-by-dolce...,https://www.nykaa.com/fragrance/c/53,,,,https://images-static.nykaa.com/media/catalog/...
4,Nykaa,https://www.nykaa.com/fragrance/c/53,25493,,Jaguar,Jaguar Classic Motion Eau De Toilette For Him,Fragrance,Perfumes (EDT & EDP),Womens Fragrance,,...,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,,https://www.nykaa.com/jaguar-classic-motion-ed...,https://www.nykaa.com/fragrance/c/53,,,,https://images-static.nykaa.com/media/catalog/...


In [None]:
df_sample[df_sample.sku == '5904122']['mrp']

1547    1200
Name: mrp, dtype: object

In [None]:
df_sample.iloc[:,10:30]

Unnamed: 0,color,offers,avg rating,no ratings,no reviews,no Q&A,mrp,sale_price,discounnt,manufacturer_name,seller_name,seller_country,stock,description,ingredients,formulation_filter,fragrance_v1_filter,gender_filter,occasion_filter,preference_filter
0,,,4.1,134,28,,,2000,0,,Nykaa E retail limited,India,In Stock,Gucci Bloom Ambrosia Di Fiori Eau De Parfum,,,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,
1,,,4.7,59,3,,,249,0,,Nykaa E retail limited,India,In Stock,Yardley London Gentleman Urbane Deodorant Roll...,,,,,,
2,,,4.4,6709,1131,,1800,1399,22,,Nykaa E retail limited,India,In Stock,Moi by Nykaa Joie De Vivre Citrus Fresh Perfum...,,,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,
3,,,4.7,7,1,,,5450,0,,Nykaa E retail limited,India,In Stock,Q By Dolce&Gabbana Eau De Parfum,,,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,
4,,,4.2,104,15,,4100,2665,35,,Nykaa E retail limited,India,In Stock,Jaguar Classic Motion Eau De Toilette For Him,,,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1930,,,4.1,46,14,,799,549,31,,Nykaa E retail limited,India,In Stock,Bella Vita Organic Gourmet Collection Vanilla ...,,,‚ùØ Mens Fragrance ‚ùØ Perfumes (EDT & EDP) Heart ...,,,
1931,,,,,,,,27000,0,,Nykaa E retail limited,India,In Stock,Memo Paris Moroccon Leather Eau De Parfum,,,‚ùØ Mens Fragrance ‚ùØ Perfumes (EDT & EDP) Heart ...,,,
1932,,,4.2,10,,,,4595,0,,Nykaa E retail limited,India,In Stock,Forest Essentials Intense Perfume Selection Bo...,,,‚ùØ Womens Fragrance ‚ùØ Perfumes (EDT & EDP) Hear...,,,
1933,,,4.1,1198,78,,899,449,50,,Nykaa E retail limited,India,In Stock,Bella Vita Ceo Woman Eau De Perfume,,,‚ùØ Mens Fragrance ‚ùØ Perfumes (EDT & EDP) Heart ...,,,


In [None]:
# df_sample.to_csv("nykaa_fragrances_13_07_21_13.csv", index=False)