In [47]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

class ScrappingAmazon:

    def __init__(self):
        self.HEADERS = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 OPR/115.0.0.0',
            'Accept-Language': 'en-US, en;q=0.5'
        }
        self.URL = "https://www.amazon.in/s?k=refrigerator&crid=284TJSM5X0E0A&sprefix=refrigerator%2Caps%2C253&ref=nb_sb_noss_2"
        self.soup = None  # Ensure soup is always initialized
        self.links_list = []  # Ensure links_list is always initialized

    def requestAndConvert(self):
        """Fetches the main Amazon search page and parses it."""
        try:
            webpage = requests.get(self.URL, headers=self.HEADERS)
            if webpage.status_code == 200:
                self.soup = BeautifulSoup(webpage.content, "html.parser")
                return self.soup
            else:
                print(f"Error: Page Not Found (Status Code: {webpage.status_code})")
                self.soup = None
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request Error: {e}")
            self.soup = None
            return None

    def findElements(self):
        """Extracts product links from the search page."""
        if self.soup is None:
            print("Error: No page content to parse. Ensure `requestAndConvert()` succeeded.")
            return None

        self.links = self.soup.find_all("a", attrs={'class': 'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})

        if not self.links:
            print("Error: No product links found.")
            return None

        self.links_list = [link.get("href") for link in self.links]
        return self.links_list

    def reachPages(self):
        """Visits each product page and extracts details like title, price, and brand."""
        if not self.links_list:
            print("Error: No product links available. Ensure `findElements()` was executed successfully.")
            return None

        self.d = {"Title": [], "Price": [], "Brand": []}

        for link in self.links_list:
            try:
                item_webpage = requests.get("https://www.amazon.in" + link, headers=self.HEADERS)
                if item_webpage.status_code == 200:
                    self.Item_soup = BeautifulSoup(item_webpage.content, "html.parser")
                    self.d['Title'].append(self.get_title())
                    self.d['Price'].append(self.get_price())
                    self.d['Brand'].append(self.get_brand())
                else:
                    print(f"Skipping link due to error (Status Code: {item_webpage.status_code})")
            except requests.exceptions.RequestException as e:
                print(f"Error fetching product page: {e}")

        # **Convert dictionary to DataFrame**
        amazon_df = pd.DataFrame.from_dict(self.d)

        # **Handle missing values: Remove rows with empty titles**
        amazon_df['Title'].replace('', np.nan, inplace=True)
        amazon_df.dropna(subset=['Title'], inplace=True)

        # **Save data to CSV**
        amazon_df.to_csv("amazon_data.csv", header=True, index=False)
        print("\n✅ Data successfully saved to 'amazon_data.csv'")

        return amazon_df

    def get_title(self):
        """Extracts the product title from the page."""
        try:
            title = self.Item_soup.find("span", attrs={"id": 'productTitle'})
            return title.text.strip() if title else "N/A"
        except AttributeError:
            return "N/A"

    def get_price(self):
        """Extracts the product price from the page."""
        try:
            price = self.Item_soup.find("span", attrs={"class": 'a-price-whole'})
            return price.text.strip() if price else "N/A"
        except AttributeError:
            return "N/A"

    def get_brand(self):
        """Extracts the product brand from the page."""
        try:
            brand = self.Item_soup.find("span", attrs={"class": "a-size-base po-break-word"})
            return brand.text.strip() if brand else "N/A"
        except AttributeError:
            return "N/A"

# **Execution**
obj = ScrappingAmazon()

# Step 1: Fetch main page content
if obj.requestAndConvert():
    # Step 2: Extract product links
    if obj.findElements():
        # Step 3: Fetch product pages, extract details, and save to CSV
        amazon_df = obj.reachPages()
        print("\nExtracted Product Data:\n", amazon_df) 
        



✅ Data successfully saved to 'amazon_data.csv'

Extracted Product Data:
                                                 Title    Price  \
0   LG 272 L 3 Star Frost-Free Smart Inverter Comp...  29,990.   
1   Samsung 236 L, 3 Star, Convertible, Digital In...  26,990.   
2   Whirlpool 184 L 2 Star Direct-Cool Single Door...  11,990.   
3   Whirlpool 235 L 2 Star Frost Free Double Door ...  21,990.   
4   Haier 165 L, 1 Star, Direct-Cool Single Door R...  11,190.   
5   Whirlpool 235 L Frost Free Triple-Door Refrige...  25,600.   
6   Whirlpool 192 L 3 Star Vitamagic PRO Frost Fre...  15,990.   
7   Samsung 236 L, 3 Star, Convertible, Digital In...  26,990.   
8   Samsung 183 L, 4 Star, Digital Inverter, Direc...  16,490.   
9   Samsung 183 L, 5 Star, Digital Inverter, Direc...  17,690.   
10  Godrej 202 L 5 Star Advanced Inverter, Jumbo V...  18,490.   
11  Godrej 180 L 2 Star Advanced Capillary Technol...  12,390.   
12  Haier 185 L, 2 Star, Direct-Cool Single Door R...  11,990.   
13