In [4]:
# Importing required libraries
import pandas as pd  # Import pandas for data manipulation and analysis
import requests  # Import requests for sending HTTP requests
from bs4 import BeautifulSoup  # Import BeautifulSoup from bs4 for parsing HTML and XML documents
import csv  # Import csv for handling CSV files
import time  # Import time to add delay between requests to avoid rate limiting
import random  # Import random to generate random sleep times between requests

# Define headers to mimic a browser request to avoid being blocked by the website
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

# Initialize lists outside the loop to accumulate data across pages
Product_name = []  # List to store product names
Prices = []  # List to store product prices
Description = []  # List to store product descriptions
Reviews = []  # List to store product reviews

# Loop through 1 to 99 to scrape data from multiple pages
for i in range(1, 150):
    # Construct the URL with the page number i
    url = f"https://www.flipkart.com/search?q=mobiles+under+50000&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&as-pos=1&as-type=HISTORY&page={i}"
    
    # Attempt to fetch the page with retries to handle potential rate limits or errors
    for attempt in range(5):
        page = requests.get(url, headers=headers)  # Send a GET request to the specified URL with headers
        print(f"Scraping page {i}: {page.status_code}")  # Print the status code to know the result of the request
        
        if page.status_code == 429:
            # If the status code is 429 (Too Many Requests), wait before retrying
            print("Rate limit hit. Waiting before retrying...")
            time.sleep(10 + attempt * 5)  # Exponential backoff: wait longer with each attempt
            continue  # Retry the request
        elif page.status_code == 200:
            # If the status code is 200 (OK), the request was successful
            break  # Exit the retry loop
        else:
            # If the status code is not 200 or 429, handle it as an unexpected status
            print(f"Unexpected status code {page.status_code}. Stopping the loop.")
            break  # Exit both loops
      # If the status is still 429 after retries, stop the entire scraping process
    if page.status_code == 429:
        print("Rate limit not overcome. Exiting the loop.")
        break  # Exit the main loop
    
    # Get the content of the page from the response
    fd = page.content
    
    # Parse the HTML content of the page with BeautifulSoup
    soup = BeautifulSoup(fd, 'html.parser')
    
    # Find the main container that holds the product listings
    box = soup.find('div', class_='DOjaWF gdgoEp')
    
    # Check if the box is None (i.e., no products found on this page)
    if box is None:
        print(f"No products found on page {i}. Stopping the loop.")
        break  # Stop scraping further pages if no products are found
    
    # Finding and storing Product Names
    names = box.find_all('div', class_='KzDlHZ')  # Find all elements with the product name class
    for name in names:
        Product_name.append(name.text)  # Append the text content of each name to the Product_name list
    
    # Finding and storing Product Prices
    prices = box.find_all('div', class_='Nx9bqj _4b5DiR')  # Find all elements with the product price class
    for price in prices:
        Prices.append(price.text.strip())  # Append the cleaned (stripped) text of each price to the Prices list

    # Finding and storing Product Descriptions
    desc = box.find_all('ul', class_='G4BRas')  # Find all elements with the product description class
    for description in desc:
        Description.append(description.text)  # Append the text content of each description to the Description list
    
    # Finding and storing Product Reviews
    reviews = box.find_all('div', class_='XQDdHH')  # Find all elements with the product review class
    for review in reviews:
        Reviews.append(review.text)  # Append the text content of each review to the Reviews list

    # Random sleep time between requests to avoid getting blocked by the website's rate limiting
    time.sleep(random.uniform(2, 5))  # Sleep for a random duration between 2 and 5 seconds

# After scraping all pages, ensure data consistency

# Check the lengths of the lists to verify that they are equal
print(f"Length of Product_name: {len(Product_name)}")
print(f"Length of Prices: {len(Prices)}")
print(f"Length of Description: {len(Description)}")
print(f"Length of Reviews: {len(Reviews)}")

# Ensure all lists have the same length by truncating them to the length of the shortest list
min_length = min(len(Product_name), len(Prices), len(Description), len(Reviews))

Product_name = Product_name[:min_length]  # Truncate the Product_name list to the minimum length
Prices = Prices[:min_length]  # Truncate the Prices list to the minimum length
Description = Description[:min_length]  # Truncate the Description list to the minimum length
Reviews = Reviews[:min_length]  # Truncate the Reviews list to the minimum length

# Create a DataFrame to organize the data
df = pd.DataFrame({
    'Product Name': Product_name,  # Add the Product_name list as a column named 'Product Name'
    'Prices': Prices,  # Prepend '₹' to each price and add it as a column named 'Prices'
    'Description': Description,  # Add the Description list as a column named 'Description'
    'Reviews': Reviews  # Add the Reviews list as a column named 'Reviews'
})

# Save the DataFrame to a CSV file
df.to_csv("E:/#DATA_ANALYST_PORTFOLIO_PROJECTS/03_PYTHON/Web_Scrapping/Flipkart_Multiple_Page_Scrapper/Flipkart_mobiles_under_50000.csv", 
          index=False, encoding='utf-8-sig')  # Save the DataFrame to a CSV file without the index and with UTF-8 encoding

print("Data scraping completed and saved to CSV.")  # Indicate that the scraping and saving process is complete


Scraping page 1: 200
Scraping page 2: 200
Scraping page 3: 200
Scraping page 4: 200
Scraping page 5: 200
Scraping page 6: 429
Rate limit hit. Waiting before retrying...
Scraping page 6: 429
Rate limit hit. Waiting before retrying...
Scraping page 6: 429
Rate limit hit. Waiting before retrying...
Scraping page 6: 200
Scraping page 7: 429
Rate limit hit. Waiting before retrying...
Scraping page 7: 200
Scraping page 8: 429
Rate limit hit. Waiting before retrying...
Scraping page 8: 429
Rate limit hit. Waiting before retrying...
Scraping page 8: 200
Scraping page 9: 429
Rate limit hit. Waiting before retrying...
Scraping page 9: 200
Scraping page 10: 429
Rate limit hit. Waiting before retrying...
Scraping page 10: 429
Rate limit hit. Waiting before retrying...
Scraping page 10: 200
Scraping page 11: 429
Rate limit hit. Waiting before retrying...
Scraping page 11: 429
Rate limit hit. Waiting before retrying...
Scraping page 11: 200
Scraping page 12: 429
Rate limit hit. Waiting before retryi