In [None]:
import requests
from bs4 import BeautifulSoup
import time

def scrape_amazon_with_headers(search_query):
    """
    Scrapes Amazon using requests and BeautifulSoup, with a more complete set of headers
    to try and bypass anti-scraping measures.
    """
    url = f"https://www.amazon.com/s?k={search_query}"

    # A more complete set of headers to mimic a real browser
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Connection": "keep-alive",
        "Referer": "https://www.google.com/",  # Pretend we came from a search engine
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    print(f"Attempting to scrape URL with enhanced headers: {url}")

    try:
        response = requests.get(url, headers=headers, timeout=10)

        # Check for non-200 status codes
        if response.status_code != 200:
            print(f"Request failed with status code: {response.status_code}")
            print(f"Reason: {response.reason}")
            # The HTML content might contain an error message or a CAPTCHA page
            print("Response content (first 500 chars):")
            print(response.text[:500])
            return []

        soup = BeautifulSoup(response.content, "html.parser")

        # ... (rest of the parsing logic remains the same) ...
        # The selector might need to be updated. It's a frequent point of failure.
        results = soup.find_all("div", {"data-component-type": "s-search-result"})

        if not results:
            print("No search results found. This might be due to a blocked request or a change in Amazon's HTML structure.")
            return []

        products = []
        for item in results:
            product_info = {}

            title_tag = item.find("span", class_="a-text-normal") or item.find("h2").find("span")
            product_info["title"] = title_tag.get_text(strip=True) if title_tag else "N/A"

            price_whole = item.find("span", class_="a-price-whole")
            price_fraction = item.find("span", class_="a-price-fraction")

            if price_whole and price_fraction:
                product_info["price"] = f"${price_whole.get_text(strip=True)}{price_fraction.get_text(strip=True)}"
            else:
                product_info["price"] = "N/A"

            rating_tag = item.find("span", class_="a-icon-alt")
            product_info["rating"] = rating_tag.get_text(strip=True) if rating_tag else "N/A"

            products.append(product_info)

        return products

    except requests.exceptions.RequestException as e:
        print(f"Error during the request: {e}")
        return []

if __name__ == "__main__":
    search_term = "laptop"
    print(f"Scraping Amazon for: '{search_term}'...")

    # Still use a delay to be polite
    time.sleep(5)

    laptops = scrape_amazon_with_headers(search_term)

    if laptops:
        print("\n--- Scraped Products ---")
        for i, laptop in enumerate(laptops[:5]):
            print(f"Product {i+1}:")
            print(f"  Title: {laptop.get('title')}")
            print(f"  Price: {laptop.get('price')}")
            print(f"  Rating: {laptop.get('rating')}")
            print("-" * 20)
    else:
        print("\nFailed to retrieve products. The request was likely blocked.")


Scraping Amazon for: 'laptop'...
Attempting to scrape URL with enhanced headers: https://www.amazon.com/s?k=laptop

--- Scraped Products ---
Product 1:
  Title: HP 15.6 inch Laptop, HD Touchscreen Display, AMD Ryzen 3 7320U, 8 GB RAM, 128 GB SSD, AMD Radeon Graphics, Windows 11 Home in S Mode, Natural Silver, 15- fc0099nr
  Price: $299.99
  Rating: 4.4 out of 5 stars
--------------------
Product 2:
  Title: HP 14 Laptop, Intel Celeron N4020, 4 GB RAM, 64 GB Storage, 14-inch Micro-edge HD Display, Windows 11 Home, Thin & Portable, 4K Graphics, One Year of Microsoft 365 (14-dq0040nr, Snowflake White)
  Price: $173.27
  Rating: 4.1 out of 5 stars
--------------------
Product 3:
  Title: HP 2025 14 inch HD Laptop, Office Pro Lifetime License, Intel Processor N150, 16 GB RAM, 384 GB Storage(128GB UFS+256GB MSD), Copilot AI, Wi-Fi 6, Lightweight, Windows 11 Pro, w/Accessories
  Price: $399.99
  Rating: 3.9 out of 5 stars
--------------------
Product 4:
  Title: Apple 2025 MacBook Air 13-inch

In [None]:
!sudo apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
pip install pytesseract pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
from PIL import Image
import pytesseract

# Path to your image file
image_path = "/content/images.jpg"

# Open the image with Pillow
img = Image.open(image_path)

# Optional: If Tesseract is not in your PATH, specify the executable location:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Windows example

# Extract text from the image
text = pytesseract.image_to_string(img)

print("Extracted Text:")
print(text)


Extracted Text:
“© MAKE TEXT
STAND OUT FROM
, BACKGROUNDS ”* <

      



OBSERVATIONS AND LEARNINGS FROM ABOVE CODES

The web scraping code successfully extracted information (title, price, and rating) for the first 5 laptop listings on Amazon, demonstrating the use of requests with enhanced headers and BeautifulSoup for parsing HTML. It also included error handling for non-200 status codes and potential changes in the website's structure.
The OCR code using pytesseract and Pillow successfully extracted text from an image file. This involved installing the Tesseract OCR engine (tesseract-ocr) and the Python libraries (pytesseract, pillow), and then using pytesseract.image_to_string() to perform the text extraction.
Both examples show how to use external libraries to perform specific tasks (web scraping and OCR) within a Python environment.



CONCLUSION

These examples highlight the capabilities of Python libraries like requests, BeautifulSoup, pytesseract, and Pillow for automating data extraction from websites and recognizing text within images. These techniques can be applied to a wide range of tasks, such as gathering data for analysis, automating data entry, or processing scanned documents. While web scraping can be challenging due to anti-scraping measures and website structure changes, and OCR accuracy can vary depending on image quality, these libraries provide powerful tools for tackling such problems programmatically.