In [1]:
!pip install requests
!pip install beautifulsoup4
!pip install pandas
!pip install fake-useragent
!pip install lxml

Collecting fake-useragent
  Obtaining dependency information for fake-useragent from https://files.pythonhosted.org/packages/e4/99/60d8cf1b26938c2e0a57e232f7f15641dfcd6f8deda454d73e4145910ff6/fake_useragent-1.5.1-py3-none-any.whl.metadata
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Downloading fake_useragent-1.5.1-py3-none-any.whl (17 kB)
Installing collected packages: fake-useragent
Successfully installed fake-useragent-1.5.1


# Direction

In [None]:
#direction
import os

import os

current_directory = os.getcwd()
print("Current Directory:", current_directory)


#new_directory = r"D:\My papers\Application"  # Use the 'r' prefix to handle backslashes in Windows paths

new_directory = r"D:\My papers\Application\API_Youtube"  # Use the 'r' prefix to handle backslashes in Windows paths

# Change the current working directory
os.chdir(new_directory)

# Verify that the directory has been changed
current_directory = os.getcwd()
print("Current Directory:", current_directory)
#D:\My papers\Application\Other

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from fake_useragent import UserAgent
import json
import scipy.io as sio
import numpy as np

def setup_scraper():
    ua = UserAgent()
    headers = {
        'User-Agent': ua.random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Connection': 'keep-alive',
        'Referer': 'https://www.amazon.com'
    }
    return headers

def search_amazon(keyword, headers):
    base_url = f"https://www.amazon.com/s?k={keyword.replace(' ', '+')}"
    
    try:
        print(f"Attempting to fetch URL: {base_url}")
        response = requests.get(base_url, headers=headers)
        print(f"Response status code: {response.status_code}")
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch data: Status code {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred during request: {e}")
        return None

def parse_product_listing(html_content):
    if not html_content:
        print("No HTML content to parse")
        return []

    soup = BeautifulSoup(html_content, 'html.parser')
    products = []
    
    print("Parsing HTML content...")
    
    items = soup.find_all('div', {'data-component-type': 's-search-result'})
    print(f"Found {len(items)} product items")
    
    for item in items:
        try:
            product = {}
            
            # Title and Product URL
            title_elem = item.find('a', {'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
            if title_elem:
                product['title'] = title_elem.text.strip()
                # Get the product URL
                product['url'] = 'https://www.amazon.com' + title_elem.get('href', '')
            else:
                product['title'] = 'N/A'
                product['url'] = 'N/A'
            
            # Price
            price_elem = item.find('span', {'class': 'a-price-whole'})
            product['price'] = price_elem.text.strip() if price_elem else 'N/A'
            
            # Rating
            rating_elem = item.find('span', {'class': 'a-icon-alt'})
            product['rating'] = rating_elem.text.strip() if rating_elem else 'N/A'
            
            # Reviews
            reviews_elem = item.find('span', {'class': 'a-size-base'})
            product['reviews'] = reviews_elem.text.strip() if reviews_elem else 'N/A'
            
            # ASIN (Amazon Standard Identification Number)
            product['asin'] = item.get('data-asin', 'N/A')
            
            # Prime status
            prime_elem = item.find('i', {'class': 'a-icon-prime'})
            product['prime'] = 'Yes' if prime_elem else 'No'
            
            products.append(product)
            print(f"Successfully parsed product: {product['title'][:50]}... | URL: {product['url'][:70]}...")
            
        except Exception as e:
            print(f"Error parsing product: {e}")
            continue
    
    return products

def save_data(products, base_filename='amazon_products'):
    if not products:
        print("No products to save")
        return False
    
    try:
        # Save to CSV
        df = pd.DataFrame(products)
        csv_filename = f'{base_filename}.csv'
        df.to_csv(csv_filename, index=False, encoding='utf-8-sig')  # utf-8-sig for Excel compatibility
        print(f"Successfully saved CSV: {csv_filename}")

        # Save to JSON
        json_filename = f'{base_filename}.json'
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(products, f, ensure_ascii=False, indent=4)
        print(f"Successfully saved JSON: {json_filename}")

        # Save to MAT file
        mat_filename = f'{base_filename}.mat'
        
        # Convert data to a format suitable for MAT file
        mat_data = {
            'titles': np.array([p['title'] for p in products]),
            'prices': np.array([p['price'] for p in products]),
            'ratings': np.array([p['rating'] for p in products]),
            'reviews': np.array([p['reviews'] for p in products]),
            'urls': np.array([p['url'] for p in products]),
            'asins': np.array([p['asin'] for p in products]),
            'prime_status': np.array([p['prime'] for p in products]),
            'keywords': np.array([p.get('keyword', '') for p in products])
        }
        
        sio.savemat(mat_filename, mat_data)
        print(f"Successfully saved MAT file: {mat_filename}")

        # Save URLs separately for easy access
        urls_filename = f'{base_filename}_urls.txt'
        with open(urls_filename, 'w', encoding='utf-8') as f:
            for product in products:
                f.write(f"{product['title'][:100]}...\n{product['url']}\n\n")
        print(f"Successfully saved URLs: {urls_filename}")

        return True
    except Exception as e:
        print(f"Error saving data: {e}")
        return False

def main():
    try:
        # Initialize empty list for all products
        all_products = []
        
        # Setup scraper
        headers = setup_scraper()
        print("Scraper setup completed")
        
        # Define keywords
        keywords = ['laptop', 'ipad', 'iphone', 'AI', 'DSLR']  # Add more keywords as needed
        print(f"Starting scrape for keywords: {keywords}")
        
        # Loop through keywords
        for keyword in keywords:
            print(f"\nScraping data for: {keyword}")
            
            # Get HTML content
            html_content = search_amazon(keyword, headers)
            
            if html_content:
                # Parse products
                products = parse_product_listing(html_content)
                
                # Add keyword to each product and append to all_products
                for product in products:
                    product['keyword'] = keyword
                    all_products.append(product)
                
                print(f"Found {len(products)} products for keyword: {keyword}")
                
                # Add delay between requests
                delay = random.uniform(2, 5)
                print(f"Waiting {delay:.2f} seconds before next request...")
                time.sleep(delay)
            else:
                print(f"No HTML content retrieved for keyword: {keyword}")
        
        # Save results in multiple formats
        if all_products:
            save_data(all_products)
            print(f"Total products collected: {len(all_products)}")
        else:
            print("No products were collected")
            
    except Exception as e:
        print(f"An error occurred in main: {e}")

if __name__ == "__main__":
    main()

Scraper setup completed
Starting scrape for keywords: ['laptop', 'ipad', 'iphone', 'AI', 'DSLR']

Scraping data for: laptop
Attempting to fetch URL: https://www.amazon.com/s?k=laptop
Response status code: 200
Parsing HTML content...
Found 22 product items
Successfully parsed product: Laptop Computer Win 11, 15.6 inch 1920x1080 FHD IP... | URL: https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo4MjkzNzMxODU2MDIzOTA5...
Successfully parsed product: 15.6 FHD Student Laptop Computer, 16GB RAM 1TB SSD... | URL: https://www.amazon.com/sspa/click?ie=UTF8&spc=MTo4MjkzNzMxODU2MDIzOTA5...
Successfully parsed product: Acer Aspire 3 A315-24P-R7VH Slim Laptop | 15.6" Fu... | URL: https://www.amazon.com/A315-24P-R7VH-Display-Quad-Core-Processor-Graph...
Successfully parsed product: HP Newest 14" Ultral Light Laptop for Students and... | URL: https://www.amazon.com/HP-Students-Business-Quad-Core-Storage/dp/B0B2D...
Successfully parsed product: 15.6 Inch Laptops, Windows 11 Laptop Computer with... | URL