In [20]:
import subprocess
import time
import psutil

def ensure_tor_is_running():
    tor_process = None
    for proc in psutil.process_iter(['name']):
        if proc.info['name'] == 'tor':
            tor_process = proc
            break
    
    if tor_process is None:
        print("Tor is not running. Starting Tor...")
        subprocess.Popen(["/opt/homebrew/opt/tor/bin/tor"])
        time.sleep(10)  # Wait for Tor to start up
        print("Tor should now be running.")
    else:
        print("Tor is already running.")

In [22]:
if __name__ == "__main__":
    ensure_tor_is_running()


Tor is already running.


In [5]:
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import csv
from urllib.parse import quote
import socks
import socket
from fake_useragent import UserAgent
import cloudscraper
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class TorSession(requests.Session):
    def __init__(self, *args, **kwargs):
        super(TorSession, self).__init__(*args, **kwargs)
        self.proxies = {
            'http': 'socks5h://localhost:9050',
            'https': 'socks5h://localhost:9050'
        }

def get_tor_session():
    session = TorSession()
    retry = Retry(total=5, backoff_factor=0.2, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def check_tor_connection():
    session = get_tor_session()
    try:
        response = session.get('https://check.torproject.org/', timeout=30)
        if 'Congratulations. This browser is configured to use Tor.' in response.text:
            logging.info("Successfully connected through Tor.")
            return True
        else:
            logging.warning("Connected to the internet, but not through Tor.")
            return False
    except Exception as e:
        logging.error(f"Error checking Tor connection: {str(e)}")
        return False

class AmazonScraper:
    def __init__(self):
        self.ua = UserAgent()
        self.session = self.create_scraper()

    def create_scraper(self):
        scraper = cloudscraper.create_scraper(
            browser={
                'browser': 'chrome',
                'platform': 'windows',
                'desktop': True
            },
            delay=10,
            interpreter='nodejs'
        )
        scraper.proxies = {
            'http': 'socks5h://localhost:9050',
            'https': 'socks5h://localhost:9050'
        }
        return scraper

    def get_headers(self):
        return {
            'User-Agent': self.ua.random,
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://www.amazon.co.uk/',
            'DNT': '1',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
        }

    def throttle_request(self):
        time.sleep(random.uniform(10, 30))  

    def make_request(self, url, max_retries=5):
        for attempt in range(max_retries):
            try:
                self.session.cookies.clear()  
                headers = self.get_headers()
                response = self.session.get(url, headers=headers, timeout=60)
                response.raise_for_status()
                return response
            except requests.exceptions.RequestException as e:
                logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt == max_retries - 1:
                    logging.error(f"Max retries reached. Unable to fetch {url}")
                    return None
                self.session = self.create_scraper()  # Create a new session for a new Tor circuit
                time.sleep((2 ** attempt) + random.random() * 30)  # Exponential backoff with added randomness

    def scrape_category(self, category, max_products=100):
        all_products = []
        page = 1
        encoded_category = quote(category)
        base_url = f"https://www.amazon.co.uk/s?k={encoded_category}&ref=nb_sb_noss_1"
        
        while len(all_products) < max_products:
            url = f"{base_url}&page={page}"
            self.throttle_request()
            
            response = self.make_request(url)
            if not response:
                break

            soup = BeautifulSoup(response.content, 'html.parser')
            product_cards = soup.select('div[data-component-type="s-search-result"]')
            
            if not product_cards:
                logging.warning(f"No product cards found on page {page} for {category}. This might be a captcha page.")
                break
            
            for card in product_cards:
                if len(all_products) >= max_products:
                    break
                
                product_data = self.extract_product_data(card, category)
                all_products.append(product_data)
                logging.info(f"Scraped product {len(all_products)} for {category}: {product_data['title']}")
            
            # Check if there's a next page
            next_page = soup.select_one('a.s-pagination-next')
            if not next_page:
                break
            
            page += 1
            self.session = self.create_scraper()  # This creates a new session for a new Tor circuit between pages 

        logging.info(f"Scraped {len(all_products)} products from the category {category}.")
        return all_products

    def extract_product_data(self, card, category):
        product_data = {'category': category}
        
        # Extract title
        title_elem = card.select_one('h2 a.a-link-normal')
        product_data['title'] = title_elem.text.strip() if title_elem else None
        
        # Extract price
        price_elem = card.select_one('span.a-price-whole')
        product_data['price'] = price_elem.text.strip() if price_elem else None
        
        # Extract rating
        rating_elem = card.select_one('span.a-icon-alt')
        product_data['rating'] = rating_elem.text.split()[0] if rating_elem else None
        
        # Extract review count
        review_count_elem = card.select_one('span.a-size-base.s-underline-text')
        product_data['review_count'] = review_count_elem.text.strip() if review_count_elem else None
        
        return product_data

def scrape_all_categories(categories):
    scraper = AmazonScraper()
    all_data = []

    for category in categories:
        if isinstance(category, list):
            for subcategory in category:
                products = scraper.scrape_category(f"{category[0]} {subcategory}", max_products=100)
                all_data.extend(products)
        else:
            products = scraper.scrape_category(category, max_products=100)
            all_data.extend(products)
        
        time.sleep(random.uniform(180, 300))  

    return all_data

def save_to_csv(data, filename):
    if not data:
        logging.warning("No data to save. CSV file will not be created.")
        return
    
    keys = data[0].keys()
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, keys)
            dict_writer.writeheader()
            dict_writer.writerows(data)
        logging.info(f"Data successfully saved to {filename}")
    except IOError as e:
        logging.error(f"IOError occurred while saving data: {str(e)}")
    except Exception as e:
        logging.error(f"Unexpected error occurred while saving data: {str(e)}")

if __name__ == "__main__":
    if not check_tor_connection():
        logging.error("Failed to connect through Tor. Exiting.")
        exit(1)

    categories = [
        "All Departments", "Apps & Games"
    ]

    try:
        all_products = scrape_all_categories(categories)
        save_to_csv(all_products, 'amazon_products_TOR.csv')
        logging.info("Scraping completed. Data saved to amazon_products_TOR.csv")
    except KeyboardInterrupt:
        logging.info("Scraping interrupted by user.")
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")
    finally:
        logging.info("Scraping process finished.")

2024-10-02 21:28:14,663 - INFO - Successfully connected through Tor.
2024-10-02 21:29:14,595 - INFO - Scraping interrupted by user.
2024-10-02 21:29:14,597 - INFO - Scraping process finished.


In [32]:
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import csv
from urllib.parse import quote
from fake_useragent import UserAgent

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_tor_session():
    session = requests.session()
    session.proxies = {
        'http': 'socks5h://localhost:9050',
        'https': 'socks5h://localhost:9050'
    }
    return session

def check_tor_connection():
    session = get_tor_session()
    try:
        response = session.get('https://check.torproject.org/', timeout=30)
        if 'Congratulations. This browser is configured to use Tor.' in response.text:
            logging.info("Successfully connected through Tor.")
            return True
        else:
            logging.warning("Connected to the internet, but not through Tor.")
            return False
    except Exception as e:
        logging.error(f"Error checking Tor connection: {str(e)}")
        return False

class AmazonScraper:
    def __init__(self):
        self.ua = UserAgent()
        self.session = get_tor_session()

    def get_headers(self):
        return {
            'User-Agent': self.ua.random,
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://www.amazon.co.uk/',
            'DNT': '1',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
        }

    def throttle_request(self):
        time.sleep(random.uniform(10, 20))

    def make_request(self, url, max_retries=5):
        for attempt in range(max_retries):
            try:
                self.session.cookies.clear()
                headers = self.get_headers()
                response = self.session.get(url, headers=headers, timeout=30)
                response.raise_for_status()
                return response
            except requests.exceptions.RequestException as e:
                logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt == max_retries - 1:
                    logging.error(f"Max retries reached. Unable to fetch {url}")
                    return None
                self.session = get_tor_session()  # Get a new Tor session
                time.sleep((2 ** attempt) + random.random() * 10)

    def scrape_category(self, category, max_products=25):
        all_products = []
        page = 1
        encoded_category = quote(category)
        base_url = f"https://www.amazon.co.uk/s?k={encoded_category}&ref=nb_sb_noss_1"
        
        while len(all_products) < max_products:
            url = f"{base_url}&page={page}"
            self.throttle_request()
            
            response = self.make_request(url)
            if not response:
                break

            soup = BeautifulSoup(response.content, 'html.parser')
            product_cards = soup.select('div[data-component-type="s-search-result"]')
            
            if not product_cards:
                logging.warning(f"No product cards found on page {page} for {category}. This might be a captcha page.")
                break
            
            for card in product_cards:
                if len(all_products) >= max_products:
                    break
                
                product_data = self.extract_product_data(card, category)
                all_products.append(product_data)
                logging.info(f"Scraped product {len(all_products)} for {category}: {product_data['title']}")
            
            next_page = soup.select_one('a.s-pagination-next')
            if not next_page:
                break
            
            page += 1
            self.session = get_tor_session()  # Get a new Tor session between pages
            time.sleep(random.uniform(30, 60))

        logging.info(f"Scraped {len(all_products)} products from the category {category}.")
        return all_products

    def extract_product_data(self, card, category):
        product_data = {'category': category}
        
        title_elem = card.select_one('h2 a.a-link-normal')
        product_data['title'] = title_elem.text.strip() if title_elem else None
        
        price_elem = card.select_one('span.a-price-whole')
        product_data['price'] = price_elem.text.strip() if price_elem else None
        
        rating_elem = card.select_one('span.a-icon-alt')
        product_data['rating'] = rating_elem.text.split()[0] if rating_elem else None
        
        review_count_elem = card.select_one('span.a-size-base.s-underline-text')
        product_data['review_count'] = review_count_elem.text.strip() if review_count_elem else None
        
        return product_data

def scrape_all_categories(categories):
    scraper = AmazonScraper()
    all_data = []

    for category in categories:
        if isinstance(category, list):
            for subcategory in category:
                products = scraper.scrape_category(f"{category[0]} {subcategory}", max_products=25)
                all_data.extend(products)
        else:
            products = scraper.scrape_category(category, max_products=25)
            all_data.extend(products)
        
        time.sleep(random.uniform(60, 120))

    return all_data

def save_to_csv(data, filename):
    if not data:
        logging.warning("No data to save. CSV file will not be created.")
        return
    
    keys = data[0].keys()
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, keys)
            dict_writer.writeheader()
            dict_writer.writerows(data)
        logging.info(f"Data successfully saved to {filename}")
    except IOError as e:
        logging.error(f"IOError occurred while saving data: {str(e)}")
    except Exception as e:
        logging.error(f"Unexpected error occurred while saving data: {str(e)}")

if __name__ == "__main__":
    if not check_tor_connection():
        logging.error("Failed to connect through Tor. Exiting.")
        exit(1)

    categories = [
        "All Departments", "Apps & Games"
    ]

    try:
        all_products = scrape_all_categories(categories)
        save_to_csv(all_products, 'amazon_products_TOR.csv')
        logging.info("Scraping completed. Data saved to amazon_products_TOR.csv")
    except KeyboardInterrupt:
        logging.info("Scraping interrupted by user.")
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")
    finally:
        logging.info("Scraping process finished.")

2024-10-02 22:28:20,387 - INFO - Successfully connected through Tor.
2024-10-02 22:28:45,465 - INFO - Scraped product 1 for All Departments: None
2024-10-02 22:28:45,466 - INFO - Scraped product 2 for All Departments: None
2024-10-02 22:28:45,466 - INFO - Scraped product 3 for All Departments: None
2024-10-02 22:28:45,467 - INFO - Scraped product 4 for All Departments: None
2024-10-02 22:28:45,467 - INFO - Scraped product 5 for All Departments: None
2024-10-02 22:28:45,468 - INFO - Scraped product 6 for All Departments: None
2024-10-02 22:28:45,469 - INFO - Scraped product 7 for All Departments: None
2024-10-02 22:28:45,469 - INFO - Scraped product 8 for All Departments: None
2024-10-02 22:28:45,470 - INFO - Scraped product 9 for All Departments: None
2024-10-02 22:28:45,471 - INFO - Scraped product 10 for All Departments: None
2024-10-02 22:28:45,472 - INFO - Scraped product 11 for All Departments: None
2024-10-02 22:28:45,472 - INFO - Scraped product 12 for All Departments: None
2024

In [59]:
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import csv
from urllib.parse import quote
from fake_useragent import UserAgent
import re

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_tor_session():
    session = requests.session()
    session.proxies = {
        'http': 'socks5h://localhost:9050',
        'https': 'socks5h://localhost:9050'
    }
    return session

def check_tor_connection():
    session = get_tor_session()
    try:
        response = session.get('https://api.ipify.org?format=json', timeout=30)
        ip = response.json()['ip']
        logging.info(f"Connected through IP: {ip}")
        return True
    except Exception as e:
        logging.error(f"Error checking Tor connection: {str(e)}")
        return False

class AmazonScraper:
    def __init__(self):
        self.ua = UserAgent()
        self.session = get_tor_session()
        self.pages_accessed = 0
        self.total_page_size = 0

    def get_headers(self):
        return {
            'User-Agent': self.ua.random,
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://www.amazon.co.uk/',
            'DNT': '1',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
        }

    def throttle_request(self):
        time.sleep(random.uniform(10, 20))

    def make_request(self, url, max_retries=10):
        for attempt in range(max_retries):
            try:
                self.session.cookies.clear()
                headers = self.get_headers()
                response = self.session.get(url, headers=headers, timeout=30)
                response.raise_for_status()
                self.pages_accessed += 1
                self.total_page_size += len(response.content)
                logging.info(f"Accessed page through IP: {self.session.get('https://api.ipify.org?format=json').json()['ip']}")
                return response
            except requests.exceptions.RequestException as e:
                logging.warning(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt == max_retries - 1:
                    logging.error(f"Max retries reached. Unable to fetch {url}")
                    return None
                self.session = get_tor_session()  # Get a new Tor session
                time.sleep((2 ** attempt) + random.random() * 10)

    def scrape_category(self, category, max_products=50, max_pages=2):
        all_products = []
        page = 1
        encoded_category = quote(category)
        base_url = f"https://www.amazon.co.uk/s?k={encoded_category}&ref=nb_sb_noss_1"
        
        while len(all_products) < max_products and page <= max_pages:
            url = f"{base_url}&page={page}"
            self.throttle_request()
            
            response = self.make_request(url)
            if not response:
                break

            soup = BeautifulSoup(response.content, 'html.parser')
            product_cards = soup.select('div[data-component-type="s-search-result"]')
            
            if not product_cards:
                logging.warning(f"No product cards found on page {page} for {category}. This might be a captcha page.")
                break
            
            for card in product_cards:
                if len(all_products) >= max_products:
                    break
                
                product_data = self.extract_product_data(card, category)
                if product_data:
                    all_products.append(product_data)
                    logging.info(f"Scraped product {len(all_products)} for {category}: {product_data['title']}")
            
            page += 1
            if page <= max_pages:
                self.session = get_tor_session()  # Get a new Tor session between pages
                time.sleep(random.uniform(30, 60))

        logging.info(f"Scraped {len(all_products)} products from the category {category}.")
        return all_products

    def extract_product_data(self, card, category):
        try:
            asin = card.get('data-asin')
            title_elem = card.select_one('h2 a.a-link-normal')
            title = title_elem.text.strip() if title_elem else None
            url = f"https://www.amazon.co.uk{title_elem['href']}" if title_elem else None
            
            price_elem = card.select_one('span.a-price-whole')
            price = float(price_elem.text.replace(',', '').strip()) if price_elem else None
            
            rating_elem = card.select_one('span.a-icon-alt')
            rating = float(rating_elem.text.split()[0]) if rating_elem else None
            
            review_count_elem = card.select_one('span.a-size-base.s-underline-text')
            review_count = int(review_count_elem.text.replace(',', '').strip()) if review_count_elem else None
            
            image_elem = card.select_one('img.s-image')
            image = image_elem['src'] if image_elem else None

            # Extract Best Sellers Rank
            bsr_elem = card.select_one('th:-soup-contains("Best Sellers Rank")')
            bsr = []
            if bsr_elem:
                bsr_text = bsr_elem.find_next('td').text.strip()
                ranks = bsr_text.split('\n')
                for rank in ranks:
                    rank = rank.strip()
                    if rank:
                        match = re.search(r'#([\d,]+)\s+in\s+(.+?)\s*(\(.*\))?$', rank)
                        if match:
                            rank_num, category_name = match.group(1), match.group(2)
                            bsr.append({
                                'rank': int(rank_num.replace(',', '')),
                                'category': category_name.strip()
                            })

            return {
                'asin': asin,
                'title': title,
                'price': price,
                'rating': rating,
                'review_count': review_count,
                'category': category,
                'url': url,
                'image': image,
                'best_sellers_rank': bsr,
                'scraped_at': int(time.time() * 1000)  # Current timestamp in milliseconds
            }
        except Exception as e:
            logging.error(f"Error extracting product data: {str(e)}")
            return None

def scrape_all_categories(categories):
    scraper = AmazonScraper()
    all_data = []

    for category in categories:
        if isinstance(category, list):
            for subcategory in category:
                products = scraper.scrape_category(f"{category[0]} {subcategory}", max_products=50, max_pages=2)
                all_data.extend(products)
        else:
            products = scraper.scrape_category(category, max_products=50, max_pages=2)
            all_data.extend(products)
        
        time.sleep(random.uniform(60, 120))

    logging.info(f"Total pages accessed: {scraper.pages_accessed}")
    logging.info(f"Average page size: {scraper.total_page_size / scraper.pages_accessed / 1024:.2f} KB")

    return all_data

def save_to_csv(data, filename):
    if not data:
        logging.warning("No data to save. CSV file will not be created.")
        return
    
    keys = data[0].keys()
    try:
        with open(filename, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, keys)
            dict_writer.writeheader()
            dict_writer.writerows(data)
        logging.info(f"Data successfully saved to {filename}")
    except IOError as e:
        logging.error(f"IOError occurred while saving data: {str(e)}")
    except Exception as e:
        logging.error(f"Unexpected error occurred while saving data: {str(e)}")

if __name__ == "__main__":
    if not check_tor_connection():
        logging.error("Failed to connect through Tor. Exiting.")
        exit(1)

    categories = [
        "Apps & Games", "Home & Kitchen"
    ]

    try:
        all_products = scrape_all_categories(categories)
        save_to_csv(all_products, 'amazon_products_TOR.csv')
        logging.info("Scraping completed. Data saved to amazon_products_TOR.csv")
    except KeyboardInterrupt:
        logging.info("Scraping interrupted by user.")
    except Exception as e:
        logging.error(f"An unexpected error occurred: {str(e)}")
    finally:
        logging.info("Scraping process finished.")

2024-10-04 23:35:05,909 - INFO - Connected through IP: 192.42.116.195
2024-10-04 23:36:24,463 - INFO - Accessed page through IP: 192.42.116.195
2024-10-04 23:36:24,540 - INFO - Scraped product 1 for Apps & Games: None
2024-10-04 23:36:24,541 - INFO - Scraped product 2 for Apps & Games: None
2024-10-04 23:36:24,542 - INFO - Scraped product 3 for Apps & Games: None
2024-10-04 23:36:24,543 - INFO - Scraped product 4 for Apps & Games: None
2024-10-04 23:36:24,543 - INFO - Scraped product 5 for Apps & Games: None
2024-10-04 23:36:24,544 - INFO - Scraped product 6 for Apps & Games: None
2024-10-04 23:36:24,545 - INFO - Scraped product 7 for Apps & Games: None
2024-10-04 23:36:24,545 - INFO - Scraped product 8 for Apps & Games: None
2024-10-04 23:36:24,546 - INFO - Scraped product 9 for Apps & Games: None
2024-10-04 23:36:24,546 - INFO - Scraped product 10 for Apps & Games: None
2024-10-04 23:36:24,547 - INFO - Scraped product 11 for Apps & Games: None
2024-10-04 23:36:24,548 - INFO - Scraped