Web scraper for fitment information:

In [2]:
import requests
def fetch_proxies():
    try:
        response = requests.get("https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all")
        response.raise_for_status()
        proxy_list = response.text.splitlines()
        proxies = [{"http": f"http://{proxy}", "https": f"http://{proxy}"} for proxy in proxy_list]
        return proxies
    except requests.RequestException as e:
        print(f"Error fetching proxies: {e}")
        return []
    
PROXIES  = fetch_proxies()


[{'http': 'http://46.47.197.210:3128', 'https': 'http://46.47.197.210:3128'}, {'http': 'http://65.109.220.163:80', 'https': 'http://65.109.220.163:80'}, {'http': 'http://45.92.177.60:8080', 'https': 'http://45.92.177.60:8080'}, {'http': 'http://152.26.229.66:9443', 'https': 'http://152.26.229.66:9443'}, {'http': 'http://185.17.146.18:8080', 'https': 'http://185.17.146.18:8080'}, {'http': 'http://51.79.170.92:80', 'https': 'http://51.79.170.92:80'}, {'http': 'http://43.134.229.98:3128', 'https': 'http://43.134.229.98:3128'}, {'http': 'http://89.35.237.187:8888', 'https': 'http://89.35.237.187:8888'}, {'http': 'http://103.49.202.250:80', 'https': 'http://103.49.202.250:80'}, {'http': 'http://49.7.11.187:80', 'https': 'http://49.7.11.187:80'}, {'http': 'http://154.203.132.49:8090', 'https': 'http://154.203.132.49:8090'}, {'http': 'http://47.91.104.88:3128', 'https': 'http://47.91.104.88:3128'}, {'http': 'http://72.10.160.173:5675', 'https': 'http://72.10.160.173:5675'}, {'http': 'http://6

In [21]:
import re
from urllib.parse import urlparse, urljoin
from datetime import datetime
import random
import pandas as pd
from tqdm import tqdm
import os
import aiohttp
import asyncio
import json
import logging
import time
from fake_useragent import UserAgent

# Set up logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler("scraping.log"),
                        logging.StreamHandler()
                    ])

MAX_CONCURRENT_REQUESTS = 5
MAX_GLOBAL_RETRIES = 10 
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

TARGET_MAKES = ['toyota']

ua = UserAgent()

class CheckpointSystem:
    def __init__(self, checkpoint_file):
        self.checkpoint_file = checkpoint_file
        self.progress = self.load_checkpoint()

    def load_checkpoint(self):
        if os.path.exists(self.checkpoint_file):
            with open(self.checkpoint_file, 'r') as f:
                return json.load(f)
        return {}

    def save_checkpoint(self, make, year):
        self.progress[make] = year
        with open(self.checkpoint_file, 'w') as f:
            json.dump(self.progress, f)

    def get_last_processed_year(self, make):
        return self.progress.get(make, None)

checkpoint = CheckpointSystem('/Users/skylerwilson/Desktop/PartsWise/Data/fitment_data/scraping_checkpoint.json')

class RateLimiter:
    def __init__(self, rate_limit, time_period=60):
        self.rate_limit = rate_limit
        self.time_period = time_period
        self.request_times = []

    async def wait(self):
        now = time.time()
        self.request_times = [t for t in self.request_times if now - t <= self.time_period]
        
        if len(self.request_times) >= self.rate_limit:
            sleep_time = self.time_period - (now - self.request_times[0])
            if sleep_time > 0:
                await asyncio.sleep(sleep_time)
        
        self.request_times.append(time.time())

rate_limiter = RateLimiter(rate_limit=10, time_period=60)

async def get_soup(session, url, retries=5, base_delay=5, max_delay=30):
    headers = {
        'User-Agent': ua.random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.rockauto.com/',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    async with semaphore:
        for attempt in range(retries):
            try:
                await rate_limiter.wait()
                
                print(f"Fetching URL: {url} with User-Agent: {headers['User-Agent']}")
                async with session.get(url, headers=headers, timeout=30) as response:
                    response.raise_for_status()
                    content = await response.text()
                    print("Successfully fetched the page.")
                    await asyncio.sleep(random.uniform(2, 5))  # Increased delay after each successful request
                    return BeautifulSoup(content, 'html.parser')
            except Exception as e:
                print(f"Error fetching {url}: {e}")
                if attempt < retries - 1:
                    wait_time = min(base_delay * (2 ** attempt) + random.uniform(0, 5), max_delay)
                    print(f"Retrying in {wait_time:.2f} seconds...")
                    await asyncio.sleep(wait_time)
                else:
                    print(f"Failed to fetch {url} after {retries} attempts: {e}")
    
    return None

def get_make_urls(base_url="https://www.rockauto.com/en/catalog/"):
    return [f"{base_url}{make.lower().replace(' ', '+')}" for make in TARGET_MAKES]

def get_year_urls(make_url):
    current_year = datetime.now().year
    return [f"{make_url},{year}" for year in range(current_year, 1999, -1)]

async def get_model_urls(session, year_url):
    soup = await get_soup(session, year_url)
    if not soup:
        return []
    
    model_links = soup.find_all("a", class_="navlabellink")
    base_path = urlparse(year_url).path
    
    return [urljoin(year_url, link['href']) for link in model_links 
            if link.get('href') and len(urlparse(link['href']).path.split(',')) == 3 
            and urlparse(link['href']).path.split(',')[2] not in base_path]

async def get_engine_urls(session, model_url):
    soup = await get_soup(session, model_url)
    if not soup:
        return []
    
    base_path = urlparse(model_url).path
    engine_tds = soup.find_all('td', class_='nlabel')
    
    return [{'name': link.text.strip(), 'url': urljoin(model_url, link['href'])}
            for td in engine_tds
            if (link := td.find('a', class_='navlabellink'))
            and len(urlparse(link['href']).path.split(',')) > 3
            and urlparse(link['href']).path.split(',')[3] not in base_path]

async def get_category_urls(session, engine_url):
    soup = await get_soup(session, engine_url['url'])
    if not soup:
        return []
    
    base_path = urlparse(engine_url['url']).path
    category_links = soup.find_all('a', class_='navlabellink')
    
    return [{'name': link.text.strip(), 'url': urljoin(engine_url['url'], link['href'])}
            for link in category_links
            if len(urlparse(link['href']).path.split(',')) == 6
            and urlparse(link['href']).path.split(',')[5] not in base_path]

async def get_subcategory_urls(session, category_url):
    soup = await get_soup(session, category_url['url'])
    if not soup:
        return []
    
    base_path = urlparse(category_url['url']).path
    subcategory_tds = soup.find_all('td', class_='nlabel')
    
    return [{'name': td.text.strip(), 'url': urljoin(category_url['url'], link['href'])}
            for td in subcategory_tds
            if (link := td.find('a', class_='navlabellink'))
            and len(urlparse(link['href']).path.split(',')) == 8
            and urlparse(link['href']).path.split(',')[7] not in base_path]

def extract_info_from_url(url):
    parts = urlparse(url).path.split(',')
    info = {
        'make': parts[0].split('/')[-1] if len(parts) > 0 else None,
        'year': parts[1] if len(parts) > 1 else None,
        'model': parts[2] if len(parts) > 2 else None,
        'engine': parts[3] if len(parts) > 3 else None,
        'category': parts[5].replace('+', ' ').replace('&', 'and') if len(parts) > 5 else None,
        'subcategory': parts[6].replace('+', ' ').replace('&', 'and') if len(parts) > 6 else None
    }
    return {k: v for k, v in info.items() if v is not None}

async def scrape_fitment_data(session, url_dict):
    print(f"Scraping URL: {url_dict['url']}")
    url_info = extract_info_from_url(url_dict['url'])
    soup = await get_soup(session, url_dict['url'])
    if not soup:
        return []
    
    tbody_elements = soup.find_all("tbody", id=re.compile(r"listingcontainer\[\d+\]"))
    
    all_parts = []
    for tbody in tbody_elements:
        part_info = url_info.copy()
        part_number = tbody.find("span", class_="listing-final-partnumber as-link-if-js")
        description = tbody.find("span", class_="listing-footnote-text")
        
        if part_number:
            part_info['part_number'] = part_number.text.strip()
        if description:
            part_info['description'] = description.text.strip()
        
        all_parts.append(part_info)
    
    return all_parts

async def scrape_make_year(session, make_url, year):
    year_url = f"{make_url},{year}"
    year_data = []
    
    print(f"Scraping data for {make_url.split('/')[-1].capitalize()} - {year}")
    
    try:
        model_urls = await get_model_urls(session, year_url)
        print(f"Found {len(model_urls)} models for {year}")
        
        for model_url in tqdm(model_urls, desc=f"Models ({year})", leave=False):
            engine_urls = await get_engine_urls(session, model_url)
            print(f"Found {len(engine_urls)} engines for {model_url.split(',')[-1]}")
            
            for engine_url in engine_urls:
                category_urls = await get_category_urls(session, engine_url)
                print(f"Found {len(category_urls)} categories for engine {engine_url['name']}")
                
                for category_url in category_urls:
                    subcategory_urls = await get_subcategory_urls(session, category_url)
                    print(f"Found {len(subcategory_urls)} subcategories for category {category_url['name']}")
                    
                    for subcategory_url in subcategory_urls:
                        fitment_data = await scrape_fitment_data(session, subcategory_url)
                        if fitment_data:
                            year_data.extend(fitment_data)
                            print(f"Added {len(fitment_data)} parts for subcategory {subcategory_url['name']}")
    
    except Exception as e:
        print(f"Error scraping {make_url.split('/')[-1].capitalize()} - {year}: {e}")
        raise  # Re-raise the exception to be caught by the retry mechanism
    
    df = pd.DataFrame(year_data)
    print(f"Scraped {len(df)} parts for {make_url.split('/')[-1].capitalize()} - {year}")
    return df

async def gather_and_scrape_urls():
    all_fitment_data = []
    make_urls = get_make_urls()
    
    async with aiohttp.ClientSession() as session:
        for make_url in make_urls:
            make_name = urlparse(make_url).path.split('/')[-1]
            current_year = datetime.now().year
            last_processed_year = checkpoint.get_last_processed_year(make_name)
            start_year = last_processed_year - 1 if last_processed_year else current_year
            years = range(start_year, 1999, -1)
            
            print(f"Starting scraping for {make_name.capitalize()} from year {start_year}")
            
            make_start_time = time.time()
            total_parts_for_make = 0
            
            for year in tqdm(years, desc=f"Years for {make_name.capitalize()}"):
                retry_count = 0
                while retry_count < MAX_GLOBAL_RETRIES:
                    try:
                        df = await scrape_make_year(session, make_url, year)
                        if not df.empty:
                            all_fitment_data.append(df)
                            total_parts_for_make += len(df)
                            intermediate_file = f"/Users/skylerwilson/Desktop/PartsWise/Data/fitment_data/rockauto_fitment_data_{make_name}_{year}.csv"
                            df.to_csv(intermediate_file, index=False)
                            print(f"Saved intermediate results for {make_name} {year} ({len(df)} parts)")
                            checkpoint.save_checkpoint(make_name, year)
                        
                        await asyncio.sleep(random.uniform(30, 60))  # Cooldown between years
                        break  # If successful, break the retry loop
                    except Exception as e:
                        retry_count += 1
                        print(f"Error processing {make_name} {year} (Attempt {retry_count}/{MAX_GLOBAL_RETRIES}): {e}")
                        if retry_count >= MAX_GLOBAL_RETRIES:
                            print(f"Max retries reached for {make_name} {year}. Stopping the scraping process.")
                            return pd.concat(all_fitment_data, ignore_index=True) if all_fitment_data else pd.DataFrame()
                        await asyncio.sleep(60 * retry_count)  # Exponential backoff
            
            make_end_time = time.time()
            make_duration = make_end_time - make_start_time
            print(f"Completed scraping for {make_name.capitalize()}:")
            print(f"  Total parts scraped: {total_parts_for_make}")
            print(f"  Time taken: {make_duration:.2f} seconds")
            print(f"  Average parts per year: {total_parts_for_make / len(years):.2f}")
    
    return pd.concat(all_fitment_data, ignore_index=True) if all_fitment_data else pd.DataFrame()

async def main():
    output_file = "/Users/skylerwilson/Desktop/PartsWise/Data/fitment_data/rockauto_fitment_data_complete.csv"
    print("Starting the scraping process")
    start_time = time.time()
    scraped_df = await gather_and_scrape_urls()
    end_time = time.time()
    
    print(f"Scraping completed in {end_time - start_time:.2f} seconds")
    print(f"Total parts scraped: {len(scraped_df)}")
    
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        final_df = pd.concat([existing_df, scraped_df], ignore_index=True)
        print(f"Merged new data with existing data. Total parts: {len(final_df)}")
    else:
        final_df = scraped_df

    final_df.to_csv(output_file, index=False)
    print(f"Saved complete dataset to {output_file}")
    print(f"Scraping process completed. Check the log file for detailed information.")

if __name__ == "__main__":
    asyncio.run(main())


Starting the scraping process
Starting scraping for Toyota from year 2024


Years for Toyota:   0%|          | 0/25 [00:00<?, ?it/s]

Scraping data for Toyota - 2024
Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024 with User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0
Successfully fetched the page.
Found 28 models for 2024




Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024,4runner with User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) EdgiOS/117.0.2045.48 Version/17.0 Mobile/15E148 Safari/604.1
Successfully fetched the page.
Found 1 engines for 4runner
Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024,4runner,4.0l+v6,3455446 with User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 17_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) EdgiOS/121.0.2277.107 Version/17.0 Mobile/15E148 Safari/604.1
Successfully fetched the page.
Found 17 categories for engine 4.0L V6
Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024,4runner,4.0l+v6,3455446,body+&+lamp+assembly with User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0 GLS/100.10.9979.100
Successfully fetched the page.
Found 6 subcategories for category Body & Lamp Assembly
Scraping 

In [None]:
target_makes = ['toyota', 'ford', 'chevrolet', 'honda', 'nissan', 'jeep', 'subaru', 'gmc', 
 'hyundai', 'ram', 'kia', 'volkswagen', 'mercedes-benz', 'bmw', 'lexus', 
 'audi', 'mazda', 'buick', 'chrysler', 'dodge', 'cadillac', 'volvo', 'lincoln', 
 'acura', 'tesla', 'infiniti', 'mitsubishi', 'porsche', 'land rover', 'jaguar', 
 'genesis', 'mini', 'fiat', 'maserati', 'alfa romeo', 'bentley', 'rolls-royce', 
 'aston martin', 'ferrari', 'lamborghini']

['toyota', 'ford', 'chevrolet', 'honda', 'nissan', 'jeep', 'subaru', 'gmc', 'hyundai', 'ram', 'kia', 'volkswagen', 'mercedes-benz', 'bmw', 'lexus', 'audi', 'mazda', 'buick', 'chrysler', 'dodge', 'cadillac', 'volvo', 'lincoln', 'acura', 'tesla', 'infiniti', 'mitsubishi', 'porsche', 'land rover', 'jaguar', 'genesis', 'mini', 'fiat', 'maserati', 'alfa romeo', 'bentley', 'rolls-royce', 'aston martin', 'ferrari', 'lamborghini']
