Web scraper for fitment information:

In [2]:
import requests
def fetch_proxies():
    try:
        response = requests.get("https://api.proxyscrape.com/v2/?request=displayproxies&protocol=http&timeout=10000&country=all&ssl=all&anonymity=all")
        response.raise_for_status()
        proxy_list = response.text.splitlines()
        proxies = [{"http": f"http://{proxy}", "https": f"http://{proxy}"} for proxy in proxy_list]
        return proxies
    except requests.RequestException as e:
        print(f"Error fetching proxies: {e}")
        return []
    
PROXIES  = fetch_proxies()


[{'http': 'http://46.47.197.210:3128', 'https': 'http://46.47.197.210:3128'}, {'http': 'http://65.109.220.163:80', 'https': 'http://65.109.220.163:80'}, {'http': 'http://45.92.177.60:8080', 'https': 'http://45.92.177.60:8080'}, {'http': 'http://152.26.229.66:9443', 'https': 'http://152.26.229.66:9443'}, {'http': 'http://185.17.146.18:8080', 'https': 'http://185.17.146.18:8080'}, {'http': 'http://51.79.170.92:80', 'https': 'http://51.79.170.92:80'}, {'http': 'http://43.134.229.98:3128', 'https': 'http://43.134.229.98:3128'}, {'http': 'http://89.35.237.187:8888', 'https': 'http://89.35.237.187:8888'}, {'http': 'http://103.49.202.250:80', 'https': 'http://103.49.202.250:80'}, {'http': 'http://49.7.11.187:80', 'https': 'http://49.7.11.187:80'}, {'http': 'http://154.203.132.49:8090', 'https': 'http://154.203.132.49:8090'}, {'http': 'http://47.91.104.88:3128', 'https': 'http://47.91.104.88:3128'}, {'http': 'http://72.10.160.173:5675', 'https': 'http://72.10.160.173:5675'}, {'http': 'http://6

In [6]:
import re
from urllib.parse import urlparse, urljoin
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import random
import time
import pandas as pd

TARGET_MAKES = ['toyota']

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0'
]

# Function to fetch the content of a URL with proxy and user-agent rotation
def get_soup(url, retries=10, delay=15, max_delay=60):
    headers = {
        'User-Agent': random.choice(USER_AGENTS)
    }

    for attempt in range(retries):
        print(f"Fetching URL: {url} with User-Agent: {headers['User-Agent']}")
        try:
            response = requests.get(url, headers=headers, timeout=20)
            response.raise_for_status()
            print("Successfully fetched the page.")
            return BeautifulSoup(response.text, 'html.parser')
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            if attempt < retries - 1:
                wait_time = min(delay * (2 ** attempt), max_delay)
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print("Max retries reached. Skipping this URL.")
        finally:
            # Conservative delay to prevent rate limiting
            time.sleep(random.uniform(10, 30))

    return None

def get_make_urls(base_url="https://www.rockauto.com/en/catalog/"):
    make_urls = [f"{base_url}{make.lower().replace(' ', '+')}" for make in TARGET_MAKES]
    return make_urls

def get_year_urls(make_url):
    current_year = datetime.now().year
    start_year = 2000 
    year_urls = [f"{make_url},{year}" for year in range(current_year, start_year - 1, -1)]
    return year_urls

def get_model_urls(year_url):
    soup = get_soup(year_url)
    if not soup:
        return []
    
    model_links = soup.find_all("a", class_="navlabellink")
    model_urls = []
    base_path = urlparse(year_url).path
    
    for link in model_links:
        href = link.get('href')
        if href:
            model_path = urlparse(href).path
            url_parts = model_path.split(',')
            
            if len(url_parts) == 3 and url_parts[2] not in base_path:
                model_url = urljoin(year_url, href)
                model_urls.append(model_url)
    
    print(f"Found {len(model_urls)} model URLs for make: {year_url}")
    return model_urls

def get_engine_urls(model_url):
    soup = get_soup(model_url)
    if not soup:
        print("Failed to retrieve or parse the page.")
        return []
    
    base_path = urlparse(model_url).path
    engine_tds = soup.find_all('td', class_='nlabel')
    engine_urls = []
    
    for td in engine_tds:
        link = td.find('a', class_='navlabellink')
        if link:
            href = link.get('href', '')
            engine_path = urlparse(href).path
            url_parts = engine_path.split(',')
            
            if len(url_parts) > 3 and url_parts[3] not in base_path:
                engine_name = link.text.strip()
                engine_url = urljoin(model_url, href)
                engine_urls.append({'name': engine_name, 'url': engine_url})
    
    return engine_urls

def get_category_urls(engine_url):
    # Extract the URL string from the dictionary before passing it to get_soup
    soup = get_soup(engine_url['url'])
    if not soup:
        print("Failed to retrieve or parse the page.")
        return []
    
    base_path = urlparse(engine_url['url']).path
    category_links = soup.find_all('a', class_='navlabellink')
    category_urls = []
    
    for link in category_links:
        href = link.get('href', '')
        category_path = urlparse(href).path
        url_parts = category_path.split(',')
        
        if len(url_parts) == 6 and url_parts[5] not in base_path:
            category_name = link.text.strip()
            category_url = urljoin(engine_url['url'], href)
            category_urls.append({'name': category_name, 'url': category_url})
    
    return category_urls

def get_subcategory_urls(category_url):
    soup = get_soup(category_url['url'])
    if not soup:
        print("Failed to retrieve or parse the page.")
        return []
    
    base_path = urlparse(category_url['url']).path
    subcategory_tds = soup.find_all('td', class_='nlabel')
    subcategory_urls = []
    
    for td in subcategory_tds:
        link = td.find('a', class_='navlabellink')
        if link:
            href = link.get('href', '')
            category_path = urlparse(href).path
            url_parts = category_path.split(',')
            
            if len(url_parts) == 8 and url_parts[7] not in base_path:
                subcategory_url = urljoin(category_url['url'], href)
                subcategory_urls.append({'name': td.text.strip(), 'url': subcategory_url})
    
    return subcategory_urls


def extract_info_from_url(url):
    path = urlparse(url).path
    parts = path.split(',')
    info = {}
    if len(parts) > 0:
        make_path = parts[0].split('/')
        info['make'] = make_path[3]
    if len(parts) > 1:
        info['year'] = parts[1]
    if len(parts) > 2:
        info['model'] = parts[2]
    if len(parts) > 3:
        info['engine'] = parts[3]
    if len(parts) > 5:
        info['category'] = parts[5].replace('+', ' ').replace('&', 'and')
    if len(parts) > 6:
        info['subcategory'] = parts[6].replace('+', ' ').replace('&', 'and')
    return info

def scrape_fitment_data(url_dict):
    print(f"Scraping URL: {url_dict['url']}")
    
    # Extract info from URL
    url_info = extract_info_from_url(url_dict['url'])
    
    # Fetch and parse the page
    soup = get_soup(url_dict['url'])
    if not soup:
        return []
    
    # Find all tbody elements that contain part listings
    tbody_elements = soup.find_all("tbody", id=re.compile(r"listingcontainer\[\d+\]"))
    
    # Iterate through each tbody element to extract part data
    all_parts = []
    for tbody in tbody_elements:
        # Start with a copy of the general info
        part_info = url_info.copy()
        
        # Extract part number
        part_number = tbody.find("span", class_="listing-final-partnumber as-link-if-js")
        if part_number:
            part_info['part_number'] = part_number.text.strip()
        
        # Extract description
        description = tbody.find("span", class_="listing-footnote-text")
        if description:
            part_info['description'] = description.text.strip()
        
        all_parts.append(part_info)
    
    return all_parts

def gather_and_scrape_urls():
    all_fitment_data = []

    for make_url in get_make_urls():
        for year_url in get_year_urls(make_url):
            for model_url in get_model_urls(year_url):
                for engine_url in get_engine_urls(model_url):
                    for category_url in get_category_urls(engine_url):
                        for subcategory_url in get_subcategory_urls(category_url):
                            fitment_data = scrape_fitment_data(subcategory_url)
                            if fitment_data:
                                df = pd.DataFrame(fitment_data)
                                all_fitment_data.append(df)
    
    if all_fitment_data:
        final_df = pd.concat(all_fitment_data, ignore_index=True)
    else:
        final_df = pd.DataFrame()

    return final_df

scraped_df = gather_and_scrape_urls()
scraped_df.to_csv("/Users/skylerwilson/Desktop/PartsWise/Data/fitment_data/rockauto_fitment_data.csv", index=False)


Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024 with User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0
Successfully fetched the page.
Found 28 model URLs for make: https://www.rockauto.com/en/catalog/toyota,2024
Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024,4runner with User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0
Successfully fetched the page.
Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024,4runner,4.0l+v6,3455446 with User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0
Successfully fetched the page.
Fetching URL: https://www.rockauto.com/en/catalog/toyota,2024,4runner,4.0l+v6,3455446,body+&+lamp+assembly with User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15
Successfully fetched the page.
Scraping URL: https://www.rockauto.com/en/catalo

KeyboardInterrupt: 

In [None]:
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6; rv:88.0) Gecko/20100101 Firefox/88.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:87.0) Gecko/20100101 Firefox/87.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.2 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 Edg/90.0.818.66',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.172',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 OPR/76.0.4017.177',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.172',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36 OPR/76.0.4017.177',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
    'Mozilla/5.0 (iPad; CPU OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1',
    'Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36',
    'Mozilla/5.0 (Android 11; Mobile; rv:89.0) Gecko/89.0 Firefox/89.0',
    'Mozilla/5.0 (Android 10; Mobile; rv:88.0) Gecko/88.0 Firefox/88.0',
    'Mozilla/5.0 (Linux; Android 11; Pixel 4a) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36 EdgA/46.3.4.5155',
    'Mozilla/5.0 (Linux; Android 10; Pixel 4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Mobile Safari/537.36 EdgA/46.3.3.5145',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0'
]

In [10]:
import re
from urllib.parse import urlparse, urljoin
from datetime import datetime
from bs4 import BeautifulSoup
import random
import pandas as pd
from tqdm import tqdm
import os
import aiohttp
import asyncio
import nest_asyncio
import subprocess
import logging
import time
from fake_useragent import UserAgent

nest_asyncio.apply()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

MAX_CONCURRENT_REQUESTS = 3
semaphore = asyncio.Semaphore(MAX_CONCURRENT_REQUESTS)

TARGET_MAKES = ['toyota']

ua = UserAgent()

NORDVPN_LOCATIONS = {
    'United States': [
        'Atlanta', 'Buffalo', 'Charlotte', 'Chicago', 'Dallas', 'Denver', 'Detroit', 'Kansas City','Los Angeles', 
        'Manassas', 'Miami', 'New York', 'Phoenix', 'Saint Louis', 'Salt Lake City', 
        'San Francisco', 'Seattle'
    ],
    'Canada': ['Montreal', 'Toronto', 'Vancouver'],
    'United Kingdom': ['London', 'Manchester' 'Glasgow', 'Edinburgh'],
    'Germany': ['Berlin', 'Frankfurt', 'Hamburg'],
    'Australia': ['Sydney', 'Perth', 'Melbourne', 'Brisbane', 'Adelaide'],
    'Netherlands': ['Amsterdam'],
    'Israel': ['Tel Aviv'],
    'France': ['Paris', 'Marseille'],
    'Spain': ['Madrid'],
    'Estonia': ['Tallinn'],
    'Ireland': ['Dublin'],
    'Iceland': ['Reykjavik'],
    'Finland': ['Helsinki'],
    'Norway': ['Oslo'],
    'New Zealand': ['Auckland'],
    'Denmark': ['Copenhagen'],
    'Sweden': ['Stockholm'],
    'Italy': ['Milan', 'Rome', 'Palermo'],
    'Japan': ['Osaka', 'Tokyo']
}

class RateLimiter:
    def __init__(self, rate_limit, time_period=60):
        self.rate_limit = rate_limit
        self.time_period = time_period
        self.request_times = []

    async def wait(self):
        now = time.time()
        self.request_times = [t for t in self.request_times if now - t <= self.time_period]
        
        if len(self.request_times) >= self.rate_limit:
            sleep_time = self.time_period - (now - self.request_times[0])
            if sleep_time > 0:
                await asyncio.sleep(sleep_time)
        
        self.request_times.append(time.time())

rate_limiter = RateLimiter(rate_limit=10, time_period=60)  

def rotate_nordvpn():
    country = random.choice(list(NORDVPN_LOCATIONS.keys()))
    city = random.choice(NORDVPN_LOCATIONS[country])
    try:
        subprocess.run(['nordvpn', 'disconnect'], check=True)
        time.sleep(5)  # Wait a bit after disconnecting
        subprocess.run(['nordvpn', 'connect', f"{country} {city}"], check=True)
        logging.info(f"Connected to NordVPN server in {city}, {country}")
        time.sleep(10)  # Wait a bit after connecting to ensure the connection is established
    except subprocess.CalledProcessError as e:
        logging.error(f"Failed to rotate NordVPN: {e}")

async def get_soup(session, url, retries=3, base_delay=5, max_delay=30):
    headers = {
        'User-Agent': ua.random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.rockauto.com/',
        'DNT': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    async with semaphore:
        for attempt in range(retries):
            try:
                await rate_limiter.wait()
                
                async with session.get(url, headers=headers, timeout=30) as response:
                    response.raise_for_status()
                    content = await response.text()
                    await asyncio.sleep(random.uniform(2, 5))  # Increased delay after each successful request
                    return BeautifulSoup(content, 'html.parser')
            except Exception as e:
                if attempt < retries - 1:
                    wait_time = min(base_delay * (2 ** attempt) + random.uniform(0, 5), max_delay)
                    logging.warning(f"Error fetching {url}: {e}. Retrying in {wait_time:.2f} seconds...")
                    await asyncio.sleep(wait_time)
                    rotate_nordvpn()  # Rotate IP before retrying
                else:
                    logging.error(f"Failed to fetch {url} after {retries} attempts: {e}")
    
    return None

def get_make_urls(base_url="https://www.rockauto.com/en/catalog/"):
    return [f"{base_url}{make.lower().replace(' ', '+')}" for make in TARGET_MAKES]

def get_year_urls(make_url):
    current_year = datetime.now().year
    return [f"{make_url},{year}" for year in range(current_year, 1999, -1)]

async def get_model_urls(session, year_url):
    soup = await get_soup(session, year_url)
    if not soup:
        return []
    
    model_links = soup.find_all("a", class_="navlabellink")
    base_path = urlparse(year_url).path
    
    return [urljoin(year_url, link['href']) for link in model_links 
            if link.get('href') and len(urlparse(link['href']).path.split(',')) == 3 
            and urlparse(link['href']).path.split(',')[2] not in base_path]

async def get_engine_urls(session, model_url):
    soup = await get_soup(session, model_url)
    if not soup:
        return []
    
    base_path = urlparse(model_url).path
    engine_tds = soup.find_all('td', class_='nlabel')
    
    return [{'name': link.text.strip(), 'url': urljoin(model_url, link['href'])}
            for td in engine_tds
            if (link := td.find('a', class_='navlabellink'))
            and len(urlparse(link['href']).path.split(',')) > 3
            and urlparse(link['href']).path.split(',')[3] not in base_path]

async def get_category_urls(session, engine_url):
    soup = await get_soup(session, engine_url['url'])
    if not soup:
        return []
    
    base_path = urlparse(engine_url['url']).path
    category_links = soup.find_all('a', class_='navlabellink')
    
    return [{'name': link.text.strip(), 'url': urljoin(engine_url['url'], link['href'])}
            for link in category_links
            if len(urlparse(link['href']).path.split(',')) == 6
            and urlparse(link['href']).path.split(',')[5] not in base_path]

async def get_subcategory_urls(session, category_url):
    soup = await get_soup(session, category_url['url'])
    if not soup:
        return []
    
    base_path = urlparse(category_url['url']).path
    subcategory_tds = soup.find_all('td', class_='nlabel')
    
    return [{'name': td.text.strip(), 'url': urljoin(category_url['url'], link['href'])}
            for td in subcategory_tds
            if (link := td.find('a', class_='navlabellink'))
            and len(urlparse(link['href']).path.split(',')) == 8
            and urlparse(link['href']).path.split(',')[7] not in base_path]

def extract_info_from_url(url):
    parts = urlparse(url).path.split(',')
    info = {
        'make': parts[0].split('/')[-1] if len(parts) > 0 else None,
        'year': parts[1] if len(parts) > 1 else None,
        'model': parts[2] if len(parts) > 2 else None,
        'engine': parts[3] if len(parts) > 3 else None,
        'category': parts[5].replace('+', ' ').replace('&', 'and') if len(parts) > 5 else None,
        'subcategory': parts[6].replace('+', ' ').replace('&', 'and') if len(parts) > 6 else None
    }
    return {k: v for k, v in info.items() if v is not None}

async def scrape_fitment_data(session, url_dict):
    url_info = extract_info_from_url(url_dict['url'])
    soup = await get_soup(session, url_dict['url'])
    if not soup:
        return []
    
    tbody_elements = soup.find_all("tbody", id=re.compile(r"listingcontainer\[\d+\]"))
    
    all_parts = []
    for tbody in tbody_elements:
        part_info = url_info.copy()
        part_number = tbody.find("span", class_="listing-final-partnumber as-link-if-js")
        description = tbody.find("span", class_="listing-footnote-text")
        
        if part_number:
            part_info['part_number'] = part_number.text.strip()
        if description:
            part_info['description'] = description.text.strip()
        
        all_parts.append(part_info)
    
    return all_parts

async def scrape_make_year(session, make_url, year):
    year_url = f"{make_url},{year}"
    year_data = []
    
    model_urls = await get_model_urls(session, year_url)
    for model_url in model_urls:
        engine_urls = await get_engine_urls(session, model_url)
        for engine_url in engine_urls:
            category_urls = await get_category_urls(session, engine_url)
            for category_url in category_urls:
                subcategory_urls = await get_subcategory_urls(session, category_url)
                for subcategory_url in subcategory_urls:
                    fitment_data = await scrape_fitment_data(session, subcategory_url)
                    if fitment_data:
                        year_data.extend(fitment_data)
    
    return pd.DataFrame(year_data)

async def gather_and_scrape_urls():
    all_fitment_data = []
    make_urls = get_make_urls()
    
    async with aiohttp.ClientSession() as session:
        for make_url in make_urls:
            make_name = urlparse(make_url).path.split('/')[-1]
            current_year = datetime.now().year
            years = range(current_year, 1999, -1)
            
            tasks = []
            for year in years:
                tasks.append(scrape_make_year(session, make_url, year))
                if len(tasks) >= MAX_CONCURRENT_REQUESTS:
                    results = await asyncio.gather(*tasks)
                    for df in results:
                        if not df.empty:
                            all_fitment_data.append(df)
                            intermediate_file = f"/Users/skylerwilson/Desktop/PartsWise/Data/fitment_data/rockauto_fitment_data_{make_name}_{df['year'].iloc[0]}.csv"
                            df.to_csv(intermediate_file, index=False)
                            logging.info(f"Saved intermediate results for {make_name} {df['year'].iloc[0]}")
                    tasks = []
                    rotate_nordvpn()  # Rotate IP after processing a batch
                    await asyncio.sleep(random.uniform(30, 60))  # Cooldown between batches
            
            # Process any remaining tasks
            if tasks:
                results = await asyncio.gather(*tasks)
                for df in results:
                    if not df.empty:
                        all_fitment_data.append(df)
                        intermediate_file = f"/Users/skylerwilson/Desktop/PartsWise/Data/fitment_data/rockauto_fitment_data_{make_name}_{df['year'].iloc[0]}.csv"
                        df.to_csv(intermediate_file, index=False)
                        logging.info(f"Saved intermediate results for {make_name} {df['year'].iloc[0]}")
    
    return pd.concat(all_fitment_data, ignore_index=True) if all_fitment_data else pd.DataFrame()

async def main():
    output_file = "/Users/skylerwilson/Desktop/PartsWise/Data/fitment_data/rockauto_fitment_data_complete.csv"
    
    start_time = time.time()
    scraped_df = await gather_and_scrape_urls()
    end_time = time.time()
    
    logging.info(f"Scraping completed in {end_time - start_time:.2f} seconds")
    
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        final_df = pd.concat([existing_df, scraped_df], ignore_index=True)
    else:
        final_df = scraped_df

    final_df.to_csv(output_file, index=False)
    logging.info(f"Saved complete dataset to {output_file}")

if __name__ == "__main__":
    asyncio.run(main())






Scraping toyota:   0%|          | 0/25 [00:00<?, ?it/s]

Error fetching https://www.rockauto.com/en/catalog/toyota,2013,4runner,4.0l+v6,1504997,body+&+lamp+assembly,air+deflector,14179: . Retrying in 5.30 seconds...
Error fetching https://www.rockauto.com/en/catalog/toyota,2016,4runner,4.0l+v6,3353820,body+&+lamp+assembly,antenna+cable,48490: . Retrying in 8.38 seconds...
Error fetching https://www.rockauto.com/en/catalog/toyota,2003,4runner,4.0l+v6,1418239,body+&+lamp+assembly,antenna,1332: . Retrying in 7.09 seconds...
Error fetching https://www.rockauto.com/en/catalog/toyota,2020,4runner,4.0l+v6,3445369,body+&+lamp+assembly,antenna+cable,48490: . Retrying in 8.40 seconds...
Error fetching https://www.rockauto.com/en/catalog/toyota,2018,4runner,4.0l+v6,3441448,body+&+lamp+assembly,antenna+cable,48490: . Retrying in 6.25 seconds...
Error fetching https://www.rockauto.com/en/catalog/toyota,2013,4runner,4.0l+v6,1504997,body+&+lamp+assembly,air+deflector,14179: . Retrying in 10.69 seconds...
Error fetching https://www.rockauto.com/en/catalog/t

Scraping toyota:   0%|          | 0/25 [07:20<?, ?it/s]


KeyboardInterrupt: 

Failed to fetch https://www.rockauto.com/en/catalog/toyota,2013,4runner,4.0l+v6,1504997,body+&+lamp+assembly,air+deflector,14179 after 3 attempts: 
Failed to fetch https://www.rockauto.com/en/catalog/toyota,2003,4runner,4.0l+v6,1418239,body+&+lamp+assembly,antenna,1332 after 3 attempts: Connector is closed.
Failed to fetch https://www.rockauto.com/en/catalog/toyota,2018,4runner,4.0l+v6,3441448,body+&+lamp+assembly,antenna+cable,48490 after 3 attempts: Connector is closed.
Failed to fetch https://www.rockauto.com/en/catalog/toyota,2020,4runner,4.0l+v6,3445369,body+&+lamp+assembly,antenna+cable,48490 after 3 attempts: Connector is closed.
Failed to fetch https://www.rockauto.com/en/catalog/toyota,2016,4runner,4.0l+v6,3353820,body+&+lamp+assembly,antenna+cable,48490 after 3 attempts: Connector is closed.


In [None]:
target_makes = ['toyota', 'ford', 'chevrolet', 'honda', 'nissan', 'jeep', 'subaru', 'gmc', 
 'hyundai', 'ram', 'kia', 'volkswagen', 'mercedes-benz', 'bmw', 'lexus', 
 'audi', 'mazda', 'buick', 'chrysler', 'dodge', 'cadillac', 'volvo', 'lincoln', 
 'acura', 'tesla', 'infiniti', 'mitsubishi', 'porsche', 'land rover', 'jaguar', 
 'genesis', 'mini', 'fiat', 'maserati', 'alfa romeo', 'bentley', 'rolls-royce', 
 'aston martin', 'ferrari', 'lamborghini']

['toyota', 'ford', 'chevrolet', 'honda', 'nissan', 'jeep', 'subaru', 'gmc', 'hyundai', 'ram', 'kia', 'volkswagen', 'mercedes-benz', 'bmw', 'lexus', 'audi', 'mazda', 'buick', 'chrysler', 'dodge', 'cadillac', 'volvo', 'lincoln', 'acura', 'tesla', 'infiniti', 'mitsubishi', 'porsche', 'land rover', 'jaguar', 'genesis', 'mini', 'fiat', 'maserati', 'alfa romeo', 'bentley', 'rolls-royce', 'aston martin', 'ferrari', 'lamborghini']
