In [1]:
import csv
import logging
import time
from bs4 import BeautifulSoup
import cloudscraper

Define our params before jumping in


In [2]:
logging.basicConfig(level=logging.DEBUG)
base_url = 'https://locprecision.com'

### get HTML

This function uses cloudscraper to grab the HTML for a URL and returns the text content


In [3]:
def get_html_content(url):
    full_url = base_url + url
    scraper = cloudscraper.create_scraper(delay=10, browser='chrome')
    return scraper.get(full_url).text

## Extract product details from the page


In [4]:
def extract_products(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    products = []

    for product in soup.find_all('div', class_='grid-product'):
        product_handle = product.get('data-product-handle')
        product_id = product.get('data-product-id')

        # Extracting the URL for the product detail page
        detail_url = product.find('a', class_='grid-product__link')['href']
        full_detail_url = f'{base_url}{detail_url}'

        # Extracting the image URL
        image_tag = product.find('img', class_='grid__image-contain')
        image_url = image_tag.get(
            'data-src').replace('{width}', '540') if image_tag else None

        product_info = {
            'handle': product_handle,
            'id': product_id,
            'detail_url': full_detail_url,
            'image_url': image_url
        }

        products.append(product_info)

    return products

### Find each page

Recursively move through each age extracting the products from each


In [6]:
def get_all_products(url):
    time.sleep(1)  # Adding a delay

    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract product details from the current page
    products = extract_products(html_content)

    # Find the link to the next page
    next_span = soup.find('span', class_='next')
    if next_span:
        next_page = next_span.find('a', title='Next')
        if next_page and 'href' in next_page.attrs:
            # Recursively extract products from the next page
            products += get_all_products(next_page['href'])

    return products

### Write a product list to CSV


In [7]:
def write_to_csv(products, filename='products.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        fieldnames = ['handle', 'id', 'detail_url', 'image_url']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for product in products:
            writer.writerow(product)

In [8]:
import json
from bs4 import Comment


def extract_json_ld(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        if 'application/ld+json' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            json_ld_tag = comment_soup.find(
                'script', type='application/ld+json')
            if json_ld_tag:
                return json.loads(json_ld_tag.string)
    return None

In [27]:
import json
import re


def extract_product_details(url):
    html_content = get_html_content(url)
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize a dictionary to hold the extracted details
    details = {
        'Complexity': None,
        'Diameter': None,
        'Height': None,
        'Weight': None,
        'Motor Mount': None,
        'Parachute Size': None,
        'Shock Cord Type': None,
        'Shock Cord Mount': None,
        'Fin Thickness': None,
        'Ring Thickness': None,
        'Instructions': None,
        'Decal': None,
        'Name': None,
        'Image URL': None,
        'SKU': None,
        'Price': None,
        'Currency': None,
        'Stock Status': None,
        'Product URL': None,
        'Description': None,
        'Links': []
    }

    # Extract JSON-LD data
    json_ld_data = extract_json_ld(html_content)
    if json_ld_data:
        details.update({
            'Name': json_ld_data.get('name'),
            'Image URL': json_ld_data.get('image', {}).get('url'),
            'SKU': json_ld_data.get('sku'),
            'Price': json_ld_data.get('offers', [{}])[0].get('price'),
            'Currency': json_ld_data.get('offers', [{}])[0].get('priceCurrency'),
            'Stock Status': json_ld_data.get('offers', [{}])[0].get('availability'),
            'Product URL': json_ld_data.get('url')
        })

    # Extract other details from the description
    description_div = soup.find(
        'div', class_='product-single__description rte')
    if description_div:
        p_tags = description_div.find_all('p')
        pattern = re.compile(r'^(\w+(?:\s\w+){0,2}):\s*(.+)')
        for p_tag in p_tags:
            if not p_tag.find():
                details['Description'] = p_tag.get_text().strip()
            elif not pattern.search(p_tag.get_text()):
                # Extract <a> tags
                a_tags = p_tag.find_all('a')
                for a in a_tags:
                    link_info = {'text': a.get_text(), 'href': a['href']}
                    details['Links'].append(link_info)
            else:
                lines = p_tag.get_text(separator='\n').split('\n')
                for line in lines:
                    match = pattern.search(line)
                    if match:
                        name, value = match.groups()
                        details[name] = value

    return details

### Generate the product index dataset


In [None]:

# Scrape all product pages
all_products = get_all_products('/collections/rocket-kits')

# Write the results to a CSV file
write_to_csv(all_products, 'product_index.csv')

In [32]:
with open('product_index.csv', newline='') as csvfile, open('product_details.csv', 'w', newline='') as outputfile:
    reader = csv.DictReader(csvfile)
    fieldnames = [
        'Name', 'Image URL', 'Complexity', 'Diameter', 'Height', 'Weight',
        'Motor Mount', 'Parachute Size', 'Shock Cord Type', 'Shock Cord Mount',
        'Fin Thickness', 'Ring Thickness', 'Instructions', 'Decal','Launch Pad',
        'Price', 'Product URL', 'Currency', 'SKU', 'Stock Status', 'Description','Rail Buttons',
        'Links', 'Vinyl Decals', 'Tec features', 'Decals', 'Fire Blanket', 'Vinyl Decal','Parachute', 'Fin Array','Rocksim', 'Parachutes','Additional Decals'
    ]
    writer = csv.DictWriter(outputfile, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        detail_url = row['detail_url']
        details = extract_product_details(detail_url)
        writer.writerow(details)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): locprecision.com:443


DEBUG:urllib3.connectionpool:https://locprecision.com:443 "GET /collections/rocket-kits/products/loc-iv HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): locprecision.com:443
DEBUG:urllib3.connectionpool:https://locprecision.com:443 "GET /collections/rocket-kits/products/loc-goblin HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): locprecision.com:443
DEBUG:urllib3.connectionpool:https://locprecision.com:443 "GET /collections/rocket-kits/products/yiris4 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): locprecision.com:443
DEBUG:urllib3.connectionpool:https://locprecision.com:443 "GET /collections/rocket-kits/products/ezi-65 HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): locprecision.com:443
DEBUG:urllib3.connectionpool:https://locprecision.com:443 "GET /collections/rocket-kits/products/iris HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new H