In [None]:
%pip install beautifulsoup4
%pip install cloudscraper

In [None]:
import csv
import logging
import time
import pandas as pd
from bs4 import BeautifulSoup
import cloudscraper
from urllib.request import Request, urlopen
from urllib.error import HTTPError
from urllib.parse import urlparse

Define our params before jumping in


In [None]:
logging.basicConfig(level=logging.DEBUG)
base_url = 'https://help.estesrockets.com'

## Main Kit links

This function gets the kit links for a page and looks for a _next page_ link, navigating to each page recursively.


In [None]:
def get_links(url):
    time.sleep(1)  # Adding a delay

    full_url = base_url + url
    scraper = cloudscraper.create_scraper(delay=10, browser='chrome')
    info = scraper.get(full_url).text
    soup = BeautifulSoup(info, "html.parser")

    links = soup.find_all('a', class_='article-list-link')
    extracted_links = [(link['href'], link.get_text()) for link in links]

    next_page = soup.find('a', class_='pagination-next-link')
    if next_page and 'href' in next_page.attrs:
        extracted_links += get_links(next_page['href'])

    return extracted_links

## Kit Details


In [None]:
def get_kit_info(url):
    time.sleep(1)  # Adding a delay

    full_url = base_url + url
    scraper = cloudscraper.create_scraper(delay=10, browser='chrome')
    info = scraper.get(full_url).text
    soup = BeautifulSoup(info, 'html.parser')

    # Extract image source URL from the 'article-body' class
    image_src = soup.find('div', class_='article-body').find('img')['src']

    # Extract description and product URL
    article_body = soup.find('div', class_='article-body')
    description, product_url = None, None
    for p_tag in article_body.find_all('p', recursive=False):
        if 'Purchase Link' in p_tag.text:
            product_url = p_tag.find('a')['href'] if p_tag.find('a') else None
        elif not p_tag.find():
            description = p_tag.get_text().strip()

    # Extract key features from the product attributes table
    features = {}
    table = soup.find(
        'table', class_='woocommerce-product-attributes shop_attributes')
    if table:
        for row in table.find_all('tr'):
            feature_name = row.find('th').get_text().strip()
            feature_value = row.find('td').get_text().strip()
            if feature_name:
                features[feature_name] = feature_value

    # Extract the instructions PDF link
    instructions_pdf = soup.find('div', class_='article-attachments').find(
        'a')['href'] if soup.find('div', class_='article-attachments') else None

    return {
        "description": description,
        "image_src": image_src,
        "features": features,
        "instructions": instructions_pdf,
        "product_url": product_url
    }

## Dataset writer

Creates a csv file and writes the links data using the fields as column headings.


In [None]:
import re


def write_dataset(filename, links, fields):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)

        writer.writeheader()  # Write the fieldnames as a header
        for href, text in links:
            match = re.match(r"(\d{1,4})\s*-?\s*(.*)", text)
            if match:
                model = match.group(1)
                # Remove leading dash and space, if any
                name = match.group(2).lstrip("– ").strip()
            else:
                model = 'Unknown'
                name = text
            writer.writerow({'URL': href, 'Model': model, 'Name': name})

## Kit details

### Fieldnames

Define the fields to use for our CSV columns


In [None]:
fieldnames = [
    "url",
    "description",
    "image_src",
    "Recommended Engines",
    "Projected Max Altitude",
    "Recovery System",
    "Length",
    "Diameter",
    "Estimated Weight",
    "Estimated Assembly Time",
    "Fin Materials",
    "Decal Type",
    "Launch System",
    "Launch Rod Size",
    "instructions",
    "Construction",
    "Wingspan",
    "Age Recommendation",
    "Launch Rod System",
    "Recovery",
    "Fin Material",
    "Estimated Assembly Weight",
    "product_url"
]

### process_csv

Reads URLs from an input CSV (1st column) and writes kit details to an output CSV

Features are sparsely populated


In [None]:
def process_csv(file_path, output_file_path):

    with open(file_path, newline='', encoding='utf-8') as csvfile, \
            open(output_file_path, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(csvfile)
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        next(reader)  # Skip the header row

        for row in reader:
            url = row[0]
            kit_info = get_kit_info(url)
            # Flatten the 'features' dictionary
            flattened_features = {k: v for k,
                                  v in kit_info['features'].items()}

            # Merge all data into a single dictionary
            row_data = {'url': url, **flattened_features,
                        'description': kit_info['description'], 'image_src': kit_info['image_src'], 'instructions': kit_info['instructions'], 'product_url': kit_info['product_url']}
            writer.writerow(row_data)

## Estes Production Kit Data

retrieves in producrtion kit names, models and urls to kit detail pages


In [None]:
all_links = get_links(
    '/hc/en-us/sections/8356411218829-Currently-Manufactured-Rockets')
fieldnames = ['URL', 'Model', 'Name']  # Define your fieldnames
write_dataset('estes_kits.csv', all_links, fieldnames)

### production kit details

Get the detail for all the production kits


In [None]:
process_csv('estes_kits.csv', 'Estes_kits_detail.csv')

## Estes Out of Production Kit Data

Gets a list of OOP kits


In [None]:
all_links = get_links(
    '/hc/en-us/sections/8356482425613-Out-of-Production-Kits')
fieldnames = ['URL', 'Model', 'Name']  # Define your fieldnames
write_dataset('estes_kits_oop.csv', all_links, fieldnames)

### Out of production kit details

Nothing here except a link to the instructions pdf


In [None]:
# process_csv('estes_kits_oop.csv', 'Estes_kits_detail_oop.csv')

## Classroom/Bulk Kits


In [None]:
all_links = get_links(
    '/hc/en-us/sections/9025204863373-Classroom-Bulk-Kits')
fieldnames = ['URL', 'Model', 'Name']  # Define your fieldnames
write_dataset('estes_kits_bulk.csv', all_links, fieldnames)

### Bulk kit details


In [None]:
process_csv('estes_kits_bulk.csv', 'Estes_kits_detail_bulk.csv')