In [1]:
import requests
from lxml import etree
import json
import pandas as pd
import os

In [2]:
PROCESSED_URLS = []


def send_request(url, save_to_file=True):
    if url in PROCESSED_URLS:
        print(f"URL {url} has already been processed.")
        return None
    PROCESSED_URLS.append(url)
    print(f"Fetching data from {url}")
    filename = url.split("/")[-2] + ".html"
    filename = "root.html" if filename == ".html" else filename
    output_dir = "kite_packaging_data"
    # if not exists, create the directory
    os.makedirs(output_dir, exist_ok=True)
    filename = os.path.join(output_dir, filename)
    # if file already exists, then read the file instead of making a request
    try:
        with open(filename, "r", encoding="utf-8") as file:
            print(f"Using cached data from {filename}")
            raw_html = file.read()
        return raw_html
    except FileNotFoundError:
        print(f"No cached data found for {filename}, making a request...")
        response = requests.get(url)
        if response.status_code != 200:
            raise ValueError(
                f"Failed to fetch data from {url}. Status code: {response.status_code}"
            )
        raw_html = response.text

        try:
            if save_to_file:
                with open(filename, "w", encoding="utf-8") as file:
                    file.write(raw_html)
                print(f"Data saved to {filename}")
        except:
            pass

    return raw_html


def fetch_categories(raw_html, root_url, xpath):
    parser = etree.HTMLParser()
    tree = etree.fromstring(raw_html, parser)
    categories = tree.xpath(xpath)
    return [root_url + cat for cat in categories]

In [3]:
def parse_product_variants(tree):
    parser = etree.HTMLParser()
    # fetch categories
    categories = " > ".join(tree.xpath('//ul[@class="breadcrumb-menu"]/li/*/text()'))

    # get all image URLs from the carousel
    image_urls = [
        ie.get("data-large-src") if ie.get("data-large-src") else ie.get("data-src")
        for ie in tree.xpath('//*[@id="P-Carousel"]//img')
    ]
    # bullet text
    custom_bullet_text = "\n".join(
        [
            item
            for item in tree.xpath('//div[@class="custom-bullet"]//text()')
            if item.strip()
        ]
    )

    # about section
    about_section = "\n".join(
        [
            item.replace("\xa0", "")
            for item in tree.xpath(
                '//div[@id="details-tab"]/div[@class="catpagemaintext"]//text()'
            )
            if item.strip()
        ]
    )

    # prepare the base data structure
    base_data = {
        "categories": categories,
        "image_urls": image_urls,
        "custom_bullet_text": custom_bullet_text,
        "about_section": about_section,
    }

    # get script tag element with xpath: //script[@id="ang-data-source"]
    script_tag = tree.xpath('//script[@id="ang-data-source"]')
    if not script_tag:
        raise ValueError("No script tag with id 'ang-data-source' found in the HTML.")
    script_tag = "{" + script_tag[0].text.split("{", 1)[-1].rsplit("}", 1)[0] + "}"
    # extract the JSON data from the script tag
    # parse the JSON data
    product_data = json.loads(script_tag)

    products = product_data.get("productTables")[0].get("products")

    master_list = []
    for prod in products:
        # parse notes as HTML and extract text
        notes = prod.get("notes", "")
        if notes:
            notes = etree.fromstring(notes, parser)
            notes = "\n".join(
                [item.strip() for item in notes.xpath("//text()") if item.strip()]
            )
        else:
            notes = ""

        modal_images = [
            item.get("location")
            for mi in prod.get("modalImages", [])
            for item in mi.get("items")
            if item.get("typeAnnotation") == "L"
        ]

        internal_dimensions = " x ".join(
            map(str, prod.get("internalDimensions", {}).values())
        )

        videos = prod.get("videos", [])
        images = [
            item.get("location")
            for mi in prod.get("images", [])
            for item in mi.get("items")
            if item.get("typeAnnotation") == "L"
        ]
        description = prod.get("description", "")
        basket_description = prod.get("basketDescription", "")

        # specifications
        specs_dict = {
            item.get("description"): item.get("value")
            for item in prod.get("specifications")
            if item.get("value") != "None"
        }

        unit_per_pallet = prod.get("palletSize")
        unit_per_pack = prod.get("packSize")

        price_per_pack = {
            f"{item.get('min')}-{item.get('max')} Packs": item.get("price")
            for item in prod.get("packBreaks")
        }
        price_per_pallet = {
            f"{item.get('min')}-{item.get('max')} Packs": item.get("price")
            for item in prod.get("palletBreaks")
        }
        master_list.append(
            {
                **base_data,
                **specs_dict,
                "notes": notes,
                "modal_images": modal_images,
                "internal_dimensions": internal_dimensions,
                "videos": videos,
                "images": images,
                "description": description,
                "basket_description": basket_description,
                "unit_per_pallet": unit_per_pallet,
                "unit_per_pack": unit_per_pack,
                "price_per_pack": price_per_pack,
                "price_per_pallet": price_per_pallet,
            }
        )
    return pd.DataFrame(master_list)

In [4]:
# get categories
root_url = "https://www.kitepackaging.co.uk"
raw_html = send_request(root_url)
categories = fetch_categories(raw_html, root_url, xpath="//div[@class='list-group']/a/@href")

master_sub_cat = []
for cat_url in categories:
    print(f"Processing category: {cat_url}")
    raw_html = send_request(cat_url)
    if raw_html:
        sub_cat_urls = fetch_categories(raw_html, root_url, xpath="//a[@class='list-group-item dropme-left ']/@href")
        master_sub_cat.extend(sub_cat_urls)

Fetching data from https://www.kitepackaging.co.uk
Using cached data from kite_packaging_data/root.html
Processing category: https://www.kitepackaging.co.uk/scp/boxes/cardboard-boxes/
Fetching data from https://www.kitepackaging.co.uk/scp/boxes/cardboard-boxes/
Using cached data from kite_packaging_data/cardboard-boxes.html
Processing category: https://www.kitepackaging.co.uk/mcp/pallet-wrap/
Fetching data from https://www.kitepackaging.co.uk/mcp/pallet-wrap/
Using cached data from kite_packaging_data/pallet-wrap.html
Processing category: https://www.kitepackaging.co.uk/mcp/tapes/
Fetching data from https://www.kitepackaging.co.uk/mcp/tapes/
Using cached data from kite_packaging_data/tapes.html
Processing category: https://www.kitepackaging.co.uk/mcp/bubble-wrap/
Fetching data from https://www.kitepackaging.co.uk/mcp/bubble-wrap/
Using cached data from kite_packaging_data/bubble-wrap.html
Processing category: https://www.kitepackaging.co.uk/scp/protection-and-wrapping/voidfill-and-loos

In [None]:
i = 0
master_prod_df = pd.DataFrame()
while i < len(master_sub_cat):
    print(f"Processing sub category {i + 1}/{len(master_sub_cat)}: {master_sub_cat[i]}")
    url = master_sub_cat[i]
    if "custom" in url:
        print(f"Skipping custom: {url}")
        i += 1
        continue
    sub_cat_raw_html = send_request(url)
    if not sub_cat_raw_html:
        i += 1
        print(f"Failed to fetch data for {url}, skipping...")
        continue

    # create etree object to use xpath
    parser = etree.HTMLParser()
    tree = etree.fromstring(sub_cat_raw_html, parser)

    sub_sub_urls = tree.xpath(
        ('//div[@class="product-group-container"]/div[@class="tile-container"]/a/@href')
    )

    if sub_sub_urls:
        # append each sub-sub category
        master_sub_cat.extend([root_url + url for url in sub_sub_urls])
    else:
        try:
            prod_df = parse_product_variants(tree)
            prod_df["url"] = url
            master_prod_df = pd.concat([master_prod_df, prod_df], ignore_index=True)
        except ValueError as e:
            print(f"Error processing {url}: {e}")
    i += 1

Processing sub category 1/218: https://www.kitepackaging.co.uk/scp/boxes/standard-cardboard-boxes/
Fetching data from https://www.kitepackaging.co.uk/scp/boxes/standard-cardboard-boxes/
Using cached data from kite_packaging_data/standard-cardboard-boxes.html
Processing sub category 2/218: https://www.kitepackaging.co.uk/scp/heavy-duty-cardboard-boxes/double-wall-boxes/
Fetching data from https://www.kitepackaging.co.uk/scp/heavy-duty-cardboard-boxes/double-wall-boxes/
Using cached data from kite_packaging_data/double-wall-boxes.html
Processing sub category 3/218: https://www.kitepackaging.co.uk/scp/heavy-duty-cardboard-boxes/enviro-box/
Fetching data from https://www.kitepackaging.co.uk/scp/heavy-duty-cardboard-boxes/enviro-box/
Using cached data from kite_packaging_data/enviro-box.html
Processing sub category 4/218: https://www.kitepackaging.co.uk/scp/heavy-duty-cardboard-boxes/extra-heavy-duty-boxes/
Fetching data from https://www.kitepackaging.co.uk/scp/heavy-duty-cardboard-boxes/ex

In [6]:
len(PROCESSED_URLS)

239

In [7]:
master_prod_df.shape

(1244, 129)

In [9]:
master_prod_df.to_excel("kite_packaging_data.xlsx", index=False)

In [None]:
specs_dict

{'Box type': 'Standard',
 'Closure': 'Standard Box (seal with tape)',
 'Colour': 'Brown',
 'Material': 'Cardboard',
 'PackagingRegsMaterial': 'Paper',
 'Strength': 'Double Wall'}