In [2]:
import glob
source_dir = "/workspace/datasets/product_data/products"
files = glob.glob(source_dir + "/*.xml")
print(f"Number of files; {len(files)}")
files[:5]

Number of files; 256


['/workspace/datasets/product_data/products/products_0001_2570_to_430420.xml',
 '/workspace/datasets/product_data/products/products_0002_430439_to_518210.xml',
 '/workspace/datasets/product_data/products/products_0003_518229_to_606384.xml',
 '/workspace/datasets/product_data/products/products_0004_606428_to_722720.xml',
 '/workspace/datasets/product_data/products/products_0005_722800_to_846222.xml']

In [265]:
mappings_old =  [
    "productId/text()", "productId",
    "sku/text()", "sku",
    "name/text()", "name",
    "type/text()", "type",
    "startDate/text()", "startDate",
    "active/text()", "active",
    "regularPrice/text()", "regularPrice",
    "salePrice/text()", "salePrice",
    "artistName/text()", "artistName",
    "onSale/text()", "onSale",
    "digital/text()", "digital",
    "frequentlyPurchasedWith/*/text()", "frequentlyPurchasedWith",# Note the match all here to get the subfields
    "accessories/*/text()", "accessories",# Note the match all here to get the subfields
    "relatedProducts/*/text()", "relatedProducts",# Note the match all here to get the subfields
    "crossSell/text()", "crossSell",
    "salesRankShortTerm/text()", "salesRankShortTerm",
    "salesRankMediumTerm/text()", "salesRankMediumTerm",
    "salesRankLongTerm/text()", "salesRankLongTerm",
    "bestSellingRank/text()", "bestSellingRank",
    "url/text()", "url",
    "categoryPath/*/name/text()", "categoryPath", # Note the match all here to get the subfields
    "categoryPath/*/id/text()", "categoryPathIds", # Note the match all here to get the subfields
    "categoryPath/category[last()]/id/text()", "categoryLeaf",
    "count(categoryPath/*/name)", "categoryPathCount",
    "customerReviewCount/text()", "customerReviewCount",
    "customerReviewAverage/text()", "customerReviewAverage",
    "inStoreAvailability/text()", "inStoreAvailability",
    "onlineAvailability/text()", "onlineAvailability",
    "releaseDate/text()", "releaseDate",
    "shippingCost/text()", "shippingCost",
    "shortDescription/text()", "shortDescription",
    "shortDescriptionHtml/text()", "shortDescriptionHtml",
    "class/text()", "class",
    "classId/text()", "classId",
    "subclass/text()", "subclass",
    "subclassId/text()", "subclassId",
    "department/text()", "department",
    "departmentId/text()", "departmentId",
    "bestBuyItemId/text()", "bestBuyItemId",
    "description/text()", "description",
    "manufacturer/text()", "manufacturer",
    "modelNumber/text()", "modelNumber",
    "image/text()", "image",
    "condition/text()", "condition",
    "inStorePickup/text()", "inStorePickup",
    "homeDelivery/text()", "homeDelivery",
    "quantityLimit/text()", "quantityLimit",
    "color/text()", "color",
    "depth/text()", "depth",
    "height/text()", "height",
    "weight/text()", "weight",
    "shippingWeight/text()", "shippingWeight",
    "width/text()", "width",
    "longDescription/text()", "longDescription",
    "longDescriptionHtml/text()", "longDescriptionHtml",
    "features/*/text()", "features" # Note the match all here to get the subfields
]
mappings = list(zip(mappings_old[1::2], mappings_old[::2]))
mappings_dict = dict(mappings)
mappings[:5]

[('productId', 'productId/text()'),
 ('sku', 'sku/text()'),
 ('name', 'name/text()'),
 ('type', 'type/text()'),
 ('startDate', 'startDate/text()')]

In [29]:
children = etree.parse(files[0]).getroot().findall("./product")
child = children[0]

In [272]:
def generate_documents(file: str, index_name: str, mappings_dict: dict, batch_size:int = 5) -> list:
    """Generate a batch of documents from a product file

    Args:
        file (str): The path to the XML file containing the product details.    
        index_name (str): The name of the index to which the documents are to be written to.
        batch_size (int, optional): The max number of documents to index in bulk. Defaults to 5.

    Yields:
        _type_: _description_
    """
    # Parse tree to fetch all products
    children = etree.parse(file).getroot().findall("./product")
    batch = []
    for child in children:
        product_id = child.xpath('productId')
        if (len(product_id) == 0):
          continue
        doc = {k: child.xpath(v) for k, v in mappings_dict.items()}
        doc["_index"] = index_name
        batch.append(doc)
        if len(batch) == batch_size:
          yield batch
          batch = []

    # Yield the last batch even if it is less than batch size
    if batch:
        yield batch

In [273]:
def index_file(client, file: str, index_name: str, mappings_dict: dict, batch_size: int = 5):
    for docs in generate_documents(file, index_name, mappings_dict, batch_size = batch_size):
        bulk(client, docs)

In [None]:
from time import perf_counter
import concurrent.futures

def index_files(source_dir: str, index_name: str, max_workers: int):
    files = glob.glob(source_dir + "/*.xml")
    docs_indexed = 0
    start = perf_counter()
    with concurrent.futures.ProcessPoolExecutor(max_workers = workers) as executor:
        futures = [executor.submit(index_file, file, index_name) for file in files]
        for future in concurrent.futures.as_completed(futures):
            docs_indexed += future.result()

    finish = perf_counter()
    logger.info(f'Done. Total docs: {docs_indexed} in {(finish - start)/60} minutes')