# Index Product Data

Let us start by extracting product data from the XML files. 

In [4]:
import glob
SOURCE_DIR = "/workspace/datasets/product_data/products"
FILES = glob.glob(f"{SOURCE_DIR}/*.xml")
print(f"Number of XML files: {len(FILES)}")
FILES[:5]

Number of XML files: 256


['/workspace/datasets/product_data/products/products_0001_2570_to_430420.xml',
 '/workspace/datasets/product_data/products/products_0002_430439_to_518210.xml',
 '/workspace/datasets/product_data/products/products_0003_518229_to_606384.xml',
 '/workspace/datasets/product_data/products/products_0004_606428_to_722720.xml',
 '/workspace/datasets/product_data/products/products_0005_722800_to_846222.xml']

Let us read `mappings.yaml` to get the xpath selectors associated with each field to extract.

In [10]:
import yaml
with open("/workspace/datasets/mappings.yaml", "r") as handle:
    mappings = yaml.safe_load(handle)
mappings.keys()

dict_keys(['accessories', 'active', 'artistName', 'bestBuyItemId', 'bestSellingRank', 'categoryLeaf', 'categoryPath', 'categoryPathCount', 'categoryPathIds', 'class', 'classId', 'color', 'condition', 'crossSell', 'customerReviewAverage', 'customerReviewCount', 'department', 'departmentId', 'depth', 'description', 'digital', 'features', 'frequentlyPurchasedWith', 'height', 'homeDelivery', 'image', 'inStoreAvailability', 'inStorePickup', 'longDescription', 'longDescriptionHtml', 'manufacturer', 'modelNumber', 'name', 'onSale', 'onlineAvailability', 'productId', 'quantityLimit', 'regularPrice', 'relatedProducts', 'releaseDate', 'salePrice', 'salesRankLongTerm', 'salesRankMediumTerm', 'salesRankShortTerm', 'shippingCost', 'shippingWeight', 'shortDescription', 'shortDescriptionHtml', 'sku', 'startDate', 'subclass', 'subclassId', 'type', 'url', 'weight', 'width'])

## Extract Records

Let us write a function to extract product records from a XML file and return a pandas dataframe.

In [6]:
import pandas as pd
from lxml import etree
def extract_records(file: str, mappings: dict) -> pd.DataFrame:
    """Extract details from XML file
  
    Args:
        file (str): Path to the XML file containing details.
        mappings (dict): A dictionary of mappings to extract
  
    Returns:
        pd.DataFrame: A pandas dataframe with records
    """
    nodes = etree.parse(file).getroot().findall("./product") 
    records = pd.DataFrame([
      {k: node.xpath(v) for k, v in mappings.items()} 
      for node in nodes
      if len(node.xpath("productId/text()")) > 0
    ])
    return records

records = extract_records(FILES[0], mappings)
records.head()

Unnamed: 0,productId,sku,name,type,startDate,active,regularPrice,salePrice,artistName,onSale,...,quantityLimit,color,depth,height,weight,shippingWeight,width,longDescription,longDescriptionHtml,features
0,[1051806942127],[2570],[Fuji - 110 Color Print Disc Film (24 exposures)],[HardGood],[2000-03-01],[false],[1.89],[1.89],[],[false],...,[50],[],[],[],[],[0.1],[],"[110 disc color print, 200-speed film. 24 expo...","[110 disc color print, 200-speed film. 24 expo...",[]
1,[1051806941102],[12854],[Manufacturer - Test sku - Color],[HardGood],[2000-03-01],[false],[3.99],[3.99],[],[false],...,[50],[Color],[Product Depth],[Product Height],[Product Weight],[0.4],[Product Width],[Product Description],[Product Description],[Feature bullet]
2,[1051806941178],[12881],[Kodak - Gold 400 Film (24 exposures)],[HardGood],[2000-03-01],[false],[4.79],[4.79],[],[false],...,[50],[],[],[],[],[0.08],[],[Kodak Gold 400-speed film is perfect for a va...,[Kodak Gold 400-speed film is perfect for a va...,[Excellent latent-image keeping characteristic...
3,[1051826152940],[34590],[Hoover - Vacuum Bag A],[HardGood],[2003-05-04],[false],[3.99],[3.99],[],[false],...,[20],[],"[1-1/10""]","[11-3/5""]",[],[0.3],"[8-1/5""]",[Vacuum Bag A],[Vacuum Bag A],[]
4,[1051384074145],[43900],[Duracell - AAA Batteries (4-Pack)],[HardGood],[2000-03-01],[true],[5.99],[5.99],[],[false],...,[5],[],[],[],[],[0.19],[],[Power a variety of electronic devices with th...,[Power a variety of electronic devices with th...,[Compatible with select electronic devices\nFo...


## Save Records

Let us now write a function to batch the records and save each batch as a parquet file. We batch the records so that we can index the batches in parallel, while keeping the batch size manageable.

In [7]:
from pathlib import Path
def save_records(records: pd.DataFrame, file: str, batch_size: int = 2000) -> int:
    """Save product records in batches to pickle files.

    Args:
        records (pd.DataFrame): Records to save.
        file (str): The path to save the file.
        batch_size (int, optional): The number of records in a batch. Defaults to 2000.
    Returns:
        int: The number of records extracted and saved.
    """
    for idx, start in enumerate(range(0, len(records), batch_size)):
        batch = records.iloc[start : start + batch_size]
        batch.to_parquet(f"{file}-{idx}.parquet")
    return len(records)

save_records(records, f"/tmp/{Path(FILES[0]).stem}", batch_size = 2000)
glob.glob("/tmp/*.parquet")

['/tmp/products_0001_2570_to_430420-2.pickle',
 '/tmp/products_0001_2570_to_430420-1.pickle',
 '/tmp/products_0001_2570_to_430420-0.pickle']

## Extract and Save Records

Let us compose the two functions we wrote earlier to extact and save product records.

In [8]:
def extract_and_save_records(file: str, output_dir: str, mappings: dict, batch_size:int = 2000) -> int:
    """Extract product records from XML files and save them in batches to pickle files.

    Args:
        file (str): XML file to extract records from.
        output_dir (str):  The directory to save the pickle files.
        mappings (dict): A dictionary of mappings to extract.
        batch_size (int, optional): The maximum number of records in a batch. Defaults to 2000.

    Returns:
        int: The number of records extracted and saved.
    """
    records = extract_records(file, mappings)
    output_file = Path(output_dir) / Path(file).stem
    save_records(records, output_file, batch_size)
    return len(records)

Path("/tmp/test-1").mkdir(parents=True, exist_ok=True)
extract_and_save_records(FILES[0], "/tmp/test-1", mappings, 2000)
glob.glob("/tmp/test-1/*")

['/tmp/test-1/products_0001_2570_to_430420-2.pickle',
 '/tmp/test-1/products_0001_2570_to_430420-1.pickle',
 '/tmp/test-1/products_0001_2570_to_430420-0.pickle']

## Extract and Save All Records

We can now write a function that loops through all the XML files and calls the `extract_and_save_records` function on each of them. We can use `ProcessPoolExecutor()` from `concurrent.futures` to parallelize the ingesion pipeline and speed it up significantly.

In [None]:
from concurrent.futures import ProcessPoolExecutor
from itertools import repeat
from functools import partial
from tqdm import tqdm
def extract_and_save_records_all(source_dir: str, output_dir: str, mappings: dict, batch_size:int = 2000):
    """Extract product records from XML files and save them in batches

    Args:
        source_dir (str): _description_
        output_dir (str): _description_
        mappings (dict): _description_
        batch_size (int, optional): _description_. Defaults to 2000.
    """
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    files = glob.glob(source_dir + "/*.xml")
    extract_and_save_records_from_file = partial(
        extract_and_save_records, 
        mappings = mappings, 
        output_dir = output_dir, 
        batch_size = batch_size
    )
    with ProcessPoolExecutor(max_workers=8) as pool:
        records = list(tqdm(pool.map(extract_and_save_records_from_file, files), total=len(files)))
    print(f"Extracted {sum(records)} records from {len(files)} files")

OUTPUT_DIR = "/workspace/datasets/products"
extract_and_save_records_all(SOURCE_DIR, OUTPUT_DIR, mappings, batch_size = 2000)

In [23]:
output_files = sorted(glob.glob(f"{OUTPUT_DIR}/*.parquet"))
batches = (pd.read_parquet(file) for file in output_files)