In [1]:
import glob
source_dir = "/workspace/datasets/product_data/products"
files = glob.glob(source_dir + "/*.xml")
print(f"Number of files; {len(files)}")
files[:5]

Number of files; 256


['/workspace/datasets/product_data/products/products_0001_2570_to_430420.xml',
 '/workspace/datasets/product_data/products/products_0002_430439_to_518210.xml',
 '/workspace/datasets/product_data/products/products_0003_518229_to_606384.xml',
 '/workspace/datasets/product_data/products/products_0004_606428_to_722720.xml',
 '/workspace/datasets/product_data/products/products_0005_722800_to_846222.xml']

In [2]:
mappings_old =  [
    "productId/text()", "productId",
    "sku/text()", "sku",
    "name/text()", "name",
    "type/text()", "type",
    "startDate/text()", "startDate",
    "active/text()", "active",
    "regularPrice/text()", "regularPrice",
    "salePrice/text()", "salePrice",
    "artistName/text()", "artistName",
    "onSale/text()", "onSale",
    "digital/text()", "digital",
    "frequentlyPurchasedWith/*/text()", "frequentlyPurchasedWith",# Note the match all here to get the subfields
    "accessories/*/text()", "accessories",# Note the match all here to get the subfields
    "relatedProducts/*/text()", "relatedProducts",# Note the match all here to get the subfields
    "crossSell/text()", "crossSell",
    "salesRankShortTerm/text()", "salesRankShortTerm",
    "salesRankMediumTerm/text()", "salesRankMediumTerm",
    "salesRankLongTerm/text()", "salesRankLongTerm",
    "bestSellingRank/text()", "bestSellingRank",
    "url/text()", "url",
    "categoryPath/*/name/text()", "categoryPath", # Note the match all here to get the subfields
    "categoryPath/*/id/text()", "categoryPathIds", # Note the match all here to get the subfields
    "categoryPath/category[last()]/id/text()", "categoryLeaf",
    "count(categoryPath/*/name)", "categoryPathCount",
    "customerReviewCount/text()", "customerReviewCount",
    "customerReviewAverage/text()", "customerReviewAverage",
    "inStoreAvailability/text()", "inStoreAvailability",
    "onlineAvailability/text()", "onlineAvailability",
    "releaseDate/text()", "releaseDate",
    "shippingCost/text()", "shippingCost",
    "shortDescription/text()", "shortDescription",
    "shortDescriptionHtml/text()", "shortDescriptionHtml",
    "class/text()", "class",
    "classId/text()", "classId",
    "subclass/text()", "subclass",
    "subclassId/text()", "subclassId",
    "department/text()", "department",
    "departmentId/text()", "departmentId",
    "bestBuyItemId/text()", "bestBuyItemId",
    "description/text()", "description",
    "manufacturer/text()", "manufacturer",
    "modelNumber/text()", "modelNumber",
    "image/text()", "image",
    "condition/text()", "condition",
    "inStorePickup/text()", "inStorePickup",
    "homeDelivery/text()", "homeDelivery",
    "quantityLimit/text()", "quantityLimit",
    "color/text()", "color",
    "depth/text()", "depth",
    "height/text()", "height",
    "weight/text()", "weight",
    "shippingWeight/text()", "shippingWeight",
    "width/text()", "width",
    "longDescription/text()", "longDescription",
    "longDescriptionHtml/text()", "longDescriptionHtml",
    "features/*/text()", "features" # Note the match all here to get the subfields
]
mappings = list(zip(mappings_old[1::2], mappings_old[::2]))
mappings_dict = dict(mappings)
mappings[:5]

[('productId', 'productId/text()'),
 ('sku', 'sku/text()'),
 ('name', 'name/text()'),
 ('type', 'type/text()'),
 ('startDate', 'startDate/text()')]

In [3]:
import pandas as pd
from lxml import etree
from pathlib import Path
import pickle
def extract_records(file: str, mappings_dict: dict, batch_size:int = 2000) -> pd.DataFrame:
    """Extract product details from file
  
    Args:
        file (str): Path to the XML file containing product details.
        mappings_dict (dict): A dictionary of mappings to extract
  
    Returns:
        pd.DataFrame: A pandas dataframe with product details
    """
    nodes = etree.parse(file).getroot().findall("./product") 
    records = pd.DataFrame([
      {k: node.xpath(v) for k, v in mappings_dict.items()} 
      for node in nodes
      if len(node.xpath("productId/text()")) > 0
    ])
    for idx, i in enumerate(range(0, len(records), batch_size)):
      batch = records[i:i + batch_size]
      with open(f"/workspace/datasets/products/{Path(file).stem}-{idx}.pickle", 'wb') as handle:
        pickle.dump(batch, handle)
    return len(records)

def combine_records(records: list):
    pd.concat(records)

In [4]:
from concurrent.futures import ProcessPoolExecutor
with ProcessPoolExecutor(max_workers=8) as pool:
    records = list(pool.map(extract_records, files, [mappings_dict]*len(files)))

In [5]:
sum(records)

1275077

In [6]:
queries = pd.read_csv("/workspace/datasets/train.csv")

In [7]:
queries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1865269 entries, 0 to 1865268
Data columns (total 6 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   user        object
 1   sku         int64 
 2   category    object
 3   query       object
 4   click_time  object
 5   query_time  object
dtypes: int64(1), object(5)
memory usage: 85.4+ MB


In [4]:
import pandas as pd
from lxml import etree
def extract_documents(nodes: list, mappings_dict: dict) -> pd.DataFrame:
    """Extract product details from file
  
    Args:
        file (str): Path to the XML file containing product details.
        index_name (str): The name of the index to store the details in.
  
    Returns:
        pd.DataFrame: A pandas dataframe with product details
    """
    documents = pd.DataFrame([
      {k: node.xpath(v) for k, v in mappings_dict.items()} 
      for node in nodes
      if len(node.xpath("productId/text()")) > 0
    ])
    return documents


In [13]:
def get_nodes_batched(file: str, batch_size:int = 2000):
    nodes = etree.parse(file).getroot().findall("./product")
    nodes_batches = [
        nodes[i:i + batch_size] 
        for i in range(0, len(nodes), batch_size)
    ]
    return(nodes_batches)

In [14]:
def extract_documents_file(file: str):
    batches = get_nodes_batched(file, batch_size = 2000)
    documents = [extract_documents(batch, mappings_dict) for batch in batches]
    return(documents)

In [39]:
256*(90/16)/60

3.0

In [41]:
df = pd.concat([item for items in results for item in items])

In [64]:
from lxml import etree
import pandas as pd
def extract_documents(file: str, mappings_dict: dict) -> pd.DataFrame:
    """Extract product details from file
  
    Args:
        file (str): Path to the XML file containing product details.
        index_name (str): The name of the index to store the details in.
  
    Returns:
        pd.DataFrame: A pandas dataframe with product details
    """
    children = etree.parse(file).getroot().findall("./product")
    documents = pd.DataFrame([
      {k: child.xpath(v) for k, v in mappings_dict.items()} 
      for child in children
      if len(child.xpath("productId/text()")) > 0
    ])
    return documents
    


def index_batch(batch, index_name:str):
    records = (
        batch
          .assign(_index = index_name)
          .to_dict(orient = "records")
    )
    bulk(client, records)
    return len(records)

def index_batch_queries(batch):
    return index_batch(batch, index_name = 'bbuy_queries')

def index_batch_products(batch):
    return index_batch(batch, index_name = 'bbuy_products')

def index_batches(batches: list):
    num_records = sum([index_batch(batch) for batch in batches])
    return(num_records)

def index_products_in_file(file:str, index_name:str = "bbuy_products", batch_size:int = 2000):
    """Index all products contained in an XML file.
  
    Args:
        client (_type_): An OpenSearch client
        file (str): Path to the XML file containing the product details
        index_name (str, optional): The name of the index to store details in. Defaults to "bbuy_products".
        batch_size (int, optional): The maximum number of records to index at once. Defaults to 2000.
    """
    documents = extract_documents(file, index_name = "bbuy_products")
    batches = split_batches(documents, batch_size = 2000)
    for batch in batches:
        index_batch(batch, index_name)
    return len(documents)

def index_products_in_dir(source_dir: str) -> int:
    """Index all products contained in XML files in a source directory.

    Args:
        source_dir (str): Path to the directory containing the files

    Returns:
        int: The total number of records indexed
    """
    files = glob.glob(source_dir + "/*.xml")
    results = execute_in_parallel(index_products_in_file, files)
    documents_indexed = sum(results)
    return documents_indexed

In [None]:
def index_records(csv_file, batch_size:int = 2000):
    chunks = pd.read_csv(csv_file, chunksize = batch_size)
    results = execute_in_parallel(index_batch_queries, chunks)
    documents_indexed = sum(results)
    return documents_indexed


In [None]:
import pandas as pd
import multiprocessing
def execute_in_parallel(func, chunks):
    # Set up multiprocessing
    pool = multiprocessing.Pool()
    # Apply the function to each chunk in parallel
    result = pool.map(func, chunks)
    # Close the multiprocessing pool
    pool.close()
    return result

In [None]:
def index_queries

In [59]:




# Read the large DataFrame in chunks
chunks = files

# Define a function to be applied to each chunk
def process_chunk(chunk):
    return chunk

# Apply the function to each chunk in parallel
result = pool.map(add_index_and_id, chunks)

# Concatenate the result into a single DataFrame
result_df = pd.concat(result)

# Close the multiprocessing pool
pool.close()


In [54]:

file = files[10]
children = etree.parse(file).getroot().findall("./product")
docs = pd.DataFrame([
    {k: child.xpath(v) for k, v in mappings_dict.items()} 
    for child in children
    if len(child.xpath("productId/text()")) > 0
])
docs_bulk = docs.assign(_index = "bbuy_products")
docs_bulk.head()

Unnamed: 0,productId,sku,name,type,startDate,active,regularPrice,salePrice,artistName,onSale,...,color,depth,height,weight,shippingWeight,width,longDescription,longDescriptionHtml,features,_index
0,[291938],[1304779],[Bettishbreez - CD],[Music],[1992-06-01],[false],[14.99],[14.99],[Wizzard],[false],...,[],[],[],[],[],[],[],[],[],bbuy_products
1,[2143652],[1304783],[The Original Mono Recordings [LP] - VINYL],[Music],[2010-05-11],[true],[219.99],[219.99],[Bob Dylan],[false],...,[],[],[],[],[],[],[],[],[],bbuy_products
2,[87870],[1304788],[Bluntly Speaking - CD],[Music],[1992-11-03],[false],[15.99],[15.99],[Kilo],[false],...,[],[],[],[],[],[],[],[],[],bbuy_products
3,[2145865],[1304792],"[Bootleg Series, Vol. 9: The Witmark... [Box] ...",[Music],[2010-03-23],[true],[17.99],[17.99],[Bob Dylan],[false],...,[],[],[],[],[],[],[],[],[],bbuy_products
4,[2145866],[1304801],"[Bootleg Series, Vol. 9: The Witmark... [LP] -...",[Music],[2010-03-23],[true],[119.99],[119.99],[Bob Dylan],[false],...,[],[],[],[],[],[],[],[],[],bbuy_products


In [33]:
batch_size = 2000
batches = [docs[i:i + batch_size] for i in range(0, len(docs), batch_size)] 

Unnamed: 0,productId,sku,name,type,startDate,active,regularPrice,salePrice,artistName,onSale,...,quantityLimit,color,depth,height,weight,shippingWeight,width,longDescription,longDescriptionHtml,features
4000,[82131],[1366480],[Taxi - CASSETTE],[Music],[1993-01-13],[false],[9.99],[9.99],[Bryan Ferry],[false],...,[3],[],[],[],[],[],[],[],[],[]
4001,[82878],[1366499],[Dragonfly Summer - CD],[Music],[1992-09-15],[true],[11.99],[11.99],[Michael Franks],[false],...,[3],[],[],[],[],[],[],[],[],[]
4002,[82878],[1366505],[Dragonfly Summer - CASSETTE],[Music],[1993-01-13],[false],[9.99],[9.99],[Michael Franks],[false],...,[3],[],[],[],[],[],[],[],[],[]
4003,[91816],[1366541],[Dropped - CD],[Music],[1992-10-06],[false],[17.99],[17.99],[Mindfunk],[false],...,[3],[],[],[],[],[],[],[],[],[]
4004,[91816],[1366550],[Dropped - CASSETTE],[Music],[1993-02-03],[false],[11.99],[11.99],[Mindfunk],[false],...,[3],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,[94548],[1387519],[Dragline - CD],[Music],[1992-10-06],[false],[14.99],[14.99],[Paw],[false],...,[3],[],[],[],[],[],[],[],[],[]
4996,[238813],[1387537],[Dragline - CASSETTE],[Music],[1993-02-03],[false],[9.99],[9.99],[Paw],[false],...,[3],[],[],[],[],[],[],[],[],[]
4997,[179191],[1387555],[Music - CD],[Music],[1993-06-03],[false],[15.99],[15.99],[Paul Horn],[false],...,[3],[],[],[],[],[],[],[],[],[]
4998,[238778],[1387564],[Uncommon Goal - CD],[Music],[1993-02-03],[false],[14.99],[14.99],[Allgood],[false],...,[3],[],[],[],[],[],[],[],[],[]


In [29]:
children = etree.parse(files[0]).getroot().findall("./product")
child = children[0]

In [272]:
def generate_documents(file: str, index_name: str, mappings_dict: dict, batch_size:int = 5) -> list:
    """Generate a batch of documents from a product file

    Args:
        file (str): The path to the XML file containing the product details.    
        index_name (str): The name of the index to which the documents are to be written to.
        batch_size (int, optional): The max number of documents to index in bulk. Defaults to 5.

    Yields:
        _type_: _description_
    """
    # Parse tree to fetch all products
    children = etree.parse(file).getroot().findall("./product")
    batch = []
    for child in children:
        product_id = child.xpath('productId')
        if (len(product_id) == 0):
          continue
        doc = {k: child.xpath(v) for k, v in mappings_dict.items()}
        doc["_index"] = index_name
        batch.append(doc)
        if len(batch) == batch_size:
          yield batch
          batch = []

    # Yield the last batch even if it is less than batch size
    if batch:
        yield batch

In [273]:
def index_file(client, file: str, index_name: str, mappings_dict: dict, batch_size: int = 5):
    for docs in generate_documents(file, index_name, mappings_dict, batch_size = batch_size):
        bulk(client, docs)

In [None]:
from time import perf_counter
import concurrent.futures

def index_files(source_dir: str, index_name: str, max_workers: int):
    files = glob.glob(source_dir + "/*.xml")
    docs_indexed = 0
    start = perf_counter()
    with concurrent.futures.ProcessPoolExecutor(max_workers = workers) as executor:
        futures = [executor.submit(index_file, file, index_name) for file in files]
        for future in concurrent.futures.as_completed(futures):
            docs_indexed += future.result()

    finish = perf_counter()
    logger.info(f'Done. Total docs: {docs_indexed} in {(finish - start)/60} minutes')