In [21]:
import os
import json
import pandas as pd
import copy 

In [2]:
products = pd.read_parquet('output/products.parquet')

In [3]:
# Load the JSON data
file_path = os.path.join(os.getcwd(), "output", "full_product_extraction.json")
# We will read the json file to memory and then process it
with open(file_path, 'r', encoding="utf-8") as f:
    raw_data = f.read()
    raw_data = json.loads(raw_data)

In [5]:
raw_data[8]

{'packages': [{'name': 'BRIMNES',
   'typeName': 'Librería',
   'itemNo': '90301225',
   'articleNumber': {'label': 'Número de artículo', 'value': '903.012.25'},
   'measurements': [[{'label': 'Ancho',
      'type': 'width',
      'text': '36 cm',
      'value': 36},
     {'label': 'Alto', 'type': 'height', 'text': '11 cm', 'value': 11},
     {'label': 'Largo', 'type': 'length', 'text': '86 cm', 'value': 86},
     {'label': 'Peso', 'type': 'weight', 'text': '16.10 kg', 'value': 16.1}],
    [{'label': 'Ancho', 'type': 'width', 'text': '36 cm', 'value': 36},
     {'label': 'Alto', 'type': 'height', 'text': '6 cm', 'value': 6},
     {'label': 'Largo', 'type': 'length', 'text': '193 cm', 'value': 193},
     {'label': 'Peso', 'type': 'weight', 'text': '25.60 kg', 'value': 25.6}]],
   'quantity': {'label': 'Paquete(s)', 'value': 1},
   'multiPackDisclaimerText': 'Este producto tiene varios bultos'}],
 'totalNoOfPackagesText': 'Este producto consta de 2 paquetes.',
 'maxMeasurments': {'weight

In [14]:
package = entry["packages"][0]

In [16]:
package["measurements"]

[[{'label': 'Ancho', 'type': 'width', 'text': '36 cm', 'value': 36},
  {'label': 'Alto', 'type': 'height', 'text': '11 cm', 'value': 11},
  {'label': 'Largo', 'type': 'length', 'text': '86 cm', 'value': 86},
  {'label': 'Peso', 'type': 'weight', 'text': '16.10 kg', 'value': 16.1}],
 [{'label': 'Ancho', 'type': 'width', 'text': '36 cm', 'value': 36},
  {'label': 'Alto', 'type': 'height', 'text': '6 cm', 'value': 6},
  {'label': 'Largo', 'type': 'length', 'text': '193 cm', 'value': 193},
  {'label': 'Peso', 'type': 'weight', 'text': '25.60 kg', 'value': 25.6}]]

In [28]:
# Initialize lists to store product and package data
# Each product, which is any product for sale with a name will have a packages list.
# The package may be the product itself if there is only one package or it may be several packages
# In case any package has more than one object, we will replicate the package and assign to it consecutive subitemNo
products_list = []
product_id = 0
# Extract product and package information
for idx, entry in enumerate(raw_data):
    if entry is None or isinstance(entry, dict) is False:
        continue

    product_id += 1
    main_product = {
        "id": product_id,
        #"item_number": entry["packages"][0]["itemNo"],
        "ensembled_measurements": entry["measurement_ensembled_text"],
        "name": entry.get("name", None),
        "category": entry.get("category", None),
        "price": entry["price"],
        "currency": entry["currency"].strip(),
        "url": entry.get("url", None),
        "packages": []  # Initially empty, can be populated if needed
    }

    # if len(entry["packages"]) > 1: the furniture is made up of other furniture
    # So we need to extract all the components, and the first element of the packages list is the main product
    # So it will likely have no measurements

    if len(entry["packages"]) > 1:
        for package in entry["packages"][1:]:
            package_details = {
                "name": package["name"],
                "typeName": package["typeName"],
                "itemNo": package["itemNo"],
                #"subitemNo": subitem_number,
                "articleNumber": package.get("articleNumber", {}).get("value", None),
                "measurements": {
                    "dimensions": {
                    },
                    "weight": {
                    },
                    "volume": {
                    }
                },
                "quantity": 1
            }
            if package.get("measurements", None) is None:
                continue
            for subpackage in package["measurements"]:
                subpackage_details = copy.deepcopy(package_details)
                subitem_number = 1
                subpackage_details["subitemNo"] = subitem_number
                for measurement in subpackage:
                    if measurement["type"] == "width":
                        subpackage_details["measurements"]["dimensions"]["width"] = float(measurement["value"])
                        subpackage_details["measurements"]["dimensions"]["unit"] = measurement["text"].split(" ")[-1]
                    elif measurement["type"] == "height":
                        subpackage_details["measurements"]["dimensions"]["height"] = float(measurement["value"])
                        subpackage_details["measurements"]["dimensions"]["unit"] = measurement["text"].split(" ")[-1]\
                            if subpackage_details["measurements"]["dimensions"]["unit"] is None\
                                else subpackage_details["measurements"]["dimensions"]["unit"]
                    elif measurement["type"] == "length":
                        subpackage_details["measurements"]["dimensions"]["length"] = float(measurement["value"])
                        subpackage_details["measurements"]["dimensions"]["unit"] = measurement["text"].split(" ")[-1]\
                            if subpackage_details["measurements"]["dimensions"]["unit"] is None\
                                else subpackage_details["measurements"]["dimensions"]["unit"]
                    elif measurement["type"] == "weight":
                        subpackage_details["measurements"]["weight"]["value"] = float(measurement["value"])
                        subpackage_details["measurements"]["weight"]["unit"] = measurement["text"].split(" ")[-1]
                # We set volume value as the product of the dimensions
                subpackage_details["measurements"]["volume"]["value"] = subpackage_details["measurements"]["dimensions"]["width"] *\
                                                                    subpackage_details["measurements"]["dimensions"]["height"] *\
                                                                    subpackage_details["measurements"]["dimensions"]["length"]
                subpackage_details["measurements"]["volume"]["unit"] = f'{subpackage_details["measurements"]["dimensions"]["unit"]}3'

                # We will add all the packages, which we consider to be a single bulk as one package within the product
                # If there is more than one bulk within a package we make it separate packages
                subitem_number += 1
                # Now, we add the subpackage_details to the main_product packages list
                main_product["packages"].append(subpackage_details)

    else:
        package = entry["packages"][0]
        package_details = {
                "name": package["name"],
                "typeName": package["typeName"],
                "itemNo": package["itemNo"],
                #"subitemNo": subitem_number,
                "articleNumber": package.get("articleNumber", {}).get("value", None),
                "measurements": {
                    "dimensions": {
                    },
                    "weight": {
                    },
                    "volume": {
                    }
                },
                "quantity": 1
            }
        subitem_number = 1
        if package.get("measurements", None) is None:
            continue
        for subpackage in package["measurements"]:
            subpackage_details = copy.deepcopy(package_details)
            subpackage_details["subitemNo"] = subitem_number
            for measurement in subpackage:
                if measurement["type"] == "width":
                    subpackage_details["measurements"]["dimensions"]["width"] = float(measurement["value"])
                    subpackage_details["measurements"]["dimensions"]["unit"] = measurement["text"].split(" ")[-1]
                elif measurement["type"] == "height":
                    subpackage_details["measurements"]["dimensions"]["height"] = float(measurement["value"])
                    subpackage_details["measurements"]["dimensions"]["unit"] = measurement["text"].split(" ")[-1]\
                          if subpackage_details["measurements"]["dimensions"]["unit"] is None\
                              else subpackage_details["measurements"]["dimensions"]["unit"]
                elif measurement["type"] == "length":
                    subpackage_details["measurements"]["dimensions"]["length"] = float(measurement["value"])
                    subpackage_details["measurements"]["dimensions"]["unit"] = measurement["text"].split(" ")[-1]\
                          if subpackage_details["measurements"]["dimensions"]["unit"] is None\
                              else subpackage_details["measurements"]["dimensions"]["unit"]
                elif measurement["type"] == "weight":
                    subpackage_details["measurements"]["weight"]["value"] = float(measurement["value"])
                    subpackage_details["measurements"]["weight"]["unit"] = measurement["text"].split(" ")[-1]
            # We set volume value as the product of the dimensions
            subpackage_details["measurements"]["volume"]["value"] = subpackage_details["measurements"]["dimensions"]["width"] *\
                                                                subpackage_details["measurements"]["dimensions"]["height"] *\
                                                                subpackage_details["measurements"]["dimensions"]["length"]
            subpackage_details["measurements"]["volume"]["unit"] = f'{subpackage_details["measurements"]["dimensions"]["unit"]}3'

            # We will add all the packages, which we consider to be a single bulk as one package within the product
            # If there is more than one bulk within a package we make it separate packages
            subitem_number += 1
            # Now, we add the subpackage_details to the main_product packages list
            main_product["packages"].append(subpackage_details)
        
    products_list.append(main_product)

In [29]:
# Save the products list to a json file
file_path = os.path.join(os.getcwd(), "output", "full_product_extraction_updated_schema.json")
with open(file_path, 'w', encoding="utf-8") as f:
    json.dump(products_list, f, indent=4)