In [70]:
import os
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, lit, array_distinct, split, regexp_replace, collect_list, struct, create_map
from pyspark.sql.types import StructType, StructField, LongType, StringType, FloatType, MapType, IntegerType

In [55]:
# Load the JSON data
file_path = os.path.join(os.getcwd(), "output", "full_product_extraction.json")
# We will read the json file to memory and then process it
with open(file_path, 'r', encoding="utf-8") as f:
    raw_data = f.read()
    raw_data = json.loads(raw_data)

In [74]:
# Initialize lists to store product and package data
products_list = []
packages_list = []
product_id = 0
package_id = 0
# Extract product and package information
for idx, entry in enumerate(raw_data):
    if entry is None or isinstance(entry, dict) is False:
        continue

    product_id += 1
    main_product = {
        "id": product_id,
        "item_number": entry["packages"][0]["itemNo"],
        "ensembled_measurements": entry["measurement_ensembled_text"],
        "name": entry["name"],
        "category": entry["category"],
        "price": entry["price"],
        "currency": entry["currency"].strip(),
        "url": entry.get("url", None),
        "product_components": {}  # Initially empty, can be populated if needed
    }

    # if len(entry["packages"]) > 1: the furniture is made up of other furniture
    # So we need to extract all the components, and the first element of the packages list is the main product
    # So it will likely have no measurements

    if len(entry["packages"]) > 1:
        for product in entry["packages"][1:]:
            product_id += 1
            products_list.append({
                "id": product_id,
                "item_number": product["itemNo"],
                "ensembled_measurements": None,
                "name": product["name"],
                "category": product["typeName"],
                "price": None,
                "currency": None,
                "url": None,
                "product_components": None  # Initially empty, can be populated if needed
            })
            main_product["product_components"][product["itemNo"]] = 1
            

            if product.get("measurements", None) is not None:
                for package in product["measurements"]:
                    package_id += 1
                    package_details = {
                                        "id": package_id,
                                        "products_item_number": product["itemNo"],
                                        "width": None,
                                        "length": None,
                                        "height": None,
                                        "weight": None,
                                        "distance_units": None,
                                        "weight_units": None
                                    }
                    for measurement in package:
                        if measurement["type"] == "width":
                            package_details["width"] = measurement["value"]
                            package_details["distance_units"] = measurement["text"].split(" ")[-1]
                        elif measurement["type"] == "height":
                            package_details["height"] = measurement["value"]
                            package_details["distance_units"] = measurement["text"].split(" ")[-1] if package_details["distance_units"] is None else package_details["distance_units"]
                        elif measurement["type"] == "length":
                            package_details["length"] = measurement["value"]
                            package_details["distance_units"] = measurement["text"].split(" ")[-1] if package_details["distance_units"] is None else package_details["distance_units"]
                        elif measurement["type"] == "weight":
                            package_details["weight"] = measurement["value"]
                            package_details["weight_units"] = measurement["text"].split(" ")[-1]
                    packages_list.append(package_details)
        products_list.append(main_product)

    else:
        products_list.append(main_product)
        product = entry["packages"][0]
        if product.get("measurements", None) is not None:
            for package in product["measurements"]:
                package_id += 1
                package_details = {
                                    "id": package_id,
                                    "products_item_number": product["itemNo"],
                                    "width": None,
                                    "length": None,
                                    "height": None,
                                    "weight": None,
                                    "distance_units": None,
                                    "weight_units": None
                                }
                for measurement in package:
                    if measurement["type"] == "width":
                        package_details["width"] = float(measurement["value"])
                        package_details["distance_units"] = measurement["text"].split(" ")[-1]
                    elif measurement["type"] == "height":
                        package_details["height"] = float(measurement["value"])
                        package_details["distance_units"] = measurement["text"].split(" ")[-1] if package_details["distance_units"] is None else package_details["distance_units"]
                    elif measurement["type"] == "length":
                        package_details["length"] = float(measurement["value"])
                        package_details["distance_units"] = measurement["text"].split(" ")[-1] if package_details["distance_units"] is None else package_details["distance_units"]
                    elif measurement["type"] == "weight":
                        package_details["weight"] = float(measurement["value"])
                        package_details["weight_units"] = measurement["text"].split(" ")[-1]
                packages_list.append(package_details)
    

In [77]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Process JSON Data") \
    .getOrCreate()

In [78]:
# Define the schema for products and packages
products_schema = StructType([
    StructField("id", LongType(), nullable=False),
    StructField("item_number", StringType(), nullable=False),
    StructField("ensembled_measurements", StringType(), nullable=True),
    StructField("name", StringType(), nullable=False),
    StructField("category", StringType(), nullable=False),
    StructField("price", FloatType(), nullable=True),
    StructField("currency", StringType(), nullable=True),
    StructField("url", StringType(), nullable=True),
    StructField("product_components", MapType(StringType(), IntegerType()), nullable=True)
])

packages_schema = StructType([
    StructField("id", LongType(), nullable=False),
    StructField("products_item_number", StringType(), nullable=False),
    StructField("width", FloatType(), nullable=False),
    StructField("length", FloatType(), nullable=False),
    StructField("height", FloatType(), nullable=False),
    StructField("weight", FloatType(), nullable=False),
    StructField("distance_units", StringType(), nullable=False),
    StructField("weight_units", StringType(), nullable=False)
])


# TODO I did not manage to get spark working here
# Create DataFrames from the processed lists
products_df = spark.createDataFrame(products_list, schema=products_schema)
#packages_df = spark.createDataFrame(packages_list, schema=packages_schema)

# Show the resulting DataFrames
products_df.show()
#packages_df.show()

In [84]:
import pandas as pd
pd.DataFrame(products_list).to_parquet(os.path.join("output", "products.parquet"))
pd.DataFrame(packages_list).to_parquet(os.path.join("output", "packages.parquet"))