In [39]:
# import libraries
import pandas as pd
import numpy as np

In [40]:
# import parquet file
df = pd.read_parquet('product_deduplication.parquet', engine='pyarrow')

In [41]:
#  transform data from the product_identifier column into string in order to be able to modify
def array_to_str(arr):
# Convert NumPy arrays to a comma-separated string.
    if isinstance(arr, np.ndarray):  # Check if it's a NumPy array
        return ", ".join(map(str, arr))  # Convert elements to string and join with commas
    return str(arr)

# Insert data into a dictionary
def set_key(dictionary, key, value, duplication = False):
    if key not in dictionary: # Check if the key is already in the dictionary
        dictionary.update({key: value}) # Create a new key in the dictionary
    elif isinstance(dictionary[key], list): # Check if the value is a list
            if duplication is True: # The script is able to handle the option to allow duplicates
                 dictionary[key].append(value) # Append the new data to the list
            else:
                if value not in dictionary[key]:# Append the new data to the list if the data is not already in dictionary
                    dictionary[key].append(value)      
    else:
         dictionary[key] = [dictionary[key], value] # Create a list if is not one already

def merge_dicts(data, accept_duplicates = False):
    final_dict = {}
    data = data.dropna() # Remove NaN values
    for item in data: # Itearate through Pandas Series
        if isinstance(item, np.ndarray) and len(item) > 0:  # Extract from NumPy array
            item = item[0]  # Get the dictionary inside the array
        if isinstance(item, dict):  # Ensure it's a dictionary
            for key, value in item.items(): # Extract key, value from the dictionary
              set_key(final_dict, key, value, accept_duplicates) # Insert the new value into dictionary 
    return final_dict

def merge_lists(data):
    final_list = []
    data = data.dropna() # Remove NaN values
    for item in data: # Itearate through Pandas Series
        if isinstance(item, np.ndarray):  # Convert NumPy arrays to lists
            final_list.extend(item.tolist()) # Insert into the list
        elif isinstance(item, list): # Check if the item is a list
            final_list.extend(item) # Insert into the list
        else:
            final_list.append(item)  # Insert into the list
    final_list = set(final_list) # Eliminate duplicates
    return list(final_list)

In [42]:
df['product_name'] = df['product_name'].apply(array_to_str) # Convert the 'product_name' into str 
                                                            # in order to pivot on this column
# Create a unique line for each 'product_name'
df_merged = df.groupby("product_name").agg({
    "applicability": merge_lists,
    "unspsc": merge_lists, 
    "root_domain": merge_lists,
    "page_url": merge_lists, 
    "product_summary": merge_lists,                       
    "product_title": merge_lists,        
    "product_identifier": merge_lists,
    "brand": merge_lists,                                  
    "intended_industries": merge_lists,
    "applicability": merge_lists,
    "eco_friendly": merge_lists,                            
    "ethical_and_sustainability_practices": merge_lists,
    "production_capacity": merge_dicts,         
    "price": lambda x: merge_dicts(x, True),                                   
    "materials": merge_lists,                               
    "ingredients": merge_lists,                             
    "manufacturing_countries": merge_lists,                 
    "manufacturing_year": merge_lists,                       
    "manufacturing_type": merge_lists,                      
    "customization": merge_lists,                           
    "packaging_type": merge_lists,                          
    "form": merge_lists,                                    
    "size": merge_dicts,                                  
    "color": merge_dicts,                                   
    "purity": merge_dicts,                                  
    "energy_efficiency": merge_dicts,                       
    "pressure_rating": merge_dicts,                         
    "power_rating": merge_dicts,                            
    "quality_standards_and_certifications": merge_lists,    
    "miscellaneous_features": merge_lists,                  
    "description": merge_lists  
}).reset_index()

In [43]:
df_merged.to_csv('final_export.csv', index=False)

In [25]:
df.loc[df["product_name"] == '3D Massage Chairs','price']

412     [{'amount': 9999.0, 'currency': 'USD', 'type':...
6985    [{'amount': 3999.0, 'currency': 'USD', 'type':...
8831    [{'amount': 3999.0, 'currency': 'USD', 'type':...
Name: price, dtype: object

In [37]:
df_merged.loc[df_merged["product_name"] == '3D Massage Chairs','price']

84    {'amount': [9999.0, 3999.0], 'currency': ['USD...
Name: price, dtype: object