In [1]:
import pandas as pd
import ast

# Load dataset
df = pd.read_csv(r"E:\chrome\veridion_product_deduplication_full.csv")

# Helper functions
def merge_text_fields(series):
    """Merge distinct non-null text entries, safely"""
    try:
        return ' || '.join(sorted(set(str(x).strip() for x in series.dropna() if str(x).strip())))
    except Exception:
        return None

def merge_json_lists(series):
    """Merge list-of-dict or list-of-values from serialized JSON strings"""
    merged_items = []
    for entry in series.dropna():
        try:
            parsed = ast.literal_eval(entry)
            if isinstance(parsed, list):
                merged_items.extend(parsed)
        except Exception:
            continue
    return list({repr(item): item for item in merged_items}.values())  # unique

def merge_categorical(series):
    """Return the most frequent non-null value"""
    return series.dropna().mode().iloc[0] if not series.dropna().empty else None

def merge_numeric(series):
    """Return most frequent numeric value"""
    try:
        return series.mode().iloc[0]
    except:
        return None

# Deduplication process
all_consolidated_entries = []
all_product_names = df['product_name'].dropna().unique()

for pname in all_product_names:
    group = df[df['product_name'] == pname]

    merged_entry = {
        'product_name': pname,
        'product_title': merge_text_fields(group['product_title']),
        'product_summary': merge_text_fields(group['product_summary']),
        'description': merge_text_fields(group['description']),
        'brand': merge_text_fields(group['brand']),
        'unspsc': merge_categorical(group['unspsc']),
        'root_domain': merge_text_fields(group['root_domain']),
        'page_url': merge_text_fields(group['page_url']),
        'product_identifier': merge_text_fields(group['product_identifier']),
        'intended_industries': merge_json_lists(group['intended_industries']),
        'applicability': merge_json_lists(group['applicability']),
        'eco_friendly': merge_text_fields(group['eco_friendly']),
        'ethical_and_sustainability_practices': merge_text_fields(group['ethical_and_sustainability_practices']),
        'production_capacity': merge_text_fields(group['production_capacity']),
        'price': merge_text_fields(group['price']),
        'materials': merge_json_lists(group['materials']),
        'ingredients': merge_json_lists(group['ingredients']),
        'manufacturing_countries': merge_json_lists(group['manufacturing_countries']),
        'manufacturing_year': merge_numeric(group['manufacturing_year']),
        'manufacturing_type': merge_text_fields(group['manufacturing_type']),
        'customization': merge_text_fields(group['customization']),
        'packaging_type': merge_text_fields(group['packaging_type']),
        'form': merge_json_lists(group['form']),
        'size': merge_json_lists(group['size']),
        'color': merge_json_lists(group['color']),
        'purity': merge_json_lists(group['purity']),
        'energy_efficiency': merge_text_fields(group['energy_efficiency']),
        'pressure_rating': merge_json_lists(group['pressure_rating']),
        'power_rating': merge_json_lists(group['power_rating']),
        'quality_standards_and_certifications': merge_json_lists(group['quality_standards_and_certifications']),
        'miscellaneous_features': merge_json_lists(group['miscellaneous_features']),
    }

    all_consolidated_entries.append(merged_entry)

# Create and save the deduplicated dataset
deduplicated_df = pd.DataFrame(all_consolidated_entries)
deduplicated_df.to_csv("veridion_deduplicated_full_output.csv", index=False)
print("Deduplicated dataset saved as 'veridion_deduplicated_full_output.csv'")


Deduplicated dataset saved as 'veridion_deduplicated_full_output.csv'
