In [None]:
from jsonschema import validate, ValidationError
import yaml

def validate_yaml_schema(data, schema_path="dataset_schema.json"):
    with open(schema_path, 'r') as schema_file:
        schema = yaml.safe_load(schema_file)
    try:
        validate(instance=data, schema=schema)
        print("YAML file is valid.")
    except ValidationError as e:
        print(f"Validation error: {e}")
        raise

with open("custom_properties.yaml", 'r') as file:
    dataset_data = yaml.safe_load(file)

validate_yaml_schema(dataset_data, schema_path="validation_schema.yaml")


YAML file is valid.


In [4]:
import os
import yaml
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.schema_classes import DatasetPropertiesClass, ChangeTypeClass

# Initialize the REST emitter
DATAHUB_SERVER_URL = os.getenv("DATAHUB_SERVER_URL")
DATAHUB_TOKEN = os.getenv("DATAHUB_TOKEN")
emitter = DatahubRestEmitter(gms_server=DATAHUB_SERVER_URL, token=DATAHUB_TOKEN)

# Load datasets and custom properties from the YAML file
def load_datasets(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

dataset_data = load_datasets("custom_properties.yaml")

# Iterate over datasets and emit MetadataChangeProposalWrapper for each
for dataset in dataset_data['datasets']:
    urn = dataset['urn']
    custom_properties = dataset['customProperties']

    print(f"Processing dataset: {urn}")
    print(f"Custom properties: {custom_properties}")

    # Create MCP for custom properties
    mcp = MetadataChangeProposalWrapper(
        entityType="dataset",
        entityUrn=urn,
        aspect=DatasetPropertiesClass(customProperties=custom_properties),
        changeType=ChangeTypeClass.UPSERT,
    )

    # Emit the MCP
    print(f"Emitting MCP for {urn}...")
    emitter.emit_mcp(mcp)

print("All custom properties emitted successfully!")


Processing dataset: urn:li:dataset:(urn:li:dataPlatform:postgres,nrs_demo.public.babies_first_names_23_full_lists_girls,PROD)
Custom properties: {'PublicationDate': '2024-01-01', 'TimePeriod': '2025', 'Supplier': 'NRS'}
Emitting MCP for urn:li:dataset:(urn:li:dataPlatform:postgres,nrs_demo.public.babies_first_names_23_full_lists_girls,PROD)...
Processing dataset: urn:li:dataset:(urn:li:dataPlatform:postgres,nrs_demo.public.babies_first_names_23_full_lists_boys,PROD)
Custom properties: {'PublicationDate': '2024-01-01', 'TimePeriod': '2023', 'Supplier': 'NRS'}
Emitting MCP for urn:li:dataset:(urn:li:dataPlatform:postgres,nrs_demo.public.babies_first_names_23_full_lists_boys,PROD)...
All custom properties emitted successfully!
