In [None]:
import os
import yaml
from dotenv import load_dotenv
import datahub.emitter.mce_builder as builder
from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.emitter.rest_emitter import DatahubRestEmitter
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
    DatasetLineageType,
    FineGrainedLineage,
    FineGrainedLineageDownstreamType,
    FineGrainedLineageUpstreamType,
    Upstream,
    UpstreamLineage,
)

# Load environment variables
load_dotenv()
DATAHUB_SERVER_URL = os.getenv("DATAHUB_SERVER_URL")

# Initialize the DataHub emitter
emitter = DatahubRestEmitter(gms_server=DATAHUB_SERVER_URL)

# Helper functions for URNs
def dataset_urn(platform, name, env):
    return f"urn:li:dataset:(urn:li:dataPlatform:{platform},{name},{env})"

def field_urn(dataset_urn, field):
    return builder.make_schema_field_urn(dataset_urn, field)

# Load lineage data from the YAML file
def load_lineage(file_path):
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)

lineage_data = load_lineage("lineage.yaml")

# Aggregate sources by target
target_lineages = {}
for lineage in lineage_data['lineages']:
    target_urn = dataset_urn(lineage['target']['platform'], lineage['target']['dataset'], "PROD")
    
    if target_urn not in target_lineages:
        target_lineages[target_urn] = {
            'upstreams': [],
            'fine_grained_lineages': []
        }

    source_urn = dataset_urn(lineage['source']['platform'], lineage['source']['dataset'], "PROD")
    target_lineages[target_urn]['upstreams'].append(Upstream(dataset=source_urn, type=DatasetLineageType.TRANSFORMED))
    
    for mapping in lineage['field_mappings']:
        source_field_urn = field_urn(source_urn, mapping['source_field'])
        target_field_urn = field_urn(target_urn, mapping['target_field'])

        target_lineages[target_urn]['fine_grained_lineages'].append(
            FineGrainedLineage(
                upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
                upstreams=[source_field_urn],
                downstreamType=FineGrainedLineageDownstreamType.FIELD,
                downstreams=[target_field_urn],
            )
        )

# Emit lineage for each aggregated target
for target_urn, data in target_lineages.items():
    print(f"Processing lineage for target {target_urn}")
    
    lineage_aspect = UpstreamLineage(
        upstreams=data['upstreams'], 
        fineGrainedLineages=data['fine_grained_lineages']
    )

    lineage_mcp = MetadataChangeProposalWrapper(entityUrn=target_urn, aspect=lineage_aspect)

    print("Emitting lineage MCP...")
    emitter.emit_mcp(lineage_mcp)

print("Lineage processing complete!")