In [1]:
# This should be set by Pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.ui.enabled", False).getOrCreate()

In [2]:
# Initialise test environment
from dapla.magics import DaplaLineageMagics
from IPython import get_ipython
ipython = get_ipython()

# Provide a mock template from attached json file
import json
def lineage_template_mock(spark_df, input_df_map):
    with open('lineage-template.json', 'r') as f:
        return json.load(f)

# Register dapla magics manually
magics = DaplaLineageMagics(ipython, lineage_template_mock)
ipython.register_magics(magics)

In [3]:
from pyspark.sql.types import *

# Create 2 test dataframe
person_type = StructType([
    StructField('personidentifikator', StringType()),
    StructField('kontonummer', StringType())])
person_data = [
    ('1234', '12345555'),
    ('1235', '12347777'),
]
person = spark.createDataFrame(person_data, person_type)

konto_type = StructType([
    StructField('kontonummer', StringType()),
    StructField('innskudd', IntegerType())])
konto_data = [
    ('12345555', 25000),
    ('12347777', 120000),
]
konto = spark.createDataFrame(konto_data, konto_type)

# Create a 3rd dataframe based on the other two
innskudd = person.join(konto, 'kontonummer', how='inner')

In [12]:
# This will be done automatically by spark.read.path

%lineage_input /skatt/person {person.schema.json()}
%lineage_input /skatt/konto {konto.schema.json()}

{}
/
{}
/


In [None]:
# This will be done automatically by spark.read.path
%lineage_output /skatt/innskudd {innskudd.schema.json()}

In [9]:
%lineage_tree

Input datasets:
 |-- /skatt/person
Output datasets:
 |-- 


In [8]:
innskudd.printSchema()

root
 |-- kontonummer: string (nullable = true)
 |-- personidentifikator: string (nullable = true)
 |-- innskudd: integer (nullable = true)

