In [None]:
# TODO Remove this - only people running notebooks in an IDE without the JAVA_HOME env need this.
import os
os.environ["JAVA_HOME"] = "/home/snazy/devel/openjdk/images/graalvm/jdk-11"

In [None]:
# FOR Google Colaboratory and Binder (mybinder.org):

!pip install -i https://test.pypi.org/simple/ nessiedemo

In [None]:
# Testing THE Python package MANUALLY/LOCALLY :
#
# Run
#     python3 -m build
# from within the pydemolib/ directory

import glob
import os
import subprocess
import sys

# TODO replace this block with the following, once nessiedemo is stable and released on pypi or at least pypi-test
setup_path = "{}/../pydemolib".format(os.getcwd())
pkg_file = glob.glob("{}/dist/nessiedemo-*.whl".format(setup_path))[0]
result = subprocess.run([sys.executable, "-m", "pip", "install", "--force-reinstall", pkg_file])
if result.returncode != 0:
    raise Exception("pip install failed: exit-code={}, stdout={}, stderr={}".format(result.returncode, result.stdout, result.stderr))

In [None]:
# Setup the Demo: installs the required Python dependencies, downloads the sample datasets and
# downloads + starts the Nessie-Quarkus-Runner.
from nessiedemo.demo import setup_demo
demo = setup_demo("nessie-0.5-iceberg-0.11.yml", ["region-nation"])

# This is separate, because NessieDemo.prepare() via .start() implicitly installs the required dependencies.
# Downloads Spark and sets up SparkSession, SparkContext, JVM-gateway
from nessiedemo.spark import spark_for_demo
spark, sc, jvm, demo_spark = spark_for_demo(demo)


In [None]:
!nessie branch dev

In [None]:
import os

spark_dev = demo_spark.session_for_ref("dev")

dataset = demo.fetch_dataset("region-nation")
catalog = jvm.CatalogUtil.loadCatalog("org.apache.iceberg.nessie.NessieCatalog", "nessie", {'ref': 'dev', 'url': 'http://localhost:19120/api/v1', "warehouse": 'file://' + os.getcwd() + '/spark_warehouse'}, sc._jsc.hadoopConfiguration())

# Creating region table
region_name = jvm.TableIdentifier.parse("testing.region")
region_schema = jvm.Schema([
    jvm.Types.NestedField.optional(1, "R_REGIONKEY", jvm.Types.LongType.get()),
    jvm.Types.NestedField.optional(2, "R_NAME", jvm.Types.StringType.get()),
    jvm.Types.NestedField.optional(3, "R_COMMENT", jvm.Types.StringType.get()),
])
region_spec = jvm.PartitionSpec.unpartitioned()

region_table = catalog.createTable(region_name, region_schema, region_spec)
region_df = spark_dev.read.load(dataset["region.parquet"])
region_df.write.format("iceberg").mode("overwrite").save("nessie.testing.region")

# Creating nation table
nation_name = jvm.TableIdentifier.parse("testing.nation")
nation_schema = jvm.Schema([
    jvm.Types.NestedField.optional(1, "N_NATIONKEY", jvm.Types.LongType.get()),
    jvm.Types.NestedField.optional(2, "N_NAME", jvm.Types.StringType.get()),
    jvm.Types.NestedField.optional(3, "N_REGIONKEY", jvm.Types.LongType.get()),
    jvm.Types.NestedField.optional(4, "N_COMMENT", jvm.Types.StringType.get()),
])
nation_spec = jvm.PartitionSpec.builderFor(nation_schema).truncate("N_NAME", 2).build()
nation_table = catalog.createTable(nation_name, nation_schema, nation_spec)

nation_df = spark_dev.read.load(dataset["nation.parquet"])
nation_df.write.format("iceberg").mode("overwrite").save("nessie.testing.nation")