In [8]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

In [9]:
def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path

In [10]:
class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""

    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)

        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.cells:
                if cell.cell_type == 'code':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.source)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod

In [11]:
class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""

    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

In [12]:
sys.meta_path.append(NotebookFinder())

In [13]:
!find ./ | grep .ipynb

.//Snowpark/SMA_Output/DataFrame_Operations.ipynb
.//validations.ipynb
.//Spark/src/DataFrame_Operations.ipynb


### Load Spark and Snowpark functions from Jupyter Notebooks

In [15]:
from Spark.src.DataFrame_Operations import filter_products as spark_filter_products, first_sport_item as spark_first_sport_item, df as spark_df
from Snowpark.SMA_Output.DataFrame_Operations import filter_products as snowpark_filter_products, first_sport_item as snowpark_first_sport_item, df as sf_df

ImportError: cannot import name 'first_sport_item' from 'Spark.src.DataFrame_Operations' (/Users/kjimenezmorales/Downloads/CheckpointsDemo/Spark/src/DataFrame_Operations.ipynb)

### Setup up a SnowparkJobContext instance

In [None]:
from snowflake.snowpark import Session
from pyspark.sql import SparkSession
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext

snowpark_session = Session.builder.config("connection_name", "checkpoint_demo").getOrCreate()
spark_session = SparkSession.builder.getOrCreate()
job_context = SnowparkJobContext(snowpark_session, spark_session, "demo_job", True)

### Step 3: check_pandera_df_schema_file

In [None]:
from snowflake.snowpark_checkpoints import check_pandera_df_schema_file

snowpark_df = snowpark_filter_products()

checkpoint_path = os.path.join(os.getcwd(), "Spark/src/snowpark-filtered_data-schema.json")

check_pandera_df_schema_file(
  df=snowpark_df, 
  job_context=job_context, 
  file_path= checkpoint_path
)

### Step 4: check_with_spark

In [None]:
from snowflake.snowpark_checkpoints import check_with_spark
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy


def test_spark_first_sport_item():
  return spark_first_sport_item(spark_df)

@check_with_spark(
  job_context=job_context, 
  spark_function=test_spark_first_sport_item,
  sample=100, 
  sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
)
def test_snowpark_first_sport_item():
  return snowpark_first_sport_item(sf_df)

test_snowpark_first_sport_item()