# Running a notebook in the cluster

## Setup

### GCP authentication

In [None]:
!gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

### Install required packages and import packages

In [None]:
%%capture pip_install_out

!pip install pip --upgrade
!pip install -r ../mlpipeline_utils/requirements.txt --upgrade

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import itertools
import logging
import os
import sys
sys.path.append('../mlpipeline_utils')
import uuid

import kfp
import kfp.gcp

from mlpipeline_utils.kfp_components import *


logging.getLogger().setLevel(logging.INFO)

In [None]:
PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
DEV_BUCKET = f'gs://{PROJECT_ID}-dev'
!gsutil mb -b on {DEV_BUCKET}
BASE_GCS_PATH = os.path.join(DEV_BUCKET, 'notebooks')

## Running notebook in the cluster

### Defines a boilerplate Kubeflow Pipeline to run the notebook in the cluster

In [None]:
@kfp.dsl.pipeline()
def run_notebook_pipeline(
    notebook_path: str,
    output_notebook_path: str,
    papermill_options: str='',
):
    run_notebook_comp(
        notebook_path=notebook_path,
        output_notebook_path=output_notebook_path,
        papermill_base_image=f'gcr.io/{PROJECT_ID}/kubeflow-jupyter',
        papermill_options=papermill_options
    ) \
        .apply(kfp.gcp.use_gcp_secret('user-gcp-sa')) \
        .set_cpu_request('2') \
#         .set_gpu_limit(1, 'nvidia')

### Copy the local notebook to GCS and run the notebook in the cluster

In [None]:
notebook_name = 'sample_notebook.ipynb'

In [None]:
# notebook parameters
parameters = {
    'RUN_LOCAL': True
}

# boilerplate code to trigger the notebook run
run_id = str(uuid.uuid4()).replace('-', '')

# input notebook
notebook_gcs_path = os.path.join(BASE_GCS_PATH, 'temp', run_id, 'input.ipynb')
!gsutil cp {notebook_name} {notebook_gcs_path}

# output notebook
output_notebook_gcs_path = os.path.join(
    BASE_GCS_PATH, 'temp', run_id, 'output.ipynb'
)

# create the pipeline run
kfp.Client().create_run_from_pipeline_func(
    run_notebook_pipeline,
    arguments={
        'notebook_path': notebook_gcs_path,
        'output_notebook_path': output_notebook_gcs_path,
        'papermill_options': ' '.join(
            itertools.chain.from_iterable(
                ['-p', str(k), str(v)] for k, v in parameters.items()
            )
        )
    },
    run_name=f'run_notebook_{run_id}'
)

### Retrieve the resulting notebook once it's done

In [None]:
!gsutil cp {output_notebook_gcs_path} {run_id}_{notebook_name}