# Running a notebook in the cluster using papermill 

## Setup

### GCP authentication

In [1]:
!gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

Activated service account credentials for: [kubeflow-user@kubeflow-demo-256908.iam.gserviceaccount.com]


### Install required packages and import packages

In [2]:
%%capture pip_install_out

!pip install pip --upgrade
!pip install -r mlpipeline_utils/requirements.txt --upgrade

In [3]:
import itertools
import logging
import os
import sys
sys.path.append('../mlpipeline_utils')
import uuid

import kfp
import kfp.gcp

from mlpipeline_utils.kfp_components import *


logging.getLogger().setLevel(logging.INFO)

In [4]:
PROJECT_ID = !gcloud config get-value project
PROJECT_ID = PROJECT_ID[0]
DEV_BUCKET = f'gs://{PROJECT_ID}-dev'
!gsutil mb -b on {DEV_BUCKET}
BASE_GCS_PATH = os.path.join(DEV_BUCKET, 'notebooks')

Creating gs://kubeflow-demo-256908-dev/...
ServiceException: 409 Bucket kubeflow-demo-256908-dev already exists.


## Papermill

### Select base image to be used for papermill

In [5]:
PAPERMILL_BASE_IMAGE = f'gcr.io/{PROJECT_ID}/kfp-base'

### Configures input/output notebooks for papermill

In [6]:
PAPERMILL_DEMO_GCS_PATH = os.path.join(BASE_GCS_PATH, 'papermill_demo.ipynb')
PAPERMILL_DEMO_OUTPUT_GCS_PATH = os.path.join(BASE_GCS_PATH, 'papermill_demo.output.ipynb')

### Defines a boilerplate Kubeflow Pipeline to run the notebook using papermill

In [7]:
@kfp.dsl.pipeline()
def papermill_pipeline(
    papermill_options: str='',
    
):
    papermill_comp(
        notebook_path=PAPERMILL_DEMO_GCS_PATH,
        output_notebook_path=PAPERMILL_DEMO_OUTPUT_GCS_PATH,
        papermill_base_image=PAPERMILL_BASE_IMAGE,
        papermill_options=papermill_options
    ) \
        .apply(kfp.gcp.use_gcp_secret('user-gcp-sa')) \
        .set_cpu_request('2') \
#         .set_gpu_limit(1, 'nvidia')

### Copy the local notebook to GCS and run the papermill pipeline

In [8]:
!gsutil cp papermill_demo.ipynb {PAPERMILL_DEMO_GCS_PATH}
run_id = str(uuid.uuid4()).replace('-', '')
parameters = {
    'x': 10,
    'y': 200
}
kfp.Client().create_run_from_pipeline_func(
    papermill_pipeline,
    arguments={
        'papermill_options': ' '.join(
            itertools.chain.from_iterable(
                ['-p', str(k), str(v)] for k, v in parameters.items()
            )
        )
    },
    run_name=f'papermill_demo_{run_id}'
)

Copying file://papermill_demo.ipynb [Content-Type=application/octet-stream]...
/ [1 files][ 17.0 KiB/ 17.0 KiB]                                                
Operation completed over 1 objects/17.0 KiB.                                     


<kfp._client.Client.create_run_from_pipeline_package.<locals>.RunPipelineResult at 0x7f04df79a898>

### Retrieve the resulting notebook once it's done

In [9]:
!gsutil cp {PAPERMILL_DEMO_OUTPUT_GCS_PATH} papermill_demo.output.ipynb

Copying gs://kubeflow-demo-256908-dev/notebooks/papermill_demo.output.ipynb...
/ [1 files][169.1 KiB/169.1 KiB]                                                
Operation completed over 1 objects/169.1 KiB.                                    
