# Clean notebooks

Outputs are removed from notebooks to make a cleaner notebook available for git diff.

In [1]:
# Tag that points to cells where all outputs are removed
# Set this string as [cell tag] in target notebooks

TAG_CONDITION = "remove_when_contains:image"

tag_dict = {
    'remove_all_outputs': "nbconvert_instruction:remove_all_outputs",
    'remove_single_output': "nbconvert_instruction:remove_single_output", 
    'remove_cell': "nbconvert_instruction:remove_full_cell", 
}

BACKUP_DIR = "../.notebook-cleaning/"

In [None]:
from traitlets.config import Config
import nbformat as nbf
from nbconvert.exporters import NotebookExporter
from nbconvert.preprocessors import TagRemovePreprocessor
import os
import re


def clean_nb(
    fn,                    # notebook
    tags = tag_dict,       # dict with instructions
    rel_backup_dir = '.',  # location where to write temp files and backups relative! to location of fn
    VERBOSE = True,        # Turn verbosity on/off
    DRYRUN = False,        # Do a dry run for debugging
):
    
    assert os.path.exists(fn), f'Notebook <{fn}> does not exist.'
    
    # Create filenames
    tmp_name = re.sub('\.ipynb$', '', 
                      os.path.join(os.path.relpath(os.path.dirname(fn) + '/' + rel_backup_dir), os.path.basename(fn))
                     ) # temp file
    nb_in = tmp_name + '.smudge.ipynb'
    nb_out = tmp_name + '.clean.ipynb'
    nb_bup = tmp_name + '.bup.ipynb'

    if VERBOSE:
        print('-- File names')
        print('input:\t\t', fn)
        print('backup:\t\t',nb_bup)
        print('to be cleaned:\t', nb_in)
        print('cleaned:\t',nb_out)
        
    # Back-up original
    if VERBOSE:
        print(f'backup:\n\t{fn} ->\n\t{nb_bup}')
    if DRYRUN == False:
        !cp -p {fn} {nb_bup}
    
    # Setup config
    c = Config()

    # Configure tag removal - be sure to tag your cells to remove  using the
    # words remove_cell to remove cells. You can also modify the code to use
    # a different tag word
    c.TagRemovePreprocessor.remove_all_outputs_tags = (tags['remove_all_outputs'],)
    c.TagRemovePreprocessor.remove_single_output_tags = (tags['remove_single_output'],)
    c.TagRemovePreprocessor.remove_cell_tags = (tags['remove_cell'],)
    c.TagRemovePreprocessor.enabled = True
    #c.TemplateExporter.exclude_markdown = True 
    #c.TemplateExporter.exclude_code_cell = True 
    #c.TemplateExporter.exclude_raw = True 
    #c.TemplateExporter.exclude_unknown = True 
    #c.TemplateExporter.exclude_foo = True 

    if VERBOSE:
        print('-- Config')
        display(c)
    
    # Prepare input
    if VERBOSE:
        print(f'smudge:\n\t{fn} ->\n\t{nb_in}')
    if DRYRUN == False:
        !cp -p {fn} {nb_in}

    # Tag images
    if VERBOSE:
        print(f'tag outputs with images for removal:\n\t{nb_in} ->\n\t{nb_in}')
    if DRYRUN == False:
        tag_images_for_removal(nb_in, nb_in)

    # Process with NotebookExporter
    if VERBOSE:
        print(f'clean (process):\n\t{nb_in} ->\n\t{nb_out}')
    out = NotebookExporter(config=c).from_filename(nb_in)
    if DRYRUN == False:
        with open(nb_out,  "w") as f:
            f.write(out[0])
    
    # Overwrite original
    if VERBOSE:
        print(f'save:\n\t{nb_out} ->\n\t{fn}')
    if DRYRUN == False:
        !cp -p {nb_out} {fn}

In [None]:
def tag_mime_output(nb, mime_tag, output_tag):
    
    '''
    Adds a tag to a specific output given its "data" is of certain MIME type.
    Useful for removing figures from notebooks.
    
    Input:
        nb - notebook
        mime_tag - cell tag of cell that needs processing. 
            
            The tag can be formatted as [condition]:[mime_type] (e.g. `foo:image/jpeg`)
            or only with the [mime_type] (e.g. `image/jpeg`)
            
            condition - arbitrary name. All cells with this tag are processed. 
                If ommitted (or '*') all cells are processed. All outputs with matching mime types are tagged.
            
            mime_type - outputs are tagged (for removal) with if data is of this mime type. 
                This can also be a partial MIME name (e.g. `image`, str.find() is used).
                
        output_tag - tag added to metadata when condition is matched
            
    Output:
        The input object is updated
            
    '''

    # handle input
    if mime_tag.find(':') < 0: # no `:` separator
        condition, mime_type = '*', mime_tag
    else:
        condition, mime_type = mime_tag.split(':')
        
    # loop over cells in notebook
    for c in nb.get('cells',[]):
        if (condition != '*') and (mime_tag not in c.get('metadata',{}).get('tags',[])):
            # next cell if the cell tag does not match
            continue
        # loop over all outputs
        for o in c.get('outputs',[]):
            # do for every data output that is a of MIME type ...

            #if any([t in o.get('data', {}) for t in ['image/png', 'image/jpg', 'image/jpeg']]):
            if any([k.find(mime_type)>=0 for k in o.get('data', {})]):
                # add to existing tags
                tags = o['metadata'].get('tags', [])
                tags.append(output_tag)
                o['metadata']['tags'] = tags

                

def tag_images_for_removal(in_file, out_file):
    import nbformat as nbf
    
    # read nb
    nb = nbf.read(in_file, nbf.NO_CONVERT)
    # process
    tag_mime_output(nb, TAG_CONDITION, tag_dict['remove_single_output'])
    # save
    nbf.write(nb, out_file, nbf.NO_CONVERT)
    


In [None]:
# Notebooks of phase 1: collect data from auction

path = "."
for short_fn in [
    '00-scrape/scrape-drz-auction-results.ipynb',
    '00-scrape/add-rdw-info-to-drz.ipynb',
    '00-scrape/download-images.ipynb',
    '00-scrape/explore-auction-results.ipynb',
    '10-combine-and-preprocess/aggregate-all-auctions.ipynb',
    '10-combine-and-preprocess/preproc-cars.ipynb',
    '10-combine-and-preprocess/eda-after-merge.ipynb',
    '20-modelling/predict-price.ipynb',
    '20-modelling/classify-pictures.ipynb',
]:
    fn = os.path.join(path, short_fn)
    print(fn)
    clean_nb(fn, rel_backup_dir = '../.notebook-cleaning/', VERBOSE=True)

In [None]:
raise, 'Below is sandboxing'

In [None]:
fn

In [None]:
fn = '20-modelling/classify-pictures.ipynb'
clean_nb(fn, rel_backup_dir = '../.notebook-cleaning/', VERBOSE=True)