# Clean notebooks

Outputs are removed from notebooks to make a cleaner notebook available for git diff.

In [1]:
# Tag that points to cells where all outputs are removed
# Set this string as [cell tag] in target notebooks

TAG_CONDITION = "remove_when_contains:image"

tag_dict = {
    'remove_all_outputs': "nbconvert_instruction:remove_all_outputs",
    'remove_single_output': "nbconvert_instruction:remove_single_output", 
    'remove_cell': "nbconvert_instruction:remove_full_cell", 
}

In [2]:
from traitlets.config import Config
import nbformat as nbf
from nbconvert.exporters import NotebookExporter
from nbconvert.preprocessors import TagRemovePreprocessor
import os



def clean_nb(fn, tags = tag_dict, VERBOSE = True):
    nb_in = fn.replace('.ipynb', '.smudge.ipynb')
    nb_out = nb_in.replace('smudge.ipynb', 'clean.ipynb')
    nb_bup = fn.replace('.ipynb', '.bup.ipynb')
    
    if VERBOSE:
        print('-- File names')
        print('input:\t\t', fn)
        print('backup:\t\t',nb_bup)
        print('to be cleaned:\t', nb_in)
        print('cleaned:\t',nb_out)
        
    # Back-up original
    if VERBOSE:
        print(f'backup:\n\t{fn} ->\n\t{nb_bup}')
    !cp -p {fn} {nb_bup}
    
    # Setup config
    c = Config()

    # Configure tag removal - be sure to tag your cells to remove  using the
    # words remove_cell to remove cells. You can also modify the code to use
    # a different tag word
    c.TagRemovePreprocessor.remove_all_outputs_tags = (tags['remove_all_outputs'],)
    c.TagRemovePreprocessor.remove_single_output_tags = (tags['remove_single_output'],)
    c.TagRemovePreprocessor.remove_cell_tags = (tags['remove_cell'],)
    c.TagRemovePreprocessor.enabled = True
    #c.TemplateExporter.exclude_markdown = True 
    #c.TemplateExporter.exclude_code_cell = True 
    #c.TemplateExporter.exclude_raw = True 
    #c.TemplateExporter.exclude_unknown = True 
    #c.TemplateExporter.exclude_foo = True 

    if VERBOSE:
        print('-- Config')
        display(c)
    
    # Prepare input
    if VERBOSE:
        print(f'smudge:\n\t{fn} ->\n\t{nb_in}')
    !cp -p {fn} {nb_in}

    # Tag images
    if VERBOSE:
        print(f'tag outputs with images for removal:\n\t{nb_in} ->\n\t{nb_in}')
    tag_images_for_removal(nb_in, nb_in)

    # Process with NotebookExporter
    if VERBOSE:
        print(f'clean (process):\n\t{nb_in} ->\n\t{nb_out}')
    out = NotebookExporter(config=c).from_filename(nb_in)
    with open(nb_out,  "w") as f:
        f.write(out[0])
    
    # Overwrite original
    if VERBOSE:
        print(f'save:\n\t{nb_out} ->\n\t{fn}')
    !cp -p {nb_out} {fn}

In [3]:
def tag_mime_output(nb, mime_tag, output_tag):
    
    '''
    Adds a tag to a specific output given its "data" is of certain MIME type.
    Useful for removing figures from notebooks.
    
    Input:
        nb - notebook
        mime_tag - cell tag of cell that needs processing. 
            
            The tag can be formatted as [condition]:[mime_type] (e.g. `foo:image/jpeg`)
            or only with the [mime_type] (e.g. `image/jpeg`)
            
            condition - arbitrary name. All cells with this tag are processed. 
                If ommitted (or '*') all cells are processed. All outputs with matching mime types are tagged.
            
            mime_type - outputs are tagged (for removal) with if data is of this mime type. 
                This can also be a partial MIME name (e.g. `image`, str.find() is used).
                
        output_tag - tag added to metadata when condition is matched
            
    Output:
        The input object is updated
            
    '''

    # handle input
    if mime_tag.find(':') < 0: # no `:` separator
        condition, mime_type = '*', mime_tag
    else:
        condition, mime_type = mime_tag.split(':')
        
    # loop over cells in notebook
    for c in nb.get('cells',[]):
        if (condition != '*') and (mime_tag not in c.get('metadata',{}).get('tags',[])):
            # next cell if the cell tag does not match
            continue
        # loop over all outputs
        for o in c.get('outputs',[]):
            # do for every data output that is a of MIME type ...

            #if any([t in o.get('data', {}) for t in ['image/png', 'image/jpg', 'image/jpeg']]):
            if any([k.find(mime_type)>=0 for k in o.get('data', {})]):
                # add to existing tags
                tags = o['metadata'].get('tags', [])
                tags.append(output_tag)
                o['metadata']['tags'] = tags

                

def tag_images_for_removal(in_file, out_file):
    import nbformat as nbf
    
    # read nb
    nb = nbf.read(in_file, nbf.NO_CONVERT)
    # process
    tag_mime_output(nb, TAG_CONDITION, tag_dict['remove_single_output'])
    # save
    nbf.write(nb, out_file, nbf.NO_CONVERT)
    


In [4]:
# Notebooks of phase 1: collect data from auction

path = "."
for short_fn in [
    'scrape-drz-auction-results.ipynb',
    'add-rdw-info-to-drz.ipynb',
    'download-images.ipynb',
    'explore-auction-results.ipynb',
    'aggregate-all-auctions.ipynb',
    'preproc-cars.ipynb',
    'eda-after-merge.ipynb',
    'predict-price.ipynb',
    'classify-pictures.ipynb',
]:
    fn = os.path.join(path, short_fn)
    print(fn)
    clean_nb(fn, VERBOSE=True)

./scrape-drz-auction-results.ipynb
-- File names
input:		 ./scrape-drz-auction-results.ipynb
backup:		 ./scrape-drz-auction-results.bup.ipynb
to be cleaned:	 ./scrape-drz-auction-results.smudge.ipynb
cleaned:	 ./scrape-drz-auction-results.clean.ipynb
backup:
	./scrape-drz-auction-results.ipynb ->
	./scrape-drz-auction-results.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./scrape-drz-auction-results.ipynb ->
	./scrape-drz-auction-results.smudge.ipynb
tag outputs with images for removal:
	./scrape-drz-auction-results.smudge.ipynb ->
	./scrape-drz-auction-results.smudge.ipynb
clean (process):
	./scrape-drz-auction-results.smudge.ipynb ->
	./scrape-drz-auction-results.clean.ipynb
save:
	./scrape-drz-auction-results.clean.ipynb ->
	./scrape-drz-auction-results.ipynb
./add-rdw-info-to-drz.ipynb
-- File names
input:		 ./add-rdw-info-to-drz.ipynb
backup:		 ./add-rdw-info-to-drz.bup.ipynb
to be cleaned:	 ./add-rdw-info-to-drz.smudge.ipynb
cleaned:	 ./add-rdw-info-to-drz.clean.ipynb
backup:
	./add-rdw-info-to-drz.ipynb ->
	./add-rdw-info-to-drz.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./add-rdw-info-to-drz.ipynb ->
	./add-rdw-info-to-drz.smudge.ipynb
tag outputs with images for removal:
	./add-rdw-info-to-drz.smudge.ipynb ->
	./add-rdw-info-to-drz.smudge.ipynb
clean (process):
	./add-rdw-info-to-drz.smudge.ipynb ->
	./add-rdw-info-to-drz.clean.ipynb
save:
	./add-rdw-info-to-drz.clean.ipynb ->
	./add-rdw-info-to-drz.ipynb
./download-images.ipynb
-- File names
input:		 ./download-images.ipynb
backup:		 ./download-images.bup.ipynb
to be cleaned:	 ./download-images.smudge.ipynb
cleaned:	 ./download-images.clean.ipynb
backup:
	./download-images.ipynb ->
	./download-images.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./download-images.ipynb ->
	./download-images.smudge.ipynb
tag outputs with images for removal:
	./download-images.smudge.ipynb ->
	./download-images.smudge.ipynb
clean (process):
	./download-images.smudge.ipynb ->
	./download-images.clean.ipynb
save:
	./download-images.clean.ipynb ->
	./download-images.ipynb
./explore-auction-results.ipynb
-- File names
input:		 ./explore-auction-results.ipynb
backup:		 ./explore-auction-results.bup.ipynb
to be cleaned:	 ./explore-auction-results.smudge.ipynb
cleaned:	 ./explore-auction-results.clean.ipynb
backup:
	./explore-auction-results.ipynb ->
	./explore-auction-results.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./explore-auction-results.ipynb ->
	./explore-auction-results.smudge.ipynb
tag outputs with images for removal:
	./explore-auction-results.smudge.ipynb ->
	./explore-auction-results.smudge.ipynb
clean (process):
	./explore-auction-results.smudge.ipynb ->
	./explore-auction-results.clean.ipynb
save:
	./explore-auction-results.clean.ipynb ->
	./explore-auction-results.ipynb
./aggregate-all-auctions.ipynb
-- File names
input:		 ./aggregate-all-auctions.ipynb
backup:		 ./aggregate-all-auctions.bup.ipynb
to be cleaned:	 ./aggregate-all-auctions.smudge.ipynb
cleaned:	 ./aggregate-all-auctions.clean.ipynb
backup:
	./aggregate-all-auctions.ipynb ->
	./aggregate-all-auctions.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./aggregate-all-auctions.ipynb ->
	./aggregate-all-auctions.smudge.ipynb
tag outputs with images for removal:
	./aggregate-all-auctions.smudge.ipynb ->
	./aggregate-all-auctions.smudge.ipynb
clean (process):
	./aggregate-all-auctions.smudge.ipynb ->
	./aggregate-all-auctions.clean.ipynb
save:
	./aggregate-all-auctions.clean.ipynb ->
	./aggregate-all-auctions.ipynb
./preproc-cars.ipynb
-- File names
input:		 ./preproc-cars.ipynb
backup:		 ./preproc-cars.bup.ipynb
to be cleaned:	 ./preproc-cars.smudge.ipynb
cleaned:	 ./preproc-cars.clean.ipynb
backup:
	./preproc-cars.ipynb ->
	./preproc-cars.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./preproc-cars.ipynb ->
	./preproc-cars.smudge.ipynb
tag outputs with images for removal:
	./preproc-cars.smudge.ipynb ->
	./preproc-cars.smudge.ipynb
clean (process):
	./preproc-cars.smudge.ipynb ->
	./preproc-cars.clean.ipynb
save:
	./preproc-cars.clean.ipynb ->
	./preproc-cars.ipynb
./eda-after-merge.ipynb
-- File names
input:		 ./eda-after-merge.ipynb
backup:		 ./eda-after-merge.bup.ipynb
to be cleaned:	 ./eda-after-merge.smudge.ipynb
cleaned:	 ./eda-after-merge.clean.ipynb
backup:
	./eda-after-merge.ipynb ->
	./eda-after-merge.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./eda-after-merge.ipynb ->
	./eda-after-merge.smudge.ipynb
tag outputs with images for removal:
	./eda-after-merge.smudge.ipynb ->
	./eda-after-merge.smudge.ipynb
clean (process):
	./eda-after-merge.smudge.ipynb ->
	./eda-after-merge.clean.ipynb
save:
	./eda-after-merge.clean.ipynb ->
	./eda-after-merge.ipynb
./predict-price.ipynb
-- File names
input:		 ./predict-price.ipynb
backup:		 ./predict-price.bup.ipynb
to be cleaned:	 ./predict-price.smudge.ipynb
cleaned:	 ./predict-price.clean.ipynb
backup:
	./predict-price.ipynb ->
	./predict-price.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./predict-price.ipynb ->
	./predict-price.smudge.ipynb
tag outputs with images for removal:
	./predict-price.smudge.ipynb ->
	./predict-price.smudge.ipynb
clean (process):
	./predict-price.smudge.ipynb ->
	./predict-price.clean.ipynb
save:
	./predict-price.clean.ipynb ->
	./predict-price.ipynb
./classify-pictures.ipynb
-- File names
input:		 ./classify-pictures.ipynb
backup:		 ./classify-pictures.bup.ipynb
to be cleaned:	 ./classify-pictures.smudge.ipynb
cleaned:	 ./classify-pictures.clean.ipynb
backup:
	./classify-pictures.ipynb ->
	./classify-pictures.bup.ipynb
-- Config


{'TagRemovePreprocessor': {'remove_all_outputs_tags': ('nbconvert_instruction:remove_all_outputs',),
  'remove_single_output_tags': ('nbconvert_instruction:remove_single_output',),
  'remove_cell_tags': ('nbconvert_instruction:remove_full_cell',),
  'enabled': True}}

smudge:
	./classify-pictures.ipynb ->
	./classify-pictures.smudge.ipynb
tag outputs with images for removal:
	./classify-pictures.smudge.ipynb ->
	./classify-pictures.smudge.ipynb
clean (process):
	./classify-pictures.smudge.ipynb ->
	./classify-pictures.clean.ipynb
save:
	./classify-pictures.clean.ipynb ->
	./classify-pictures.ipynb


In [5]:
raise, 'Below is sandboxing'

SyntaxError: invalid syntax (<ipython-input-5-10c575266340>, line 1)

In [None]:
fn

In [None]:
clean_nb(fn, VERBOSE=True)