# Parsing Jupyter Notebooks

In [1]:
import nbformat
import io

# Reading Notebooks

In [2]:
def read_nb(nb):
    with io.open(nb, 'r') as f:
        nb = nbformat.read(f, nbformat.NO_CONVERT)
    return nb

In [3]:
source_nb = 'atoms/visualization/choropleth_classification.ipynb'

inb = read_nb(source_nb)

In [4]:
type(inb)

nbformat.notebooknode.NotebookNode

In [5]:
inb.keys()

dict_keys(['nbformat', 'cells', 'nbformat_minor', 'metadata'])

In [6]:
inb.metadata

{'anaconda-cloud': {},
 'kernelspec': {'display_name': 'Python [Root]',
  'language': 'python',
  'name': 'Python [Root]'},
 'language_info': {'codemirror_mode': {'name': 'ipython', 'version': 3},
  'file_extension': '.py',
  'mimetype': 'text/x-python',
  'name': 'python',
  'nbconvert_exporter': 'python',
  'pygments_lexer': 'ipython3',
  'version': '3.5.2'}}

In [7]:
cells = inb['cells']

In [8]:
type(cells)

list

In [9]:
len(cells)

99

In [10]:
type(cells)

list

## Let's replace the cells in the in-memory notebook to create a new one

In [11]:
# extract only ever other cell
new_cells = [ c for i,c in enumerate(cells) if i%2]

In [12]:
len(new_cells)

49

In [13]:
inb['cells'] = new_cells

In [14]:
with io.open('smaller.ipynb', 'w', encoding='utf8') as f:
    nbformat.write(inb, f, nbformat.NO_CONVERT)

In [15]:
snb = read_nb('smaller.ipynb')

In [16]:
len(snb.cells)

49

## Notebook Cells

In [17]:
c0 = snb.cells[0]

In [18]:
type(c0)

nbformat.notebooknode.NotebookNode

In [19]:
c0.keys()

dict_keys(['source', 'cell_type', 'metadata'])

In [20]:
c0['cell_type']

'markdown'

In [21]:
c0['source']

'## Introduction\n\n* what is classification\n* role in choropleth mapping\n* explore classification using PySAL'

In [22]:
c0['metadata']

{'collapsed': True}

## Types of cells in  this notebook

In [23]:
from collections import defaultdict
def get_structure(cells):
    cell_types = defaultdict(list)
    for i, cell in enumerate(cells):
        cell_types[cell.cell_type].append(i)
    return cell_types
            

In [24]:
cell_types = get_structure(snb.cells)

In [25]:
cell_types.keys()

dict_keys(['code', 'markdown'])

In [26]:
for ct, cells in cell_types.items():
    print('Cell Type: %s\t %d cells'% (ct, len(cells)))

Cell Type: code	 38 cells
Cell Type: markdown	 11 cells


In [27]:
code_cell_idx = cell_types['code'][0]
code_cell_idx

2

In [28]:
snb.cells[code_cell_idx]

{'cell_type': 'code',
 'execution_count': 3,
 'metadata': {'collapsed': False},
 'outputs': [{'data': {'text/plain': '{\'description\': \'Mexican states regional income 1940-2000\',\n \'explanation\': [\'Data used in   Rey, S.J. and M.L.  Sastre Gutierrez. (2010) "Interregional inequality\',\n  \'dynamics in Mexico." Spatial Economic Analysis, 5: 277-298\',\n  \'* mexico.csv: attribute data\',\n  \'* mexico.gal: spatial weights in GAL format\',\n  \'Polygon data, n=32, k=13\'],\n \'name\': \'mexico\'}'},
   'execution_count': 3,
   'metadata': {},
   'output_type': 'execute_result'}],
 'source': "ps.examples.explain('mexico')"}

In [29]:
mkd_cell_idx = cell_types['markdown'][0]
mkd_cell_idx

0

In [30]:
snb.cells[mkd_cell_idx]

{'cell_type': 'markdown',
 'metadata': {'collapsed': True},
 'source': '## Introduction\n\n* what is classification\n* role in choropleth mapping\n* explore classification using PySAL'}

## Turning Output Cells OFF

In [31]:
def remove_outputs(nb):
    """Set output attribute of all code cells to be empty"""
    for cell in nb.cells:
        if cell.cell_type == 'code':
            cell.outputs = []

def clear_notebook(old_ipynb, new_ipynb):
    with io.open(old_ipynb, 'r') as f:
        nb = nbformat.read(f, nbformat.NO_CONVERT)

    remove_outputs(nb)
    
    with io.open(new_ipynb, 'w', encoding='utf8') as f:
        nbformat.write(nb, f, nbformat.NO_CONVERT)

source_nb = 'atoms/visualization/choropleth_classification.ipynb'

new_nb = 'nout.ipynb'
clear_notebook(source_nb, new_nb)

## Notebook Class for Querying

In [32]:
source_nb = 'atoms/visualization/choropleth_classification.ipynb'
nb = read_nb(source_nb)

In [33]:
import re
rh1 = re.compile('^# ')
rh2 = re.compile('^## ')
rh3 = re.compile('^### ')
rh4 = re.compile('^#### ')
rh = re.compile('^#+')

class NoteBook(object):
    def __init__(self, ipynb):
        self.nb = read_nb(ipynb)
        self.structure = get_structure(self.nb.cells)
        
    def get_cells_by_type(self, cell_type=None):
        if cell_type:
            cell_type = cell_type.lower()
            return [self.nb.cells[i] for i in self.structure[cell_type]]
        else:
            return self.nb.cells
    
    def get_cells_by_id(self, ids=[]):
        return [self.nb.cells[i] for i in ids]
    
    def get_header_cells(self):
        hs = []
        if 'markdown' in self.structure:
            idxs = self.structure['markdown']
            pairs = zip(idxs, self.get_cells_by_type('markdown'))
            hs = [(idx, cell) for idx, cell in pairs if rh.match(cell['source'])]
        return hs
        
        
    

In [34]:
nb = NoteBook(source_nb)

In [35]:
cid = nb.get_cells_by_id()

In [36]:
cid

[]

In [37]:
cid = nb.get_cells_by_id([7, 10, 2])

In [38]:
cid

[{'cell_type': 'code',
  'execution_count': 5,
  'metadata': {'collapsed': True},
  'outputs': [],
  'source': "y = f.by_col_array('pcgdp2000')"},
 {'cell_type': 'markdown',
  'metadata': {},
  'source': '#### Sample Mean\n\n$\\bar{y} = \\sum_{i=1}^n y_i$'},
 {'cell_type': 'code',
  'execution_count': 1,
  'metadata': {'collapsed': True},
  'outputs': [],
  'source': 'import pysal as ps'}]

In [39]:
nb.get_header_cells()

[(0,
  {'cell_type': 'markdown',
   'metadata': {},
   'source': '# Classification for Choropleth Mapping\n'}),
 (1,
  {'cell_type': 'markdown',
   'metadata': {'collapsed': True},
   'source': '## Introduction\n\n* what is classification\n* role in choropleth mapping\n* explore classification using PySAL'}),
 (3,
  {'cell_type': 'markdown',
   'metadata': {},
   'source': '## Data Set: Mexico State Gross Domestic Product'}),
 (9,
  {'cell_type': 'markdown',
   'metadata': {},
   'source': '### Numerical summaries'}),
 (10,
  {'cell_type': 'markdown',
   'metadata': {},
   'source': '#### Sample Mean\n\n$\\bar{y} = \\sum_{i=1}^n y_i$'}),
 (12,
  {'cell_type': 'markdown',
   'metadata': {},
   'source': '#### Sample Standard Deviation\n\n$\\hat{\\sigma} = \\sqrt{\\frac{\\sum_{i=1}^n (y_i-\\bar{y})^2}{n-1}}$'}),
 (14, {'cell_type': 'markdown', 'metadata': {}, 'source': '#### Median'}),
 (22,
  {'cell_type': 'markdown',
   'metadata': {},
   'source': '### Univariate Distribution Visualiz

In [40]:
hdict = defaultdict(list)
for idx, cell in nb.get_header_cells():
    level = cell['source'].count("#")
    hdict[level].append(idx)
    

In [41]:
hdict

defaultdict(list,
            {1: [0],
             2: [1, 3, 27, 64],
             3: [9, 22, 28, 35, 37, 39, 41, 43, 45, 48, 50, 52],
             4: [10, 12, 14]})

In [58]:
# find the start and end cells for each H? block
keys = list(hdict.keys())
keys.sort(reverse=True)
all_keys = keys.copy()
start_end = []
last_stop = len(nb.nb.cells)
while keys:
    current = keys.pop(0)
    for element in hdict[current]:
        above = [k for k in all_keys.copy() if k <= current]
        stop = last_stop
        while above:
            key_above = above.pop()
            larger = [v for v in hdict[key_above] if v > element]
            if larger:
                if larger[0] < stop:
                    stop = larger[0]
        start_end.append([element, stop])
        

In [59]:
start_end # for each H? cell report the start and end cells

[[10, 12],
 [12, 14],
 [14, 22],
 [9, 22],
 [22, 27],
 [28, 35],
 [35, 37],
 [37, 39],
 [39, 41],
 [41, 43],
 [43, 45],
 [45, 48],
 [48, 50],
 [50, 52],
 [52, 64],
 [1, 3],
 [3, 27],
 [27, 64],
 [64, 99],
 [0, 99]]

In [51]:
hdict

defaultdict(list,
            {1: [0],
             2: [1, 3, 27, 64],
             3: [9, 22, 28, 35, 37, 39, 41, 43, 45, 48, 50, 52],
             4: [10, 12, 14]})

In [60]:
len(start_end)

20

In [61]:
len(nb.get_header_cells())

20

In [68]:
# second h2 section with all children
se2 = [ v for v in start_end if v[0]==3][0]
block = nb.get_cells_by_id(range(*se2))
for cell in block:
    print(cell['source'])

## Data Set: Mexico State Gross Domestic Product
ps.examples.available()
ps.examples.explain('mexico')
f = ps.open(ps.examples.get_path('mexico.csv'))
f.header
y = f.by_col_array('pcgdp2000')
y
### Numerical summaries
#### Sample Mean

$\bar{y} = \sum_{i=1}^n y_i$
y_mean = y.mean()
y_mean
#### Sample Standard Deviation

$\hat{\sigma} = \sqrt{\frac{\sum_{i=1}^n (y_i-\bar{y})^2}{n-1}}$
y_std = y.std()
y_std
#### Median
import numpy as np
y_median = np.median(y)
y_median
(y < y_mean).sum()
(y > y_mean).sum()
y_sorted = np.sort(y, axis=0)
y_sorted
y_sorted[15]
y_sorted[16]
(y_sorted[15]+y_sorted[16])/2.
### Univariate Distribution Visualization
%pylab inline
import seaborn as sns
sns.distplot(y)
sns.distplot(y, kde=False, rug=True)
sns.distplot(y, bins=5, kde=False, rug=True)
sns.distplot(y, hist=False,  rug=True)


In [69]:
# first h3 section in second h2 section with all children
se3 = [ v for v in start_end if v[0]==9][0]
block = nb.get_cells_by_id(range(*se3))
for cell in block:
    print(cell['source'])

### Numerical summaries
#### Sample Mean

$\bar{y} = \sum_{i=1}^n y_i$
y_mean = y.mean()
y_mean
#### Sample Standard Deviation

$\hat{\sigma} = \sqrt{\frac{\sum_{i=1}^n (y_i-\bar{y})^2}{n-1}}$
y_std = y.std()
y_std
#### Median
import numpy as np
y_median = np.median(y)
y_median
(y < y_mean).sum()
(y > y_mean).sum()
y_sorted = np.sort(y, axis=0)
y_sorted
y_sorted[15]
y_sorted[16]
(y_sorted[15]+y_sorted[16])/2.
