In [None]:
#!/usr/bin/env python
# coding: utf-8

In [None]:
import json
import os
import sys
import argparse
from git import Repo
from collections import OrderedDict
import time
import pandas as pd
import matplotlib.pyplot as plt
import functools

In [None]:
def find_all_output_marking(notebook, output_mark, ignore_case = False):
    '''
    find all the notebook output cells with a text fields;
    and return the text field only if it contains output_mark
    returns : l_res[cell_id][text_idx] = text
    '''
    l_res=OrderedDict()
    for cell_idx,cell in enumerate(notebook["cells"]):
        if cell["cell_type"] == "code":
            cell_id = cell["metadata"]["id"]
            idx=-1
            res = OrderedDict()
            for output in cell["outputs"] :
                if "text" in output :
                    found = False
                    idx += 1
                    for line in output["text"]:
                        if ignore_case == True :
                            if line.lower().find(output_mark.lower()) != -1 :
                                found = True
                        else :   
                            if line.find(output_mark)!= -1:
                                #print("MARK FOUND !", cell_idx)
                                found = True
                    if found :
                        res[idx] = output["text"]
            if len(res):
                l_res[cell_id] = res
    return l_res

In [None]:
def filter_output_cells(notebook, filter_fun):
    '''
    find in notebook all text ouput cells where filter_fun(cell) returns true
    l_res[cell_idx][text_idx] = output_text
    '''
    l_res=OrderedDict()
    for cell_idx,cell in enumerate(notebook["cells"]):
        if cell["cell_type"] == "code":
            #print("cell idx", cell_idx)
            if not "id" in cell["metadata"] :
                cell_id = "#"+str(cell_idx)
            else :
                cell_id = cell["metadata"]["id"]
            idx=-1
            res = OrderedDict()
            for output in cell["outputs"] :
                if "text" in output :
                    idx +=1
                    found = filter_fun(output["text"])
                    if found :
                        res[idx] = output["text"]
                    
            if len(res):
                l_res[cell_id] = res
    return l_res    

In [None]:
def test_find_all_output_marking():
    notebook = json.load( open("../P2_dog_classification/dog_app.ipynb"))
    dres = find_all_output_marking(notebook, "Training Loss")
    for k,v in dres.items():
        for i, lines in v.items():
            print(k, i)
            for line in lines :
                print("  " , line, end ="")
test_find_all_output_marking()

In [None]:
def find_output_marking_(notebook, output_mark):
    l_res = find_output_marking(notebook, output_mark)
    assert len(l_res) == 1
    return l_res[0]

In [None]:
import re
def tokenize_output(output_text):
    '''
    split notebook into tokens 
    separators : whitespace  ','  ':'
    '''
    res =[]
    for line in output_text :
    #    a = re.split(' |\n|:', line)
        #a = re.split('\s|:|,', line)
        # split outputs on whitespace, : and ,
        split_line = re.split('[\s|:|,]+', line)
        # remove empty tokens
        split_line = [ a for a in split_line if len(a)]
        #print(split_line)
        res.append(split_line)
    return res

In [None]:
from collections import OrderedDict
def parse_nn_performances(split_lines):
    '''
    parse performances info from a notebook output result cell
    assumes each line is a train OR valid OR test result
    result line must follow the template =
    [phase] Epoch <epoch> loss/accuracy <metric>
    fields may be separated by : , or whitespace
    '''
    res = OrderedDict( )
    for line in split_lines:
        if 'VALID' in line :
            res.setdefault('VALID', OrderedDict())
            perf_type = 'VALID'
        elif 'TEST' in line :
            res.setdefault('TEST', OrderedDict())
            perf_type = 'TEST'            
        else :
            perf_type = 'TRAIN'
            res.setdefault('TRAIN', OrderedDict())
        if not 'Epoch' in line :
            print("not a result line")
            continue
        d_index = {}
        l_names = [ 'loss', 'accuracy', 'Epoch']
        d_type = {'loss':float, 'accuracy':float, 'Epoch':int }
        d_val = {}
        for name in l_names :
            d_index[name] = line.index(name)
        if d_index['loss'] == -1 and d_index['accuracy'] == -1 :
            print("error : not a performance line", line)
        else :
            try :
                for name, index in d_index.items():
                    if index > -1 :
                        d_val[name] = d_type[name](line[index+1])
            except ValueError :
                print("error conversion", name, index, line)
            else :
                epoch = d_val["Epoch"]
                del(d_val['Epoch'])
                for name, perf in d_val.items() :
                    res[perf_type].setdefault(name, OrderedDict() )
                    res[perf_type][name][ epoch] = perf
    return res
                      

In [None]:
import matplotlib.pyplot as plt
def plot_performance(results, metric):
    '''
    plot the train and valid performance for the given metric (loss / accuracy)
    '''
    plt.figure()
    plt.plot( list( res["TRAIN"][metric].keys()), list(res["TRAIN"][metric].values() ) , label ="TRAIN")
    plt.plot( list( res["VALID"][metric].keys()), list(res["VALID"][metric].values() ) , label ="VALID")
    plt.legend()
    plt.title(metric)    

In [None]:
import functools
def apply_file_str(func):
    def wrapper(*args, **kwargs):
        #print(type(args))
        if type( args[0]) is str :
            #print("str")
            with open(args[0]) as f :
                #print(*args[1:], *kwargs)
                return func(f, *args[1:], *kwargs)
        else :
            #print("stream")
            #print( *args, *kwargs)
            return func(*args, *kwargs)
    return wrapper

In [None]:
@apply_file_str
def extract_perf(notebook_f, l_markers = ["Begin Training", "TEST" ] ):
    '''
    perse the notebook for markers in l_markers, indicating a result cell
    split each line of the result cell
    then parse each line to extract results
    '''
    notebook = json.load(notebook_f)
    res = OrderedDict()
    for output_mark in  l_markers :
        # find output cell beginning with output_mark
        output_text = find_output_marking(notebook, output_mark)
        if output_text :
            # tokenize output cell
            split_lines = tokenize_output(output_text)
            # extract results from cell
            dict_result = parse_nn_performances(split_lines)
            res.update(dict_result)
    return res

In [None]:
def extract_notebook_train_valid(notebook_name, l_markers = ["Begin Training", "TEST" ]  ):
    if type(notebook_name) is str :
        with open(notebook_name) as f :
            return extract_perf_(f, l_markers)
    else :
        return extract_perf_(notebook_name, l_markers)

In [None]:
def test_notebook_parsing():
    results = extract_perf("/home/severine/MOOCS/UDACITY/DEEP_LEARNING/TP/P2_dog_classification/Transfer_Learning_Solution_copy.ipynb")
    plot_performance(results, "loss")                 
    plot_performance(results, "accuracy")

In [None]:
def extract_metric_phase(l_results, phase, metric):
    '''
    returns a DataFrame containing phase (train / valid / test) metric (loss / accuracy)
    columns are commits sha
    indexes are epochs
    '''
    df_result = pd.DataFrame()
    df_commit_info = pd.DataFrame()
    for result in l_results :
        sha = result["sha"][:5]
        #print(result["res"].keys())
        try:
            dres = result["res"][phase][metric]
        except KeyError :
            pass
        else :
            df_current = pd.DataFrame.from_dict(dres, orient = 'index', columns =[sha] )
            #print(df_current)
            df_result = pd.concat([df_current, df_result], axis = 1)
            
            df_commit = pd.DataFrame( [[result["msg"], result["date"]] ] , columns = ["message", "date"], index =[sha])
            df_commit_info = pd.concat([df_commit, df_commit_info], axis = 0)

    return df_result, df_commit_info

In [None]:
def all_commits_results(directory, notebook, parse_function = extract_perf): 
    '''
    extract from all git versions of the notebook in directory the results
    '''
    repo = Repo(directory)
    head = repo.head.reference
    l_results = []
    # iterate on the previous commits
    for commit in list( repo.iter_commits( ) ) :
        sha = commit.hexsha
        msg = commit.message
        dat = commit.authored_date
        strdate = time.strftime("%d/%m/%Y %H:%M", time.gmtime(dat))
        # dat = commit.commited_date
        # files in the commit
        for tr in commit.tree:
            # load the notebook
            if tr.name == notebook:
                print(sha[:7], strdate, msg )
                results = parse_function(tr.data_stream)
                res_dict = OrderedDict( [("sha", sha), ("date", strdate), ("msg", msg), ("res", results) ] )
                l_results.append(res_dict)
    return l_results


In [None]:
def get_commit(directory, ** kwargs):
    '''
    get commit either with sha, or relative to a branch (HEAD by default)
    '''
    repo = Repo(directory)
    branch_name = None
    if "sha" in kwargs :
        # find commit by sha
        commit_ref = kwargs["sha"]
    else :
        # find n last commit of branch
        if "branch" in kwargs :
            branch = kwargs["branch"]
            branch_name = branch
        else :
            branch = "HEAD"
            branch_name = repo.active_branch.name
        #print(branch_name, end = "\t")
        if "num" in kwargs :
            branch += "~" + str(kwargs["num"])
        commit_ref = branch
           
    commit =  repo.commit(commit_ref)
#    print(commit.hexsha[:7], commit.message)
    return branch_name, commit


In [None]:
print( get_commit("/home/severine/TEMP/P2_dog_classification", num = 5))
print( get_commit("/home/severine/TEMP/P2_dog_classification"))
print( get_commit("/home/severine/TEMP/P2_dog_classification", branch = "master", num = 5))
print( get_commit("/home/severine/TEMP/P2_dog_classification", branch = "master"))
print( get_commit("/home/severine/TEMP/P2_dog_classification", sha = "5dfbe8f65decc20e1d869b4cf265f9d6e33b1ae4"))

In [None]:
def get_tree_element(commit, file_name):
    for tr in commit.tree :
        if tr.name == file_name :
            return tr

In [None]:
from gitdb.exc import (
    BadObject,
    BadName,
)


def get_notebook_results(directory, notebook_name, find_function, **kwargs):
    try :
        branch_name, commit = get_commit(directory, **kwargs)
    except BadName as excpt:
        print("ERROR : commit does not exist")
        print(excpt.args)
        return None, None
        
    sha = commit.hexsha
    msg = commit.message
    dat = commit.authored_date
    strdate = time.strftime("%d/%m/%Y %H:%M", time.gmtime(dat))
    commit_info = OrderedDict( [ ("sha", sha), ("msg", msg),("date", strdate)])
    # get notebook for the current commit
    notebook_blob = get_tree_element(commit, file_name=notebook_name)
    notebook = json.load(notebook_blob.data_stream)
    l_result_cells = filter_output_cells(notebook, find_function)
    return l_result_cells, commit_info

In [None]:
def find_any(text, l_marks, ignore_case = False):
    '''
    find in text 
    '''
    found = False
    for mark in l_marks : 
        if ignore_case :
            m = mark.lower()
            for line in text :
                if line.lower().find(m) != -1 :
                    return True
        else :
            for line in text :
                if line.find(mark) != -1 :
                    return True
    return False

In [None]:
from parse import parse
def parse_train_out_1(line):
    resparse = parse("Epoch: {epoch:d} 	Training Loss: {train_loss:f} 	Validation Loss: {valid_loss:f}	train correct :{train_accuracy:f}	valid correct :{valid_accuracy:f}	time {extim}",
                     line)
    if resparse :
        res = []
        res.append(OrderedDict( [("epoch", resparse.named["epoch"]),
                                 ("phase", "train"),
                                 ("metric","loss"),
                                 ("val", resparse.named["train_loss"]),
                                 ]) )
        res.append(OrderedDict( [("epoch", resparse.named["epoch"]),
                                 ("phase", "valid"),
                                 ("metric", "loss"),
                                 ("val", resparse.named["valid_loss"]),
                                 ]) )
        res.append(OrderedDict( [("epoch", resparse.named["epoch"]),
                                 ("phase", "train"),
                                 ("metric", "accuracy"),
                                 ("val", resparse.named["train_accuracy"]),
                                 ]) )
        res.append(OrderedDict( [("epoch", resparse.named["epoch"]),
                                 ("phase", "valid"),
                                 ("metric", "accuracy"),
                                 ("val", resparse.named["valid_accuracy"]),
                                 ]) )
    else :
        res = None
    target_nb = 6
    return target_nb, res
def parse_train_out_2(line):
    resparse = parse(  "Epoch: {epoch:d} 	Training Loss: {train_loss:f} 	Validation Loss: {valid_loss:f}	valid correct :{valid_accuracy:f}	time {extim}",
                     line)
    target_nb = 5
    if resparse :
        res = []
        res.append(OrderedDict( [("epoch", resparse.named["epoch"]),
                                 ("phase", "train"),
                                 ("metric","loss"),
                                 ("val", resparse.named["train_loss"]),
                                 ]) )
        res.append(OrderedDict( [("epoch", resparse.named["epoch"]),
                                 ("phase", "valid"),
                                 ("metric", "loss"),
                                 ("val", resparse.named["valid_loss"]),
                                 ]) )
        res.append(OrderedDict( [("epoch", resparse.named["epoch"]),
                                 ("phase", "valid"),
                                 ("metric", "accuracy"),
                                 ("val", resparse.named["valid_accuracy"]),
                                 ]) )
    else :
        res = None
    return target_nb, res

def parse_testloss_1(line):
    resparse = parse("Test Loss: {test_loss:f}", line)
    target_nb = 1
    if resparse :
        res = []
        res.append(OrderedDict( [("phase", "test"),
                                 ("metric", "loss"),
                                 ("val", resparse.named["test_loss"])
                                 ]) )
    else :
        res = None
    return target_nb, res

def parse_testacc_1(line):
    target_nb =2 
    resparse = parse("Test Accuracy: {test_accuracy}%{ratio}", line)
    if resparse :
        res = []
        res.append(OrderedDict( [("phase", "test"),
                                 ("metric", "accuracy"),
                                 ("val", resparse.named["test_accuracy"])
                                 ]) )
    else :
        res = None
    return target_nb, res
    
out1 = parse_train_out_1("Epoch: 6 	Training Loss: 4.887487 	Validation Loss: 3.792227	train correct :0.011	valid correct :0.011	time 00:03:40")
print(out1)
parse_testacc_1("Test Accuracy: 12% (105/836)")

In [None]:
class NoteBookForCommit:
    def get_commit(self, directory, notebook_name, **kwargs):
        try :
            branch_name, commit = get_commit(directory, **kwargs)
        except BadName as excpt:
            print("ERROR : commit does not exist")
            print(excpt.args)
            self.sha = None
            self.msg = None
            self.strdate= None
            self.notebook = None
        else :
            self.sha = commit.hexsha
            self.msg = commit.message
            dat = commit.authored_date
            self.strdate = time.strftime("%d/%m/%Y %H:%M", time.gmtime(dat))
            # get notebook for the current commit
            notebook_blob = get_tree_element(commit, file_name=notebook_name)
            self.notebook = json.load(notebook_blob.data_stream)
    def filter_output_cells(self, find_function):
        self.res_cells =filter_output_cells(self.notebook, find_function)
        self.res_cell_keys = list(self.res_cells.keys())
    def cell_source(self, cell_idx):
        for cidx,cell in enumerate(notebook["cells"]):
            if cell["cell_type"] == "code":
                #print("cell idx", cell_idx)
                if not "id" in cell["metadata"] :
                    if  cell_idx == "#"+str(cidx):
                        return cell["source"]
                else :
                    if cell_idx == cell["metadata"]["id"]:
                        return cell["source"]
    def find_res_source(self):
        self.res_cells_source = OrderedDict()
        for cidx in self.res_cell_keys:
            self.res_cells_source[cidx] = self.cell_source(cidx)


In [None]:
import functools

dogdir = "/home/severine/TEMP/P2_dog_classification"
l_marks = ["train", "valid", "test", "epoch", "loss", "accuracy"]
l_marks = ["Test"]
l_marks=["loss", "accuracy"]
filter_fun = functools.partial(find_any, l_marks = l_marks, ignore_case = True)

num = 0
while 1 :
    res_text, commit_info = get_notebook_results(dogdir, "dog_app.ipynb", find_function=filter_fun, num = num)
    if commit_info == None :
        break
    print(num, len(res_text), list(res_text.keys()))
    num += 1
    

In [None]:
for num in range(0,53):
    res_text, commit_info = get_notebook_results(dogdir, "dog_app.ipynb", find_function=filter_fun, num = num)
    lk=list(res_text.keys())
    print(num, lk)


In [None]:
parse_info = {}
parse_info['ZFEGK0be3VJN'] =["scratch", parse_train_out_1,parse_train_out_2]
parse_info['LEzAt0RE3VJn'] =["scratch", parse_testloss_1, parse_testacc_1 ]
parse_info['Wlf9QUM63VKW'] =["transfer", parse_train_out_1, parse_train_out_2]



In [None]:
branch_name, commit = get_commit(dogdir, sha = "761c8e7")
notebook = json.load( get_tree_element(commit, "dog_app.ipynb").data_stream )
for cell_idx,cell in enumerate(notebook["cells"]):
    if cell["cell_type"] == "code":
        if "id" in cell["metadata"] and cell["metadata"]["id"] in lk :
            print("found", cell["metadata"]["id"] )
            for line in cell["source"]:
                print("\t", line.rstrip())
            print("---------------------")
            for output in cell["outputs"]:
                if "text" in output:
                    for line in output["text"] :
                        print("\t", line.rstrip())
                    print("-----------------")
            print("====================")
            


In [None]:
notebook

In [None]:
lsha = ["5d9b427"]
l_res_per_commit = []
with open("parse_error.log", "w") as ferr:
    l_all_res= []
    for num in range(54) :
    #for sha in lsha :
        notebook_commit = NoteBookForCommit()
        notebook_commit.get_commit(dogdir,"dog_app.ipynb", num=num)
        #notebook_commit.get_commit(dogdir,"dog_app.ipynb", sha=sha)
        notebook_commit.filter_output_cells(filter_fun)
        #print(notebook_commit.res_cell_keys)
        shortsha = notebook_commit.sha[:7]
        print(num, shortsha)
        d_commit_info = OrderedDict()
        d_commit_info["sha"] = shortsha
        d_commit_info["msg"] = notebook_commit.msg
        d_commit_info["date"] = notebook_commit.strdate
        
        notebook_commit.find_res_source()
        for k in notebook_commit.res_cell_keys :
            source_lines = notebook_commit.res_cells_source[k]
            source = "".join(source_lines)
            if "scratch" in source :
                if "scratch" != parse_info[k][0]:
                    print("PB scratch!!",parse_info[k][0] )
                    ferr.write("{}\t{}\tscratch\n".format(shortsha,k ))
            if "transfer" in source :
                if "transfer" != parse_info[k][0]:
                    print("PB transfer!!",parse_info[k][0] )
                    ferr.write("{}\t{}\ttransfer\n".format(shortsha,k ))
            
            results = notebook_commit.res_cells[k]
            if len(results) == 0:
                continue
            if 0 not in results.keys():
                for k,v in results.items() : 
                    print(k, v)
                break
 #           print("\t", results.keys())
            for line in results[0]:
                if line.isspace() or len(line) ==0 :
                    continue
                for func in parse_info[k][1:]:
                    target_nb, resline = func(line)
                    if resline != None and target_nb == len(resline):
                        break
                
                if resline == None or target_nb != len(resline):
                    #print("ERROR parsing",k, ":", target_nb)
                    #print(line.rstrip())
                    ferr.write("{}\t{}\t{}\n".format(shortsha,k,line.rstrip() ))
                if resline :
                    for unit_res in resline :
                        cur_res = d_commit_info.copy()
                        cur_res["NN"] = parse_info[k][0]
                        cur_res.update(unit_res)
                        l_all_res.append(cur_res)


            #print("".join(results[0]))


In [None]:
a = dict(zip(range(len(l_all_res)), l_all_res))
print(len(l_all_res))
b = pd.DataFrame.from_dict(a, orient = "index")
b

In [None]:
%cat parse_error.log

In [None]:
if __name__ == "__main__":
    all_results = all_commits_results("../P2_dog_classification/", "Transfer_Learning_Solution_copy.ipynb")
    valid_loss, commit_info = extract_metric_phase(all_results, "VALID", "loss")
    print(valid_loss)

In [None]:
if __name__ == "__main__":
    valid_loss.plot(title = "valid loss", figsize = (10,6))

In [None]:
if __name__ == "__main__":
    pd.set_option('display.max_colwidth', 0)
    print(commit_info)

In [None]:
if __name__ == "__main__":
    valid_accuracy, commit_info = extract_metric_phase(all_results, "VALID", "accuracy")
    plt.figure()
    valid_accuracy.plot( title = "valid accuracy", figsize = (10,6))

**Exemple of execution**

python notebook_parsing.py  -d ../P2_dog_classification/ -nb Transfer_Learning_Solution_copy.ipynb  -o dogs_transfer_learning.json

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", dest = "directory", required = True)
    parser.add_argument("-nb", dest = "notebook", required = True)
    parser.add_argument("-o", dest = "output", required = True)
    l_args = parser.parse_args()
    
    print(l_args.notebook)
    print(l_args.output)
    l_results = all_commits_results(l_args.directory, l_args.notebook)
    with open(l_args.output, "w") as fs :
        json.dump(l_results, fs, indent = 2)
    
