# Torch Ember Core
> Analyzing How Model Improves

In [1]:
from torchvision.models.alexnet import AlexNet
import torch

In [2]:
model = AlexNet()

Sample data

In [3]:
samp = torch.rand(2,3,224,224)

In [4]:
model(samp).shape

torch.Size([2, 1000])

In [5]:
model(samp).mean().backward()

In [6]:
from uuid import uuid4

In [7]:
# export
import os
from pathlib import Path
import json
import pandas as pd
from datetime import datetime
import torch

class tracker(object):
    def __init__(self, libname, fname):
        self.libname = libname
        self.fname = fname
        self.home = Path(os.environ['HOME'])
        self.dir = self.home/f".{libname}"
        self.dir.mkdir(exist_ok = True) 
        self.data = self.dir/"data"
        self.data.mkdir(exist_ok = True) 
        self.log = self.dir/"log"
        self.log.mkdir(exist_ok = True) 
        self.log_path = self.log/self.fname
        
    def __repr__(self):
        return f"<{self.libname}:{self.fname}>"
        
    def mkdir(self, path):
        Path(path).mkdir(exist_ok=True)
        
    def __setitem__(self, fname,dict_):
        f = open(self.data/f"{fname}.json","w")
        f.write(json.dumps(dict_, indent = 2))
        f.close()
    
    def __getitem__(self,fname):
        return json.loads(open(self.data/f"{fname}.json","r").read())
        
    def logging(self,line):
        f = open(self.log_path,"a")
        f.write(line+"\n")
        f.close()
        return self.log_path
        
    def __add__(self,dict_):
        """
        add a dictionary to log
        """
        self.logging(json.dumps(dict_))
        return self
        
    def lines(self):
        return list(json.loads(i) for i in open(self.log_path).read().split("\n")[:-1])
    
    @property
    def ts(self):
        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    @property
    def df(self):
        return pd.DataFrame(self.lines())
    
class emberTracker(tracker):
    def __init__(self, fname):
        super().__init__("torchember",fname)
        self.latest = self.log/f"{fname}_latest"
        self.latest_lines = ""
        
    def logging(self,line):
        f = open(self.log_path,"a")
        f.write(line+"\n")
        f.close()
        self.latest_lines+=(line+"\n")
        return self.log_path
    
    def refresh(self):
        """
        lastest always contain the record of the latest batch
        """
        f = open(self.latest,"w")
        f.write(self.latest_lines)
        f.close()
        self.latest_lines = ""
        return self.latest
    
    def latest_line_list(self):
        return list(json.loads(i) for i in open(self.latest).read().split("\n")[:-1])
    
    @property
    def latest_df(self):
        return pd.DataFrame(self.latest_line_list())
        

### Test on the data tracker

In [8]:
etrack = emberTracker("testmodel")

This is how we record the data within other codes

In [9]:
for i in range(10):
    etrack+{"col1":i,"col2":i*2,"col3":"hahahha"}

Preview log file

In [10]:
!cat {etrack.log_path}

{"col1": 0, "col2": 0, "col3": "hahahha"}
{"col1": 1, "col2": 2, "col3": "hahahha"}
{"col1": 2, "col2": 4, "col3": "hahahha"}
{"col1": 3, "col2": 6, "col3": "hahahha"}
{"col1": 4, "col2": 8, "col3": "hahahha"}
{"col1": 5, "col2": 10, "col3": "hahahha"}
{"col1": 6, "col2": 12, "col3": "hahahha"}
{"col1": 7, "col2": 14, "col3": "hahahha"}
{"col1": 8, "col2": 16, "col3": "hahahha"}
{"col1": 9, "col2": 18, "col3": "hahahha"}


Read log as dataframe

In [11]:
etrack.df

Unnamed: 0,col1,col2,col3
0,0,0,hahahha
1,1,2,hahahha
2,2,4,hahahha
3,3,6,hahahha
4,4,8,hahahha
5,5,10,hahahha
6,6,12,hahahha
7,7,14,hahahha
8,8,16,hahahha
9,9,18,hahahha


In [12]:
!rm {etrack.log_path}

In [13]:
etrack["about_this_model"]= {
    "property1":1,
    "property2":False,
    "property3":
        {"size":100,
         "speed":200}
    }

Read meta data

In [14]:
etrack["about_this_model"]["property2"]

False

In [16]:
# export
from types import MethodType

class moduleTrack(object):
    def __init__(self,module, name=None, root_module = False):
        self.module = module
        module.module_tracker = self
        self.base_module = True if len(list(module.modules()))==1 else False
        self.root_module = root_module
        
        self.name = name if name else module.__class__.__name__
        self.id = id(module)
        self.children = []
        
    def __repr__(self):
        rt = f"<{self.name} @ {hex(self.id)}>"
        if hasattr(self,"input_dt"):
            rt+=f'\n\t[Inputs]{",".join(list(k+" "+str(list(v.shape)) for k,v in self.input_dt.items()))}'
        if hasattr(self,"output_dt"):
            rt+=f'\n\t[Outputs]{",".join(list(str(list(v.shape)) for v in self.output_dt))}'
        return rt

def get_stats(tensor):
    """
    The default statistic method, it will capture
    shape of the tensor
    mean, std, max, min of the tensor
    this will return a dictionary
    """
    return {"shape":list(tensor.shape),
            "mean":tensor.mean().item(), 
            "std":tensor.std().item(), 
            "max":tensor.max().item(), 
            "min":tensor.min().item()}

class torchEmber(object):
    def __init__(self, model):
        self.modules = dict()
        self.model = model
        
        self.model_name = self.model.__class__.__name__
        
        fname = f"{self.model_name}_{self.ts_str}"
        
        self.t = emberTracker(fname)
        self.current_mt = None
        self.mt_log = []
        
        self.arm()
        
        self.how_record_in(get_stats)
        self.how_record_out(get_stats)
        self.how_record_weight(get_stats)
        self.t[f"base_{fname}"]={"start":self.t.ts, 
                                 "user":os.environ["USER"]}
        self.t[f"structure_{fname}"] = self.mod_tree()
        
    def parse_module(self,model, name, root_module = False):
        name = f"{name}({model.__class__.__name__})"
        mt = moduleTrack(model, name, root_module)
        self.modules[name]= mt
        model.forward = self.module_register(name,model)
        
        for cname,children in model.named_children():
            children_mt = self.parse_module(children,f"{name}.{cname}" )
            children_mt.parent = mt
            mt.children.append(children_mt)
        return mt
    
    def mod_tree(self):
        """
        Return the tree of module
        """
        return self.mod_tree_parse(self.model.module_tracker)
        
    def mod_tree_parse(self,mt):
        rt = {"name":mt.name, "short":mt.name.split(".")[-1]}
        if len(mt.children)>0:
            rt.update({"children":list(self.mod_tree_parse(i) for i in mt.children)})
        return rt
                
        
    @property
    def ts_str(self):
        return datetime.now().strftime("%Y%m%d_%H%M%S")
        
    def arm(self):
        """
        arming the tracing function to self.model
        """
        self.parse_module(self.model,"model", root_module = True)
            
    def disarm(self):
        """remove the tracing function"""
        for m in self.modules.values():
            self.recover(m)
            
    def rearm(self):
        self.disarm()
        self.arm()
    
    def reg_check(self,m):
        """
        register check
        """
        if hasattr(m.forward,"armed"):
            if m.forward.armed:
                return False
        return True
    
    def recover(self, m):
        if hasattr(m,"former"):
            m.forward = m.former
            
    def how_record_in(self,f):
        def record_input_core(this, tensor,extra_data):
            """
            extra_data: dict
            """
            dict_ = f(tensor)
            dict_.update(extra_data)
            this.t+dict_
            return dict_
        setattr(self,"record_input_core",MethodType(record_input_core,self))
        return self.record_input_core
    
    def how_record_out(self,f):
        def record_output_core(this, tensor,extra_data):
            """
            extra_data: dict
            """
            dict_ = f(tensor)
            dict_.update(extra_data)
            this.t+dict_
            return dict_
        setattr(self,"record_output_core",MethodType(record_output_core,self))
        return self.record_output_core
    
    def how_record_weight(self,f):
        def record_weight_core(this, tensor,extra_data):
            """
            extra_data: dict
            """
            dict_ = f(tensor)
            dict_.update(extra_data)
            this.t+dict_
            return dict_
        setattr(self,"record_weight_core",MethodType(record_weight_core,self))
        return self.record_weight_core
    
    def record_input(self,mt):
        """
        Record the input tensors of the moduleTrack
        """
        for k,tensor in mt.input_dt.items():
            self.record_input_core(tensor,
                          extra_data = {"module":mt.name,"ts":self.t.ts,"ttype":"input","tname":k})
            
    def record_output(self,mt):
        """
        Record the output tensors of the moduleTrack
        """
        for i in range(len(mt.output_dt)):
            tensor = mt.output_dt[i]
            self.record_output_core(tensor,
                          extra_data = {"module":mt.name,"ts":self.t.ts,"ttype":"output","tname":f"output_{i}"})
            
    def record_weight(self,mt):
        """
        Record the weights of the moduleTrack
        """
        if mt.base_module:
            i = 0
            for p in mt.module.parameters():
                self.record_weight_core(p.data, 
                              extra_data = {"module":mt.name,"ts":self.t.ts,
                                            "ttype":"weight","tname":f"weight_{i}"})
                if p.requires_grad :
                    self.record_weight_core(p.grad, 
                              extra_data = {"module":mt.name,"ts":self.t.ts,
                                            "ttype":"weight_grad","tname":f"grad_{i}"})
                i+=1
            
    def module_register(self,name,m):
        if self.reg_check(m) == False: return m.forward
        f = m.forward
        mt = self.modules[name]
        vs = f.__code__.co_varnames
        mt.vars = vs[1:]
        
        def new_forward(*args,**kwargs):
            mt.input_dt = dict(zip(mt.vars[:len(args)],args))
            mt.input_dt.update(kwargs)
            
            self.record_input(mt)
            self.current_mt = mt
            if mt.root_module: self.mt_log=[]
            self.mt_log.append(f"enter {mt.name}")
            
            # ------execution of the function------
            outputs = f(*args,**kwargs)
            self.record_weight(mt)
            # ------execution of the function------
            
            self.mt_log.append(f"exit {mt.name}")
            
            if type(outputs) in [list,tuple]:
                mt.output_dt = [outputs]
            else:
                mt.output_dt = [outputs,]
            self.record_output(mt)
            
            if mt.root_module:
                self.t.refresh() # start a new "latest" file
            
            return outputs
        
        setattr(new_forward,"armed",True)
        setattr(new_forward,"former",f)
        return new_forward

In [17]:
te = torchEmber(model)
te.rearm()

In [18]:
for i in range(3):
    model(samp)

### Check snowballing tensor stats

In [19]:
te.t.df

Unnamed: 0,shape,mean,std,max,min,module,ts,ttype,tname
0,"[2, 3, 224, 224]",4.999903e-01,0.288622,0.999999,5.364418e-07,model(AlexNet),2020-02-11 20:57:09,input,x
1,"[2, 3, 224, 224]",4.999903e-01,0.288622,0.999999,5.364418e-07,model(AlexNet).features(Sequential),2020-02-11 20:57:09,input,input
2,"[2, 3, 224, 224]",4.999903e-01,0.288622,0.999999,5.364418e-07,model(AlexNet).features(Sequential).0(Conv2d),2020-02-11 20:57:09,input,input
3,"[64, 3, 11, 11]",-9.850168e-05,0.030135,0.052477,-5.247975e-02,model(AlexNet).features(Sequential).0(Conv2d),2020-02-11 20:57:09,weight,weight_0
4,"[64, 3, 11, 11]",9.594788e-08,0.000009,0.000042,-3.187902e-05,model(AlexNet).features(Sequential).0(Conv2d),2020-02-11 20:57:09,weight_grad,grad_0
...,...,...,...,...,...,...,...,...,...
235,[1000],2.050806e-04,0.009095,0.015572,-1.561586e-02,model(AlexNet).classifier(Sequential).6(Linear),2020-02-11 20:57:11,weight,weight_1
236,[1000],1.000000e-03,0.000000,0.001000,1.000000e-03,model(AlexNet).classifier(Sequential).6(Linear),2020-02-11 20:57:11,weight_grad,grad_1
237,"[2, 1000]",3.239268e-04,0.010641,0.028620,-2.969069e-02,model(AlexNet).classifier(Sequential).6(Linear),2020-02-11 20:57:11,output,output_0
238,"[2, 1000]",3.239268e-04,0.010641,0.028620,-2.969069e-02,model(AlexNet).classifier(Sequential),2020-02-11 20:57:11,output,output_0


In [20]:
te.mod_tree()

{'name': 'model(AlexNet)',
 'short': 'model(AlexNet)',
 'children': [{'name': 'model(AlexNet).features(Sequential)',
   'short': 'features(Sequential)',
   'children': [{'name': 'model(AlexNet).features(Sequential).0(Conv2d)',
     'short': '0(Conv2d)'},
    {'name': 'model(AlexNet).features(Sequential).1(ReLU)',
     'short': '1(ReLU)'},
    {'name': 'model(AlexNet).features(Sequential).2(MaxPool2d)',
     'short': '2(MaxPool2d)'},
    {'name': 'model(AlexNet).features(Sequential).3(Conv2d)',
     'short': '3(Conv2d)'},
    {'name': 'model(AlexNet).features(Sequential).4(ReLU)',
     'short': '4(ReLU)'},
    {'name': 'model(AlexNet).features(Sequential).5(MaxPool2d)',
     'short': '5(MaxPool2d)'},
    {'name': 'model(AlexNet).features(Sequential).6(Conv2d)',
     'short': '6(Conv2d)'},
    {'name': 'model(AlexNet).features(Sequential).7(ReLU)',
     'short': '7(ReLU)'},
    {'name': 'model(AlexNet).features(Sequential).8(Conv2d)',
     'short': '8(Conv2d)'},
    {'name': 'model(AlexN

In [21]:
te.mt_log

['enter model(AlexNet)',
 'enter model(AlexNet).features(Sequential)',
 'enter model(AlexNet).features(Sequential).0(Conv2d)',
 'exit model(AlexNet).features(Sequential).0(Conv2d)',
 'enter model(AlexNet).features(Sequential).1(ReLU)',
 'exit model(AlexNet).features(Sequential).1(ReLU)',
 'enter model(AlexNet).features(Sequential).2(MaxPool2d)',
 'exit model(AlexNet).features(Sequential).2(MaxPool2d)',
 'enter model(AlexNet).features(Sequential).3(Conv2d)',
 'exit model(AlexNet).features(Sequential).3(Conv2d)',
 'enter model(AlexNet).features(Sequential).4(ReLU)',
 'exit model(AlexNet).features(Sequential).4(ReLU)',
 'enter model(AlexNet).features(Sequential).5(MaxPool2d)',
 'exit model(AlexNet).features(Sequential).5(MaxPool2d)',
 'enter model(AlexNet).features(Sequential).6(Conv2d)',
 'exit model(AlexNet).features(Sequential).6(Conv2d)',
 'enter model(AlexNet).features(Sequential).7(ReLU)',
 'exit model(AlexNet).features(Sequential).7(ReLU)',
 'enter model(AlexNet).features(Sequentia

### Check latest tensor stats

In [19]:
te.t.latest_df

Unnamed: 0,shape,mean,std,max,min,module,ts,ttype,tname
0,"[2, 3, 224, 224]",4.998509e-01,0.288974,0.999995,0.000002,model(AlexNet),2020-02-09 21:03:03,input,x
1,"[2, 3, 224, 224]",4.998509e-01,0.288974,0.999995,0.000002,model(AlexNet).features(Sequential),2020-02-09 21:03:03,input,input
2,"[2, 3, 224, 224]",4.998509e-01,0.288974,0.999995,0.000002,model(AlexNet).features(Sequential).0(Conv2d),2020-02-09 21:03:03,input,input
3,"[64, 3, 11, 11]",-1.185962e-04,0.030251,0.052481,-0.052477,model(AlexNet).features(Sequential).0(Conv2d),2020-02-09 21:03:03,weight,weight_0
4,"[64, 3, 11, 11]",-8.594389e-07,0.000008,0.000034,-0.000031,model(AlexNet).features(Sequential).0(Conv2d),2020-02-09 21:03:03,weight_grad,grad_0
...,...,...,...,...,...,...,...,...,...
75,[1000],1.477706e-04,0.009015,0.015580,-0.015614,model(AlexNet).classifier(Sequential).6(Linear),2020-02-09 21:03:03,weight,weight_1
76,[1000],1.000000e-03,0.000000,0.001000,0.001000,model(AlexNet).classifier(Sequential).6(Linear),2020-02-09 21:03:03,weight_grad,grad_1
77,"[2, 1000]",1.578474e-04,0.010619,0.029553,-0.031823,model(AlexNet).classifier(Sequential).6(Linear),2020-02-09 21:03:03,output,output_0
78,"[2, 1000]",1.578474e-04,0.010619,0.029553,-0.031823,model(AlexNet).classifier(Sequential),2020-02-09 21:03:03,output,output_0


### Redifine what you want to record

For the default statistic function, you can keep track shape, mean, std, max,min of a tensor.

The afore-mentioned tensor could mean all of the following
* module input tensors
* module output tensors
* module weight
* gradient of module weight

If you have more interesting metrics to follow, you can redifine the statistic tracking function

#### Redifine the weight tensor/ weight grad tensor  statitic function

In [20]:
@te.how_record_weight
def weight_stats(tensor):
    return {"num":tensor.numel(),"row_max":list(row.max().item() for row in tensor)}

#### Redifine the input or output statitic function

In [21]:
@te.how_record_in
def input_stats(tensor):
    return {"num":tensor.numel(),"row_min":list(row.min().item() for row in tensor)}

@te.how_record_out
def output_stats(tensor):
    return {"num":tensor.numel(),"row_min":list(row.min().item() for row in tensor)}

Let's give 1 forward pass again

In [22]:
model(samp)

tensor([[-0.0146, -0.0005,  0.0091,  ...,  0.0054,  0.0127,  0.0059],
        [-0.0200,  0.0012,  0.0156,  ...,  0.0070,  0.0152,  0.0065]],
       grad_fn=<AddmmBackward>)

The latest stats changed

In [23]:
te.t.latest_df

Unnamed: 0,num,row_min,module,ts,ttype,tname,row_max
0,301056,"[6.616115570068359e-06, 2.0265579223632812e-06]",model(AlexNet),2020-02-09 21:03:03,input,x,
1,301056,"[6.616115570068359e-06, 2.0265579223632812e-06]",model(AlexNet).features(Sequential),2020-02-09 21:03:03,input,input,
2,301056,"[6.616115570068359e-06, 2.0265579223632812e-06]",model(AlexNet).features(Sequential).0(Conv2d),2020-02-09 21:03:03,input,input,
3,23232,,model(AlexNet).features(Sequential).0(Conv2d),2020-02-09 21:03:03,weight,weight_0,"[0.051700785756111145, 0.05247066915035248, 0...."
4,23232,,model(AlexNet).features(Sequential).0(Conv2d),2020-02-09 21:03:03,weight_grad,grad_0,"[-1.7328102330793627e-07, 1.4489784234683611e-..."
...,...,...,...,...,...,...,...
75,1000,,model(AlexNet).classifier(Sequential).6(Linear),2020-02-09 21:03:04,weight,weight_1,"[-0.01220434345304966, -0.002170322462916374, ..."
76,1000,,model(AlexNet).classifier(Sequential).6(Linear),2020-02-09 21:03:04,weight_grad,grad_1,"[0.0010000000474974513, 0.0010000000474974513,..."
77,2000,"[-0.026451345533132553, -0.03089486062526703]",model(AlexNet).classifier(Sequential).6(Linear),2020-02-09 21:03:04,output,output_0,
78,2000,"[-0.026451345533132553, -0.03089486062526703]",model(AlexNet).classifier(Sequential),2020-02-09 21:03:04,output,output_0,


In [160]:
w = list(model.features.parameters())[0]

In [161]:
x1 = torch.rand(5,6)
x2 = torch.rand(5,6)
x3 = x1*6+x2

In [162]:
x2.numel()

30