In [1]:
# Import some things
import os,sys,cv2
from __future__ import print_function

import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Bring in Xilinx ML Suite Compiler, Quantizer, PyXDNN
#from xfdnn.tools.compile.bin.xfdnn_compiler_tensorflow import TFFrontend as xfdnnCompiler
from xfdnn.tools.compile.bin.xfdnn_compiler_caffe import CaffeFrontend as xfdnnCompiler
from xfdnn.tools.quantize.quantize import CaffeFrontend as xfdnnQuantizer
#from xfdnn.tools.quantize.quantize_tf import tf_Quantizer as xfdnnQuantizer
import xfdnn.rt.xdnn as xdnn
import xfdnn.rt.xdnn_io as xdnn_io
import time
import ipywidgets

import warnings
import time
import gc
import pandas as pd
warnings.simplefilter("ignore", UserWarning)

print("Current working directory: %s" % os.getcwd())
print("Running on host: %s" % os.uname()[1])
print("Running w/ LD_LIBRARY_PATH: %s" %  os.environ["LD_LIBRARY_PATH"])
print("Running w/ XILINX_OPENCL: %s" %  os.environ["XILINX_OPENCL"])
print("Running w/ XCLBIN_PATH: %s" %  os.environ["XCLBIN_PATH"])
print("Running w/ PYTHONPATH: %s" %  os.environ["PYTHONPATH"])
print("Running w/ SDACCEL_INI_PATH: %s" %  os.environ["SDACCEL_INI_PATH"])

id = !whoami

# Make sure there is no error in this cell
# The xfDNN runtime depends upon the above environment variables


['/home/centos/ml-suite/notebooks/pickle', '/home/centos/ml-suite/notebooks/parallel', '/home/centos/ml-suite/notebooks/bin', '/home/centos/ml-suite/notebooks/version', '/home/centos/ml-suite/notebooks/weights', '/home/centos/ml-suite/notebooks/optimizations', '/home/centos/ml-suite/notebooks/network', '/home/centos/ml-suite/notebooks/memory', '/home/centos/ml-suite/notebooks/graph', '/home/centos/ml-suite/notebooks/codegeneration', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../weights', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../version', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../tests', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../pickle', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../parallel', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../optimizations', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../network', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../memory', '/home/centos/ml-suite/xfdnn/tools/compile/bin/../graph', '/home/centos/ml-suite

NameError: name 'null' is not defined

In [2]:
def initializeFpgaModel(sProtoBufPath):
    config = {} # Config dict
    config["platform"] = 'aws'
    
    sInputNode,sOutputNode = getModelInputOutputNode(sProtoBufPath)
    # Compiler Arguments
    config["model"] = "GoogLeNet"
    #config["protobuf"] = "/home/centos/models/tensorflow/inception/frozen_inception_v3.pb"
    #config["protobuf"] = sProtoBufPath
    #config["outmodel"] = "work/optimized_model" # String for naming optimized model NOT YET SUPPORTED
    config["prototxt"]     ="/home/centos/models/caffe/inception_v3/fp32/inception_v3_deploy.prototxt"
    #config["caffemodel"]   = "../models/caffe/resnet/fp32/resnet50_without_bn.caffemodel"
    config["caffemodel"]   ="/home/centos/models/caffe/inception_v3/fp32/inception_v3.caffemodel"
    config["outmodel"]     = "work/opt_inception_model"
    
    config["netcfg"] = "work/fpga_caffe.cmds" # Compiler will generate FPGA instructions
    config["memory"] = 5 # Available on-chip SRAM
    config["dsp"] = 28 # Width of Systolic Array
    #config["finalnode"] = sOutputNode # Terminal node in your tensorflow graph
    #config["finalnode"] = "prob" # Terminal node in your tensorflow graph

    compiler = xfdnnCompiler(
        networkfile=config["prototxt"],      # Protobuf filename: input file
        anew=config["outmodel"],            # String for intermediate protobuf NOT YET SUPPORTED
        generatefile=config["netcfg"],       # Script filename: output file
        memory=config["memory"],             # Available on chip SRAM within xclbin
        dsp=config["dsp"],                   # Rows in DSP systolic array within xclbin # keep defaults 
        #finalnode=config["finalnode"],       # Terminal node in your tensorflow graph
        weights=config["caffemodel"]                         # Instruct Compiler to generate a weights directory for runtime
    )

# Invoke compiler
    try:
        compiler.compile()

        # The compiler extracts the floating point weights from the .caffemodel.
        # As it makes optimizations it will augment the weights, and generate a weights dir
        # This weights dir will be stored in the work dir with the appendex '_data'. 
        # In the future, the compiler will generate a more efficient format such as hdf5
        config["datadir"] = "work/" + os.path.basename(config["caffemodel"]) + "_data"    
        if os.path.exists(config["datadir"]) and os.path.exists(config["netcfg"]+".json"):
            print("Compiler successfully generated JSON and the data directory: %s" % config["datadir"])
        else:
            print("Compiler failed to generate the JSON or data directory: %s" % config["datadir"])
            raise

        print("**********\nCompilation Successful!\n")

        import json
        data = json.loads(open(config["netcfg"]+".json").read())
        print("Network Operations Count: %d"%data['ops'])
        print("DDR Transfers (bytes): %d"%data['moveops']) 

    except Exception as e:
        print("Failed to complete compilation:",e)

    # Quantizing
    config["img_mean"] = [104.007, 116.669, 122.679] # Mean of the training set
    config["output_json"] = "work/quantization_params_caffe.json"
    config["quantizecfg"] =  config["output_json"] # Quantizer will generate quantization params
    config["calibration_directory"] = "../xfdnn/tools/quantize/calibration_directory" # Directory of images for quantizer
    config["calibration_size"] = 15 # Number of calibration images quantizer will use
    config["bitwidths"] = [16,16,16] # Supported quantization precision
    config["img_raw_scale"] = 255.0 # Raw scale of input pixels, i.e. 0 <-> 255
    config["img_input_scale"] = 1.0 # Input multiplier, Images are scaled by this factor after mean subtraction
    config["transpose"] = [2,0,1] # (H,W,C)->(C,H,W) transpose argument to quantizer
    config["channel_swap"] = [2,1,0] # (R,G,B)->(B,G,R) Channel Swap argument to quantizer


# Compiler instance
    quantizer = xfdnnQuantizer(
        deploy_model=config["outmodel"]+".prototxt",          # Model filename: input file
        weights=config["outmodel"]+".caffemodel",             # Floating Point weights
        output_json=config["output_json"],                    # Quantization JSON output filename
        bitwidths=config["bitwidths"],                        # Fixed Point precision: 8,8,8 or 16,16,16
        transpose=config["transpose"],                        # Transpose argument to caffe transformer
        channel_swap=config["channel_swap"],                  # Channel swap argument to caffe transfomer
        raw_scale=config["img_raw_scale"],                    # Raw scale argument to caffe transformer
        mean_value=config["img_mean"],                        # Image mean per channel to caffe transformer
        input_scale=config["img_input_scale"],                # Input scale argument to caffe transformer
        calibration_size=config["calibration_size"],          # Number of calibration images to use
        calibration_directory=config["calibration_directory"] # Directory containing calbration images
    )

    # Invoke quantizer
    try:
        quantizer.quantize(inputName = sInputNode, outputName = sOutputNode)

        import json
        data = json.loads(open(config["quantizecfg"]).read())
        print("**********\nSuccessfully produced quantization JSON file for %d layers.\n"%len(data['network']))
    except Exception as e:
        print("Failed to quantize:",e)

    # Create a handle with which to communicate to the FPGA
    # The actual handle is managed by xdnn
    config["xclbin"] = "../overlaybins/" + config["platform"] + "/overlay_3.xclbin" # Chosen Hardware Overlay
    ## NOTE: If you change the xclbin, we likely need to change some arguments provided to the compiler
    ## Specifically, the DSP array width, and the memory arguments

    ret, handles = xdnn.createHandle(config['xclbin'])

    if ret:                                                             
        print("ERROR: Unable to create handle to FPGA")
    else:
        print("INFO: Successfully created handle to FPGA")

    # If this step fails, most likely the FPGA is locked by another user, or there is some setup problem with the hardware
    return config,handles

In [3]:
def getModelInputOutputNode(sProtobufPath):
    import tensorflow as tf
    from tensorflow.python.platform import gfile
    with tf.Session() as sess:
        with gfile.FastGFile(sProtobufPath,'rb') as f:
            graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        sess.graph.as_default()
        tf.import_graph_def(graph_def, name='')
        graph_nodes=[n for n in graph_def.node]

        return graph_nodes[0].name, graph_nodes[len(graph_nodes)-1].name


In [4]:
# # Chose an image to run, display it for reference
# config["images"] = ["../examples/classification/dog.jpg","../examples/classification/dog.jpg"] # Image of interest (Must provide as a list)

# img = cv2.imread(config["images"][0])
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# plt.imshow(img)
# plt.title(config["images"])
# plt.show()

In [5]:
# Quantize, and transfer the weights to FPGA DDR
def TransferWeightsFPGA(iBatchSize,config,handles):
    # config["datadir"] = "work/" + config["caffemodel"].split("/")[-1]+"_data" # From Compiler
    config["scaleA"] = 10000 # Global scaler for weights (Must be defined)
    config["scaleB"] = 30 # Global scaler for bias (Must be defined)
    config["PE"] = 0 # Run on Processing Element 0 - Different xclbins have a different number of Elements
    config["batch_sz"] = iBatchSize # We will load 1 image at a time from disk
    config["in_shape"] = (3,224,224) # We will resize images to 224x224

    #(weightsBlob, fcWeight, fcBias ) = pyxfdnn_io.loadWeights(config)
    fpgaRT = xdnn.XDNNFPGAOp(handles,config)
    (fcWeight, fcBias) = xdnn_io.loadFCWeightsBias(config)
    return fpgaRT,fcWeight,fcBias,config

### Step 7. Allocate space in host memory for inputs, load images from disk, and prepare images. 

In [6]:
# Allocate space in host memory for inputs, Load images from disk
def AllocateMemoryToHost(config):
#     batch_array = np.empty(((config['batch_sz'],) + config['in_shape']), dtype=np.float32, order='C')
#     img_paths = xdnn_io.getFilePaths(config['images'])

#     for i in xrange(0, len(img_paths), config['batch_sz']):
#         pl = []
#         for j, p in enumerate(img_paths[i:i + config['batch_sz']]):
#             batch_array[j, ...], _ = xdnn_io.loadImageBlobFromFile(p, config['img_raw_scale'], config['img_mean'], 
#                                                                       config['img_input_scale'], config['in_shape'][2], 
#                                                                       config['in_shape'][1])
#             pl.append(p)
    # Allocate space in host memory for outputs
    if config["model"] == "GoogLeNet":
        config["fpgaoutsz"] = 1024 # Number of elements in the activation of the last layer ran on the FPGA
    elif config["model"] == "ResNet50":
        config["fpgaoutsz"] = 2048 # Number of elements in the activation of the last layer ran on the FPGA

    config["outsz"] = 1000 # Number of elements output by FC layers (1000 used for imagenet)

    fpgaOutput = np.empty ((config['batch_sz'], config['fpgaoutsz'],), dtype=np.float32, order='C') # Space for fpga output
    fcOutput = np.empty((config['batch_sz'], config['outsz'],), dtype=np.float32, order='C') # Space for output of inner product
   
    return fpgaOutput, fcOutput,config

In [7]:
def generateRandomBatch(iBatchSize,config):
    return np.random.rand(iBatchSize,3,224,224).astype(np.float32)

### Step 12. Output the classification prediction scores.

In [8]:
# # Print the classification given the labels synset_words.txt (Imagenet classes)
# config["labels"] = "../examples/classification/synset_words.txt"
# labels = xdnn_io.get_labels(config['labels'])
# xdnn_io.printClassification(softmaxOut, pl, labels)

# #Print Original Image for Reference 
# img = cv2.imread(config["images"][0])
# img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# plt.imshow(img)
# plt.title(config["images"])
# plt.show()

In [9]:
def runOnFPGA(iBatchSize,config,handle,batchArray):
    # Load weights to FPGA
    fpgaRT,fcWeight,fcBias,config=TransferWeightsFPGA(iBatchSize,config,handle)
    
    #Allocate Memory to host
    fpgaOutput, fcOutput,config=AllocateMemoryToHost(config)
    
    #Generate Image batches to run
#     batch_array= generateRandomBatch(iBatchSize,config)
    
    # Write FPGA Instructions to FPGA and Execute the network!
    start = time.time()
    fpgaRT.execute(batch_array, fpgaOutput)
    
    # Compute the inner product
    xdnn.computeFC(fcWeight, fcBias, fpgaOutput, config['batch_sz'], config['outsz'], config['fpgaoutsz'], fcOutput)
    
    # Compute the softmax to convert the output to a vector of probabilities
    softmaxOut = xdnn.computeSoftmax(fcOutput)
    
    #Return the output
    return softmaxOut, time.time()-start    

In [None]:
#Provide the Model checkpoint path

sProtoBufPath="/home/centos/models/tensorflow/bvlc_googlenet_without_lrn/fp32/bvlc_googlenet_without_lrn_test.pb"

#Intantiate the FPGA configuration
config,handle=initializeFpgaModel(sProtoBufPath)

Inference_Data =[]
#Get Image batch to start inference
for i in range(0,7):
    
    iBatchSize = 2**i
    
    #Generate batch 10 * batchsize
    batch_array=generateRandomBatch(1*iBatchSize,config)
    
    print("starting prediction for batchsize : {} over {} images".format(iBatchSize,len(batch_array)))
    
    start = time.time()
    
    #Run prdeiction on FPGA
    out,actualTime = runOnFPGA(iBatchSize,config,handle,batch_array)
   
    end = time.time()
    
    duration = end-start
    
    Inference_Data.append({"duration":duration, "duration_actual_run":actualTime
                                         ,"imgsPerSec": len(batch_array)/duration,"batchSize":iBatchSize,
                                          "imgsPerSecAc": len(batch_array)/actualTime})
    del batch_array,out
    gc.collect()

    #Close the fpga handle 
xdnn.closeHandle()



Namespace(anew='work/opt_inception_model', approximate=False, banditpre=None, barrier=False, bridges=None, bytesperpixels=2, concatstrategy=None, conv_1x1_s2=False, cpulayermustgo=False, darius=None, ddr=256, dedicateddsp=None, deephifilename=None, dsp=28, forceweights=None, fromtensorflow=False, generatefile='work/fpga_caffe.cmds', godreplication=None, lasttensorbyname=None, loadpickle=None, manasadebugmode=False, memory=5, networkfile='/home/centos/models/caffe/inception_v3/fp32/inception_v3_deploy.prototxt', nodynamicscaling=False, noreplication=False, parallelism=False, parallelismstrategy="['bottom', 'tops']", parallelread=None, phase='TEST', pipelineconvmaxpool=False, pngfile=None, poolingaround=False, rankdir='BT', savepickle=None, schedulefile=None, strategy='all', verbose=False, versionjson=None, weights='/home/centos/models/caffe/inception_v3/fp32/inception_v3.caffemodel')
Network: /home/centos/models/caffe/inception_v3/fp32/inception_v3_deploy.prototxt
GenerateCode: work/fpg

In [None]:
#Print inference data and plot
Inference_Data = pd.DataFrame(Inference_Data)
Inference_Data

In [None]:
import matplotlib.path as mpath
f, ax = plt.subplots(figsize=(15, 10))
plt.title('Number of connections to port')
ax.set_xlabel("Batch Size")
plt.ylabel('Images Processed per second')
plt.xticks(Inference_Data['batchSize'], Inference_Data['batchSize'])
#plot images processed without initializing host memory ports- done on changing batch size
ax.plot(Inference_Data['batchSize'],Inference_Data['imgsPerSecAc'],marker='x', markersize=10)
#plot images processed with initializing host memory ports
ax.plot(Inference_Data['batchSize'],Inference_Data['imgsPerSec'],marker='x', markersize=10)