# Random Projection + Linear Regression Pipeline

In [1]:
import numpy as np
from sklearn import datasets
from sklearn import pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from pynq_sklearn.linear_model import PynqLinearRegression
from pynq_sklearn.random_projection import PynqBinaryRandomProjection
import timeit

# Generate dataset
X, y = datasets.make_regression(n_samples=5000, n_features=128, n_targets=10, random_state=0, noise=4.0,
                       bias=10.0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1000, random_state=42)
# Create a fixed point copy of X_test
FRAC_WIDTH = 20
X_test_hw = (X_test*(1<<FRAC_WIDTH)).astype(np.int32)

### 1. Software Pipeline

In [2]:
rp_sw = PynqBinaryRandomProjection(hw_accel=False)  
lr_sw = PynqLinearRegression(fit_intercept=True, hw_accel=False) 

sw_pipe = pipeline.Pipeline([("dim_red", rp_sw), ("reg", lr_sw)])
sw_pipe.fit(X_train, y_train)
ypred_sw = sw_pipe.predict(X_test)

In [3]:
number=200
def swresp():
    ypred_sw = sw_pipe.predict(X_test)
    return
    
print("Running the benchmark")
sw_time = timeit.timeit(swresp,number=number)
print("Time taken by sw_pipe", number,"times",sw_time)

Running the benchmark
Time taken by sw_pipe 200 times 7.99265790499976


### 2. Hardware Pipeline

<img src="imgs/pipe_problem.jpg">
<img src="imgs/pipe_slide.jpg">

### 1.) HW Only Pipeline

###### i.)  fit() is done in software

In [4]:
rp = PynqBinaryRandomProjection(hw_accel=False)
lr = PynqLinearRegression(fit_intercept=True, hw_accel=False)

hw_pipe = pipeline.Pipeline([("dim_red", rp), ("lreg", lr)])
hw_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('dim_red', PynqBinaryRandomProjection(hw_accel=False)), ('lreg', PynqLinearRegression(hw_accel=False))])

###### ii.) Explicitly set_params so that hw_accel=True

In [5]:
hw_pipe.set_params(dim_red__hw_accel=True, lreg__hw_accel=True)

Pipeline(memory=None,
     steps=[('dim_red', PynqBinaryRandomProjection(hw_accel=True)), ('lreg', PynqLinearRegression(hw_accel=True))])

###### iii.) Copy X_test_hw to contiguous memory for better performance. If scatter gather DMA is used (i.e. multi_sg.bit) then this step is optional.

In [6]:
X_test_hw = rp.copy_array(X_test_hw, dtype=np.int32) # allocates X_test_hw to contiguous memory

###### iv.) Offload transform() and predict() to HW for both PynqRandomProjection and PynqLinearRegression

In [7]:
ypred = hw_pipe.predict(X_test_hw)
ypred_hw = ypred*(1.0/(1<<FRAC_WIDTH))
(ypred_sw - ypred_hw)

ContiguousArray([[-1.84161023e-04,  2.91271868e-04, -1.86592293e-05, ...,
                   1.10403365e-04,  4.23305522e-05, -1.53918539e-04],
                 [-2.73144498e-04, -2.16451519e-04, -4.21972693e-04, ...,
                  -2.96949725e-04,  1.14355569e-04, -3.77841649e-04],
                 [ 6.09778628e-04,  2.37982448e-04,  2.12570337e-04, ...,
                   3.98578975e-04,  3.48527769e-04,  2.10138772e-04],
                 ...,
                 [ 4.52665271e-04,  7.61498667e-04,  5.02601490e-04, ...,
                   5.76496977e-04,  3.82906743e-04,  3.01686836e-04],
                 [ 4.72639795e-05,  1.32685996e-05,  1.37225285e-04, ...,
                   7.35078807e-04,  4.61792391e-04,  4.89451484e-04],
                 [ 9.41509175e-04,  4.26235360e-04,  3.47638286e-04, ...,
                   3.36902809e-04,  3.86004431e-04,  4.99078159e-04]])

###### v.) Measure the pipeline performance 

In [8]:
number=200
def hwresp():
    ypred = hw_pipe.predict(X_test_hw)
    return
    
print("Running the benchmark")
hw_time = timeit.timeit(hwresp,number=number)
print("Time taken by hw_pipe", number,"times",hw_time)
print("HW Speedup = %.2fx"%(sw_time/hw_time))

Running the benchmark
Time taken by hw_pipe 200 times 0.5885646269998688
HW Speedup = 13.58x


### 2.) HW/SW Pipeline
Only works for the "multi_sg.bit". This bitstream/library uses scatter gather DMA for transferring the input data from PS to PL. This means we don't have to explicitly copy the numpy array into physical contiguous memory.  

###### i.) Explicitly set_params so that hw_accel=True only for lreg accelerator

In [10]:
hw_pipe.set_params(dim_red__hw_accel=False, lreg__hw_accel=True)

Pipeline(memory=None,
     steps=[('dim_red', PynqBinaryRandomProjection(hw_accel=False)), ('lreg', PynqLinearRegression(hw_accel=True))])

###### ii.) Calling predict will only offload PynqLinearRegression to HW. Given that PynqBinaryRandomProjection is stage1 and is computed in SW, our input is floating point, and is non-contiguous. The output of stage1 is converted to fixed point before stage2. This significantly reduces the performance, but the pipeline still works.

In [12]:
X_test_hw = X_test #(X_test*(1<<FRAC_WIDTH)).astype(np.int32)
ypred = hw_pipe.predict(X_test_hw)
ypred_hw = ypred*(1.0/(1<<FRAC_WIDTH))
(ypred_sw - ypred_hw)

ContiguousArray([[-5.82392614e-06,  5.16957336e-06, -9.12248610e-06, ...,
                  -1.45279705e-05,  2.27623091e-06, -1.56357627e-05],
                 [ 1.96335167e-05,  2.67354312e-05,  2.72079101e-05, ...,
                   4.54193547e-05,  4.56910182e-05,  2.93772837e-05],
                 [ 3.47130152e-05,  1.19616351e-05,  2.94648687e-05, ...,
                   2.28312941e-05,  1.76027816e-05,  1.94039087e-05],
                 ...,
                 [ 1.39750859e-05,  3.76598613e-05,  2.09959602e-05, ...,
                   1.66901534e-05,  1.28811085e-05,  2.41676096e-05],
                 [ 2.62831445e-05,  4.68553073e-06,  3.70894816e-05, ...,
                   8.84876206e-05,  4.50367152e-05,  7.46031562e-05],
                 [ 2.69355056e-05,  1.80627521e-05,  6.22288098e-06, ...,
                  -4.51259649e-06,  2.07471683e-05,  3.06491188e-07]])