forked from IntelPython/scikit-learn_bench
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request IntelPython#12 from IntelPython/tc/feature/streaming
feature/streaming
- Loading branch information
Showing
21 changed files
with
702 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,22 @@ | ||
import os | ||
import sys | ||
import unittest | ||
|
||
if sys.platform in ['win32', 'cygwin']: | ||
os.environ['PATH'] = ';'.join([os.environ['PATH'], os.path.join(os.environ['CONDA_PREFIX'], 'Library', 'bin', 'libfabric')]) | ||
|
||
here = os.path.abspath(os.path.dirname(__file__)) | ||
ex_dir = os.path.join(here, "examples") | ||
|
||
from examples.run_examples import run_all | ||
from tests.test_examples import Test | ||
|
||
s = unittest.defaultTestLoader.discover('tests') | ||
r = unittest.TextTestRunner() | ||
r.run(s) | ||
sys.exit(0 if r._makeResult().wasSuccessful() else 1) | ||
ret1 = 0 if r._makeResult().wasSuccessful() else 1 | ||
|
||
os.chdir(ex_dir) | ||
ret2 = run_all(True) | ||
|
||
sys.exit(ret1 + ret2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
.. _streaming: | ||
|
||
############## | ||
Streaming Data | ||
############## | ||
For large quantities of data it might be impossible to provide all input data at | ||
once. This might be because the data resides in multiple files and merging it is | ||
to costly (or not feasible in other ways). In other cases the data is simply too | ||
large to be loaded completely into memory. Or, the data might come in as an | ||
actual stream. daal4py's streaming mode allows you to process such data. | ||
|
||
Besides supporting certain use cases, streaming also allows interleaving I/O | ||
operations with computation. | ||
|
||
daal4py's streaming mode is as easy as follows: | ||
|
||
1. When constructing the algorithm configure it with ``streaming=True``:: | ||
|
||
algo = daal4py.svd(streaming=True) | ||
2. Repeat calling ``compute(input-data)`` with chunks of your input (arrays or | ||
files):: | ||
|
||
for f in input_files: | ||
algo.compute(f) | ||
3. When done with inputting, call ``finalize()`` to obtain the result:: | ||
|
||
result = algo.finalize() | ||
|
||
The streaming algorithms also accept arrays as input, e.g. the data can come | ||
from a stream rather than from multiple files. Here is an example which | ||
simulates a data stream using a generator which reads a file in chunks: | ||
`SVD reading stream of data <https://github.com/IntelPython/daal4py/blob/master/examples/stream.py>`_ | ||
|
||
Supported Algorithms and Examples | ||
--------------------------------- | ||
The following algorithms support streaming: | ||
|
||
- SVD (svd) | ||
|
||
- `SVD <https://github.com/IntelPython/daal4py/blob/master/examples/svd_streaming.py>`_ | ||
|
||
- Linear Regression Training (linear_regression_training) | ||
|
||
- `Linear Regression <https://github.com/IntelPython/daal4py/blob/master/examples/linear_regression_stream.py>`_ | ||
|
||
- Ridge Regression Training (ridge_regression_training) | ||
|
||
- `Ridge Regression <https://github.com/IntelPython/daal4py/blob/master/examples/ridge_regression_stream.py>`_ | ||
|
||
- Multinomial Naive Bayes Training (multinomial_naive_bayes_training) | ||
|
||
- `Naive Bayes <https://github.com/IntelPython/daal4py/blob/master/examples/naive_bayes_stream.py>`_ | ||
|
||
- Moments of Low Order | ||
|
||
- `Low Order Moments <https://github.com/IntelPython/daal4py/blob/master/examples/low_order_moms_dense_streaming.py>`_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
#******************************************************************************* | ||
# Copyright 2014-2018 Intel Corporation | ||
# All Rights Reserved. | ||
# | ||
# This software is licensed under the Apache License, Version 2.0 (the | ||
# "License"), the following terms apply: | ||
# | ||
# You may not use this file except in compliance with the License. You may | ||
# obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | ||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
#******************************************************************************* | ||
|
||
# daal4py Linear Regression example for streaming on shared memory systems | ||
|
||
import daal4py as d4p | ||
import numpy as np | ||
|
||
# let's try to use pandas' fast csv reader | ||
try: | ||
import pandas | ||
read_csv = lambda f, c, s=0, n=None: pandas.read_csv(f, usecols=c, delimiter=',', header=None, skiprows=s, nrows=n, dtype=np.float64).values | ||
except: | ||
# fall back to numpy genfromtxt | ||
def read_csv(f, c, s=0, n=np.iinfo(np.int64).max): | ||
a = np.genfromtxt(f, usecols=c, delimiter=',', skip_header=s, max_rows=n) | ||
if a.shape[0] == 0: | ||
raise Exception("done") | ||
if a.ndim == 1: | ||
return a[:, np.newaxis] | ||
return a | ||
|
||
def main(): | ||
infile = "./data/batch/linear_regression_train.csv" | ||
testfile = "./data/batch/linear_regression_test.csv" | ||
|
||
# Configure a Linear regression training object for streaming | ||
train_algo = d4p.linear_regression_training(interceptFlag=True, streaming=True) | ||
|
||
chunk_size = 250 | ||
lines_read = 0 | ||
# read and feed chunk by chunk | ||
while True: | ||
# Read data in chunks | ||
# Let's have 10 independent, and 2 dependent variables (for each observation) | ||
try: | ||
indep_data = read_csv(infile, range(10), lines_read, chunk_size) | ||
dep_data = read_csv(infile, range(10,12), lines_read, chunk_size) | ||
except: | ||
break | ||
# Now feed chunk | ||
train_algo.compute(indep_data, dep_data) | ||
lines_read += indep_data.shape[0] | ||
|
||
# All chunks are done, now finalize the computation | ||
train_result = train_algo.finalize() | ||
|
||
# Now let's do some prediction | ||
predict_algo = d4p.linear_regression_prediction() | ||
# read test data (with same #features) | ||
pdata = read_csv(testfile, range(10)) | ||
ptdata = read_csv(testfile, range(10,12)) | ||
# now predict using the model from the training above | ||
predict_result = predict_algo.compute(pdata, train_result.model) | ||
|
||
# The prediction result provides prediction | ||
assert predict_result.prediction.shape == (pdata.shape[0], dep_data.shape[1]) | ||
|
||
return (train_result, predict_result, ptdata) | ||
|
||
|
||
if __name__ == "__main__": | ||
(train_result, predict_result, ptdata) = main() | ||
print("\nLinear Regression coefficients:\n", train_result.model.Beta) | ||
print("\nLinear Regression prediction results: (first 10 rows):\n", predict_result.prediction[0:10]) | ||
print("\nGround truth (first 10 rows):\n", ptdata[0:10]) | ||
print('All looks good!') |
Oops, something went wrong.