From 2b07e30f01f5207e30df9f117866339a7095b957 Mon Sep 17 00:00:00 2001 From: mxz96102 Date: Mon, 22 Apr 2019 13:53:47 +0800 Subject: [PATCH 1/3] update sqlflow doc --- docs/assets/js/search-data.json | 34 +++---- docs/feed.xml | 2 +- .../thrift.git/lib/go/thrift/index.html | 13 --- .../thrift/lib/go/thrift/index.html | 13 --- .../service-rpc/gen-go/tcliservice/index.html | 13 --- .../thrift.git/lib/go/thrift/index.html | 13 --- docs/sitemap.xml | 22 +---- docs/sqlflow/doc/quickstart.md | 93 +++++++++++++++++++ 8 files changed, 114 insertions(+), 89 deletions(-) delete mode 100644 docs/gohive/git.apache.org/thrift.git/lib/go/thrift/index.html delete mode 100644 docs/gohive/git.apache.org/thrift/lib/go/thrift/index.html delete mode 100644 docs/gohive/service-rpc/gen-go/tcliservice/index.html delete mode 100644 docs/gohive/vendor/git.apache.org/thrift.git/lib/go/thrift/index.html create mode 100644 docs/sqlflow/doc/quickstart.md diff --git a/docs/assets/js/search-data.json b/docs/assets/js/search-data.json index 1a732b7..fea32c6 100644 --- a/docs/assets/js/search-data.json +++ b/docs/assets/js/search-data.json @@ -29,53 +29,53 @@ }, "4": { "id": "4", - "title": "Quick start", - "content": "Quick start SQLFlow is currently under active development. For those who are interested in trying it out, we have provided several demos. Play around with it. Any bug report and issue are welcomed. :) Setup Install Docker. Set up a MySQL server following example/datasets/README.md. Pull the latest SQLFlow Docker image: docker pull sqlflow/sqlflow:latest. Demo 1: Jupyter Notebook Start a Docker container that runs sqlflowserver and Jupyter Notebook. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run --rm -it -p 8888:8888 sqlflow/sqlflow:latest bash -c "sqlflowserver --db_user root --db_password root --db_address host.docker.internal:3306 & SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root" If you are using Docker for Mac, please be aware the option --db_address host.docker.internal:3306 where host.docker.internal translates to the host ip address as recommended here. If you are running MySQL on remote, please be aware that MySQL only allows connections from localhost by default. Fix can be found here. Open a Web browser and direct to localhost:8888 and input the token. Then you can create notebooks. In a cell, you should be able to type %%sqlflow select 1 Explore more examples at example.ipynb Demo 2: Command Line Prompt Start a Docker container that runs SQLFlow command line prompt. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run -it --rm --net=host sqlflow/sqlflow:latest demo --db_user root --db_password root --db_address host.docker.internal:3306 You should be able to see the following prompt. sqlflow> Example Select data sqlflow> select * from iris.train limit 2; -- +--+-+--+-+-+ | SEPAL LENGTH | SEPAL WIDTH | PETAL LENGTH | PETAL WIDTH | CLASS | +--+-+--+-+-+ | 6.4 | 2.8 | 5.6 | 2.2 | 2 | | 5 | 2.3 | 3.3 | 1 | 1 | +--+-+--+-+-+ Train a Tensorflow DNNClassifier sqlflow> SELECT * FROM iris.train TRAIN DNNClassifier WITH n_classes = 3, hidden_units = [10, 20] COLUMN sepal_length, sepal_width, petal_length, petal_width LABEL class INTO sqlflow_models.my_dnn_model; -- ... Training set accuracy: 0.96721 Done training Prediction using a trained model sqlflow> SELECT * FROM iris.test predict iris.predict.class USING sqlflow_models.my_dnn_model; Checkout prediction result sqlflow> select * from iris.predict limit 10;", - "url": "/sqlflow/doc/quickstart.html", - "relUrl": "/sqlflow/doc/quickstart.html" - }, - "5": { - "id": "5", "title": "Build TensorFlow from Source Code using Docker", "content": "Build TensorFlow from Source Code using Docker To contribute to TensorFlow, we need to build TensorFlow from source code. The official guide is great. However, it interleaves the native building process and that using Docker and makes it confusing because packages needed by the former are not by the latter. Also, we found some useful tricks to start Docker containers in practices that are not in the official guide. Hence this document. Build the Pip Package in TensorFlow Development Container On either Mac or Linux, or any other OS, we don’t have to install and configure the building tools; instead, we can use a Docker image where all tools have been installed and properly configured. Get the Docker image containing all the building tools: docker pull tensorflow/tensorflow:latest-devel Then, let’s get the source code. On any OS, please install git using the native package manager. For example, on Ubuntu, please sudo apt-get install git or, on Mac, brew install git Then, use the git just installed, let’s clone tensorflow source code: git clone --recursive https://github.com/tensorflow/tensorflow cd tensorflow By default, we will be on the master branch. Feel free to do you change in your feature branches, or switch to a release branch, for example: git checkout v1.11.0 git checkout -b v1.11.0 Then, let us start a Docker container running the tensorflow/tensorflow:latest-devel image: docker run --rm -it -w /tensorflow -v $PWD:/tensorflow -v $HOME/.cache:/root/.cache -e "HOST_PERMS=$(id -u):$(id -g)" tensorflow/tensorflow:latest-devel /bin/bash -w /tensorflow brings us to the /tensorflow directory in the container once after we start it. -v $PWD:/tensorflow maps the current directory, which is the just cloned TensorFlow source directory on the host, to /tensorflow in the container. -v $HOME/.cache:/root/.cache maps the Bazel temporary directory on the host into the container, so the intermediate files generated by Bazel running in the container are actually saved on the host. This allows us to interrupt the container during the build and restart it later to resume the building. e "HOST_PERMS=$(id -u):$(id -g)" passes the user identification on the host into the container as an environment variable. We can reset the mode of files generated in the container to this user identity. From now on, we will be working in the container. Let us first configure the building: ./configure Usually, I would simply choose all the default options by hitting enter all the way down. Build the pip package builder: bazel build --config=opt //tensorflow/tools/pip_package:build_pip_package Build the pip package and save it into /tensorflow: ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tensorflow Change the generated wheel file into the usual mode as on the host: chown $HOST_PERMS /mnt/tensorflow-*.whl Install the Pip Pacakge in TensorFlow Development Container Let’s now try the new pip package. First, we need to uninstall the current tensorflow pip package, then we install the newly built one: pip uninstall tensorflow pip install /tensorflow/tensorflow-*.whl Now, we can verify if the new package works. First, we need to switch to a directory out from /tensorflow, so we don’t import from the source directory: cd /tmp # other directories also work then, we can try import the newly installed tensorflow package and verify it: python >>> import tensorflow >>> print(tensorflow.__version__) Now, let us quit from Python and from the Docker container. We should see the tensorflow-*.whl file on the host in the current directory. Install the Pip Package in a Clean Python Package After we quit from the development container, we should see the wheel file in the TensorFlow source directory on the host. Now, we can start a Python container and install the Pip package in it. Start the Python container docker run --rm -it -v $PWD:/tensorflow python:2.7 bash Install the pip pacakge in the container pip install /tensorflow/tensorflow*.whl Try TensorFlow by starting Python in the container python >>> import tensorflow as tf >>> print(tf.__version__)", "url": "/sqlflow/doc/build-tensorflow.html", "relUrl": "/sqlflow/doc/build-tensorflow.html" }, - "6": { - "id": "6", + "5": { + "id": "5", "title": "Canonical Development Environment", "content": "Canonical Development Environment Referring to this example, we create a canonical development environment for Go and Python programmers using Docker. Editing on Host When we use this Docker image for daily development work, the source code relies on the host computer instead of the container. The source code includes this repo and all its dependencies, for example, the Go package google.golang.org/grpc. Code-on-the-host allows us to run our favorite editors (Emacs, VIM, Eclipse, and more) on the host. Please free to rely on editors add-ons to analyze the source code for auto-completion. Building in Container We build a Docker image that contains development tools: The Python interpreter The Go compiler The protobuf compiler The protobuf to Go compiler extension The protobuf to Python compiler extension Because this repo contains Go code, please make sure that you have the directory structure required by Go. On my laptop computer, I have export GOPATH=$HOME/go You could have your $GOPATH pointing to any directory you like. Given $GOPATH$ set, we could git clone the source code of our project by running: go get github.com/sql-machine-learning/sqlflow Change the directory to our project root, and we can use go get to retrieve and update Go dependencies. cd $GOPATH/src/github.com/sql-machine-learning/sqlflow go get -u -t ./... Note -t instructs get to also download the packages required to build the tests for the specified packages. As all Git users would do, we run git pull from time to time to sync up with others’ work. If somebody added new dependencies, we might need to run go -u ./... after git pull to update dependencies. To build this project, we need the protobuf compiler, Go compiler, Python interpreter, gRPC extension to the protobuf compiler. To ease the installation and configuration of these tools, we provided a Dockerfile to install them into a Docker image. To build the Docker image: docker build -t sqlflow:dev -f Dockerfile.dev . Development Build and Test We build and test the project inside the docker container. To run the container, we need to map the $GOPATH directory on the host into the /go directory in the container, because the Dockerfile configures /go as the $GOPATH in the container: docker run --rm -it -v $GOPATH:/go -w /go/src/github.com/sql-machine-learning/sqlflow sqlflow:dev bash Inside the Docker container, start a MySQL server in the background service mysql start& run all the tests as go generate ./... go install ./... go test -v ./... where go generate invokes the protoc command to translate server/sqlflow.proto into server/sqlflow.pb.go and go test -v builds and run unit tests. Release The above build process currently generates two binary files in $GOPATH/bin on the host. To package them into a Docker image, please run docker build -t sqlflow -f ./Dockerfile $GOPATH/bin To publish the released Docker image to our official DockerHub docker tag sqlflow sqlflow/sqlflow:latest docker push sqlflow/sqlflow:latest Demo: Command line Prompt The demo requires a MySQL server instance with populated data. If we don’t, we could follow example/datasets/README.md to start one on the host. After setting up MySQL, run the following inside the Docker container go run cmd/demo/demo.go --db_user root --db_password root --db_address host.docker.internal:3306 You should be able to see the following prompt sqlflow>", "url": "/sqlflow/doc/build.html", "relUrl": "/sqlflow/doc/build.html" }, - "7": { - "id": "7", + "6": { + "id": "6", "title": "Closing the producer goroutine from the consumer", "content": "Closing the producer goroutine from the consumer The producer-and-consumer pattern is well used in Go concurrent programming. When the consumer stops, we want to gracefully stop the producer as well. Problem When a gRPC server receives a streaming request, it usually calls a function that returns a channel, reads the result from that channel and send the result to the client one by one. Take the following code for instance: upon receiving a request, the main goroutine Service calls launchJob. launchJob starts a separate goroutine as an anonymous function call and returns a channel. In the anonymous function, items will be sent to channel. And Service on the otherside of the channel will reads from it. func Service(req *Request, stream *StreamResponse) error { result := launchJob(req.Content) for r := range result { if e := stream.Send(result); e != nil { // should we signal the running goroutine so it will stop sending? return e } } } func launchJob(content string) chan Item { c := make(chan Item) go func() { defer close(c) acquireScarceResources() defer releaseScarceResources() ... // if stream.Send(result) returns an error and the Service returns, this will be blocked c <- Item{} ... }() return c } There is a major problem in this implementation. As pointed out by the comment, if the Send in Service returns an error, the Service function will return, leaving the anonymous function being blocked on c <- Item{} forever. This problem is important because the leaking goroutine usually owns scarce system resources such as network connection and memory. Solution: pipeline explicit cancellation Inspired by this blog post section Explicit cancellation, we can signal the cancellation via closing on a separate channel. And we can follow the terminology as io.Pipe. package sql import ( "errors" ) var ErrClosedPipe = errors.New("pipe: write on closed pipe") // pipe follows the design at https://blog.golang.org/pipelines // - wrCh: chan for piping data // - done: chan for signaling Close from Reader to Writer type pipe struct { wrCh chan interface{} done chan struct{} } // PipeReader reads real data type PipeReader struct { p *pipe } // PipeWriter writes real data type PipeWriter struct { p *pipe } // Pipe creates a synchronous in-memory pipe. // // It is safe to call Read and Write in parallel with each other or with Close. // Parallel calls to Read and parallel calls to Write are also safe: // the individual calls will be gated sequentially. func Pipe() (*PipeReader, *PipeWriter) { p := &pipe{ wrCh: make(chan interface{}), done: make(chan struct{})} return &PipeReader{p}, &PipeWriter{p} } // Close closes the reader; subsequent writes to the func (r *PipeReader) Close() { close(r.p.done) } // ReadAll returns the data chan. The caller should // use it as `for r := range pr.ReadAll()` func (r *PipeReader) ReadAll() chan interface{} { return r.p.wrCh } // Close closes the writer; subsequent ReadAll from the // read half of the pipe will return a closed channel. func (w *PipeWriter) Close() { close(w.p.wrCh) } // Write writes the item to the underlying data stream. // It returns ErrClosedPipe when the data stream is closed. func (w *PipeWriter) Write(item interface{}) error { select { case w.p.wrCh <- item: return nil case <-w.p.done: return ErrClosedPipe } } And the consumer and producer be can implemented as func Service(req *Request, stream *StreamResponse) error { pr := launchJob(req.Content) defer pr.Close() for r := range pr.ReadAll() { if e := stream.Send(r); e != nil { return e } } } func launchJob(content string) PipeReader { pr, pw := Pipe() go func() { defer pw.Close() if err := pw.Write(Item{}); err != nil { return } } return pr } Further Reading Google Form: Channel send timeout Go by Example: Timeouts Google I/O 2013 - Advanced Go Concurrency Patterns Go Concurrency Patterns Talk Go Concurrency Patterns: Pipelines and cancellation", "url": "/sqlflow/doc/close_producer_from_consumer.html", "relUrl": "/sqlflow/doc/close_producer_from_consumer.html" }, - "8": { - "id": "8", + "7": { + "id": "7", "title": "Compatibility with Various SQL Engines", "content": "Compatibility with Various SQL Engines SQLFlow interacts with SQL engines like MySQL and Hive, while different SQL engines use variants of SQL syntax, it is important for SQLFlow to have an abstraction layer that hides such differences. SQLFlow calls Go’s standard database API. The submitter programs generated by SQLFlow call Python’s database API. Both APIs abstract the interface to various SQL engines; however, they are insufficient for SQLFlow to work. In this document, we examine all interactions between SQLFlow and the SQL engine so to identify what SQLFlow authors have to abstract in addition to calling Go’s and Python’s database APIs. Data Operations in Go Data Retrieval The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TRAIN and PREDICT clauses. For more discussion, please refer to the syntax design. SQLFlow translates such “extended SQL statements” into submitter programs, which forward the part from SELECT to TRAIN or PREDICT, which we call the “standard part”, to the SQL engine. SQLFlow also accepts the SELECT statement without TRAIN or PREDICt clauses and would forward such “standard statements” to the engine. It is noticeable that the “standard part” or “standard statements” are not standardized. For example, various engines use different syntax for joining. MySQL: SELECT pet.name, comment FROM pet, event WHERE pet.name =event.name; with keyword WHERE . Hive: SELECT pet.name, comment FROM pet JOIN event ON (pet.name =event.name) with keyword JOIN and ON. ODPS and SQLite use either INNER JOIN or OUTER JOIN. Fortunately, as SQLFlow forwards the above parts to the engine, it doesn’t have to care much about the differences above. Metadata Retrieval To verify the semantics of users’ inputs, SQLFlow needs to retrieve the schema of tables. For example, the input might be SELECT name, age, income FROM employee TRAIN DNNRegressor WITH hidden_layers=[10,50,10] COLUMN name, agee LABEL income; In the above example, the user misspelled the field name age in the COLUMN clause as “agee”. SQLFlow must be able to find that out. To do that, SQLFlow needs to query the field names from the SQL engine. However, different engines use various syntax. For example: MySQL: DESCRIBE/DESC employee; Hive: DESCRIBE FORMATTED employee; ODPS: DESC employee; SQLite: PRAGMA table_info([employee]); The returned data format varies too. Our solution to avoid such differences is not-to-use-them; instead, SQLFlow retrieves the table schema by running a query like SELECT * FROM employee LIMIT 1; and inferring field types using the mechanism called DatabaseTypeName provided by SQL engines drivers beneath the Go’s standard database API. Prepare Prediction Table A SQLFlow prediction job writes its prediction results into a table. It prepares the prediction table by Dropping previous prediction table DROP TABLE IF EXISTS my_table; Creating table with schema CREATE TABLE my_table (name1, type1, name2 type2); Most SQL engines, including MySQL, Hive, ODPS, SQLite, support both statements. Translate Database Column Type to TensorFlow Feature Column Type After retrieving database column type name through DatabaseTypeName, we can derive TensorFlow’s feature column type via a mapping such as {"FLOAT", "DOUBLE"} -> tf.numeric_column. Save Model SQLFlow saves trained ML model by dumping the serialized the model directory into a table. It first creates a table by CREATE TABLE IF NOT EXISTS %s (id INT AUTO_INCREMENT, block BLOB, PRIMARY KEY (id)) and insert blobs by INSERT INTO %s (block) VALUES(?). Note that Hive and ODPS doesn’t have BLOB type, we need to use BINARY (docs at ODPS, Hive) instead. Also, note that Hive and ODPS doesn’t support AUTO_INCREMENT, we need to implemented auto increment logic in sqlfs. Load Model SQLFlow loads trained ML model by reading rows in a table and deserializing the blob to a model directory. It reads rows by running SELECT block FROM %s ORDER BY id, which is supported by most databases. Data Operations in Python Connect to SQL Engines Thanks to the Python database API, connecting to different databases follows a similar API. conn = mysql.connector.connect(user='scott', password='password', host='127.0.0.1', database='employees') conn = sqlite3.connect('path/to/your/sqlite/file') conn = pyhive.connect('localhost') cursor = conn.cursor() cursor.execute('select * from my_table;') Insert Prediction Result into Prediction Table Python database API provides execute_many(sql, value) to insert multiple values at once. So one can prepare the following insertion statement. Please be aware that MySQL and SQLite use INSERT INTO to insert rows while Hive and ODPS use INSERT INTO TABLE. -- MySQL, SQLite INSERT INTO table_name VALUES (value1, value2, value3, ...); -- Hive, ODPS INSERT INTO TABLE table_name VALUES (value1, value2, value3, ...);", "url": "/sqlflow/doc/database_abstraction_layer.html", "relUrl": "/sqlflow/doc/database_abstraction_layer.html" }, - "9": { - "id": "9", + "8": { + "id": "8", "title": "Run MySQL Server and Client in Docker Containers", "content": "Run MySQL Server and Client in Docker Containers The document explains how to setup MySQL in our development environment. Run MySQL Server in a Docker Container docker run --rm -v /tmp/test1:/var/lib/mysql --name mysql01 -e MYSQL_ROOT_PASSWORD=root -e MYSQL_ROOT_HOST='%' -p 3306:3306 -d mysql/mysql-server:8.0 the -v option ensures that the database is saved on the host. The default directory where MySQL saves the database is /var/lib/mysql. This directory can be configured in /etc/mysql/my.cnf, as explained in this post. By overlaying the directory /tmp/test1 on the host to /var/lib/mysql, we “cheat” MySQL to save databases on the host. So, we can kill the container and restart it, and the database is still there. Please be aware that the directory on the host must be empty the first time we run the above command; otherwise, MySQL would fail to initialize. I figured out this problem after several failures using docker logs. the -e option sets the root password of MySQL to “root”. Feel free to set it to any password you like. the second -e options sets MYSQL_ROOT_HOST to a wildcard so to allow clients connecting to the server via TCP/IP as the user “root”. This trick works with MySQL 5.7 and 8.0, but not the most recent under-development version. the --name option names the container to mysql01, which can be used to refer to this container. the -p option maps the port 3306, on which the MySQL server listens, to the same port on the host, so that clients could connect to the server via TCP/IP. Run MySQL Client in the Server Container docker exec -it mysql01 mysql -uroot -p This command executes the command mysql, which is the command line tool of MySQL, in the container named mysql01. The command line flags of mysql include -u, which specifies the username of MySQL, and -p, which makes MySQL prompts for the password. For this example, we should type the password “root”, which was set in the previous command. Please wait for a few seconds after the starting of the MySQL server container before we execute the client; otherwise, the startup of the client might fail. Once we get into the MySQL client, we can type SQL commands, e.g., show databases; create database yi; Run Client in a Different Container on the Same Host docker run --rm -it -v /tmp/test1:/var/lib/mysql mysql/mysql-server:8.0 mysql -uroot -p The -v option maps the database directory on the host to the client container. This mapping is necessary because, by default, the client talks to the server via Unix socket /var/lib/mysql/mysql.sock, which is /tmp/test1/mysql.sock on the host. Run Client in a Container on a Remote Host docker run --rm -it mysql/mysql-server:8.0 mysql -h 192.168.1.3 -P 3306 -uroot -p the -h option tells the client where the server is running on. In this example, the given IP is the one of the host where I ran the MySQL server container. Please be aware that the above command works only if the server allows remote connections. Run Python Client in a Container To write a Python client, we need to install the Python package mysql-connector-python. FROM python:2.7 RUN pip install mysql-connector-python Please be aware that some documents says that we need to install mysql-connector. I tried; but the mysql.connector.connect call failed with the error mysql.connector.errors.NotSupportedError: Authentication plugin 'caching_sha2_password' is not supported. Build the Docker image: docker build -t sqlflow . Run the image: docker run --rm -it sqlflow bash and we can start Python and run the following Python code snippet >>> import mysql.connector >>> db = mysql.connector.connect(user='root', passwd='root', host='192.168.1.3') >>> print(db) <mysql.connector.connection_cext.CMySQLConnection object at 0x7fbab9f3fed0> Run a Go Client In order to connect to a database, you need to import the database’s driver first. export GOPATH=$HOME/go go get -u github.com/go-sql-driver/mysql go run the following file package main import ( "database/sql" "github.com/go-sql-driver/mysql" "log" ) func main() { testConfig := &mysql.Config{ User: "root", Passwd: "root", Net: "tcp", Addr: "localhost:3306", } db, e := sql.Open("mysql", testConfig.FormatDSN()) if e != nil { log.Fatal(e) } defer db.Close() }", "url": "/sqlflow/doc/mysql-setup.html", "relUrl": "/sqlflow/doc/mysql-setup.html" }, - "10": { - "id": "10", + "9": { + "id": "9", "title": "Piping Responses", "content": "Piping Responses Streaming Responses As described in the overall design, a SQLFlow job could be a standard or an extended SQL statemnt, where an extended SQL statement will be translated into a Python program. Therefore, each job might generate up to the following data streams: standard output, where each element is a line of text, standard error, where each element is a line of text, data rows, where the first element consists of fields name/types, and each of the rest is a row of data, status, where the element could be pending, failed, and succeeded. To create good user experience, we need to pipe these responses from SQLFlow jobs to Jupyter Notebook in real-time. Stages in the Pipe The pipe that streams outputs from SQLFlow jobs to the Jupyter Notebook consists of the following stages: Web browser ↑ | HTTP ↓ Jupyter Notebook server ↑ | ZeroMQ streams: Shell, IOPub, stdin, Controls, Heartbeat ↓ iPython kernel ↑ | IPython magic command framework ↓ SQLFlow magic command for Jupyter ↑ | gRPC ↓ SQLFlow server ↑ | Go channels ↓ SQLFlow job manager (Go functions) ↑ | IPC with Go's standard library ↓ SQLFlow jobs In the above figure, from the SQLFlow magic command to the bottom layer are our work. Streaming We have two alternative ideas: multiple streams and a multiplexing stream. We decided to use a multiplexing stream because we had a unsuccessful trial with the multiple streams idea: we make the job writes to various Go channels and forward each Go channel to a streaming gRPC call, as the following: Multiple streams The above figure shows that there are multiple streams between the Jupyter Notebook server and Jupyter kernels. According to the document, there are five: Shell, IOPub, stdin, Control, and Heartbeat. These streams are ZeroMQ streams. We don’t use ZeroMQ, but we can take the idea of having multiple parallel streams in the pipe. service SQLFlow { rpc File(string sql) returns (int id) {} rpc ReadStdout(int id) returns (stream string) {} rpc ReadStderr(int id) returns (stream string) {} rpc ReadData(int id) returns (stream Row) {} rpc ReadStatus(int id) returns (stream int) {} } However, we realized that if the user doesn’t call any one of the SQLFlow.Read... call, there would be no forwarding from the Go channel to Jupyter, thus the job would block forever at writing. A Multiplexing Stream Another idea is multiplexing all streams into one. For example, we can have only one ZeroMQ stream, where each element is a polymorphic type – could be a text string or a data row. service SQLFlow { rpc Run(string sql) returns (stream Response) {} } // Only one of the following fields should be set. message Response { oneof record { repeated string head = 1; // Column names. repeated google.protobuf.Any row = 2; // Cells in a row. string log = 3; // A line from stderr or stdout. } }", "url": "/sqlflow/doc/pipe.html", "relUrl": "/sqlflow/doc/pipe.html" }, + "10": { + "id": "10", + "title": "Quick start", + "content": "Quick start SQLFlow is currently under active development. For those who are interested in trying it out, we have provided several demos. Play around with it. Any bug report and issue are welcomed. :) Setup Install Docker. Set up a MySQL server following example/datasets/README.md. Pull the latest SQLFlow Docker image: docker pull sqlflow/sqlflow:latest. Demo 1: Jupyter Notebook Start a Docker container that runs sqlflowserver and Jupyter Notebook. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run --rm -it -p 8888:8888 sqlflow/sqlflow:latest bash -c "sqlflowserver --db_user root --db_password root --db_address host.docker.internal:3306 & SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root" If you are using Docker for Mac, please be aware the option --db_address host.docker.internal:3306 where host.docker.internal translates to the host ip address as recommended here. If you are running MySQL on remote, please be aware that MySQL only allows connections from localhost by default. Fix can be found here. Open a Web browser and direct to localhost:8888 and input the token. Then you can create notebooks. In a cell, you should be able to type %%sqlflow select 1 Explore more examples at example.ipynb Demo 2: Command Line Prompt Start a Docker container that runs SQLFlow command line prompt. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run -it --rm --net=host sqlflow/sqlflow:latest demo --db_user root --db_password root --db_address host.docker.internal:3306 You should be able to see the following prompt. sqlflow> Example Select data sqlflow> select * from iris.train limit 2; -- +--+-+--+-+-+ | SEPAL LENGTH | SEPAL WIDTH | PETAL LENGTH | PETAL WIDTH | CLASS | +--+-+--+-+-+ | 6.4 | 2.8 | 5.6 | 2.2 | 2 | | 5 | 2.3 | 3.3 | 1 | 1 | +--+-+--+-+-+ Train a Tensorflow DNNClassifier sqlflow> SELECT * FROM iris.train TRAIN DNNClassifier WITH n_classes = 3, hidden_units = [10, 20] COLUMN sepal_length, sepal_width, petal_length, petal_width LABEL class INTO sqlflow_models.my_dnn_model; -- ... Training set accuracy: 0.96721 Done training Prediction using a trained model sqlflow> SELECT * FROM iris.test predict iris.predict.class USING sqlflow_models.my_dnn_model; Checkout prediction result sqlflow> select * from iris.predict limit 10;", + "url": "/sqlflow/doc/quickstart.html", + "relUrl": "/sqlflow/doc/quickstart.html" + }, "11": { "id": "11", "title": "Extended SQL Parser Design", diff --git a/docs/feed.xml b/docs/feed.xml index 842fe89..7bb1e90 100644 --- a/docs/feed.xml +++ b/docs/feed.xml @@ -1 +1 @@ -Jekyll2019-04-20T11:40:20+08:00/feed.xmlSQLFLOW \ No newline at end of file +Jekyll2019-04-22T13:40:11+08:00/feed.xmlSQLFLOW \ No newline at end of file diff --git a/docs/gohive/git.apache.org/thrift.git/lib/go/thrift/index.html b/docs/gohive/git.apache.org/thrift.git/lib/go/thrift/index.html deleted file mode 100644 index 9cd02cf..0000000 --- a/docs/gohive/git.apache.org/thrift.git/lib/go/thrift/index.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - -

Redirecting to https://github.com/sql-machine-learning/gohive

- - diff --git a/docs/gohive/git.apache.org/thrift/lib/go/thrift/index.html b/docs/gohive/git.apache.org/thrift/lib/go/thrift/index.html deleted file mode 100644 index 9cd02cf..0000000 --- a/docs/gohive/git.apache.org/thrift/lib/go/thrift/index.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - -

Redirecting to https://github.com/sql-machine-learning/gohive

- - diff --git a/docs/gohive/service-rpc/gen-go/tcliservice/index.html b/docs/gohive/service-rpc/gen-go/tcliservice/index.html deleted file mode 100644 index 9cd02cf..0000000 --- a/docs/gohive/service-rpc/gen-go/tcliservice/index.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - -

Redirecting to https://github.com/sql-machine-learning/gohive

- - diff --git a/docs/gohive/vendor/git.apache.org/thrift.git/lib/go/thrift/index.html b/docs/gohive/vendor/git.apache.org/thrift.git/lib/go/thrift/index.html deleted file mode 100644 index 9cd02cf..0000000 --- a/docs/gohive/vendor/git.apache.org/thrift.git/lib/go/thrift/index.html +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - -

Redirecting to https://github.com/sql-machine-learning/gohive

- - diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 39003c7..704a204 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -13,9 +13,6 @@ / -/sqlflow/doc/quickstart.html - - /sqlflow/doc/build-tensorflow.html @@ -34,6 +31,9 @@ /sqlflow/doc/pipe.html +/sqlflow/doc/quickstart.html + + /sqlflow/doc/sql_parser.html @@ -73,14 +73,6 @@ /sqlflow/sqlfs/ -/gohive/git.apache.org/thrift.git/lib/go/thrift/ -2019-04-20T11:20:34+08:00 - - -/gohive/git.apache.org/thrift/lib/go/thrift/ -2019-04-20T11:20:34+08:00 - - /gohive/hiveserver2/ 2019-04-20T11:20:34+08:00 @@ -89,18 +81,10 @@ 2019-04-20T11:08:15+08:00 -/gohive/service-rpc/gen-go/tcliservice/ -2019-04-20T11:08:15+08:00 - - /gohive/thrift/ 2019-04-20T11:20:34+08:00 -/gohive/vendor/git.apache.org/thrift.git/lib/go/thrift/ -2019-04-20T11:08:15+08:00 - - /pysqlflow/ 2019-04-20T11:40:14+08:00 diff --git a/docs/sqlflow/doc/quickstart.md b/docs/sqlflow/doc/quickstart.md new file mode 100644 index 0000000..75a00a6 --- /dev/null +++ b/docs/sqlflow/doc/quickstart.md @@ -0,0 +1,93 @@ +# Quick start + +SQLFlow is currently under active development. For those who are interested in trying +it out, we have provided several demos. Play around with it. Any bug report and +issue are welcomed. :) + +## Setup + +1. Install [Docker](https://docs.docker.com/install/). +1. Set up a MySQL server following [example/datasets/README.md](/example/datasets/README.md). +1. Pull the latest SQLFlow Docker image: `docker pull sqlflow/sqlflow:latest`. + +## Demo 1: Jupyter Notebook + +1. Start a Docker container that runs sqlflowserver and Jupyter Notebook. If you are + using Docker for Linux, please change `host.docker.internal:3306` to `localhost:3306`. + + ``` + docker run --rm -it -p 8888:8888 sqlflow/sqlflow:latest \ + bash -c "sqlflowserver --db_user root --db_password root --db_address host.docker.internal:3306 & + SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root" + ``` + + If you are using Docker for Mac, please be aware the option `--db_address host.docker.internal:3306` where + `host.docker.internal` translates to the host ip address as recommended [here](https://docs.docker.com/docker-for-mac/networking/). + + If you are running MySQL on remote, please be aware that MySQL only allows connections from localhost + by default. Fix can be found [here](https://stackoverflow.com/questions/14779104/how-to-allow-remote-connection-to-mysql). + +1. Open a Web browser and direct to `localhost:8888` and input the token. Then you +can create notebooks. In a cell, you should be able to type + + ``` + %%sqlflow + select 1 + ``` + +1. Explore more examples at [example.ipynb](/example/jupyter/example.ipynb) + +## Demo 2: Command Line Prompt + +Start a Docker container that runs SQLFlow command line prompt. If you are using +Docker for Linux, please change `host.docker.internal:3306` to `localhost:3306`. + +``` +docker run -it --rm --net=host sqlflow/sqlflow:latest demo \ +--db_user root --db_password root --db_address host.docker.internal:3306 +``` + +You should be able to see the following prompt. + +``` +sqlflow> +``` + +### Example + +- Select data +```sql +sqlflow> select * from iris.train limit 2; +----------------------------- ++--------------+-------------+--------------+-------------+-------+ +| SEPAL LENGTH | SEPAL WIDTH | PETAL LENGTH | PETAL WIDTH | CLASS | ++--------------+-------------+--------------+-------------+-------+ +| 6.4 | 2.8 | 5.6 | 2.2 | 2 | +| 5 | 2.3 | 3.3 | 1 | 1 | ++--------------+-------------+--------------+-------------+-------+ +``` +- Train a Tensorflow [DNNClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNClassifier) +```sql +sqlflow> SELECT * +FROM iris.train +TRAIN DNNClassifier +WITH n_classes = 3, hidden_units = [10, 20] +COLUMN sepal_length, sepal_width, petal_length, petal_width +LABEL class +INTO sqlflow_models.my_dnn_model; +----------------------------- +... +Training set accuracy: 0.96721 +Done training +``` +- Prediction using a trained model +```sql +sqlflow> SELECT * +FROM iris.test +predict iris.predict.class +USING sqlflow_models.my_dnn_model; +``` +- Checkout prediction result +```sql +sqlflow> select * from iris.predict limit 10; +``` \ No newline at end of file From a0d03d9bdd0f9dfd56c64e2cfbf693c934e19776 Mon Sep 17 00:00:00 2001 From: mxz96102 Date: Mon, 22 Apr 2019 14:00:47 +0800 Subject: [PATCH 2/3] update sqlflow doc --- docs/feed.xml | 2 +- docs/pysqlflow/index.html | 20 ++++++++++---------- docs/sitemap.xml | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/feed.xml b/docs/feed.xml index 7bb1e90..57f47c7 100644 --- a/docs/feed.xml +++ b/docs/feed.xml @@ -1 +1 @@ -Jekyll2019-04-22T13:40:11+08:00/feed.xmlSQLFLOW \ No newline at end of file +Jekyll2019-04-22T14:00:11+08:00/feed.xmlSQLFLOW \ No newline at end of file diff --git a/docs/pysqlflow/index.html b/docs/pysqlflow/index.html index 0eaa06b..8e286be 100644 --- a/docs/pysqlflow/index.html +++ b/docs/pysqlflow/index.html @@ -1,12 +1,12 @@ - - - - - - -

Redirecting to https://github.com/sql-machine-learning/pysqlflow

- + + + + + + +

Redirecting to https://github.com/sql-machine-learning/pysqlflow

+ \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 704a204..742cee4 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -74,7 +74,7 @@ /gohive/hiveserver2/ -2019-04-20T11:20:34+08:00 +2019-04-22T13:58:28+08:00 /gohive/ @@ -82,10 +82,10 @@ /gohive/thrift/ -2019-04-20T11:20:34+08:00 +2019-04-22T13:58:28+08:00 /pysqlflow/ -2019-04-20T11:40:14+08:00 +2019-04-22T13:58:28+08:00 From 55210f422c357168dd908d2695a3db6d023f16bc Mon Sep 17 00:00:00 2001 From: mxz96102 Date: Mon, 22 Apr 2019 14:03:08 +0800 Subject: [PATCH 3/3] update sqlflow doc --- docs/assets/js/search-data.json | 96 +-- docs/doc_index/contribute.html | 18 +- docs/doc_index/design.html | 18 +- docs/feed.xml | 2 +- docs/pages/about.html | 14 +- docs/sitemap.xml | 6 + .../calcite-parser/CalciteParserServer.java | 139 ++++ .../calcite-parser/CalciteParserTest.java | 59 ++ docs/sqlflow/calcite-parser/Dockerfile | 74 +++ docs/sqlflow/calcite-parser/README.md | 38 ++ .../calcite-parser/build_and_test.bash | 35 + docs/sqlflow/calcite-parser/index.html | 396 +++++++++++ docs/sqlflow/cmd/demo/index.html | 14 +- docs/sqlflow/doc/alps_submitter.html | 616 ++++++++++++++++++ docs/sqlflow/doc/alps_submitter.md | 237 +++++++ docs/sqlflow/doc/build-tensorflow.html | 14 +- docs/sqlflow/doc/build.html | 14 +- .../doc/close_producer_from_consumer.html | 14 +- .../doc/database_abstraction_layer.html | 14 +- docs/sqlflow/doc/mysql-setup.html | 14 +- docs/sqlflow/doc/pipe.html | 14 +- docs/sqlflow/doc/quickstart.html | 24 +- docs/sqlflow/doc/quickstart.md | 8 +- docs/sqlflow/doc/sql_parser.html | 14 +- docs/sqlflow/doc/submitter.html | 20 +- docs/sqlflow/doc/submitter.md | 8 +- docs/sqlflow/doc/syntax.html | 14 +- docs/sqlflow/doc/walkthrough.html | 14 +- docs/sqlflow/example/creditcard/index.html | 14 +- docs/sqlflow/example/datasets/index.html | 14 +- .../example/fraud_detection/index.html | 14 +- docs/sqlflow/index.html | 14 +- docs/sqlflow/server/index.html | 14 +- docs/sqlflow/sql/index.html | 14 +- docs/sqlflow/sql/python/index.html | 14 +- docs/sqlflow/sqlfs/index.html | 14 +- 36 files changed, 1979 insertions(+), 81 deletions(-) create mode 100644 docs/sqlflow/calcite-parser/CalciteParserServer.java create mode 100644 docs/sqlflow/calcite-parser/CalciteParserTest.java create mode 100644 docs/sqlflow/calcite-parser/Dockerfile create mode 100644 docs/sqlflow/calcite-parser/README.md create mode 100644 docs/sqlflow/calcite-parser/build_and_test.bash create mode 100644 docs/sqlflow/calcite-parser/index.html create mode 100644 docs/sqlflow/doc/alps_submitter.html create mode 100644 docs/sqlflow/doc/alps_submitter.md diff --git a/docs/assets/js/search-data.json b/docs/assets/js/search-data.json index fea32c6..c74f472 100644 --- a/docs/assets/js/search-data.json +++ b/docs/assets/js/search-data.json @@ -29,139 +29,153 @@ }, "4": { "id": "4", + "title": "Proof of Concept: ALPS Submitter", + "content": "Proof of Concept: ALPS Submitter ALPS (Ant Learning and Prediction Suite) provides a common algorithm-driven framework in Ant Financial, focusing on providing users with an efficient and easy-to-use machine learning programming framework and a financial learning machine learning algorithm solution. This module is used to submit ALPS machine learning training tasks in SQLFlow. Precondition For machine learning models, we only consider TensorFlow premade estimator. To simplify the design, we only execute training without evaluation in the estimator. If a table cell is encoded, we assume the user always provides enough decoding information such as dense/sparse, shape via expression such as DENSE, SPARSE Data Pipeline Standard Select -> Train Input Table -> Decoding -> Input Fn and TrainSpec The standard select query is executed in SQL Engine like ODPS, SparkSQL, we take the result table as the input of training. If a table cell is encoded, we assume the user always provides enough decoding information such as dense/sparse, shape, delimiter via expression such as DENSE, SPARSE. The decode expression must exist in COLUMN block. Dense Format It is the dense type of encoded data if we have multiple numeric features in a single table cell. For example, we have numeric features such as price, count and frequency, splice into a string using a comma delimiter. In this situation, the DENSE expression should be used to declare the decoding information such as the shape, delimiter. DENSE(table_column, shape, dtype, delimiter) Args: table_column: column name shape(int): shape of dense feature delimiter(string): delimiter of encoded feature Sparse Format It is the sparse type of encoded data if we not only have multiple features in a single table cell but also mapping the feature to sparse format. For example, we have features such as city, gender and interest which has multiple values for each feature. The values of city are beijing, hangzhou, chengdu. The values of gender are male and female. The values of interest are book and movie. Each of these values has been mapped to an integer and associate with some group. `beijing` -> group 1, value 1 `hangzhou` -> group 1, value 2 `chengdu` -> group 1, value 3 `male` -> group 1, value 4 `female` -> group 1, value 5 `book` -> group 2, value 1 `movie` -> group 2, value 2 If we use colon as the group/value delimiter and use comma as the feature delimiter, “3:1, 4:1, 2:2” means “chengdu, male, movie”. SPARSE(table_column, shape, dtype, delimiter, group_delimiter) Args: table_column: column name shape(list): list of embedding shape for each group delimiter(string): delimiter of feature group_delimiter(string): delimiter of value and group Decoding The actual decoding action is not happened in this submitter but in ALPS inside. What we should do here is just generate the configuration file of ALPS. Let’s take an example for training a classifier model for credit card fraud case. Table Data column/row c1 c2 c3 r1 10,2,4 3:1,4:1,2:2 0 r2 500,20,10 2:1,5:1,2:1 1 The column c1 is dense encoded and c2 is sparse encoded, c3 is label column. SQL select c1, c2, c3 as class from kaggle_credit_fraud_training_data TRAIN DNNClassifier WITH ... COLUMN DENSE(c1, shape=3, dtype=float, delimiter=',') SPARSE(c2, shape=[10, 10], dtype=float, delimiter=',', group_delimiter=':') LABEL class ALPS Configuration # graph.conf ... schema = [c1, c2, c3] x = [ {feature_name: c1, type: dense, shape:[3], dtype: float, separator:","}, {feature_name: c2, type: sparse, shape:[10,10], dtype: float, separator:",", group_separator:":"} ] y = {feature_name: c3, type: dense, shape:[1], dtype: int} ... Feature Pipeline Feature Expr -> Semantic Analyze -> Feature Columns Code Generation -> Estimator Feature Expressions In SQLFlow, we use Feature Expressions to represent the feature engineering process and convert it into the code snippet using TensorFlow Feature Column API. Feature Expressions NUMERIC(key, dtype, shape) BUCKETIZED(source_column, boundaries) CATEGORICAL_IDENTITY(key, num_buckets, default_value) CATEGORICAL_HASH(key, hash_bucket_size, dtype) CATEGORICAL_VOCABULARY_LIST(key, vocabulary_list, dtype, default_value, num_oov_buckets) CATEGORICAL_VOCABULARY_FILE(key, vocabulary_file, vocabulary_size, num_oov_buckets, default_value, dtype) CROSS(keys, hash_bucket_size, hash_key) The feature expressions must exist in COLUMN block. Here is an example which do BUCKETIZED on c2 then CROSS with c1. select c1, c2, c3 as class from kaggle_credit_fraud_training_data TRAIN DNNClassifier WITH ... COLUMN CROSS([NUMERIC(c1), BUCKETIZED(NUMERIC(c2), [0, 10, 100])]) LABEL class Semantic Analyze Feature Expressions except for Tensorflow Feature Column API should raise an error. /* Not supported */ select * from kaggle_credit_fraud_training_data TRAIN DNNClassifier WITH ... COLUMN NUMERIC(f1 * 10) Feature Columns Code Generation We transform feature columns expression to a code snippet and wrap it as a CustomFCBuilder which extends from alps.feature.FeatureColumnsBuilder. Review the above example, the generated code snippet is this: from alps.feature import FeatureColumnsBuilder class CustomFCBuilder(FeatureColumnsBuilder): def build_feature_columns(): fc1 = tf.feature_column.numeric_column('c1') fc2 = tf.feature_column.numeric_column('c2') fc3 = tf.feature_column.bucketized_column(fc2, boundaries = [0, 10, 100]) fc4 = tf.feature_column.crossed_column([fc2, fc3]) return [fc4] ALPS framework will execute this code snippet and pass the result to the constructor method of the estimator. Parameters We use WITH block to set the parameters of training. If the name is prefixed with estimator, it is the parameter of the constructor method of the Estimator. If the name is prefixed with train_spec, it is the parameter of the constructor method of the TrainSpec. If the name is prefixed with input_fn, it is the parameter of the input_fn. Let’s create a DNNClassifier example, the minimum parameters of the constructor method are hidden_units and feature_columns. select c1, c2, c3 as class from kaggle_credit_fraud_training_data TRAIN DNNClassifier WITH estimator.hidden_units = [10, 20], train_spec.max_steps = 2000, input_fn.batch_size = 512 COLUMN CROSS([NUMERIC(c1), BUCKETIZED(NUMERIC(c2), [0, 10, 100])]) LABEL class ... For now, we will pass the result of snippet code as feature_columns parameters and it will raise an error if the estimator expects it as a different name until AS syntax is supported in SQLFlow. select c1, c2, c3, c4, c5 as class from kaggle_credit_fraud_training_data TRAIN DNNLinearCombinedClassifier WITH linear_feature_columns = [fc1, fc2] dnn_feature_columns = [fc3] ... COLUMN NUMERIC(f1) as fc1, BUCKETIZED(fc1, [0, 10, 100]) as fc2, CROSS([fc1, fc2, f3]) as fc3 LABEL class ...", + "url": "/sqlflow/doc/alps_submitter.html", + "relUrl": "/sqlflow/doc/alps_submitter.html" + }, + "5": { + "id": "5", "title": "Build TensorFlow from Source Code using Docker", "content": "Build TensorFlow from Source Code using Docker To contribute to TensorFlow, we need to build TensorFlow from source code. The official guide is great. However, it interleaves the native building process and that using Docker and makes it confusing because packages needed by the former are not by the latter. Also, we found some useful tricks to start Docker containers in practices that are not in the official guide. Hence this document. Build the Pip Package in TensorFlow Development Container On either Mac or Linux, or any other OS, we don’t have to install and configure the building tools; instead, we can use a Docker image where all tools have been installed and properly configured. Get the Docker image containing all the building tools: docker pull tensorflow/tensorflow:latest-devel Then, let’s get the source code. On any OS, please install git using the native package manager. For example, on Ubuntu, please sudo apt-get install git or, on Mac, brew install git Then, use the git just installed, let’s clone tensorflow source code: git clone --recursive https://github.com/tensorflow/tensorflow cd tensorflow By default, we will be on the master branch. Feel free to do you change in your feature branches, or switch to a release branch, for example: git checkout v1.11.0 git checkout -b v1.11.0 Then, let us start a Docker container running the tensorflow/tensorflow:latest-devel image: docker run --rm -it -w /tensorflow -v $PWD:/tensorflow -v $HOME/.cache:/root/.cache -e "HOST_PERMS=$(id -u):$(id -g)" tensorflow/tensorflow:latest-devel /bin/bash -w /tensorflow brings us to the /tensorflow directory in the container once after we start it. -v $PWD:/tensorflow maps the current directory, which is the just cloned TensorFlow source directory on the host, to /tensorflow in the container. -v $HOME/.cache:/root/.cache maps the Bazel temporary directory on the host into the container, so the intermediate files generated by Bazel running in the container are actually saved on the host. This allows us to interrupt the container during the build and restart it later to resume the building. e "HOST_PERMS=$(id -u):$(id -g)" passes the user identification on the host into the container as an environment variable. We can reset the mode of files generated in the container to this user identity. From now on, we will be working in the container. Let us first configure the building: ./configure Usually, I would simply choose all the default options by hitting enter all the way down. Build the pip package builder: bazel build --config=opt //tensorflow/tools/pip_package:build_pip_package Build the pip package and save it into /tensorflow: ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tensorflow Change the generated wheel file into the usual mode as on the host: chown $HOST_PERMS /mnt/tensorflow-*.whl Install the Pip Pacakge in TensorFlow Development Container Let’s now try the new pip package. First, we need to uninstall the current tensorflow pip package, then we install the newly built one: pip uninstall tensorflow pip install /tensorflow/tensorflow-*.whl Now, we can verify if the new package works. First, we need to switch to a directory out from /tensorflow, so we don’t import from the source directory: cd /tmp # other directories also work then, we can try import the newly installed tensorflow package and verify it: python >>> import tensorflow >>> print(tensorflow.__version__) Now, let us quit from Python and from the Docker container. We should see the tensorflow-*.whl file on the host in the current directory. Install the Pip Package in a Clean Python Package After we quit from the development container, we should see the wheel file in the TensorFlow source directory on the host. Now, we can start a Python container and install the Pip package in it. Start the Python container docker run --rm -it -v $PWD:/tensorflow python:2.7 bash Install the pip pacakge in the container pip install /tensorflow/tensorflow*.whl Try TensorFlow by starting Python in the container python >>> import tensorflow as tf >>> print(tf.__version__)", "url": "/sqlflow/doc/build-tensorflow.html", "relUrl": "/sqlflow/doc/build-tensorflow.html" }, - "5": { - "id": "5", + "6": { + "id": "6", "title": "Canonical Development Environment", "content": "Canonical Development Environment Referring to this example, we create a canonical development environment for Go and Python programmers using Docker. Editing on Host When we use this Docker image for daily development work, the source code relies on the host computer instead of the container. The source code includes this repo and all its dependencies, for example, the Go package google.golang.org/grpc. Code-on-the-host allows us to run our favorite editors (Emacs, VIM, Eclipse, and more) on the host. Please free to rely on editors add-ons to analyze the source code for auto-completion. Building in Container We build a Docker image that contains development tools: The Python interpreter The Go compiler The protobuf compiler The protobuf to Go compiler extension The protobuf to Python compiler extension Because this repo contains Go code, please make sure that you have the directory structure required by Go. On my laptop computer, I have export GOPATH=$HOME/go You could have your $GOPATH pointing to any directory you like. Given $GOPATH$ set, we could git clone the source code of our project by running: go get github.com/sql-machine-learning/sqlflow Change the directory to our project root, and we can use go get to retrieve and update Go dependencies. cd $GOPATH/src/github.com/sql-machine-learning/sqlflow go get -u -t ./... Note -t instructs get to also download the packages required to build the tests for the specified packages. As all Git users would do, we run git pull from time to time to sync up with others’ work. If somebody added new dependencies, we might need to run go -u ./... after git pull to update dependencies. To build this project, we need the protobuf compiler, Go compiler, Python interpreter, gRPC extension to the protobuf compiler. To ease the installation and configuration of these tools, we provided a Dockerfile to install them into a Docker image. To build the Docker image: docker build -t sqlflow:dev -f Dockerfile.dev . Development Build and Test We build and test the project inside the docker container. To run the container, we need to map the $GOPATH directory on the host into the /go directory in the container, because the Dockerfile configures /go as the $GOPATH in the container: docker run --rm -it -v $GOPATH:/go -w /go/src/github.com/sql-machine-learning/sqlflow sqlflow:dev bash Inside the Docker container, start a MySQL server in the background service mysql start& run all the tests as go generate ./... go install ./... go test -v ./... where go generate invokes the protoc command to translate server/sqlflow.proto into server/sqlflow.pb.go and go test -v builds and run unit tests. Release The above build process currently generates two binary files in $GOPATH/bin on the host. To package them into a Docker image, please run docker build -t sqlflow -f ./Dockerfile $GOPATH/bin To publish the released Docker image to our official DockerHub docker tag sqlflow sqlflow/sqlflow:latest docker push sqlflow/sqlflow:latest Demo: Command line Prompt The demo requires a MySQL server instance with populated data. If we don’t, we could follow example/datasets/README.md to start one on the host. After setting up MySQL, run the following inside the Docker container go run cmd/demo/demo.go --db_user root --db_password root --db_address host.docker.internal:3306 You should be able to see the following prompt sqlflow>", "url": "/sqlflow/doc/build.html", "relUrl": "/sqlflow/doc/build.html" }, - "6": { - "id": "6", + "7": { + "id": "7", "title": "Closing the producer goroutine from the consumer", "content": "Closing the producer goroutine from the consumer The producer-and-consumer pattern is well used in Go concurrent programming. When the consumer stops, we want to gracefully stop the producer as well. Problem When a gRPC server receives a streaming request, it usually calls a function that returns a channel, reads the result from that channel and send the result to the client one by one. Take the following code for instance: upon receiving a request, the main goroutine Service calls launchJob. launchJob starts a separate goroutine as an anonymous function call and returns a channel. In the anonymous function, items will be sent to channel. And Service on the otherside of the channel will reads from it. func Service(req *Request, stream *StreamResponse) error { result := launchJob(req.Content) for r := range result { if e := stream.Send(result); e != nil { // should we signal the running goroutine so it will stop sending? return e } } } func launchJob(content string) chan Item { c := make(chan Item) go func() { defer close(c) acquireScarceResources() defer releaseScarceResources() ... // if stream.Send(result) returns an error and the Service returns, this will be blocked c <- Item{} ... }() return c } There is a major problem in this implementation. As pointed out by the comment, if the Send in Service returns an error, the Service function will return, leaving the anonymous function being blocked on c <- Item{} forever. This problem is important because the leaking goroutine usually owns scarce system resources such as network connection and memory. Solution: pipeline explicit cancellation Inspired by this blog post section Explicit cancellation, we can signal the cancellation via closing on a separate channel. And we can follow the terminology as io.Pipe. package sql import ( "errors" ) var ErrClosedPipe = errors.New("pipe: write on closed pipe") // pipe follows the design at https://blog.golang.org/pipelines // - wrCh: chan for piping data // - done: chan for signaling Close from Reader to Writer type pipe struct { wrCh chan interface{} done chan struct{} } // PipeReader reads real data type PipeReader struct { p *pipe } // PipeWriter writes real data type PipeWriter struct { p *pipe } // Pipe creates a synchronous in-memory pipe. // // It is safe to call Read and Write in parallel with each other or with Close. // Parallel calls to Read and parallel calls to Write are also safe: // the individual calls will be gated sequentially. func Pipe() (*PipeReader, *PipeWriter) { p := &pipe{ wrCh: make(chan interface{}), done: make(chan struct{})} return &PipeReader{p}, &PipeWriter{p} } // Close closes the reader; subsequent writes to the func (r *PipeReader) Close() { close(r.p.done) } // ReadAll returns the data chan. The caller should // use it as `for r := range pr.ReadAll()` func (r *PipeReader) ReadAll() chan interface{} { return r.p.wrCh } // Close closes the writer; subsequent ReadAll from the // read half of the pipe will return a closed channel. func (w *PipeWriter) Close() { close(w.p.wrCh) } // Write writes the item to the underlying data stream. // It returns ErrClosedPipe when the data stream is closed. func (w *PipeWriter) Write(item interface{}) error { select { case w.p.wrCh <- item: return nil case <-w.p.done: return ErrClosedPipe } } And the consumer and producer be can implemented as func Service(req *Request, stream *StreamResponse) error { pr := launchJob(req.Content) defer pr.Close() for r := range pr.ReadAll() { if e := stream.Send(r); e != nil { return e } } } func launchJob(content string) PipeReader { pr, pw := Pipe() go func() { defer pw.Close() if err := pw.Write(Item{}); err != nil { return } } return pr } Further Reading Google Form: Channel send timeout Go by Example: Timeouts Google I/O 2013 - Advanced Go Concurrency Patterns Go Concurrency Patterns Talk Go Concurrency Patterns: Pipelines and cancellation", "url": "/sqlflow/doc/close_producer_from_consumer.html", "relUrl": "/sqlflow/doc/close_producer_from_consumer.html" }, - "7": { - "id": "7", + "8": { + "id": "8", "title": "Compatibility with Various SQL Engines", "content": "Compatibility with Various SQL Engines SQLFlow interacts with SQL engines like MySQL and Hive, while different SQL engines use variants of SQL syntax, it is important for SQLFlow to have an abstraction layer that hides such differences. SQLFlow calls Go’s standard database API. The submitter programs generated by SQLFlow call Python’s database API. Both APIs abstract the interface to various SQL engines; however, they are insufficient for SQLFlow to work. In this document, we examine all interactions between SQLFlow and the SQL engine so to identify what SQLFlow authors have to abstract in addition to calling Go’s and Python’s database APIs. Data Operations in Go Data Retrieval The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TRAIN and PREDICT clauses. For more discussion, please refer to the syntax design. SQLFlow translates such “extended SQL statements” into submitter programs, which forward the part from SELECT to TRAIN or PREDICT, which we call the “standard part”, to the SQL engine. SQLFlow also accepts the SELECT statement without TRAIN or PREDICt clauses and would forward such “standard statements” to the engine. It is noticeable that the “standard part” or “standard statements” are not standardized. For example, various engines use different syntax for joining. MySQL: SELECT pet.name, comment FROM pet, event WHERE pet.name =event.name; with keyword WHERE . Hive: SELECT pet.name, comment FROM pet JOIN event ON (pet.name =event.name) with keyword JOIN and ON. ODPS and SQLite use either INNER JOIN or OUTER JOIN. Fortunately, as SQLFlow forwards the above parts to the engine, it doesn’t have to care much about the differences above. Metadata Retrieval To verify the semantics of users’ inputs, SQLFlow needs to retrieve the schema of tables. For example, the input might be SELECT name, age, income FROM employee TRAIN DNNRegressor WITH hidden_layers=[10,50,10] COLUMN name, agee LABEL income; In the above example, the user misspelled the field name age in the COLUMN clause as “agee”. SQLFlow must be able to find that out. To do that, SQLFlow needs to query the field names from the SQL engine. However, different engines use various syntax. For example: MySQL: DESCRIBE/DESC employee; Hive: DESCRIBE FORMATTED employee; ODPS: DESC employee; SQLite: PRAGMA table_info([employee]); The returned data format varies too. Our solution to avoid such differences is not-to-use-them; instead, SQLFlow retrieves the table schema by running a query like SELECT * FROM employee LIMIT 1; and inferring field types using the mechanism called DatabaseTypeName provided by SQL engines drivers beneath the Go’s standard database API. Prepare Prediction Table A SQLFlow prediction job writes its prediction results into a table. It prepares the prediction table by Dropping previous prediction table DROP TABLE IF EXISTS my_table; Creating table with schema CREATE TABLE my_table (name1, type1, name2 type2); Most SQL engines, including MySQL, Hive, ODPS, SQLite, support both statements. Translate Database Column Type to TensorFlow Feature Column Type After retrieving database column type name through DatabaseTypeName, we can derive TensorFlow’s feature column type via a mapping such as {"FLOAT", "DOUBLE"} -> tf.numeric_column. Save Model SQLFlow saves trained ML model by dumping the serialized the model directory into a table. It first creates a table by CREATE TABLE IF NOT EXISTS %s (id INT AUTO_INCREMENT, block BLOB, PRIMARY KEY (id)) and insert blobs by INSERT INTO %s (block) VALUES(?). Note that Hive and ODPS doesn’t have BLOB type, we need to use BINARY (docs at ODPS, Hive) instead. Also, note that Hive and ODPS doesn’t support AUTO_INCREMENT, we need to implemented auto increment logic in sqlfs. Load Model SQLFlow loads trained ML model by reading rows in a table and deserializing the blob to a model directory. It reads rows by running SELECT block FROM %s ORDER BY id, which is supported by most databases. Data Operations in Python Connect to SQL Engines Thanks to the Python database API, connecting to different databases follows a similar API. conn = mysql.connector.connect(user='scott', password='password', host='127.0.0.1', database='employees') conn = sqlite3.connect('path/to/your/sqlite/file') conn = pyhive.connect('localhost') cursor = conn.cursor() cursor.execute('select * from my_table;') Insert Prediction Result into Prediction Table Python database API provides execute_many(sql, value) to insert multiple values at once. So one can prepare the following insertion statement. Please be aware that MySQL and SQLite use INSERT INTO to insert rows while Hive and ODPS use INSERT INTO TABLE. -- MySQL, SQLite INSERT INTO table_name VALUES (value1, value2, value3, ...); -- Hive, ODPS INSERT INTO TABLE table_name VALUES (value1, value2, value3, ...);", "url": "/sqlflow/doc/database_abstraction_layer.html", "relUrl": "/sqlflow/doc/database_abstraction_layer.html" }, - "8": { - "id": "8", + "9": { + "id": "9", "title": "Run MySQL Server and Client in Docker Containers", "content": "Run MySQL Server and Client in Docker Containers The document explains how to setup MySQL in our development environment. Run MySQL Server in a Docker Container docker run --rm -v /tmp/test1:/var/lib/mysql --name mysql01 -e MYSQL_ROOT_PASSWORD=root -e MYSQL_ROOT_HOST='%' -p 3306:3306 -d mysql/mysql-server:8.0 the -v option ensures that the database is saved on the host. The default directory where MySQL saves the database is /var/lib/mysql. This directory can be configured in /etc/mysql/my.cnf, as explained in this post. By overlaying the directory /tmp/test1 on the host to /var/lib/mysql, we “cheat” MySQL to save databases on the host. So, we can kill the container and restart it, and the database is still there. Please be aware that the directory on the host must be empty the first time we run the above command; otherwise, MySQL would fail to initialize. I figured out this problem after several failures using docker logs. the -e option sets the root password of MySQL to “root”. Feel free to set it to any password you like. the second -e options sets MYSQL_ROOT_HOST to a wildcard so to allow clients connecting to the server via TCP/IP as the user “root”. This trick works with MySQL 5.7 and 8.0, but not the most recent under-development version. the --name option names the container to mysql01, which can be used to refer to this container. the -p option maps the port 3306, on which the MySQL server listens, to the same port on the host, so that clients could connect to the server via TCP/IP. Run MySQL Client in the Server Container docker exec -it mysql01 mysql -uroot -p This command executes the command mysql, which is the command line tool of MySQL, in the container named mysql01. The command line flags of mysql include -u, which specifies the username of MySQL, and -p, which makes MySQL prompts for the password. For this example, we should type the password “root”, which was set in the previous command. Please wait for a few seconds after the starting of the MySQL server container before we execute the client; otherwise, the startup of the client might fail. Once we get into the MySQL client, we can type SQL commands, e.g., show databases; create database yi; Run Client in a Different Container on the Same Host docker run --rm -it -v /tmp/test1:/var/lib/mysql mysql/mysql-server:8.0 mysql -uroot -p The -v option maps the database directory on the host to the client container. This mapping is necessary because, by default, the client talks to the server via Unix socket /var/lib/mysql/mysql.sock, which is /tmp/test1/mysql.sock on the host. Run Client in a Container on a Remote Host docker run --rm -it mysql/mysql-server:8.0 mysql -h 192.168.1.3 -P 3306 -uroot -p the -h option tells the client where the server is running on. In this example, the given IP is the one of the host where I ran the MySQL server container. Please be aware that the above command works only if the server allows remote connections. Run Python Client in a Container To write a Python client, we need to install the Python package mysql-connector-python. FROM python:2.7 RUN pip install mysql-connector-python Please be aware that some documents says that we need to install mysql-connector. I tried; but the mysql.connector.connect call failed with the error mysql.connector.errors.NotSupportedError: Authentication plugin 'caching_sha2_password' is not supported. Build the Docker image: docker build -t sqlflow . Run the image: docker run --rm -it sqlflow bash and we can start Python and run the following Python code snippet >>> import mysql.connector >>> db = mysql.connector.connect(user='root', passwd='root', host='192.168.1.3') >>> print(db) <mysql.connector.connection_cext.CMySQLConnection object at 0x7fbab9f3fed0> Run a Go Client In order to connect to a database, you need to import the database’s driver first. export GOPATH=$HOME/go go get -u github.com/go-sql-driver/mysql go run the following file package main import ( "database/sql" "github.com/go-sql-driver/mysql" "log" ) func main() { testConfig := &mysql.Config{ User: "root", Passwd: "root", Net: "tcp", Addr: "localhost:3306", } db, e := sql.Open("mysql", testConfig.FormatDSN()) if e != nil { log.Fatal(e) } defer db.Close() }", "url": "/sqlflow/doc/mysql-setup.html", "relUrl": "/sqlflow/doc/mysql-setup.html" }, - "9": { - "id": "9", + "10": { + "id": "10", "title": "Piping Responses", "content": "Piping Responses Streaming Responses As described in the overall design, a SQLFlow job could be a standard or an extended SQL statemnt, where an extended SQL statement will be translated into a Python program. Therefore, each job might generate up to the following data streams: standard output, where each element is a line of text, standard error, where each element is a line of text, data rows, where the first element consists of fields name/types, and each of the rest is a row of data, status, where the element could be pending, failed, and succeeded. To create good user experience, we need to pipe these responses from SQLFlow jobs to Jupyter Notebook in real-time. Stages in the Pipe The pipe that streams outputs from SQLFlow jobs to the Jupyter Notebook consists of the following stages: Web browser ↑ | HTTP ↓ Jupyter Notebook server ↑ | ZeroMQ streams: Shell, IOPub, stdin, Controls, Heartbeat ↓ iPython kernel ↑ | IPython magic command framework ↓ SQLFlow magic command for Jupyter ↑ | gRPC ↓ SQLFlow server ↑ | Go channels ↓ SQLFlow job manager (Go functions) ↑ | IPC with Go's standard library ↓ SQLFlow jobs In the above figure, from the SQLFlow magic command to the bottom layer are our work. Streaming We have two alternative ideas: multiple streams and a multiplexing stream. We decided to use a multiplexing stream because we had a unsuccessful trial with the multiple streams idea: we make the job writes to various Go channels and forward each Go channel to a streaming gRPC call, as the following: Multiple streams The above figure shows that there are multiple streams between the Jupyter Notebook server and Jupyter kernels. According to the document, there are five: Shell, IOPub, stdin, Control, and Heartbeat. These streams are ZeroMQ streams. We don’t use ZeroMQ, but we can take the idea of having multiple parallel streams in the pipe. service SQLFlow { rpc File(string sql) returns (int id) {} rpc ReadStdout(int id) returns (stream string) {} rpc ReadStderr(int id) returns (stream string) {} rpc ReadData(int id) returns (stream Row) {} rpc ReadStatus(int id) returns (stream int) {} } However, we realized that if the user doesn’t call any one of the SQLFlow.Read... call, there would be no forwarding from the Go channel to Jupyter, thus the job would block forever at writing. A Multiplexing Stream Another idea is multiplexing all streams into one. For example, we can have only one ZeroMQ stream, where each element is a polymorphic type – could be a text string or a data row. service SQLFlow { rpc Run(string sql) returns (stream Response) {} } // Only one of the following fields should be set. message Response { oneof record { repeated string head = 1; // Column names. repeated google.protobuf.Any row = 2; // Cells in a row. string log = 3; // A line from stderr or stdout. } }", "url": "/sqlflow/doc/pipe.html", "relUrl": "/sqlflow/doc/pipe.html" }, - "10": { - "id": "10", - "title": "Quick start", - "content": "Quick start SQLFlow is currently under active development. For those who are interested in trying it out, we have provided several demos. Play around with it. Any bug report and issue are welcomed. :) Setup Install Docker. Set up a MySQL server following example/datasets/README.md. Pull the latest SQLFlow Docker image: docker pull sqlflow/sqlflow:latest. Demo 1: Jupyter Notebook Start a Docker container that runs sqlflowserver and Jupyter Notebook. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run --rm -it -p 8888:8888 sqlflow/sqlflow:latest bash -c "sqlflowserver --db_user root --db_password root --db_address host.docker.internal:3306 & SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root" If you are using Docker for Mac, please be aware the option --db_address host.docker.internal:3306 where host.docker.internal translates to the host ip address as recommended here. If you are running MySQL on remote, please be aware that MySQL only allows connections from localhost by default. Fix can be found here. Open a Web browser and direct to localhost:8888 and input the token. Then you can create notebooks. In a cell, you should be able to type %%sqlflow select 1 Explore more examples at example.ipynb Demo 2: Command Line Prompt Start a Docker container that runs SQLFlow command line prompt. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run -it --rm --net=host sqlflow/sqlflow:latest demo --db_user root --db_password root --db_address host.docker.internal:3306 You should be able to see the following prompt. sqlflow> Example Select data sqlflow> select * from iris.train limit 2; -- +--+-+--+-+-+ | SEPAL LENGTH | SEPAL WIDTH | PETAL LENGTH | PETAL WIDTH | CLASS | +--+-+--+-+-+ | 6.4 | 2.8 | 5.6 | 2.2 | 2 | | 5 | 2.3 | 3.3 | 1 | 1 | +--+-+--+-+-+ Train a Tensorflow DNNClassifier sqlflow> SELECT * FROM iris.train TRAIN DNNClassifier WITH n_classes = 3, hidden_units = [10, 20] COLUMN sepal_length, sepal_width, petal_length, petal_width LABEL class INTO sqlflow_models.my_dnn_model; -- ... Training set accuracy: 0.96721 Done training Prediction using a trained model sqlflow> SELECT * FROM iris.test predict iris.predict.class USING sqlflow_models.my_dnn_model; Checkout prediction result sqlflow> select * from iris.predict limit 10;", + "11": { + "id": "11", + "title": "Quick Start", + "content": "Quick Start SQLFlow is currently under active development. For those who are interested in trying it out, we have provided several demos. Play around with it. Any bug report and issue are welcomed. :) Environment Setup Install Docker Community Edition. Set up a containerized MySQL server (8.0) following example/datasets/README.md. Note that there is no need to install MySQL locally. Pull the latest SQLFlow Docker image: docker pull sqlflow/sqlflow:latest. Demo 1: Jupyter Notebook Start a Docker container that runs sqlflowserver and Jupyter Notebook. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run --rm -it -p 8888:8888 sqlflow/sqlflow:latest bash -c "sqlflowserver --db_user root --db_password root --db_address host.docker.internal:3306 & SQLFLOW_SERVER=localhost:50051 jupyter notebook --ip=0.0.0.0 --port=8888 --allow-root" If you are using Docker for Mac, please be aware the option --db_address host.docker.internal:3306 where host.docker.internal translates to the host ip address as recommended here. If you are running MySQL on remote, please be aware that MySQL only allows connections from localhost by default. Fix can be found here. Open a Web browser and direct to localhost:8888 and input the token. Then you can create notebooks. In a cell, you should be able to type %%sqlflow select 1 Explore more examples at example.ipynb Demo 2: Command Line Prompt Start a Docker container that runs SQLFlow command line prompt. If you are using Docker for Linux, please change host.docker.internal:3306 to localhost:3306. docker run -it --rm --net=host sqlflow/sqlflow:latest demo --db_user root --db_password root --db_address host.docker.internal:3306 You should be able to see the following prompt. sqlflow> Example Select data sqlflow> select * from iris.train limit 2; -- +--+-+--+-+-+ | SEPAL LENGTH | SEPAL WIDTH | PETAL LENGTH | PETAL WIDTH | CLASS | +--+-+--+-+-+ | 6.4 | 2.8 | 5.6 | 2.2 | 2 | | 5 | 2.3 | 3.3 | 1 | 1 | +--+-+--+-+-+ Train a Tensorflow DNNClassifier sqlflow> SELECT * FROM iris.train TRAIN DNNClassifier WITH n_classes = 3, hidden_units = [10, 20] COLUMN sepal_length, sepal_width, petal_length, petal_width LABEL class INTO sqlflow_models.my_dnn_model; -- ... Training set accuracy: 0.96721 Done training Prediction using a trained model sqlflow> SELECT * FROM iris.test predict iris.predict.class USING sqlflow_models.my_dnn_model; Checkout prediction result sqlflow> select * from iris.predict limit 10;", "url": "/sqlflow/doc/quickstart.html", "relUrl": "/sqlflow/doc/quickstart.html" }, - "11": { - "id": "11", + "12": { + "id": "12", "title": "Extended SQL Parser Design", "content": "Extended SQL Parser Design This documentation explains the technical decision made in building a SQL parser in Go. It is used to parsed the extended SELECT syntax of SQL that integrates TensorFlow Estimators. Related Work Lexer and Parser Generator In 2001, when I was in graduate school, I defined an extended SQL syntax for querying distributed relational databases, as part of the course project of Distributed RDBMS by Prof. Li-zhu Zhou. I wrote the parser using bison (a modern version of yacc) and flex (a modern version of lex). yacc and lex generate C code; bison and flex generate C++ code. However, this time, I’d use Go. I surveyed goyacc, a standard Go tool. The usage is very similar to that of yacc and bison. However, the Go toolchain doesn’t provide a tool like lex/flex. Google revealed golex, which is out of maintenance. The Medium post recommends Ragel, which is a C++ program and could generate Go lexer; however, it lacks documents. Handwritten Lexer and Parser Some documents, including this one recommends handwriting lexers. However, it doesn’t explain how to write the parser. GoAcademy always provides high-quality tech blog posts. This one is from the author of InfluxDB. However, I stopped at where it explains wrapping a SQL statement as a string by an io.Reader, because it is obvious that we should keep the string as a string so that that token strings could refer to the same memory storage of the SQL statement. Following a link in the above GoAcademy post, I found Rob Pike’s excellent talk on how to write a lexer in Go in 2011. Many works after that change Rob’s implementation somehow but always lead to longer and less comprehensible codebases. The Choice Therefore, I wrote the lexer and parser both following Rob Pike’s idea. After few days work, I realized that: I should borrow the idea from Rob to represent SQL statements as strings, but not io.Reader as other work do, but no need to use channels and goroutines at all, and it is technically intractable to write a SQL lexer/parser manually. So, I switched to write a lexer manually, and to generate the parser using goyacc. During my work, I referred to this example and the official yacc manual for details about operator association and precedence.", "url": "/sqlflow/doc/sql_parser.html", "relUrl": "/sqlflow/doc/sql_parser.html" }, - "12": { - "id": "12", + "13": { + "id": "13", "title": "Submitter", - "content": "Submitter A submitter is a pluggable module in SQLFlow that is used to submit an ML job to a third party computation service. Workflow When a user types in an extended SQL statement, SQLFlow first parses and semantically verifies the statement. Then SQLFlow either runs the ML job locally or submits the ML job to a third party computation service. In the latter case, SQLFlow produces a job description (TrainDescription or PredictDescription) and hands it over to the submitter. For a training SQL, SQLFlow produces TrainDescription; for prediction SQL, SQLFlow produces PredDescription. The concrete definition of the description looks like the following {% raw %} type ColumnType struct { Name string // e.g. sepal_length DatabaseTypeName string // e.g. FLOAT } // SELECT * // FROM iris.train // TRAIN DNNClassifier // WITH // n_classes = 3, // hidden_units = [10, 20] // COLUMN sepal_length, sepal_width, petal_length, petal_width // LABEL class // INTO sqlflow_models.my_dnn_model; type TrainDescription struct { StandardSelect string // e.g. SELECT * FROM iris.train Estimator string // e.g. DNNClassifier Attrs map[string]string // e.g. {{"n_classes", "3"}, {"hidden_units", "[10, 20]"}} X []ColumnType // e.g. {{"sepal_length", "FLOAT"}, ...} Y ColumnType // e.g. {"class", "INT"} ModelName string // e.g. my_dnn_model } // SELECT * // FROM iris.test // PREDICT iris.predict.class // USING sqlflow_models.my_dnn_model; type PredDescription struct { StandardSelect string // e.g. SELECT * FROM iris.test TableName string // e.g. iris.predict ModelName string // e.g. my_dnn_model } {% endraw %} Submitter Interface The submitter interface should provide two functions Train and Predict. The detailed definition can be the following type Submitter interface { // Train executes a ML training job and streams job's response through writer. // A typical Train function should include // - Loading the training data // - Initializing the model // - model.train // - Saving the trained model to a persistent storage Train(desc TrainDescription, writer PipeWriter) error // Predict executes a ML predicting job and streams job's response through writer // A typical Predict function should include // - Loading the model from a persistent storage // - Loading the prediction data // - model.predict // - Writing the prediction result to a table Predict(desc PredictDescription, writer PipeWriter) error } Register a submitter A new submitter can be added as import ( ".../my_submitter" ".../sqlflow/sql" ) func main() { // ... sql.Register(my_submitter.NewSubmitter()) // ... for { sql := recv() sql.Run(sql) } } where sql.Register will put my_submitter instance to package level registry. During sql.Run, it will check whether there is a submitter registered. If there is, sql.Run will run either submitter.Train or submitter.Predict.", + "content": "Submitter A submitter is a pluggable module in SQLFlow that is used to submit an ML job to a third party computation service. Workflow When a user types in an extended SQL statement, SQLFlow first parses and semantically verifies the statement. Then SQLFlow either runs the ML job locally or submits the ML job to a third party computation service. In the latter case, SQLFlow produces a job description (TrainDescription or PredictDescription) and hands it over to the submitter. For a training SQL, SQLFlow produces TrainDescription; for prediction SQL, SQLFlow produces PredDescription. The concrete definition of the description looks like the following {% raw %} type ColumnType struct { Name string // e.g. sepal_length DatabaseTypeName string // e.g. FLOAT } // SELECT * // FROM iris.train // TRAIN DNNClassifier // WITH // n_classes = 3, // hidden_units = [10, 20] // COLUMN sepal_length, sepal_width, petal_length, petal_width // LABEL class // INTO sqlflow_models.my_dnn_model; type TrainDescription struct { StandardSelect string // e.g. SELECT * FROM iris.train Estimator string // e.g. DNNClassifier Attrs map[string]string // e.g. "n_classes": "3", "hidden_units": "[10, 20]" X []ColumnType // e.g. "sepal_length": "FLOAT", ... Y ColumnType // e.g. "class": "INT" ModelName string // e.g. my_dnn_model } // SELECT * // FROM iris.test // PREDICT iris.predict.class // USING sqlflow_models.my_dnn_model; type PredDescription struct { StandardSelect string // e.g. SELECT * FROM iris.test TableName string // e.g. iris.predict ModelName string // e.g. my_dnn_model } {% endraw %} Submitter Interface The submitter interface should provide two functions Train and Predict. The detailed definition can be the following type Submitter interface { // Train executes a ML training job and streams job's response through writer. // A typical Train function should include // - Loading the training data // - Initializing the model // - model.train // - Saving the trained model to a persistent storage Train(desc TrainDescription, writer PipeWriter) error // Predict executes a ML predicting job and streams job's response through writer // A typical Predict function should include // - Loading the model from a persistent storage // - Loading the prediction data // - model.predict // - Writing the prediction result to a table Predict(desc PredictDescription, writer PipeWriter) error } Register a submitter A new submitter can be added as import ( ".../my_submitter" ".../sqlflow/sql" ) func main() { // ... sql.Register(my_submitter.NewSubmitter()) // ... for { sql := recv() sql.Run(sql) } } where sql.Register will put my_submitter instance to package level registry. During sql.Run, it will check whether there is a submitter registered. If there is, sql.Run will run either submitter.Train or submitter.Predict.", "url": "/sqlflow/doc/submitter.html", "relUrl": "/sqlflow/doc/submitter.html" }, - "13": { - "id": "13", + "14": { + "id": "14", "title": "SQLFlow: Design Doc", "content": "SQLFlow: Design Doc What is SQLFlow SQLFlow is a bridge that connects a SQL engine, for example, MySQL, Hive, SparkSQL, Oracle, or SQL Server, and TensorFlow and other machine learning toolkits. SQLFlow extends the SQL syntax to enable model training and inference. Related Work We could write simple machine learning prediction (or scoring) algorithms in SQL using operators like DOT_PRODUCT. However, this requires copy-n-pasting model parameters from the training program into SQL statements. Some proprietary SQL engines provide extensions to support machine learning. Microsoft SQL Server Microsoft SQL Server has the machine learning service that runs machine learning programs in R or Python as an external script: CREATE PROCEDURE generate_linear_model AS BEGIN EXEC sp_execute_external_script @language = N'R' , @script = N'lrmodel <- rxLinMod(formula = distance ~ speed, data = CarsData); trained_model <- data.frame(payload = as.raw(serialize(lrmodel, connection=NULL)));' , @input_data_1 = N'SELECT [speed], [distance] FROM CarSpeed' , @input_data_1_name = N'CarsData' , @output_data_1_name = N'trained_model' WITH RESULT SETS ((model varbinary(max))); END; A challenge to the users is that they need to know not only SQL but also R or Python, and they must be capable of writing machine learning programs in R or Python. Teradata SQL for DL Teradata also provides a RESTful service, which is callable from the extended SQL SELECT syntax. SELECT * FROM deep_learning_scorer( ON (SELECT * FROM cc_data LIMIT 100) URL('http://localhost:8000/api/v1/request') ModelName('cc') ModelVersion('1') RequestType('predict') columns('v1', 'v2', ..., 'amount') ) The above syntax couples the deployment of the service (the URL in the above SQL statement) with the algorithm. Google BigQuery Google BigQuery enables machine learning in SQL by introducing the CREATE MODEL statement. CREATE MODEL dataset.model_name OPTIONS(model_type='linear_reg', input_label_cols=['input_label']) AS SELECT * FROM input_table; Currently, BigQuery only supports two simple models: linear regression and logistic regression. Design Goal None of the above meets our requirement. First of all, we want to build an open source software. Also, we want it to be extensible: We want it extensible to many SQL engines, instead of targeting any one of them. Therefore, we don’t want to build our syntax extension on top of user-defined functions (UDFs); otherwise, we’d have to implement them for each SQL engine. We want the system extensible to support sophisticated machine learning models and toolkits, including TensorFlow for deep learning and xgboost for trees. Another challenge is that we want SQLFlow to be flexible enough to configure and run cutting-edge algorithms, including specifying feature crosses. At the same time, we want SQLFlow easy to learn – at least, no Python or R code embedded in the SQL statements, and integrate hyperparameter estimation. We understand that a key to address the above challenges is the syntax of the SQL extension. To craft a highly-effective and easy-to-learn syntax, we need user feedback and fast iteration. Therefore, we’d start from a prototype that supports only MySQL and TensorFlow. We plan to support more SQL engines and machine learning toolkits later. Design Decisions As the beginning of the iteration, we propose an extension to the SQL SELECT statement. We are not going a new statement way like that BigQuery provides CREATE MODEL, because we want to maintain a loose couple between our system and the underlying SQL engine, and we cannot create the new data type for the SQL engine, like CREATE MODEL requires. We highly appreciate the work of TensorFlow Estimator, a high-level API for deep learning. The basic idea behind Estimator is to implement each deep learning model, and related training/testing/evaluating algorithms as a Python class derived from tf.estimator.Estimator. As we want to keep our SQL syntax simple, we would make the system extensible by calling estimators contributed by machine learning experts and written in Python. The SQL syntax must allow users to set Estimator attributes (parameters of the Python class’ constructor, and those of train, evaluate, or predict). Users can choose to use default values. We have a plan to integrate our hyperparameter estimation research into the system to optimize the default values. Though estimators derived from tf.estimator.Estimator run algorithms as TensorFlow graphs; SQLFlow doesn’t restrict that the underlying machine learning toolkit has to be TensorFlow. Indeed, as long as an estimator provides methods of train, evaluate, and predict, SQLFlow doesn’t care if it calls TensorFlow or xgboost. Precisely, what SQLFlow expect is an interface like the following: class AnEstimatorClass: __init__(self, **kwargs) train(self, **kwargs) evaluate(self, **kwargs) predict(self, **kwargs) We also want to reuse the feature columns API from Estimator, which allows users to columns of tables in a SQL engine to features to the model. Extended SQL Syntax Again, just as the beginning of the iteration, we propose the syntax for training as SELECT * FROM kaggle_credit_fraud_training_data LIMIT 1000 TRAIN DNNClassifier /* a pre-defined TensorFlow estimator, tf.estimator.DNNClassifier */ WITH layers=[100, 200], /* a parameter of the Estimator class constructor */ train.batch_size = 8 /* a parameter of the Estimator.train method */ COLUMN *, /* all columns as raw features */ cross(v1, v9, v28) /* plus a derived (crossed) column */ LABEL class INTO sqlflow_models.my_model_table; /* saves trained model parameters and features into a table */ We see the redundancy of * in two clauses: SELECT and COLUMN. The following alternative can avoid the redundancy, but cannot specify the label. SELECT * /* raw features or the label? */ corss(v1, v9, v28) /* derived featuers */ FROM kaggle_credit_fraud_training_data Please be aware that we save the trained models into tables, instead of a variable maintained by the underlying SQL engine. To invent a new variable type to hold trained models, we’d make our system tightly integrated with the SQL engine, and harms the extensibility to other engines. The result table should include the following information: The estimator name, e.g., DNNClassifier in this case. Estimator attributes, e.g., layer and train.batch_size. The feature mapping, e.g., * and cross(v1, v9, v28). Similarly, to infer the class (fraud or regular), we could SELECT * FROM kaggle_credit_fraud_development_data PREDICT kaggle_credit_fraud_development_data.class USING sqlflow_models.my_model_table; System Architecture A Conceptual Overview In the prototype, we use the following architecture: SQL statement -> our SQL parser --standard SQL-> MySQL -extended SQL-> code generator -> execution engine In the prototype, the code generator generates a Python program that trains or predicts. In either case, it retrieves the data from MySQL via MySQL Connector Python API, optionally, retrieves the model from MySQL, trains the model or predicts using the trained model by calling the user specified TensorFlow estimator, and writes the trained model or prediction results into a table. Working with Jupyter Notebook and Kubernetes The following figures shows the system components and their runtime environment. The left part shows how to run the system on a PC/laptop, the right part shows how to run it on a Kubernetes cluster.", "url": "/sqlflow/doc/syntax.html", "relUrl": "/sqlflow/doc/syntax.html" }, - "14": { - "id": "14", + "15": { + "id": "15", "title": "SQLFlow: Code Walkthrough", "content": "SQLFlow: Code Walkthrough User Experience SQLFlow allows users to write SQL programs with extended syntax in Jupyter Notebook or a command-line tool. The following SQL statements train a TensorFlow model named DNNClassifier, which is a Python class derived from tf.estimator.Estimator: SELECT * FROM a_table TRAIN DNNClassifier WITH learning_rate=0.01 INTO sqlflow_models.my_model; And the following statement uses the trained model for prediction. SELECT * FROM b_table PREDICT b_table.predicted_label USING sqlflow_models.my_model; Please be aware that the part in the above statements before the extended keyword TRAIN and PREDICT is a standard SQL statement. This feature simplifies the implementation of the SQLFlow system. System Implementation If a SQL statement is of the standard syntax, SQLFlow throws it to the SQL engine and redirects the output to the user; otherwise, SQLFlow translates the statement of extended syntax into a Python program. Currently, it generates a program that throws the standard-syntax part of SELECT to MySQL, reads the results in the train-loop of a TensorFlow program. We will talk about how to extend SQLFlow to connect more SQL engines like Oracle, Hive, and SparkSQL, and generates more types of machine learning programs that calls distributed TensorFlow, PyTorch, and xgboost later. Before that, let us explain the system components. SQLFlow as a gRPC Server SQLFlow is a gRPC server, which can connect with multiple clients. A typical client is pysqlflow, the SQLFlow plugin for Jupyter Notebook server. Another once is a text-based client /cmd/sqlflowserver/main.go. Jupyter Notebook (SQL statements)--> SQLFlow gRPC server (SQLFlow magic command) <--(a stream of messages)-- The protobuf definition of the gRPC service is at /server/proto/sqlflow.proto. The return of the method SQLFlow.Run is a stream of Reponses, where each represents either a table header, a row, or a log message. The header and rows are usually from a standard SQL statement, for example, SELECT or DESCRIBE, and the log messages are usually from the running of a generated Python program. SQLFlow in the gRPC Server Once the SQLFlow server receives a batch of SQL statements via a gRPC call, it runs the following steps for each statement: the parser to generate parsing result, the verifier to verify the semantics given the parsing result, the code generator to generate a Python program, or the submitter, from the parsing result, the executor that runs the submitter locally. Step 3. and 4. are only for a SQL statement of extended syntax; otherwise, SQLFlow server proxies the standard-syntax statement to the SQL engine. The executor calls Go’s standard package that captures the stdout and stderr from the submitter process and passing the result back to the gRPC client. Therefore, it is the responsibility of the submitter to print log messages to its stderr and stdout. Minimal Viable Product In the minimal viable product (MVP) of SQLFlow, the code generator generates a Python program consists of two parts: throw the standard SELECT part in the extended-syntax statement to MySQL via ODBC, and a loop that reads outputs from the run of the SELECT statement and trains the model (or, using a trained model to predict). The training part calls TensorFlow to update the parameters of the model specified in the TRAIN clause. Extensibility By writing more code generators, we could extend SQLFlow to support more SQL engines, e.g., Hive and Oracle, and use machine learning toolkits, e.g., PyTorch and xgboost, in addition to TensorFlow, on various computing platforms. You are welcome to add more code generators such as codegen_distributed_tf.go to generate a submitter program similar to the MVP but runs a distributed TensorFlow training job. codegen_kubernetes_tf.go to launch a distributed TensorFlow job on a Kubernetes cluster, other than running locally, in the same container as where SQLFlow gRPC server resides. codegen_gcloud_pytorch.go to launch a submitter that calls PyTorch instead of TensorFlow for training on the Google Cloud. Job Management The current design of the gRPC interface assumes that the connection between the client, e.g., the Jupyter Notebook, and the SQLFlow server keeps alive during the running of the training program. This assumption is reasonable because even if the user closes her/his Web browser and disconnect to the Jupyter Notebook server, the connection between Jupyter to SQLFlow server might keep alive. However, this might not be robust enough if the Jupyter Notebook server runs on a user’s laptop and gets killed. In such a case, the gRPC server cannot stream the messages back to the client and would cause the failure of the submitter. A solution is to change the gRPC interface of SQLFlow server to have a method that files a job and returns immediately, and another method to get a batch of recent messages given a job ID. We will make a design for that soon.", "url": "/sqlflow/doc/walkthrough.html", "relUrl": "/sqlflow/doc/walkthrough.html" }, - "15": { - "id": "15", + "16": { + "id": "16", "title": "SQLFlow", "content": "SQLFlow SQLFlow enriches SQL systems, e.g., MySQL, Hive, SparkSQL, with the capability of machine learning, using TensorFlow. We slightly extend the syntax of the SELECT statement of SQL to support model training and prediction. How to Use Quick start Extended SQL syntax How to Contribute Build from source code. The walkthrough of the source code The choice of parser generator Questions and Feedback Your feedback is our motivation to move on. Please let us know your questions, concerns, and issues by filing Github Issues. License Apache License 2.0", "url": "/sqlflow/", "relUrl": "/sqlflow/" }, - "16": { - "id": "16", + "17": { + "id": "17", + "title": "CalciteParser gRPC Server for SQLFlow", + "content": "CalciteParser gRPC Server for SQLFlow How to Build The following build process doesn’t require us to install Java SDK, Maven, protobuf-compiler, and any dependencies. Instead, it installs all such staff into a Docker image and use the Docker image as the build toolchain. Building in Docker containers standardizes development environments of all contributors, keeps our host computer clean, and works on macOS, Linux, BSD, Windows, and all platforms that have Docker. Build the development Docker image: docker build -t calcite:dev . Or, if it takes too long time for you to build the image, please feel free to use mine: docker pull cxwangyi/calcite:dev docker tag cxwangyi/calcite:dev calcite:dev Generate Java source code from protobuf messages: docker run --rm -it -v $PWD:/work -w /work calcite:dev protoc --java_out=. CalciteParser.proto docker run --rm -it -v $PWD:/work -w /work calcite:dev protoc --grpc-java_out=. CalciteParser.proto Build and generate .class files: docker run --rm -it -v $PWD:/work -w /work calcite:dev javac *.java All, actually, we can do all above in a single command: docker run --rm -it -v $PWD:/work -w /work calcite:dev bash ./build_and_test.bash", + "url": "/sqlflow/calcite-parser/", + "relUrl": "/sqlflow/calcite-parser/" + }, + "18": { + "id": "18", "title": "SQLFlow Demo", "content": "SQLFlow Demo Before you start, you need to set up test MySQL server as described here Start SQLFlow as go get -d ./... && go run main.go -logdir="/path/to/logs/" -loglevel="info" Regular SQL Statements sqlflow> SHOW DATABASES; +--+ | Database | +--+ | churn | | information_schema | | iris | | mysql | | performance_schema | | sqlflow_models | | sqlfs | | sys | +--+ sqlflow> SELECT * FROM iris.train LIMIT 1; +--+-+--+-+-+ | sepal_length | sepal_width | petal_length | petal_width | class | +--+-+--+-+-+ | 6.4 | 2.8 | 5.6 | 2.2 | 2 | +--+-+--+-+-+ Train sqlflow> SELECT * FROM iris.train TRAIN DNNClassifier WITH n_classes = 3, hidden_units = [10, 20] COLUMN sepal_length, sepal_width, petal_length, petal_width LABEL class INTO sqlflow_models.my_dnn_model; -- 2018/12/16 15:03:54 tensorflowCmd: run in Docker container Job success Predict sqlflow> SELECT * FROM iris.test PREDICT iris.predict.class USING sqlflow_models.my_dnn_model; -- 2018/12/16 15:05:58 tensorflowCmd: run in Docker container Job success The prediction is stored at iris.predict, you can look them up at sqlflow> SELECT * FROM iris.predict LIMIT 10; -- +--+-+--+-+-+ | sepal_length | sepal_width | petal_length | petal_width | class | +--+-+--+-+-+ | 6.4 | 2.8 | 5.6 | 2.2 | 2 | | 5 | 2.3 | 3.3 | 1 | 1 | | 4.9 | 2.5 | 4.5 | 1.7 | 2 | | 4.9 | 3.1 | 1.5 | 0.1 | 0 | | 5.7 | 3.8 | 1.7 | 0.3 | 0 | | 4.4 | 3.2 | 1.3 | 0.2 | 0 | | 5.4 | 3.4 | 1.5 | 0.4 | 0 | | 6.9 | 3.1 | 5.1 | 2.3 | 2 | | 6.7 | 3.1 | 4.4 | 1.4 | 1 | | 5.1 | 3.7 | 1.5 | 0.4 | 0 | +--+-+--+-+-+", "url": "/sqlflow/cmd/demo/", "relUrl": "/sqlflow/cmd/demo/" }, - "17": { - "id": "17", + "19": { + "id": "19", "title": "The Credit Card Fraud Detection Example", "content": "The Credit Card Fraud Detection Example The downsample/creditcard.csv file in this directory comes from Kaggle. Down-sample To make it an example that can be checked into a git repo, I randomly picked up 2000 out from the 284,808 lines from the original dataset using the following command: gshuf -n 5000 /tmp/creditcard.csv > ./downsample/creditcard.csv where /tmp/creditcard.csv is the one downloaded from Kaggle. If you want to down-sample by yourself on macOS, you might need to install gshuf. brew install coreutils Import into MySQL To develop/test/debug the TensorFlow program automatically generated by SQLFlow, which is supposed to be able to read data from MySQL, we’d import the downsample/creditcard.csv into MySQL. Run MySQL Server For more about running MySQL in Docker containers, please refer to Run MySQL Server and Client in Docker Containers. The following command starts the MySQL server: docker run --rm -v $HOME/work/creditcarddb:/var/lib/mysql --name mysql01 -e MYSQL_ROOT_PASSWORD=root -e MYSQL_ROOT_HOST='%' -p 3306:3306 -d mysql/mysql-server:8.0 mysqld --secure-file-priv="" Please be aware that the above command doesn’t use the default entrypoint of the Docker image; instead, it explicitly start mysqld in the container so that it could provide the command line option --secure-file-priv="", which is required by mysqlimport. Run the following command to start a MySQL client: docker run --rm -it -v $HOME/work/creditcarddb:/var/lib/mysql mysql/mysql-server:8.0 mysql -uroot -p Remember to type the password root when prompted. Then input the following SQL program in the client to create the table creditcard.creditcard: CREATE DATABASE creditcard; USE creditcard; CREATE TABLE IF NOT EXISTS creditcard ( time INT, v1 float, v2 float, v3 float, v4 float, v5 float, v6 float, v7 float, v8 float, v9 float, v10 float, v11 float, v12 float, v13 float, v14 float, v15 float, v16 float, v17 float, v18 float, v19 float, v20 float, v21 float, v22 float, v23 float, v24 float, v25 float, v26 float, v27 float, v28 float, amount float, class varchar(255)); Before we could import the CSV file, we must follow the security policy of MySQL, which is wired and hard to understand, and copy the CSV file into the database directory: cp downsample/creditcard.csv ~/work/creditcarddb/creditcard/ The following command runs mysqlimport in the mysql/mysql Docker image to import the data: docker run --rm -it -v $HOME/work/creditcarddb:/var/lib/mysql -v $HOME/work/creditcard:/var/lib/mysql-files mysql:8.0 mysqlimport --ignore-lines=1 --fields-terminated-by=, --socket /var/lib/mysql/mysql.sock -u root -p creditcard creditcard.csv Please be aware that mysqlimport doesn’t exist in the mysql/mysql-server image. A message likes the following indicates the success of importing: creditcard.creditcard: Records: 1999 Deleted: 0 Skipped: 0 Warnings: 0 We can verify the imported result by running a SELECT statement like the following docker run --rm -it -v $HOME/work/creditcarddb:/var/lib/mysql mysql/mysql-server:8.0 mysql -uroot -p -e 'use creditcard; select time, v1, v28, amount, class from creditcard limit 10;' which should print something like the following +--+--++--+-+ | time | v1 | v28 | amount | class | +--+--++--+-+ | 123590 | -0.341154 | 0.130741 | 69 | "0" | | 135797 | -0.118853 | 0.11599 | 11.79 | "0" | | 161249 | -1.04129 | -0.362025 | 7.38 | "0" | | 72471 | 0.603151 | 0.0551408 | 352.04 | "0" | | 120182 | 2.10678 | -0.0566284 | 0.89 | "0" | | 160476 | 2.37223 | -0.0724277 | 6 | "0" | | 135023 | 1.87987 | 0.0214597 | 20.05 | "0" | | 117329 | -0.375213 | -0.130152 | 159 | "0" | | 148637 | 1.83178 | -0.0378951 | 38.37 | "0" | | 48293 | 1.26094 | 0.0275234 | 12 | "0" | +--+--++--+-+", "url": "/sqlflow/example/creditcard/", "relUrl": "/sqlflow/example/creditcard/" }, - "18": { - "id": "18", + "20": { + "id": "20", "title": "The MySQL Server Container for Testing", "content": "The MySQL Server Container for Testing This directory contains a Dockerfile that builds a Docker image derived the MySQL Server 8.0 image, and includes SQL programs that popularize the following datasets: Churn from Kaggle Irises classfication from TensorFlow We can run a Docker container of it for unit testing. Build cd example/datasets docker build -t sqlflow:data . Run docker run --rm -d --name sqlflowdata -p 3306:3306 -e MYSQL_ROOT_PASSWORD=root -e MYSQL_ROOT_HOST=% sqlflow:data Popularize Datasets We need to manually popularize the databases and tables: docker exec -it sqlflowdata bash To popularize the Churn dataset into churn: cat /popularize_churn.sql | mysql -uroot -proot To popularize the Irises dataset into iris: cat /popularize_iris.sql | mysql -uroot -proot To prepare database for storing machine learning models: echo "CREATE DATABASE IF NOT EXISTS sqlflow_models;" | mysql -uroot -proot Query In the container, run echo "select count(*) from churn.test;" | mysql -uroot -proot should print the number of rows as the following count(*) 10 Trouble shooting: It usually takes about 15 seconds to bring up the MySQL Server. If you try to connect it before that, you may see the following error ERROR 1045 (28000): Access denied for user 'root'@'localhost' (using password: YES)", "url": "/sqlflow/example/datasets/", "relUrl": "/sqlflow/example/datasets/" }, - "19": { - "id": "19", + "21": { + "id": "21", "title": "Credit Card Fraud Detection", "content": "Credit Card Fraud Detection Data collected from here To run: python premade_estimator.py, which gives Adding numeric_columns: Time Adding numeric_columns: V1 Adding numeric_columns: V2 Adding numeric_columns: V3 Adding numeric_columns: V4 Adding numeric_columns: V5 Adding numeric_columns: V6 Adding numeric_columns: V7 Adding numeric_columns: V8 Adding numeric_columns: V9 Adding numeric_columns: V10 Adding numeric_columns: V11 Adding numeric_columns: V12 Adding numeric_columns: V13 Adding numeric_columns: V14 Adding numeric_columns: V15 Adding numeric_columns: V16 Adding numeric_columns: V17 Adding numeric_columns: V18 Adding numeric_columns: V19 Adding numeric_columns: V20 Adding numeric_columns: V21 Adding numeric_columns: V22 Adding numeric_columns: V23 Adding numeric_columns: V24 Adding numeric_columns: V25 Adding numeric_columns: V26 Adding numeric_columns: V27 Adding numeric_columns: V28 Adding numeric_columns: Amount INFO:tensorflow:Using default config. WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmpzj6rzutf INFO:tensorflow:Using config: {'_num_worker_replicas': 1, '_global_id_in_cluster': 0, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_evaluation_master': '', '_protocol': None, '_session_config': allow_soft_placement: true graph_options { rewrite_options { meta_optimizer_iterations: ONE } } , '_model_dir': '/tmp/tmpzj6rzutf', '_device_fn': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f40f9b40438>, '_save_summary_steps': 100, '_eval_distribute': None, '_keep_checkpoint_max': 5, '_master': '', '_train_distribute': None, '_log_step_count_steps': 100, '_num_ps_replicas': 0, '_experimental_distribute': None, '_save_checkpoints_secs': 600, '_service': None, '_task_type': 'worker', '_task_id': 0, '_keep_checkpoint_every_n_hours': 10000, '_is_chief': True} INFO:tensorflow:Calling model_fn. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Create CheckpointSaverHook. INFO:tensorflow:Graph was finalized. INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpzj6rzutf/model.ckpt. INFO:tensorflow:loss = 892.3532, step = 1 INFO:tensorflow:global_step/sec: 295.764 INFO:tensorflow:loss = 0.00025717804, step = 101 (0.338 sec) INFO:tensorflow:global_step/sec: 446.454 INFO:tensorflow:loss = 0.00024299514, step = 201 (0.224 sec) INFO:tensorflow:global_step/sec: 444.479 INFO:tensorflow:loss = 0.00020988214, step = 301 (0.225 sec) INFO:tensorflow:global_step/sec: 421.979 INFO:tensorflow:loss = 0.00023327809, step = 401 (0.237 sec) INFO:tensorflow:global_step/sec: 446.172 INFO:tensorflow:loss = 0.00020405005, step = 501 (0.224 sec) INFO:tensorflow:global_step/sec: 431.785 INFO:tensorflow:loss = 0.00017921966, step = 601 (0.232 sec) INFO:tensorflow:global_step/sec: 442.912 INFO:tensorflow:loss = 0.00018488085, step = 701 (0.226 sec) INFO:tensorflow:global_step/sec: 444.628 INFO:tensorflow:loss = 0.00017795907, step = 801 (0.225 sec) INFO:tensorflow:global_step/sec: 428.385 INFO:tensorflow:loss = 0.00016896412, step = 901 (0.233 sec) INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmpzj6rzutf/model.ckpt. INFO:tensorflow:Loss for final step: 0.00016340206. INFO:tensorflow:Calling model_fn. WARNING:tensorflow:Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead. WARNING:tensorflow:Trapezoidal rule is known to produce incorrect PR-AUCs; please switch to "careful_interpolation" instead. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Starting evaluation at 2018-10-16-01:02:49 INFO:tensorflow:Graph was finalized. INFO:tensorflow:Restoring parameters from /tmp/tmpzj6rzutf/model.ckpt-1000 INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. INFO:tensorflow:Finished evaluation at 2018-10-16-01:02:49 INFO:tensorflow:Saving dict for global step 1000: accuracy = 1.0, accuracy_baseline = 1.0, auc = 0.9999999, auc_precision_recall = 0.0, average_loss = 0.0001784083, global_step = 1000, label/mean = 0.0, loss = 0.0016056747, precision = 0.0, prediction/mean = 0.00017826515, recall = 0.0 INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /tmp/tmpzj6rzutf/model.ckpt-1000 Test set accuracy: 1.00000 INFO:tensorflow:Calling model_fn. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Graph was finalized. INFO:tensorflow:Restoring parameters from /tmp/tmpzj6rzutf/model.ckpt-1000 INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. Prediction is "Not Fraud" (100.0%), expected "Not Fraud" Prediction is "Not Fraud" (100.0%), expected "Not Fraud" Prediction is "Not Fraud" (100.0%), expected "Not Fraud"", "url": "/sqlflow/example/fraud_detection/", "relUrl": "/sqlflow/example/fraud_detection/" }, - "20": { - "id": "20", + "22": { + "id": "22", "title": "sqlflowserver", "content": "sqlflowserver The gRPC proxy server of SQL engines Prerequisite Golang Docker GRPC Running go generate && cd server && go run main.go", "url": "/sqlflow/server/", "relUrl": "/sqlflow/server/" }, - "21": { - "id": "21", + "23": { + "id": "23", "title": "SQLFlow", "content": "SQLFlow Build, Run, Test Before running the unit tests, we need to build and run a Docker container that hosts a MySQL database following this guide. To build the parser using goyacc and run all unit tests, use the following command: go get -d ./... && goyacc -p sql -o parser.go sql.y && go test -v", "url": "/sqlflow/sql/", "relUrl": "/sqlflow/sql/" }, - "22": { - "id": "22", + "24": { + "id": "24", "title": "Python Code Template", "content": "Python Code Template To run a test, say, test_sql_data.py, we need to Start a container that runs MySQL server with populated data following this guide. Run the tests in a SQLFlow container that has TensorFlow and mysql_connector installed: docker run --rm -it --network="host" -v $PWD:/work -w /work sqlflow/sqlflow python test_sql_data.py where --network="host" allows processes running in the container to access the host’s network, where the MySQL server container exposes its port.", "url": "/sqlflow/sql/python/", "relUrl": "/sqlflow/sql/python/" }, - "23": { - "id": "23", + "25": { + "id": "25", "title": "sqlfs", "content": "sqlfs The package sqlfs provides an io.ReadCloser and an io.WriteCloser that treats a SQL database a filesystem, where each table in the database is like a file. The schema of the table is very simple – it has only one column of BLOB type. All the rows consist the storage. sqlfs provides the following features. Create a file To create a table named “hello” in a database “mydb” for writing, we can call Create. f, e := sqlfs.Create(db, "mydb.hello") f.Write([]byte("hello world! n")) f.Close() where db comes from a call to database/sql.Open. Append to a file f, e := sqlfs.Append(db, "mydb.hello") f.Write([]byte("hello world! n") f.Close() Read from a file f, e := sqlfs.Open(db, "mydb.hello") buf := make([]byte, 1024) f.Read(buf) f.Close() Remove a file DropTable(db, "mydb.hello") Check if a file exists HasTable(db, "mydb.hello") Other I/O operations Feel free to use standard packages io, ioutil, etc with sqlfs. For example, we can call io.Copy to copy everything from the standard input to a table. f, e := sqlfs.Create(db, "mydb.hello") io.Copy(f, os.Stdin) f.Close()", "url": "/sqlflow/sqlfs/", diff --git a/docs/doc_index/contribute.html b/docs/doc_index/contribute.html index dd00227..e47ecd1 100644 --- a/docs/doc_index/contribute.html +++ b/docs/doc_index/contribute.html @@ -48,6 +48,8 @@ + + @@ -97,13 +99,15 @@ + + @@ -135,6 +139,8 @@ + +
  • Build TensorFlow from Source Code using Docker
  • @@ -428,6 +442,8 @@

    Table of contents

    + + diff --git a/docs/doc_index/design.html b/docs/doc_index/design.html index 9143faa..60e9125 100644 --- a/docs/doc_index/design.html +++ b/docs/doc_index/design.html @@ -48,6 +48,8 @@ + + @@ -97,13 +99,15 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + + @@ -135,6 +139,8 @@ + +