Skip to content

Commit 98fd6a9

Browse files
tonyyang-svailwangkuiyi
authored andcommitted
Rename TRAIN/PREDICT to TO TRAIN/TO PREDICT (#1128)
* Rename TRAIN/PREDICT to TO TRAIN/TO PREDICT * merge develop * fix tests
1 parent 67b7d7b commit 98fd6a9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+364
-315
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Here are examples for training a Tensorflow [DNNClassifer](https://www.tensorflo
3131
```sql
3232
sqlflow> SELECT *
3333
FROM iris.train
34-
TRAIN DNNClassifier
34+
TO TRAIN DNNClassifier
3535
WITH model.n_classes = 3, model.hidden_units = [10, 20]
3636
COLUMN sepal_length, sepal_width, petal_length, petal_width
3737
LABEL class
@@ -45,7 +45,7 @@ Done training
4545
```sql
4646
sqlflow> SELECT *
4747
FROM iris.test
48-
PREDICT iris.predict.class
48+
TO PREDICT iris.predict.class
4949
USING sqlflow_models.my_dnn_model;
5050

5151
...

cmd/sqlflowserver/main_test.go

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ func CaseTrainTextClassificationIR(t *testing.T) {
337337
a := assert.New(t)
338338
trainSQL := `SELECT news_title, class_id
339339
FROM text_cn.train_processed
340-
TRAIN DNNClassifier
340+
TO TRAIN DNNClassifier
341341
WITH model.n_classes = 17, model.hidden_units = [10, 20]
342342
COLUMN EMBEDDING(CATEGORY_ID(SPARSE(news_title,16000,COMMA), 16000),128,mean)
343343
LABEL class_id
@@ -352,7 +352,7 @@ func CaseTrainTextClassificationFeatureDerivation(t *testing.T) {
352352
a := assert.New(t)
353353
trainSQL := `SELECT news_title, class_id
354354
FROM text_cn.train_processed
355-
TRAIN DNNClassifier
355+
TO TRAIN DNNClassifier
356356
WITH model.n_classes = 17, model.hidden_units = [10, 20]
357357
COLUMN EMBEDDING(SPARSE(news_title,16000,COMMA),128,mean)
358358
LABEL class_id
@@ -576,7 +576,7 @@ func CaseTrainSQL(t *testing.T) {
576576
trainSQL := fmt.Sprintf(`
577577
SELECT *
578578
FROM %s.%s
579-
TRAIN DNNClassifier
579+
TO TRAIN DNNClassifier
580580
WITH
581581
model.n_classes = 3,
582582
model.hidden_units = [10, 20],
@@ -592,7 +592,7 @@ func CaseTrainSQL(t *testing.T) {
592592

593593
predSQL := fmt.Sprintf(`SELECT *
594594
FROM %s.%s
595-
PREDICT %s.%s.class
595+
TO PREDICT %s.%s.class
596596
USING sqlflow_models.my_dnn_model;`, caseDB, caseTestTable, caseDB, casePredictTable)
597597
_, _, err = connectAndRunSQL(predSQL)
598598
if err != nil {
@@ -624,7 +624,7 @@ func CaseTrainFeatureDerevation(t *testing.T) {
624624
a := assert.New(t)
625625
trainSQL := fmt.Sprintf(`SELECT *
626626
FROM %s.%s
627-
TRAIN DNNClassifier
627+
TO TRAIN DNNClassifier
628628
WITH model.n_classes = 3, model.hidden_units = [10, 20]
629629
LABEL class
630630
INTO sqlflow_models.my_dnn_model;`, caseDB, caseTrainTable)
@@ -633,7 +633,7 @@ INTO sqlflow_models.my_dnn_model;`, caseDB, caseTrainTable)
633633

634634
// TODO(typhoonzero): also support string column type for training and prediction (column c6)
635635
trainVaryColumnTypes := `SELECT c1, c2, c3, c4, c5, class from feature_derivation_case.train
636-
TRAIN DNNClassifier
636+
TO TRAIN DNNClassifier
637637
WITH model.n_classes=3, model.hidden_units=[10,10]
638638
COLUMN EMBEDDING(c3, 128, sum), EMBEDDING(SPARSE(c5, 10000, COMMA), 128, sum)
639639
LABEL class
@@ -646,7 +646,7 @@ func CaseTrainCustomModel(t *testing.T) {
646646
a := assert.New(t)
647647
trainSQL := `SELECT *
648648
FROM iris.train
649-
TRAIN sqlflow_models.DNNClassifier
649+
TO TRAIN sqlflow_models.DNNClassifier
650650
WITH model.n_classes = 3, model.hidden_units = [10, 20]
651651
COLUMN sepal_length, sepal_width, petal_length, petal_width
652652
LABEL class
@@ -658,7 +658,7 @@ INTO sqlflow_models.my_dnn_model_custom;`
658658

659659
predSQL := `SELECT *
660660
FROM iris.test
661-
PREDICT iris.predict.class
661+
TO PREDICT iris.predict.class
662662
USING sqlflow_models.my_dnn_model_custom;`
663663
_, _, err = connectAndRunSQL(predSQL)
664664
if err != nil {
@@ -684,7 +684,7 @@ func CaseTrainTextClassification(t *testing.T) {
684684
a := assert.New(t)
685685
trainSQL := `SELECT news_title, class_id
686686
FROM text_cn.train_processed
687-
TRAIN DNNClassifier
687+
TO TRAIN DNNClassifier
688688
WITH model.n_classes = 17, model.hidden_units = [10, 20]
689689
COLUMN EMBEDDING(CATEGORY_ID(news_title,16000,COMMA),128,mean)
690690
LABEL class_id
@@ -699,7 +699,7 @@ func CaseTrainTextClassificationCustomLSTM(t *testing.T) {
699699
a := assert.New(t)
700700
trainSQL := `SELECT news_title, class_id
701701
FROM text_cn.train_processed
702-
TRAIN sqlflow_models.StackedBiLSTMClassifier
702+
TO TRAIN sqlflow_models.StackedBiLSTMClassifier
703703
WITH model.n_classes = 17, model.stack_units = [16], train.epoch = 1, train.batch_size = 32
704704
COLUMN EMBEDDING(SEQ_CATEGORY_ID(news_title,1600,COMMA),128,mean)
705705
LABEL class_id
@@ -714,7 +714,7 @@ func CaseTrainSQLWithHyperParams(t *testing.T) {
714714
a := assert.New(t)
715715
trainSQL := `SELECT *
716716
FROM iris.train
717-
TRAIN DNNClassifier
717+
TO TRAIN DNNClassifier
718718
WITH model.n_classes = 3, model.hidden_units = [10, 20], train.batch_size = 10, train.epoch = 2
719719
COLUMN sepal_length, sepal_width, petal_length, petal_width
720720
LABEL class
@@ -729,7 +729,7 @@ func CaseTrainDeepWideModel(t *testing.T) {
729729
a := assert.New(t)
730730
trainSQL := `SELECT *
731731
FROM iris.train
732-
TRAIN DNNLinearCombinedClassifier
732+
TO TRAIN DNNLinearCombinedClassifier
733733
WITH model.n_classes = 3, model.dnn_hidden_units = [10, 20], train.batch_size = 10, train.epoch = 2
734734
COLUMN sepal_length, sepal_width FOR linear_feature_columns
735735
COLUMN petal_length, petal_width FOR dnn_feature_columns
@@ -746,7 +746,7 @@ func CaseTrainCustomModelWithHyperParams(t *testing.T) {
746746
a := assert.New(t)
747747
trainSQL := `SELECT *
748748
FROM iris.train
749-
TRAIN sqlflow_models.DNNClassifier
749+
TO TRAIN sqlflow_models.DNNClassifier
750750
WITH model.n_classes = 3, model.hidden_units = [10, 20], train.batch_size = 10, train.epoch=2
751751
COLUMN sepal_length, sepal_width, petal_length, petal_width
752752
LABEL class
@@ -761,7 +761,7 @@ func CaseSparseFeature(t *testing.T) {
761761
a := assert.New(t)
762762
trainSQL := `SELECT news_title, class_id
763763
FROM text_cn.train
764-
TRAIN DNNClassifier
764+
TO TRAIN DNNClassifier
765765
WITH model.n_classes = 3, model.hidden_units = [10, 20]
766766
COLUMN EMBEDDING(CATEGORY_ID(news_title,16000,COMMA),128,mean)
767767
LABEL class_id
@@ -777,7 +777,7 @@ func CaseTrainElasticDL(t *testing.T) {
777777
a := assert.New(t)
778778
trainSQL := fmt.Sprintf(`SELECT sepal_length, sepal_width, petal_length, petal_width, class
779779
FROM %s.%s
780-
TRAIN ElasticDLDNNClassifier
780+
TO TRAIN ElasticDLDNNClassifier
781781
WITH
782782
model.optimizer = "optimizer",
783783
model.loss = "loss",
@@ -830,7 +830,7 @@ func CaseTrainALPS(t *testing.T) {
830830
SELECT deep_id, user_space_stat, user_behavior_stat, space_stat, l
831831
FROM %s.sparse_column_test
832832
LIMIT 100
833-
TRAIN DNNClassifier
833+
TO TRAIN DNNClassifier
834834
WITH
835835
model.n_classes = 2,
836836
model.hidden_units = [10, 20],
@@ -863,8 +863,8 @@ func CaseTrainALPSRemoteModel(t *testing.T) {
863863
trainSQL := fmt.Sprintf(`SELECT deep_id, user_space_stat, user_behavior_stat, space_stat, l
864864
FROM %s.sparse_column_test
865865
LIMIT 100
866-
TRAIN models.estimator.dnn_classifier.DNNClassifier
867-
WITH
866+
TO TRAIN models.estimator.dnn_classifier.DNNClassifier
867+
WITH
868868
model.n_classes = 2, model.hidden_units = [10, 20], train.batch_size = 10, engine.ps_num=0, engine.worker_num=0, engine.type=local,
869869
gitlab.project = "Alps/sqlflow-models",
870870
gitlab.source_root = python,
@@ -891,7 +891,7 @@ func CaseTrainALPSFeatureMap(t *testing.T) {
891891
trainSQL := fmt.Sprintf(`SELECT dense, deep, item, test_sparse_with_fm.label
892892
FROM %s.test_sparse_with_fm
893893
LIMIT 32
894-
TRAIN alipay.SoftmaxClassifier
894+
TO TRAIN alipay.SoftmaxClassifier
895895
WITH train.max_steps = 32, eval.steps=32, train.batch_size=8, engine.ps_num=0, engine.worker_num=0, engine.type = local
896896
COLUMN DENSE(dense, none, comma),
897897
DENSE(item, 1, comma, int)
@@ -931,7 +931,7 @@ func CaseTrainRegression(t *testing.T) {
931931
a := assert.New(t)
932932
trainSQL := fmt.Sprintf(`SELECT *
933933
FROM housing.train
934-
TRAIN LinearRegressor
934+
TO TRAIN LinearRegressor
935935
WITH model.label_dimension=1
936936
COLUMN f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13
937937
LABEL target
@@ -943,7 +943,7 @@ INTO sqlflow_models.my_regression_model;`)
943943

944944
predSQL := fmt.Sprintf(`SELECT *
945945
FROM housing.test
946-
PREDICT housing.predict.target
946+
TO PREDICT housing.predict.target
947947
USING sqlflow_models.my_regression_model;`)
948948
_, _, err = connectAndRunSQL(predSQL)
949949
if err != nil {
@@ -977,7 +977,7 @@ func CaseTrainXGBoostRegression(t *testing.T) {
977977
trainSQL := fmt.Sprintf(`
978978
SELECT *
979979
FROM housing.train
980-
TRAIN xgboost.gbtree
980+
TO TRAIN xgboost.gbtree
981981
WITH
982982
objective="reg:squarederror",
983983
train.num_boost_round = 30
@@ -998,7 +998,7 @@ func CaseTrainAndAnalyzeXGBoostModel(t *testing.T) {
998998
trainStmt := `
999999
SELECT *
10001000
FROM housing.train
1001-
TRAIN xgboost.gbtree
1001+
TO TRAIN xgboost.gbtree
10021002
WITH
10031003
objective="reg:squarederror",
10041004
train.num_boost_round = 30
@@ -1040,7 +1040,7 @@ func CasePredictXGBoostRegression(t *testing.T) {
10401040
a := assert.New(t)
10411041
predSQL := fmt.Sprintf(`SELECT *
10421042
FROM housing.test
1043-
PREDICT housing.xgb_predict.target
1043+
TO PREDICT housing.xgb_predict.target
10441044
USING sqlflow_models.my_xgb_regression_model;`)
10451045
_, _, err := connectAndRunSQL(predSQL)
10461046
if err != nil {

doc/design/alps_submitter.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ The column `c1` is dense encoded and `c2` is sparse encoded, `c3` is label colum
8989
select
9090
c1, c2, c3 as class
9191
from kaggle_credit_fraud_training_data
92-
TRAIN DNNClassifier
92+
TO TRAIN DNNClassifier
9393
WITH
9494
...
9595
COLUMN
@@ -148,7 +148,7 @@ Here is an example which do `BUCKETIZED` on `c2` then `CROSS` with `c1`.
148148
select
149149
c1, c2, c3 as class
150150
from kaggle_credit_fraud_training_data
151-
TRAIN DNNClassifier
151+
TO TRAIN DNNClassifier
152152
WITH
153153
...
154154
COLUMN
@@ -162,7 +162,7 @@ Feature Expressions except for Tensorflow Feature Column API should raise an err
162162
```sql
163163
/* Not supported */
164164
select * from kaggle_credit_fraud_training_data
165-
TRAIN DNNClassifier
165+
TO TRAIN DNNClassifier
166166
WITH
167167
...
168168
COLUMN
@@ -206,7 +206,7 @@ Let's create a DNNClassifier example, the minimum parameters of the constructor
206206
select
207207
c1, c2, c3 as class
208208
from kaggle_credit_fraud_training_data
209-
TRAIN DNNClassifier
209+
TO TRAIN DNNClassifier
210210
WITH
211211
estimator.hidden_units = [10, 20],
212212
train_spec.max_steps = 2000,
@@ -223,7 +223,7 @@ For now, we will pass the result of snippet code as `feature_columns` parameters
223223
select
224224
c1, c2, c3, c4, c5 as class
225225
from kaggle_credit_fraud_training_data
226-
TRAIN DNNLinearCombinedClassifier
226+
TO TRAIN DNNLinearCombinedClassifier
227227
WITH
228228
linear_feature_columns = [fc1, fc2]
229229
dnn_feature_columns = [fc3]
@@ -234,4 +234,4 @@ COLUMN
234234
CROSS([fc1, fc2, f3]) as fc3
235235
LABEL class
236236
...
237-
```
237+
```

doc/design/analyzer.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,13 @@ This design doc introduces how to support the `Analyze SQL` in SQLFlow with SHAP
88

99
## User Interface
1010

11-
Users usually use a **TRAIN SQL** to train a model and then analyze the model using an **ANALYZE SQL**, the simple pipeline like:
11+
Users usually use a **TO TRAIN SQL** to train a model and then analyze the model using an **ANALYZE SQL**, the simple pipeline like:
1212

1313
Train SQL:
1414

1515
``` sql
1616
SELECT * FROM train_table
17-
TRAIN xgboost.Estimator
17+
TO TRAIN xgboost.Estimator
1818
WITH
1919
train.objective = "reg:linear"
2020
COLUMN x

doc/design/ant_xgboost.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Comparing to python API provided by `xgboost`, it is easier to build a python co
3333
### User Experience
3434

3535
In terms of sqlflow users, xgboost is an alternative `Estimator` like `TensorFlow Estimators`.
36-
Working with xgboost is quite similar to working with TensorFlow Estimators; just change `TRAIN DNNClassifier` into `TRAIN XGBoostEstimator`.
36+
Working with xgboost is quite similar to working with TensorFlow Estimators; just change `TO TRAIN DNNClassifier` into `TO TRAIN XGBoostEstimator`.
3737

3838
In addition, xgboost specific parameters can be configured in the same way as TensorFlow parameters.
3939

@@ -44,7 +44,7 @@ Below is a demo about training/predicting via xgboost :
4444
select
4545
c1, c2, c3, c4, c5 as class
4646
from kaggle_credit_fraud_training_data
47-
TRAIN XGBoostEstimator
47+
TO TRAIN XGBoostEstimator
4848
WITH
4949
booster = "gbtree"
5050
objective = "logistic:binary"
@@ -62,7 +62,7 @@ INTO sqlflow_models.xgboost_model_table;
6262
select
6363
c1, c2, c3, c4
6464
from kaggle_credit_fraud_development_data
65-
PREDICT kaggle_credit_fraud_development_data.class
65+
TO PREDICT kaggle_credit_fraud_development_data.class
6666
USING sqlflow_models.xgboost_model_table;
6767
```
6868

doc/design/clustermodel.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ The figure below demonstrates the overall workflow for cluster model training, w
2222

2323
In this scenario, we focus on the extraction of data patterns in unsupervised learning.
2424

25-
So, the user can use `TRAIN` keyword to training a model. The user can also specify the training hyper-parameters with the keyword `WITH` and determine whether to use pre-trained model by `USING`. The training and predicting syntax looks like:
25+
So, the user can use `TO TRAIN` keyword to training a model. The user can also specify the training hyper-parameters with the keyword `WITH` and determine whether to use pre-trained model by `USING`. The training and predicting syntax looks like:
2626

27-
TRAIN SQL:
27+
TO TRAIN SQL:
2828

2929
``` sql
3030
SELECT * FROM input_table
31-
TRAIN clusterModel
31+
TO TRAIN clusterModel
3232
WITH
3333
model.encode_units = [100, 7]
3434
model.n_clusters = 5
@@ -38,12 +38,12 @@ USING existed_pretrain_model
3838
INTO my_cluster_model;
3939
```
4040

41-
PREDICT SQL:
41+
TO PREDICT SQL:
4242

4343
``` sql
4444
SELECT *
4545
FROM input_table
46-
PREDICT output_table.group_id
46+
TO PREDICT output_table.group_id
4747
USING my_cluster_model;
4848
```
4949

@@ -108,7 +108,7 @@ Therefore, there are four cases in total:
108108

109109
- In the first stage of the clustering model on SQLFlow, we plan to achieve the `first case`. We will achieve the other cases in the later.
110110

111-
- Users can use the trained cluster model in ` PREDICT SQL` to predict the group of input_table to get output_table.
111+
- Users can use the trained cluster model in ` TO PREDICT SQL` to predict the group of input_table to get output_table.
112112

113113
- Finally, the user can perform a combined aggregation operation on the output_table based on the SQL statement to obtain a result_table, which can be saved to the local dataframe and then analyzed according to his own needs.
114114

doc/design/database_abstraction_layer.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ SQLFlow calls Go's [standard database API](https://golang.org/pkg/database/sql/)
88

99
### Data Retrieval
1010

11-
The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TRAIN and PREDICT clauses. For more discussion, please refer to the [syntax design](syntax.md). SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TRAIN or PREDICT, which we call the "standard part", to the SQL engine. SQLFlow also accepts the SELECT statement without TRAIN or PREDICT clauses and would forward such "standard statements" to the engine. It is noticeable that the "standard part" or "standard statements" are not standardized. For example, various engines use different syntax for `FULL OUTER JOIN`.
11+
The basic idea of SQLFlow is to extend the SELECT statement of SQL to have the TRAIN and PREDICT clauses. For more discussion, please refer to the [syntax design](syntax.md). SQLFlow translates such "extended SQL statements" into submitter programs, which forward the part from SELECT to TO TRAIN or TO PREDICT, which we call the "standard part", to the SQL engine. SQLFlow also accepts the SELECT statement without TO TRAIN or TO PREDICT clauses and would forward such "standard statements" to the engine. It is noticeable that the "standard part" or "standard statements" are not standardized. For example, various engines use different syntax for `FULL OUTER JOIN`.
1212

1313
- Hive supports `FULL OUTER JOIN` directly.
1414
- MySQL doesn't have `FULL OUTER JOIN`. However, a user can emulates `FULL OUTER JOIN` using `LEFT JOIN`, `UNION` and `RIGHT JOIN`.
@@ -24,7 +24,7 @@ SELECT
2424
name,
2525
age,
2626
income
27-
FROM employee TRAIN DNNRegressor
27+
FROM employee TO TRAIN DNNRegressor
2828
WITH hidden_layers=[10,50,10]
2929
COLUMN name, agee LABEL income;
3030
```

0 commit comments

Comments
 (0)