Fixes #8, #9, refactoring

sdpython · Oct 13, 2018 · ea856ef · ea856ef
1 parent cd68d6e
commit ea856ef
Show file tree

Hide file tree

Showing 32 changed files with 993 additions and 301 deletions.
diff --git a/.gitignore b/.gitignore
@@ -281,3 +281,6 @@ src/csharpyml/binaries/*.lib
 src/csharpyml/binaries/*.xml
 src/csharpyml/binaries/*.json
 *.err
+_doc/sphinxdoc/source/components
+cscode/packages
+_doc/examples/model.zip
diff --git a/_doc/examples/README.txt b/_doc/examples/README.txt
@@ -3,9 +3,3 @@
 Gallery of examples
 ===================
 
-
-First section
--------------
-
-One example of a gallery.
-
diff --git a/_doc/examples/plot_cspipeline.py b/_doc/examples/plot_cspipeline.py
@@ -0,0 +1,60 @@
+"""
+Trains a Random Forest on Iris dataset
+======================================
+
+The following example shows how to create and train
+a pipeline using :ref:`l-fasttree-(boosted-trees)-classification`.
+"""
+import sys
+import os
+import unittest
+import numpy
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+import pandas
+from csharpyml.binaries import CSPipeline
+
+##############################
+# Let's first retrieve the data.
+
+X, y = datasets.load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(
+    X.astype(numpy.float32), y.astype(numpy.float32))
+df_train = pandas.DataFrame(data=X_train, columns=["FA", "FB", "FC", "FD"])
+df_train["Label"] = y_train
+
+df_test = pandas.DataFrame(data=X_test, columns=["FA", "FB", "FC", "FD"])
+df_test["Label"] = y_test
+
+##############################
+# Let's create a pipeline.
+pipe = CSPipeline(["concat{col=Feat:FA,FB,FC,FD}"],
+                  "oova{p=ft}", verbose=2)
+
+#############################
+# Let's train it.
+pipe.fit(df_train, feature="Feat", label="Label")
+
+###############################################
+# Let's show the output.
+
+print(pipe.StdOut)
+
+#################################
+# Let's predict.
+
+pred = pipe.predict(df_test)
+print(pred.head())
+
+###########################
+# Let's save the model.
+
+outfile = "model.zip"
+pipe.save(outfile)
+
+#############################
+# Let's load it.
+
+pipe2 = CSPipeline.load(outfile)
+pred2 = pipe2.predict(df_test)
+print(pred2.head())
diff --git a/_doc/notebooks/csharp_for_ml_in_notebook.ipynb b/_doc/notebooks/csharp_for_ml_in_notebook.ipynb
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
@@ -14,6 +14,8 @@
 
 blog_root = "http://www.xavierdupre.fr/app/csharpyml/helpsphinx/"
 
+extensions.extend(['csharpyml.sphinxext.sphinx_mlext'])
+
 html_context = {
     'css_files': get_default_stylesheet() + ['_static/my-styles.css', '_static/gallery.css'],
 }
@@ -38,10 +40,20 @@ def custom_latex_processing(latex):
 epkg_dictionary.update({
     'C#': 'https://en.wikipedia.org/wiki/C_Sharp_(programming_language)',
     'C# DataFrame': 'https://github.com/sdpython/machinelearningext/blob/master/machinelearningext/DataManipulation/DataFrame.cs',
+    'C# IDataView': 'https://github.com/dotnet/machinelearning/blob/master/src/Microsoft.ML.Core/Data/IDataView.cs',
+    'C# LogWriter': 'https://github.com/xadupre/machinelearningext/blob/master/machinelearningext/PipelineHelper/DelegateEnvironment.cs',
     'C# Pipeline': 'https://github.com/sdpython/machinelearningext/blob/master/machinelearningext/ScikitAPI/ScikitPipeline.cs',
+    'C# ScikitPipeline': 'https://github.com/xadupre/machinelearningext/blob/master/machinelearningext/ScikitAPI/ScikitPipeline.cs',
     'csv': 'https://en.wikipedia.org/wiki/Comma-separated_values',
     'DataFrame': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html',
     'DataKind': 'https://github.com/dotnet/machinelearning/blob/master/src/Microsoft.ML.Core/Data/DataKind.cs#L13',
+    'DBSCAN': 'https://en.wikipedia.org/wiki/DBSCAN',
     'ML.net': 'https://github.com/dotnet/machinelearning',
+    'OPTICS': 'https://en.wikipedia.org/wiki/OPTICS_algorithm',
     'Windows': 'https://www.microsoft.com/',
 })
+
+
+from recommonmark.parser import CommonMarkParser
+source_parsers = {'.md': CommonMarkParser}
+source_suffix = ['.rst', '.md']
diff --git a/_doc/sphinxdoc/source/index.rst b/_doc/sphinxdoc/source/index.rst
@@ -57,9 +57,16 @@ Documentation
     :maxdepth: 1
 
     api/index
+    mlnetdocs/index
+    components/index
     i_ex
     i_faq
     i_nb
+    all_notebooks
+    blog/blogindex
+    indexmenu
+    HISTORY
+    license
 
 It can easily compile and wrap a :epkg:`C#` function
 into :epkg:`Python`:
@@ -81,15 +88,6 @@ The list of available trainers can be obtained with:
 This function also exists as a magic command
 :ref:`%%maml <cmagic-maml>`.
 
-Galleries
----------
-
-.. toctree::
-    :maxdepth: 2
-
-    all_notebooks
-    blog/blogindex
-
 Installation
 ------------
 
@@ -103,21 +101,6 @@ Follow the instructions described in
 Follow the instructions described in
 `config.yml <https://github.com/sdpython/csharpyml/blob/master/.circleci/config.yml>`_.
 
-Navigation
-----------
-
-.. toctree::
-    :maxdepth: 1
-
-    indexmenu
-    HISTORY
-    license
-
-.. toctree::
-    :hidden:
-
-    blog/index_blog
-
 +----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+
 | :ref:`l-modules`     |  :ref:`l-functions` | :ref:`l-classes`    | :ref:`l-methods`   | :ref:`l-staticmethods` | :ref:`l-properties`                            |
 +----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+

diff --git a/_doc/sphinxdoc/source/indexmenu.rst b/_doc/sphinxdoc/source/indexmenu.rst
@@ -23,3 +23,8 @@ Through documentation
         filechanges
         README
         all_indexes
+
+.. toctree::
+    :hidden:
+
+    blog/index_blog
diff --git a/_doc/sphinxdoc/source/nbcov-2018-10-13.png b/_doc/sphinxdoc/source/nbcov-2018-10-13.png
diff --git a/_doc/sphinxdoc/source/nbcov.png b/_doc/sphinxdoc/source/nbcov.png
diff --git a/_unittests/ut_binaries/test_cspipeline.py b/_unittests/ut_binaries/test_cspipeline.py
@@ -7,6 +7,8 @@
 import sys
 import os
 import unittest
+from io import StringIO
+from contextlib import redirect_stdout, redirect_stderr
 import numpy
 from sklearn import datasets
 from sklearn.model_selection import train_test_split
@@ -38,34 +40,45 @@ def test_src(self):
         self.assertFalse(datasets is None)
 
     def test_predictor(self):
-        X, y = datasets.load_iris(return_X_y=True)
-        X_train, X_test, y_train, y_test = train_test_split(
-            X.astype(numpy.float32), y.astype(numpy.float32))
-        df_train = pandas.DataFrame(data=X_train, columns=[
-                                    "FA", "FB", "FC", "FD"])
-        df_train["Label"] = y_train
-        pipe = CSPipeline(["concat{col=Feat:FA,FB,FC,FD}"],
-                          "ova{p=lr}", stdout="C#", verbose=0)
-        pipe.fit(df_train, feature="Feat", label="Label")
-        df_test = pandas.DataFrame(data=X_test, columns=[
-                                   "FA", "FB", "FC", "FD"])
-        self.assertIsInstance(df_test, pandas.DataFrame)
-        df_test["Label"] = y_test
-        pred = pipe.predict(df_test)
-        head = pred.head()
-        exp = ['FA', 'FB', 'FC', 'FD', 'Label', 'Feat.0', 'Feat.1', 'Feat.2',
-               'Feat.3', 'PredictedLabel', 'Score.0', 'Score.1', 'Score.2']
-        self.assertEqual(list(head.columns), exp)
-        self.assertEqual(pred.shape, (38, 13))
-        acc = (pred.Label + 1 - pred.PredictedLabel).abs().sum()
-        self.assertLesser(acc, 10)
-        # Save
-        temp = get_temp_folder(__file__, "temp_predictor")
-        outfile = os.path.join(temp, "iris.zip")
-        pipe.save(outfile)
-        pipe2 = CSPipeline.load(outfile)
-        pred2 = pipe2.predict(df_test)
-        self.assertEqual(pred, pred2)
+        fout = StringIO()
+        ferr = StringIO()
+        with redirect_stdout(fout):
+            with redirect_stderr(ferr):
+                X, y = datasets.load_iris(return_X_y=True)
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X.astype(numpy.float32), y.astype(numpy.float32))
+                df_train = pandas.DataFrame(data=X_train, columns=[
+                                            "FA", "FB", "FC", "FD"])
+                df_train["Label"] = y_train
+                pipe = CSPipeline(["concat{col=Feat:FA,FB,FC,FD}"],
+                                  "oova{p=ap}", verbose=2)
+                pipe.fit(df_train, feature="Feat", label="Label")
+                stdout = pipe.StdOut
+                self.assertIn('Training learner 1', stdout)
+                self.assertEqual('', pipe.StdErr)
+                df_test = pandas.DataFrame(data=X_test, columns=[
+                                           "FA", "FB", "FC", "FD"])
+                self.assertIsInstance(df_test, pandas.DataFrame)
+                df_test["Label"] = y_test
+                pred = pipe.predict(df_test)
+                head = pred.head()
+                exp = ['FA', 'FB', 'FC', 'FD', 'Label', 'Feat.0', 'Feat.1', 'Feat.2',
+                       'Feat.3', 'PredictedLabel', 'Score.0', 'Score.1', 'Score.2']
+                self.assertEqual(list(head.columns), exp)
+                self.assertEqual(pred.shape, (38, 13))
+                acc = (pred.Label + 1 - pred.PredictedLabel).abs().sum()
+                self.assertLesser(acc, 10)
+                # Save
+                temp = get_temp_folder(__file__, "temp_predictor")
+                outfile = os.path.join(temp, "iris.zip")
+                pipe.save(outfile)
+                pipe2 = CSPipeline.load(outfile)
+                pred2 = pipe2.predict(df_test)
+                self.assertEqual(pred, pred2)
+        out = fout.getvalue()
+        err = ferr.getvalue()
+        self.assertEqual(out, '')
+        self.assertEqual(err, '')
 
     def test_transform(self):
         X, y = datasets.load_iris(return_X_y=True)
@@ -75,7 +88,7 @@ def test_transform(self):
                                     "FA", "FB", "FC", "FD"])
         df_train["Label"] = y_train
         pipe = CSPipeline(
-            ["concat{col=Feat:FA,FB,FC,FD}", "poly{col=Feat}"], stdout="python", verbose=0)
+            ["concat{col=Feat:FA,FB,FC,FD}", "poly{col=Feat}"], verbose=0)
         pipe.fit(df_train)
         df_test = pandas.DataFrame(data=X_test, columns=[
                                    "FA", "FB", "FC", "FD"])
@@ -100,7 +113,7 @@ def test_transform_array(self):
         X_train, X_test, __, _ = train_test_split(
             X.astype(numpy.float32), y.astype(numpy.float32))
         pipe = CSPipeline(
-            ["concat{col=Feat:X0,X1,X2,X3}", "poly{col=Feat}"], stdout="python", verbose=0)
+            ["concat{col=Feat:X0,X1,X2,X3}", "poly{col=Feat}"], verbose=0)
         pipe.fit(X_train)
         pred = pipe.transform(X_test)
         head = pred.head()

diff --git a/_unittests/ut_binaries/test_maml.py b/_unittests/ut_binaries/test_maml.py
@@ -60,7 +60,7 @@ def test_maml(self):
 
         out, _ = maml(script)
         self.assertExists(model)
-        self.assertIn("LBFGS Optimizer", out)
+        self.assertIn("'Normalize' finished", out)
 
 
 if __name__ == "__main__":

diff --git a/_unittests/ut_binaries/test_maml_nb.py b/_unittests/ut_binaries/test_maml_nb.py
@@ -55,7 +55,7 @@ def test_maml_nb(self):
         data=__DATA__
         loader=text{col=Label:U4[0-2]:0 col=Slength:R4:1 col=Swidth:R4:2 col=Plength:R4:3 col=Pwidth:R4:4 sep=, header=+}
         xf=Concat{col=Features:Slength,Swidth}
-        tr=ova{p=lr}
+        tr=oova{p=lr}
         out=__MODEL__
         """.strip("\n ").replace('__MODEL__', model).replace('__DATA__', dest)
 

diff --git a/build.cmd b/build.cmd
@@ -9,38 +9,43 @@ if not exist %ppythonpath% set ppythonpath="c:\Python36_x64"
 set PATH=%PATH%;%ppythonpath%
 set PYTHONPATH=%~dp0..\pyquickhelper\src
 
+set DOTNET_CLI_TELEMETRY_OPTOUT=1
+set DOTNET_SKIP_FIRST_TIME_EXPERIENCE=1
+set DOTNET_MULTILEVEL_LOOKUP=0
+
+set LOCALMLEXT=%~dp0..\machinelearningext
+if exist %LOCALMLEXT% goto copybinaries:
+
 cd cscode\machinelearning
-if "%1" == "ml" goto buildrelease:
-if exist bin\x64.Release goto mldeb:
-@echo [build.cmd] build machinelearning release
-:buildrelease:
-cmd /C build.cmd -release
+if "%1" == "ml" goto buildml:
+if exist bin goto mlend:
+@echo [build.cmd] build machinelearning debug and release
+:buildml:
+cmd /C build.cmd
 if %errorlevel% neq 0 exit /b %errorlevel%
-:mldeb:
-if "%1" == "ml" goto builddebug:
-if exist bin\x64.Debug goto mlrel:
-:builddebug:
-@echo [build.cmd] build machinelearning debug
-cmd /C build.cmd -debug
+cmd /C build.cmd -release
 if %errorlevel% neq 0 exit /b %errorlevel%
-:mlrel:
+:mlend:
 cd ..\..
 
+:copybinaries:
 if "%1" == "ml" goto copydebug:
-if exist cscode\machinelearning\bin\x64.Debug goto copymlrel:
+if exist cscode\machinelearning\bin\AnyCPU.Debug\Microsoft.ML.Api goto copymlrel:
 :copydebug:
 @echo [build.cmd] copy debug binaries for machinelearning
 python -u setup.py copybinml debug
 if %errorlevel% neq 0 exit /b %errorlevel%
 :copymlrel:
 if "%1" == "ml" goto copyrelease:
-if exist cscode\machinelearning\bin\x64.Release goto copybin:
+if exist cscode\machinelearning\bin\AnyCPU.Release\Microsoft.ML.Api goto copybin:
 :copyrelease:
 @echo [build.cmd] copy release binaries for machinelearning
 python -u setup.py copybinml release
 if %errorlevel% neq 0 exit /b %errorlevel%
 
 :copybin:
+if exist %LOCALMLEXT% goto copybinariesext:
+
 @echo [build.cmd] build machinelearningext
 cd cscode\machinelearningext\machinelearningext
 dotnet build -c Release machinelearningext.sln
@@ -49,6 +54,8 @@ dotnet build -c debug machinelearningext.sln
 if %errorlevel% neq 0 exit /b %errorlevel%
 cd ..\..\..
 
+:copybinariesext:
+
 @echo [build.cmd] copy binaries for machinelearningext
 python -u setup.py copybinmlext debug
 if %errorlevel% neq 0 exit /b %errorlevel%