From 740f1c8cb8860fa545d341aea9ddbe4c58070e02 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sat, 22 Jul 2023 19:41:40 +0200
Subject: [PATCH 01/16] refactoring

---
 .circleci/config.yml                          |   4 +-
 .github/workflows/black.yml                   |  11 +
 .github/workflows/check-urls.yml              |  47 +++
 .github/workflows/codeql.yml                  |  61 ++++
 .github/workflows/documentation.yml           |  88 +++++
 .github/workflows/rstcheck.yml                |  27 ++
 .github/workflows/wheels-any.yml              |  29 ++
 .gitignore                                    | 300 ++----------------
 .landscape.yml                                |  15 -
 .local.jenkins.lin.yml                        |   1 +
 .local.jenkins.win.yml                        |  26 --
 .travis.yml                                   |  15 -
 CHANGELOGS.rst                                |  35 ++
 HISTORY.rst                                   |  37 ---
 MANIFEST.in                                   |   2 -
 README.rst                                    |   7 +-
 _unittests/ut_df/test_connex_split.py         |  26 +-
 _unittests/ut_df/test_connex_split_big.py     |  22 +-
 .../ut_documentation/test_run_notebooks.py    |   8 +-
 _unittests/ut_module/test_check.py            |  26 --
 _unittests/ut_module/test_code_style.py       |  37 ---
 .../ut_module/test_convert_notebooks.py       |  38 ---
 _unittests/ut_module/test_readme.py           |  35 --
 appveyor.yml                                  |   2 +-
 azure-pipelines.yml                           | 180 +++++++++--
 build_script.bat                              |  13 -
 pandas_streaming/__init__.py                  |  50 +--
 pandas_streaming/df/connex_split.py           |   7 +-
 pandas_streaming/df/dataframe_io_helpers.py   |  12 +-
 pyproject.toml                                |  31 ++
 requirements-dev.txt                          |  20 ++
 requirements.txt                              |  21 +-
 setup.cfg                                     |   5 +
 setup.py                                      | 114 ++++---
 34 files changed, 610 insertions(+), 742 deletions(-)
 create mode 100644 .github/workflows/black.yml
 create mode 100644 .github/workflows/check-urls.yml
 create mode 100644 .github/workflows/codeql.yml
 create mode 100644 .github/workflows/documentation.yml
 create mode 100644 .github/workflows/rstcheck.yml
 create mode 100644 .github/workflows/wheels-any.yml
 delete mode 100644 .landscape.yml
 delete mode 100644 .local.jenkins.win.yml
 delete mode 100644 .travis.yml
 create mode 100644 CHANGELOGS.rst
 delete mode 100644 HISTORY.rst
 delete mode 100644 _unittests/ut_module/test_check.py
 delete mode 100644 _unittests/ut_module/test_code_style.py
 delete mode 100644 _unittests/ut_module/test_convert_notebooks.py
 delete mode 100644 _unittests/ut_module/test_readme.py
 delete mode 100644 build_script.bat
 create mode 100644 pyproject.toml
 create mode 100644 requirements-dev.txt
 create mode 100644 setup.cfg

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e764c48..712677b 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -11,7 +11,7 @@ jobs:
 
       - restore_cache:
           keys:
-          - v3-dependencies-{{ checksum "requirements.txt" }}
+          - v3-dependencies-{{ checksum "requirements-dev.txt" }}
           - v3-dependencies-
 
       - run:
@@ -44,7 +44,7 @@ jobs:
       - save_cache:
           paths:
             - ./venv
-          key: v3-dependencies-{{ checksum "requirements.txt" }}
+          key: v3-dependencies-{{ checksum "requirements-dev.txt" }}
         
       - run:
           name: compile and build
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
new file mode 100644
index 0000000..fe99e3c
--- /dev/null
+++ b/.github/workflows/black.yml
@@ -0,0 +1,11 @@
+name: Black Format Checker
+on: [push, pull_request]
+jobs:
+  black-format-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: psf/black@stable
+        with:
+          options: "--diff --check"
+          src: "."
diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml
new file mode 100644
index 0000000..f235903
--- /dev/null
+++ b/.github/workflows/check-urls.yml
@@ -0,0 +1,47 @@
+name: Check URLs
+
+on:
+  pull_request:
+    branches: [main]
+  schedule:
+    #        ┌───────────── minute (0 - 59)
+    #        │  ┌───────────── hour (0 - 23)
+    #        │  │ ┌───────────── day of the month (1 - 31)
+    #        │  │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #        │  │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    #        │  │ │ │ │
+    #        │  │ │ │ │
+    #        │  │ │ │ │
+    #        *  * * * *
+    - cron: '30 1 * * 0'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: urls-checker-code
+      uses: urlstechie/urlchecker-action@master
+      with:
+        subfolder: pandas_streaming
+        file_types: .md,.py,.rst,.ipynb
+        print_all: false
+        timeout: 2
+        retry_count# : 2
+        # exclude_urls: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz,https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz
+        # exclude_patterns: https://dumps.wikimedia.org/
+        # force_pass : true
+
+    - name: urls-checker-docs
+      uses: urlstechie/urlchecker-action@master
+      with:
+        subfolder: _doc
+        file_types: .md,.py,.rst,.ipynb
+        print_all: false
+        timeout: 2
+        retry_count# : 2
+        # exclude_urls: https://hal.archives-ouvertes.fr/hal-00990252/document
+        # exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/
+        # force_pass : true
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000..bea1259
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,61 @@
+name: "Code Scanning - Action"
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+  schedule:
+    #        ┌───────────── minute (0 - 59)
+    #        │  ┌───────────── hour (0 - 23)
+    #        │  │ ┌───────────── day of the month (1 - 31)
+    #        │  │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #        │  │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    #        │  │ │ │ │
+    #        │  │ │ │ │
+    #        │  │ │ │ │
+    #        *  * * * *
+    - cron: '30 1 * * 0'
+
+jobs:
+  CodeQL-Build:
+    # CodeQL runs on ubuntu-latest, windows-latest, and macos-latest
+    runs-on: ubuntu-latest
+
+    permissions:
+      # required for all workflows
+      security-events: write
+
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v2
+        # Override language selection by uncommenting this and choosing your languages
+        # with:
+        #   languages: go, javascript, csharp, python, cpp, java, ruby
+
+      # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
+      # If this step fails, then you should remove it and run the build manually (see below).
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v2
+
+      # ℹ️ Command-line programs to run using the OS shell.
+      # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+      # ✏️ If the Autobuild fails above, remove it and uncomment the following
+      #    three lines and modify them (or add more) to build your code if your
+      #    project uses a compiled language
+
+      #- run: |
+      #     make bootstrap
+      #     make release
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v2
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
new file mode 100644
index 0000000..a7a5be1
--- /dev/null
+++ b/.github/workflows/documentation.yml
@@ -0,0 +1,88 @@
+name: Documentation and Code Coverage
+
+on:
+  push:
+  pull_request:
+    types:
+      - closed
+    branches:
+      - main
+
+jobs:
+  run:
+    name: Build documentation on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - uses: tlylt/install-graphviz@v1
+
+      - name: Install pandoc
+        run: sudo apt-get install -y pandoc
+
+      - name: Install requirements
+        run: python -m pip install -r requirements.txt
+
+      - name: Install requirements dev
+        run: python -m pip install -r requirements-dev.txt
+
+      - name: Cache pip
+        uses: actions/cache@v2
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+            ${{ runner.os }}-
+
+      - name: Generate coverage report
+        run: |
+          pip install pytest
+          pip install pytest-cov
+          export PYTHONPATH=.
+          pytest --cov=./pandas_streaming/ --cov-report=xml --durations=10 --ignore-glob=**LONG*.py --ignore-glob=**notebook*.py
+          export PYTHONPATH=
+
+      - name: Upload coverage reports to Codecov
+        uses: codecov/codecov-action@v3
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+
+      - name: Install
+        run: python setup.py install
+
+      - name: Copy license, changelogs
+        run: |
+          cp LICENSE* ./_doc
+          cp CHANGELOGS* ./_doc
+
+      - name: Documentation
+        run: python -m sphinx ./_doc ./dist/html -n -w doc.txt
+
+      - name: Summary
+        run: cat doc.txt
+
+      - name: Check for errors and warnings
+        run: |
+          if [[ $(grep ERROR doc.txt) ]]; then
+            echo "Documentation produces errors."
+            grep ERROR doc.txt
+            exit 1
+          fi
+          if [[ $(grep WARNING doc.txt) ]]; then
+            echo "Documentation produces warnings."
+            grep WARNING doc.txt
+            exit 1
+          fi
+
+      - uses: actions/upload-artifact@v3
+        with:
+          path: ./dist/html/**
diff --git a/.github/workflows/rstcheck.yml b/.github/workflows/rstcheck.yml
new file mode 100644
index 0000000..44e2a48
--- /dev/null
+++ b/.github/workflows/rstcheck.yml
@@ -0,0 +1,27 @@
+name: RST Check
+
+on: [push, pull_request]
+
+jobs:
+  build_wheels:
+    name: rstcheck ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: Install requirements
+        run: python -m pip install -r requirements.txt
+
+      - name: Install rstcheck
+        run: python -m pip install sphinx tomli rstcheck[toml,sphinx]
+
+      - name: rstcheck
+        run: rstcheck -r _doc pandas_streaming
diff --git a/.github/workflows/wheels-any.yml b/.github/workflows/wheels-any.yml
new file mode 100644
index 0000000..2547b0b
--- /dev/null
+++ b/.github/workflows/wheels-any.yml
@@ -0,0 +1,29 @@
+name: Build Any Wheel
+
+on:
+  push:
+    branches:
+      - main
+      - 'releases/**'
+
+jobs:
+  build_wheels:
+    name: Build wheels on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+
+      - name: build wheel
+        run: python -m pip wheel .
+
+      - uses: actions/upload-artifact@v3
+        with:
+          path: ./pandas_streaming*.whl
diff --git a/.gitignore b/.gitignore
index 6bd1306..fbe3a19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,279 +1,25 @@
-#################
-## Eclipse
-#################
-
-*.pydevproject
-.project
-.metadata
-bin/
-tmp/
-_virtualenv/
-*.tmp
-*.bak
-*.swp
-*~.nib
-local.properties
-.classpath
-.settings/
-.loadpath
-*.pyproj
-
-# External tool builders
-.externalToolBuilders/
-
-# Locally stored "Eclipse launch configurations"
-*.launch
-
-# CDT-specific
-.cproject
-
-# PDT-specific
-.buildpath
-
-
-#################
-## Visual Studio
-#################
-
-## Ignore Visual Studio temporary files, build results, and
-## files generated by popular Visual Studio add-ons.
-
-# User-specific files
-*.suo
-*.user
-*.sln.docstates
-
-# Build results
-
-[Dd]ebug/
-[Rr]elease/
-x64/
-build/
-[Bb]in/
-[Oo]bj/
-
-# MSTest test Results
-[Tt]est[Rr]esult*/
-[Bb]uild[Ll]og.*
-
-*_i.c
-*_p.c
-*.ilk
-*.meta
-*.obj
-*.pch
-*.pdb
-*.pgc
-*.pgd
-*.rsp
-*.sbr
-*.tlb
-*.tli
-*.tlh
-*.tmp
-*.tmp_proj
-*.log
-*.vspscc
-*.vssscc
-.builds
-*.pidb
-*.log
-*.scc
+*.pyc
 *.pyd
-
-# Visual C++ cache files
-ipch/
-*.aps
-*.ncb
-*.opensdf
-*.sdf
-*.cachefile
-
-# Visual Studio profiler
-*.psess
-*.vsp
-*.vspx
-
-# Guidance Automation Toolkit
-*.gpState
-
-# ReSharper is a .NET coding add-in
-_ReSharper*/
-*.[Rr]e[Ss]harper
-
-# TeamCity is a build add-in
-_TeamCity*
-
-# DotCover is a Code Coverage Tool
-*.dotCover
-
-# NCrunch
-*.ncrunch*
-.*crunch*.local.xml
-
-# Installshield output folder
-[Ee]xpress/
-
-# DocProject is a documentation generator add-in
-DocProject/buildhelp/
-DocProject/Help/*.HxT
-DocProject/Help/*.HxC
-DocProject/Help/*.hhc
-DocProject/Help/*.hhk
-DocProject/Help/*.hhp
-DocProject/Help/Html2
-DocProject/Help/html
-
-# Click-Once directory
-publish/
-
-# Publish Web Output
-*.Publish.xml
-*.pubxml
-
-# NuGet Packages Directory
-## TODO: If you have NuGet Package Restore enabled, uncomment the next line
-#packages/
-
-# Windows Azure Build Output
-csx
-*.build.csdef
-
-# Windows Store app package directory
-AppPackages/
-
-# Others
-sql/
-*.Cache
-ClientBin/
-[Ss]tyle[Cc]op.*
-~$*
-*~
-*.dbmdl
-*.[Pp]ublish.xml
-*.pfx
-*.publishsettings
-
-# RIA/Silverlight projects
-Generated_Code/
-
-# Backup & report files from converting an old project file to a newer
-# Visual Studio version. Backup files are not needed, because we have git ;-)
-_UpgradeReport_Files/
-Backup*/
-UpgradeLog*.XML
-UpgradeLog*.htm
-
-# SQL Server files
-App_Data/*.mdf
-App_Data/*.ldf
-
-#############
-## Windows detritus
-#############
-
-# Windows image file caches
-Thumbs.db
-ehthumbs.db
-
-# Folder config file
-Desktop.ini
-
-# Recycle Bin used on file shares
-$RECYCLE.BIN/
-
-# Mac crap
-.DS_Store
-
-
-#############
-## Python
-#############
-
-*.py[co]
-
-# Packages
-*.egg
-*.egg-info
-dist/
-build/
-eggs/
-parts/
-var/
-sdist/
-develop-eggs/
-__pycache__/
-.installed.cfg
-
-# Installer logs
-pip-log.txt
-
-# Unit test / coverage reports
+*.dylib
+*.so
+*.whl
+coverage.html/*
+_cache/*
 .coverage
-.tox
-
-#Translations
-*.mo
-
-#Mr Developer
-.mr.developer.cfg
-
-# py* packages
-temp_*
-out_*
-*/sphinxdoc/source/index_*
-*/sphinxdoc/source/readme.*
-*/sphinxdoc/source/LICENSE.txt
-*/sphinxdoc/source/filechanges.*
-version.txt
-_doc/sphinxdoc/source/python_template/*box.html
-_doc/sphinxdoc/source/python_template/*toc.html
-_doc/sphinxdoc/source/jyquickhelper/
-_doc/sphinxdoc/source/coverage/*
-*/sphinxdoc/source/all*.rst
-_doc/sphinxdoc/source/notebooks/*
-*/sphinxdoc/source/gynotebooks/*
-_doc/sphinxdoc/source/gyexamples/*
-_doc/sphinxdoc/source/examples/*
-_doc/sphinxdoc/source/gallery/*
-_doc/sphinxdoc/source/gallerynb/*
-build_help.bat
-_doc/sphinxdoc/source/blog/*.rst
-_doc/sphinxdoc/source/blog/rss.xml
-_doc/sphinxdoc/source/_templates/*toc.html
-_doc/sphinxdoc/source/_templates/*box.html
-_doc/sphinxdoc/source/blog/feed-icon*.png
-_doc/sphinxdoc/source/_static/reveal.js/*
-_doc/notebooks/.ipynb_checkpoints/*
-dist_module27/*
-auto_*.bat
-auto_*.sh
-auto_*.py
-auto_*.xml
-auto_*.db3
-_doc/sphinxdoc/source/_static/require.js
-_doc/sphinxdoc/require.js
-ex.*
-m.temp
-_doc/notebooks/*/.ipynb_checkpoints
-_doc/notebooks/nlp/frwiki-latest-all-titles-in-ns0
-_doc/notebooks/nlp/sample*.txt
-_doc/notebooks/nlp/completion.prof
-_doc/notebooks/nlp/profile.png
-_doc/notebooks/nlp/completion.dot
-_doc/notebooks/nlp/completion.png
-_doc/notebooks/nlp/completion.pstat
-_unittests/run_unittests.py.out
-*.err
-_doc/sphinxdoc/source/_static/style_notebook_snippet.css
-dist
-_doc/sphinxdoc/source/pandas_streaming
-_doc/sphinxdoc/source/nbcov.png
-_doc/notebooks/example.test.txt
-_doc/notebooks/example.txt
-_doc/notebooks/example.train.txt
-_unittests/ut_df/buggy_hash.csv
-_doc/sphinxdoc/dfs.zip
-_doc/sphinxdoc/dfsa.zip
-_doc/sphinxdoc/source/nbcov-*.png
-.eggs
+dist/*
+build/*
+.eggs/*
+.hypothesis/*
+*egg-info/*
+prof
+_doc/auto_examples/*
+_doc/examples/_cache/*
+_doc/examples/plot_*.png
+_doc/examples/plot_*.xlsx
+_doc/examples/*.html
+_doc/_static/require.js
+_doc/_static/viz.js
+_unittests/ut__main/*.png
+_unittests/ut__main/_cache/*
+_unittests/ut__main/*.html
+_unittests/.hypothesis/*
diff --git a/.landscape.yml b/.landscape.yml
deleted file mode 100644
index 3b83d70..0000000
--- a/.landscape.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-doc-warnings: yes
-test-warnings: no
-strictness: veryhigh
-max-line-length: 120
-autodetect: yes
-requirements:
-    - requirement.txt
-ignore-paths:
-    - _unittests
-    - _doc
-    - dist
-    - build
-ignore-patterns:
-    - .*Parser\.py$
-    - .*Lexer\.py$
diff --git a/.local.jenkins.lin.yml b/.local.jenkins.lin.yml
index c3e0dee..9ab574d 100644
--- a/.local.jenkins.lin.yml
+++ b/.local.jenkins.lin.yml
@@ -11,6 +11,7 @@ install:
   - $PYINT -m pip install --upgrade pip
   - $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/
   - $PYINT -m pip install -r requirements.txt
+  - $PYINT -m pip install -r requirements-dev.txt
   - $PYINT --version
   - $PYINT -m pip freeze
 
diff --git a/.local.jenkins.win.yml b/.local.jenkins.win.yml
deleted file mode 100644
index 7f8c60a..0000000
--- a/.local.jenkins.win.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-
-language: python
-
-python:
-  - { PATH: "{{replace(Python39, '\\', '\\\\')}}", VERSION: 3.9, DIST: std }
-  
-virtualenv:
-  - path: {{ospathjoin(root_path, pickname("%NAME_JENKINS%", project_name + "_%VERSION%_%DIST%_%NAME%"), "_venv")}}
-  
-install:
-  - pip install --upgrade pip
-  - pip install --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper --extra-index-url=https://pypi.python.org/simple/
-  - pip install --no-cache-dir --no-deps --index http://localhost:8067/simple/ pyquickhelper --extra-index-url=https://pypi.python.org/simple/
-  - pip install -r requirements.txt
-  - pip freeze
-  - pip freeze > pip_freeze.txt
-before_script:
-  - python -u setup.py build_ext --inplace
-script:
-  - { CMD: "python -u setup.py unittests", NAME: "UT" }
-after_script:
-  - python setup.py bdist_wheel
-  - if [ ${DIST} != "conda" and ${NAME} == "UT" ] then copy dist\*.whl {{root_path}}\..\..\local_pypi\local_pypi_server fi
-documentation:
-  - if [ ${NAME} == "UT" ] then python -u setup.py build_sphinx fi
-  - if [ ${NAME} == "UT" ] then xcopy /E /C /I /Y _doc\sphinxdoc\build\html dist\html fi
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index df69072..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-dist: focal
-sudo: true
-language: python
-matrix:
-  include:
-  - python: 3.10
-    name: "Py310-skl022+"
-    env: sklearnc=">=0.22"
-install:
-  - pip install -r requirements.txt
-  - python -c "import sklearn;print(sklearn.__version__)"
-before_script:
-  - python setup.py build_ext --inplace
-script:
-  - python setup.py unittests
diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst
new file mode 100644
index 0000000..985bb0a
--- /dev/null
+++ b/CHANGELOGS.rst
@@ -0,0 +1,35 @@
+
+===========
+Change Logs
+===========
+
+current - 2021-10-26 - 0.00Mb
+=============================
+
+* :pr:`27`: Fixes json parser when input is a stream (2021-10-26)
+* :pr:`26`: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26)
+* :pr:`25`: Fixes documentation (2021-10-18)
+* :pr:`24`: Implements a first version of sort_values. (2021-10-18)
+* :pr:`23`: First version of operator __setitem__ (2021-10-16)
+* :pr:`22`: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11)
+* :pr:`21`: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10)
+* :pr:`17`: Implements method describe (2021-04-08)
+
+0.2.175 - 2020-08-06 - 0.03Mb
+=============================
+
+* :pr:`16`: Unit tests failing with pandas 1.1.0. (2020-08-06)
+* :pr:`15`: implements parameter lines, flatten for read_json (2018-11-21)
+* :pr:`14`: implements fillna (2018-10-29)
+* :pr:`13`: implement concat for axis=0,1 (2018-10-26)
+* :pr:`12`: add groupby_streaming (2018-10-26)
+* :pr:`11`: add method add_column (2018-10-26)
+* :pr:`10`: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
+* :pr:`9`: head is very slow (2018-10-26)
+* :pr:`8`: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
+* :pr:`7`: implement read_json (2018-05-17)
+* :pr:`6`: add pandas_groupby_nan from pyensae (2018-05-17)
+* :pr:`5`: add random_state parameter to splitting functions (2018-02-04)
+* :pr:`2`: add method sample, resevoir sampling (2017-11-05)
+* :pr:`3`: method train_test_split for out-of-memory datasets (2017-10-21)
+* :pr:`1`: Excited for your project (2017-10-10)
diff --git a/HISTORY.rst b/HISTORY.rst
deleted file mode 100644
index 7a41d02..0000000
--- a/HISTORY.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-
-.. _l-HISTORY:
-
-=======
-History
-=======
-
-current - 2021-10-26 - 0.00Mb
-=============================
-
-* #27: Fixes json parser when input is a stream (2021-10-26)
-* #26: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26)
-* #25: Fixes documentation (2021-10-18)
-* #24: Implements a first version of sort_values. (2021-10-18)
-* #23: First version of operator __setitem__ (2021-10-16)
-* #22: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11)
-* #21: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10)
-* #17: Implements method describe (2021-04-08)
-
-0.2.175 - 2020-08-06 - 0.03Mb
-=============================
-
-* #16: Unit tests failing with pandas 1.1.0. (2020-08-06)
-* #15: implements parameter lines, flatten for read_json (2018-11-21)
-* #14: implements fillna (2018-10-29)
-* #13: implement concat for axis=0,1 (2018-10-26)
-* #12: add groupby_streaming (2018-10-26)
-* #11: add method add_column (2018-10-26)
-* #10: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26)
-* #9: head is very slow (2018-10-26)
-* #8: fix pandas_streaming for pandas 0.23.1 (2018-07-31)
-* #7: implement read_json (2018-05-17)
-* #6: add pandas_groupby_nan from pyensae (2018-05-17)
-* #5: add random_state parameter to splitting functions (2018-02-04)
-* #2: add method sample, resevoir sampling (2017-11-05)
-* #3: method train_test_split for out-of-memory datasets (2017-10-21)
-* #1: Excited for your project (2017-10-10)
diff --git a/MANIFEST.in b/MANIFEST.in
index a782640..66ddca8 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +1,5 @@
 prune _doc
 prune _unittests
-prune bin
-prune .circleci
 exclude *.bat
 exclude *.yml
 exclude *.git*
diff --git a/README.rst b/README.rst
index 12bcec5..1096a34 100644
--- a/README.rst
+++ b/README.rst
@@ -1,12 +1,9 @@
+pandas-streaming: streaming API over pandas
+===========================================
 
 .. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true
     :target: https://github.com/sdpython/pandas_streaming/
 
-.. _l-README:
-
-pandas_streaming: streaming API over pandas
-===========================================
-
 .. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=master
     :target: https://app.travis-ci.com/github/sdpython/pandas_streaming
     :alt: Build status
diff --git a/_unittests/ut_df/test_connex_split.py b/_unittests/ut_df/test_connex_split.py
index f0ab09c..33bd03f 100644
--- a/_unittests/ut_df/test_connex_split.py
+++ b/_unittests/ut_df/test_connex_split.py
@@ -4,7 +4,6 @@
 """
 import unittest
 import pandas
-from pyquickhelper.loghelper import fLOG
 from pyquickhelper.pycode import ExtTestCase
 from pandas_streaming.df import dataframe_shuffle, train_test_split_weights, train_test_connex_split
 
@@ -86,11 +85,6 @@ def test_split_weights(self):
         self.assertGreater(0.4, delta)
 
     def test_split_connex(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
         df = pandas.DataFrame([dict(user="UA", prod="PA", card="C1"),
                                dict(user="UA", prod="PB", card="C1"),
                                dict(user="UB", prod="PC", card="C2"),
@@ -102,7 +96,7 @@ def test_split_connex(self):
 
         train, test = train_test_connex_split(  # pylint: disable=W0632
             df, test_size=0.5, groups=['user', 'prod', 'card'],
-            fail_imbalanced=0.4, fLOG=fLOG)
+            fail_imbalanced=0.4)
 
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
         for col in ['user', 'prod', 'card']:
@@ -115,15 +109,10 @@ def test_split_connex(self):
         df['connex'] = 'ole'
         train, test = train_test_connex_split(  # pylint: disable=W0632
             df, test_size=0.5, groups=['user', 'prod', 'card'],
-            fail_imbalanced=0.4, fLOG=fLOG)
+            fail_imbalanced=0.4)
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 
     def test_split_connex2(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
         df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"),
                                dict(user="UA", prod="PB", card="C1"),
                                dict(user="UB", prod="PC", card="C2"),
@@ -134,11 +123,11 @@ def test_split_connex2(self):
                                ])
 
         train_test_connex_split(df, test_size=0.5, groups=['user', 'prod', 'card'],
-                                fail_imbalanced=0.5, fLOG=fLOG, return_cnx=True)
+                                fail_imbalanced=0.5, return_cnx=True)
         train, test, stats = train_test_connex_split(df, test_size=0.5,
                                                      groups=[
                                                          'user', 'prod', 'card'],
-                                                     fail_imbalanced=0.5, fLOG=fLOG,
+                                                     fail_imbalanced=0.5,
                                                      return_cnx=True, random_state=0)
 
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
@@ -153,11 +142,6 @@ def test_split_connex2(self):
                     'Non empty intersection {0} & {1}\n{2}\n{3}\n{4}'.format(s1, s2, train, test, "\n".join(rows)))
 
     def test_split_connex_missing(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
         df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"),
                                dict(user="UA", prod="PB", card="C1"),
                                dict(user="UB", prod="PC", card="C2"),
@@ -170,7 +154,7 @@ def test_split_connex_missing(self):
         train, test, stats = train_test_connex_split(df, test_size=0.5,
                                                      groups=[
                                                          'user', 'prod', 'card'],
-                                                     fail_imbalanced=0.4, fLOG=fLOG,
+                                                     fail_imbalanced=0.4,
                                                      return_cnx=True, random_state=0)
 
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
diff --git a/_unittests/ut_df/test_connex_split_big.py b/_unittests/ut_df/test_connex_split_big.py
index b21c9b4..22292c5 100644
--- a/_unittests/ut_df/test_connex_split_big.py
+++ b/_unittests/ut_df/test_connex_split_big.py
@@ -6,7 +6,6 @@
 import unittest
 from collections import Counter
 import pandas
-from pyquickhelper.loghelper import fLOG
 from pyquickhelper.pycode import ExtTestCase
 from pandas_streaming.df import train_test_connex_split
 
@@ -14,15 +13,10 @@
 class TestConnexSplitBig(ExtTestCase):
 
     def test_connex_big(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
         data = os.path.join(os.path.dirname(__file__), "data")
         name = os.path.join(data, "buggy_hash.csv")
         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
-        train, test, stats = train_test_connex_split(df, fLOG=fLOG,
+        train, test, stats = train_test_connex_split(df,
                                                      groups=[
                                                          "cart_id", "mail", "product_id"],
                                                      fail_imbalanced=0.9, return_cnx=True)
@@ -36,15 +30,10 @@ def test_connex_big(self):
         self.assertEqual(maxi, 14181)
 
     def test_connex_big_approx(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
         data = os.path.join(os.path.dirname(__file__), "data")
         name = os.path.join(data, "buggy_hash.csv")
         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
-        train, test, stats = train_test_connex_split(df, fLOG=fLOG,
+        train, test, stats = train_test_connex_split(df,
                                                      groups=[
                                                          "cart_id", "mail", "product_id"],
                                                      stop_if_bigger=0.05, return_cnx=True,
@@ -59,15 +48,10 @@ def test_connex_big_approx(self):
         self.assertLesser(maxi, 14181)
 
     def test_connex_big_approx_must(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
         data = os.path.join(os.path.dirname(__file__), "data")
         name = os.path.join(data, "buggy_hash.csv")
         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
-        train, test, stats = train_test_connex_split(df, fLOG=fLOG,
+        train, test, stats = train_test_connex_split(df,
                                                      groups=[
                                                          "cart_id", "mail", "product_id"],
                                                      stop_if_bigger=0.05, return_cnx=True,
diff --git a/_unittests/ut_documentation/test_run_notebooks.py b/_unittests/ut_documentation/test_run_notebooks.py
index 486cb45..6f84e1c 100644
--- a/_unittests/ut_documentation/test_run_notebooks.py
+++ b/_unittests/ut_documentation/test_run_notebooks.py
@@ -4,7 +4,6 @@
 """
 import os
 import unittest
-from pyquickhelper.loghelper import fLOG
 from pyquickhelper.pycode import ExtTestCase
 from pyquickhelper.ipythonhelper import test_notebook_execution_coverage
 import pandas_streaming
@@ -17,16 +16,11 @@ def setUp(self):
         self.assertTrue(jyquickhelper is not None)
 
     def test_notebook_artificiel(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
         self.assertTrue(pandas_streaming is not None)
         folder = os.path.join(os.path.dirname(__file__),
                               "..", "..", "_doc", "notebooks")
         test_notebook_execution_coverage(
-            __file__, "first_steps", folder, 'pandas_streaming', copy_files=[], fLOG=fLOG)
+            __file__, "first_steps", folder, 'pandas_streaming', copy_files=[])
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_module/test_check.py b/_unittests/ut_module/test_check.py
deleted file mode 100644
index 8b08b34..0000000
--- a/_unittests/ut_module/test_check.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""
-@brief      test log(time=0s)
-"""
-import io
-import unittest
-from contextlib import redirect_stdout
-from pyquickhelper.pycode import ExtTestCase
-from pandas_streaming import check, _setup_hook
-
-
-class TestCheck(ExtTestCase):
-    """Test style."""
-
-    def test_check(self):
-        self.assertTrue(check())
-
-    def test_setup_hook(self):
-        f = io.StringIO()
-        with redirect_stdout(f):
-            _setup_hook(True)
-        out = f.getvalue()
-        self.assertIn('Success:', out)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/_unittests/ut_module/test_code_style.py b/_unittests/ut_module/test_code_style.py
deleted file mode 100644
index 3c87deb..0000000
--- a/_unittests/ut_module/test_code_style.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""
-@brief      test log(time=0s)
-"""
-import os
-import unittest
-from pyquickhelper.loghelper import fLOG
-from pyquickhelper.pycode import check_pep8, ExtTestCase
-
-
-class TestCodeStyle(ExtTestCase):
-    """Test style."""
-
-    def test_style_src(self):
-        thi = os.path.abspath(os.path.dirname(__file__))
-        src_ = os.path.normpath(os.path.join(
-            thi, "..", "..", "pandas_streaming"))
-        check_pep8(src_, fLOG=fLOG,
-                   pylint_ignore=('C0103', 'C1801', 'R1705', 'W0108', 'W0613',
-                                  'W0212', 'W0703', 'W0107', 'C0302', 'C0209',
-                                  'C3001', 'R1735'),
-                   skip=["Too many nested blocks",
-                         "Module 'numpy.random' has no 'RandomState' member",
-                         "dataframe_split.py:60: [E731]",
-                         ])
-
-    def test_style_test(self):
-        thi = os.path.abspath(os.path.dirname(__file__))
-        test = os.path.normpath(os.path.join(thi, "..", ))
-        check_pep8(test, fLOG=fLOG, neg_pattern="temp_.*",
-                   pylint_ignore=('C0103', 'C1801', 'R1705', 'W0108', 'W0613',
-                                  'C0111', 'W0107', 'C0302', 'R1732', 'C0209',
-                                  'C3001', 'R1735'),
-                   skip=[])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/_unittests/ut_module/test_convert_notebooks.py b/_unittests/ut_module/test_convert_notebooks.py
deleted file mode 100644
index 12fe82a..0000000
--- a/_unittests/ut_module/test_convert_notebooks.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""
-@brief      test log(time=0s)
-"""
-import os
-import unittest
-from pyquickhelper.loghelper import fLOG
-from pyquickhelper.filehelper import explore_folder_iterfile
-from pyquickhelper.pycode import ExtTestCase
-from pyquickhelper.ipythonhelper import upgrade_notebook, remove_execution_number
-
-
-class TestConvertNotebooks(ExtTestCase):
-
-    def test_convert_notebooks(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
-        fold = os.path.abspath(os.path.dirname(__file__))
-        fold2 = os.path.normpath(
-            os.path.join(fold, "..", "..", "_doc", "notebooks"))
-        for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
-            t = upgrade_notebook(nbf)
-            if t:
-                fLOG("modified", nbf)
-            # remove numbers
-            remove_execution_number(nbf, nbf)
-
-        fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests"))
-        for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"):
-            t = upgrade_notebook(nbf)
-            if t:
-                fLOG("modified", nbf)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/_unittests/ut_module/test_readme.py b/_unittests/ut_module/test_readme.py
deleted file mode 100644
index 95c9fb7..0000000
--- a/_unittests/ut_module/test_readme.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
-@brief      test tree node (time=50s)
-"""
-import os
-import unittest
-from pyquickhelper.loghelper import fLOG
-from pyquickhelper.pycode import get_temp_folder, ExtTestCase, check_readme_syntax
-
-
-class TestReadme(ExtTestCase):
-
-    def test_venv_docutils08_readme(self):
-        fLOG(
-            __file__,
-            self._testMethodName,
-            OutputPrint=__name__ == "__main__")
-
-        fold = os.path.dirname(os.path.abspath(__file__))
-        readme = os.path.join(fold, "..", "..", "README.rst")
-        self.assertTrue(os.path.exists(readme))
-        with open(readme, "r", encoding="utf8") as f:
-            content = f.read()
-
-        self.assertTrue(len(content) > 0)
-        temp = get_temp_folder(__file__, "temp_readme")
-
-        if __name__ != "__main__":
-            # does not work from a virtual environment
-            return
-
-        check_readme_syntax(readme, folder=temp, fLOG=fLOG)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/appveyor.yml b/appveyor.yml
index e8294be..cb1fa0e 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -10,7 +10,7 @@ init:
 
 install:
   - "%PYTHON%\\python -m pip install --upgrade pip"
-  - "%PYTHON%\\Scripts\\pip install  -r requirements.txt"
+  - "%PYTHON%\\Scripts\\pip install  -r requirements-dev.txt"
 build: off
 
 before_test:
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 1d95231..040a297 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -1,11 +1,55 @@
 jobs:
-- job: 'TestLinux'
+- job: 'TestLinuxWheelPip'
+  pool:
+    vmImage: 'ubuntu-latest'
+  strategy:
+    matrix:
+      Python311-Linux:
+        python.version: '3.11'
+    maxParallel: 3
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '$(python.version)'
+      architecture: 'x64'
+  - script: sudo apt-get update
+    displayName: 'AptGet Update'
+  - script: sudo apt-get install -y graphviz
+    displayName: 'Install Graphviz'
+  - script: python -m pip install --upgrade pip setuptools wheel
+    displayName: 'Install tools'
+  - script: pip install -r requirements.txt
+    displayName: 'Install Requirements'
+  - script: pip install -r requirements-dev.txt
+    displayName: 'Install Requirements dev'
+  - script: |
+      ruff .
+    displayName: 'Ruff'
+  - script: |
+      black --diff .
+    displayName: 'Black'
+  - script: |
+      python -m pip wheel . --wheel-dir dist -v -v -v
+    displayName: 'build wheel'
+  - script: |
+      python -m pip install . -v -v -v
+    displayName: 'install wheel'
+  - script: |
+      python -m pytest
+    displayName: 'Runs Unit Tests'
+  - task: PublishPipelineArtifact@0
+    inputs:
+      artifactName: 'wheel-linux-wheel-$(python.version)'
+      targetPath: 'dist'
+
+- job: 'TestLinuxNightly'
   pool:
     vmImage: 'ubuntu-latest'
   strategy:
     matrix:
-      Python310-Linux:
-        python.version: '3.10'
+      Python311-Linux:
+        python.version: '3.11'
     maxParallel: 3
 
   steps:
@@ -17,10 +61,6 @@ jobs:
     displayName: 'AptGet Update'
   - script: sudo apt-get install -y pandoc
     displayName: 'Install Pandoc'
-  - script: sudo apt-get install -y texlive texlive-latex-extra texlive-xetex dvipng
-    displayName: 'Install Latex'
-  - script: sudo apt-get install -y p7zip-full
-    displayName: 'Install 7z, rar'
   - script: sudo apt-get install -y inkscape
     displayName: 'Install Inkscape'
   - script: sudo apt-get install -y graphviz
@@ -29,30 +69,114 @@ jobs:
     displayName: 'Install tools'
   - script: pip install -r requirements.txt
     displayName: 'Install Requirements'
+  - script: pip install -r requirements-dev.txt
+    displayName: 'Install Requirements dev'
+  - script: pip uninstall -y scikit-learn
+    displayName: 'Uninstall scikit-learn'
+  - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn
+    displayName: 'Install scikit-learn nightly'
+  - script: |
+      ruff .
+    displayName: 'Ruff'
+  - script: |
+      rstcheck -r ./_doc ./pandas_streaming
+    displayName: 'rstcheck'
+  - script: |
+      black --diff .
+    displayName: 'Black'
   - script: |
-      python -u setup.py build_ext --inplace
+      python -m pytest
     displayName: 'Runs Unit Tests'
+
+- job: 'TestLinux'
+  pool:
+    vmImage: 'ubuntu-latest'
+  strategy:
+    matrix:
+      Python311-Linux:
+        python.version: '3.11'
+    maxParallel: 3
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '$(python.version)'
+      architecture: 'x64'
+  - script: sudo apt-get update
+    displayName: 'AptGet Update'
+  - script: sudo apt-get install -y pandoc
+    displayName: 'Install Pandoc'
+  - script: sudo apt-get install -y inkscape
+    displayName: 'Install Inkscape'
+  - script: sudo apt-get install -y graphviz
+    displayName: 'Install Graphviz'
+  - script: python -m pip install --upgrade pip setuptools wheel
+    displayName: 'Install tools'
+  - script: pip install -r requirements.txt
+    displayName: 'Install Requirements'
+  - script: pip install -r requirements-dev.txt
+    displayName: 'Install Requirements dev'
+  - script: |
+      ruff .
+    displayName: 'Ruff'
+  - script: |
+      rstcheck -r ./_doc ./pandas_streaming
+    displayName: 'rstcheck'
+  - script: |
+      black --diff .
+    displayName: 'Black'
   - script: |
-      python -u setup.py unittests
+      python -m pytest --cov
     displayName: 'Runs Unit Tests'
   - script: |
       python -u setup.py bdist_wheel
     displayName: 'Build Package'
-#  - script: |
-#      python -u setup.py build_sphinx
-#    displayName: 'Builds Documentation'
+  #- script: |
+  #    python -m sphinx _doc dist/html
+  #  displayName: 'Builds Documentation'
   - task: PublishPipelineArtifact@0
     inputs:
       artifactName: 'wheel-linux-$(python.version)'
       targetPath: 'dist'
 
+- job: 'TestWindows'
+  pool:
+    vmImage: 'windows-latest'
+  strategy:
+    matrix:
+      Python311-Windows:
+        python.version: '3.11'
+    maxParallel: 3
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '$(python.version)'
+      architecture: 'x64'
+  - script: python -m pip install --upgrade pip setuptools wheel
+    displayName: 'Install tools'
+  - script: pip install -r requirements.txt
+    displayName: 'Install Requirements'
+  - script: pip install -r requirements-dev.txt
+    displayName: 'Install Requirements dev'
+  - script: |
+      python -m pytest
+    displayName: 'Runs Unit Tests'
+  - script: |
+      python -u setup.py bdist_wheel
+    displayName: 'Build Package'
+  - task: PublishPipelineArtifact@0
+    inputs:
+      artifactName: 'wheel-windows-$(python.version)'
+      targetPath: 'dist'
+
 - job: 'TestMac'
   pool:
     vmImage: 'macOS-latest'
   strategy:
     matrix:
-      Python310-Mac:
-        python.version: '3.10'
+      Python311-Mac:
+        python.version: '3.11'
     maxParallel: 3
 
   steps:
@@ -62,36 +186,22 @@ jobs:
       architecture: 'x64'
   - script: gcc --version
     displayName: 'gcc version'
-  - script: |
-          brew update
-    displayName: 'brew update'
+  #- script: brew upgrade
+  #  displayName: 'brew upgrade'
+  #- script: brew update
+  #  displayName: 'brew update'
   - script: export
     displayName: 'export'
   - script: gcc --version
     displayName: 'gcc version'
-  - script: brew install llvm
-    displayName: 'install llvm'
-  - script: brew install p7zip
-    displayName: 'Install p7zip'
-  - script: brew install pandoc
-    displayName: 'Install Pandoc'
-  - script: brew install graphviz
-    continueOnError: true
-    displayName: 'Install Graphviz'
-  - script: brew install --cask mactex
-    displayName: 'Install latex'
   - script: python -m pip install --upgrade pip setuptools wheel
     displayName: 'Install tools'
-  - script: brew install pybind11
-    displayName: 'Install pybind11'
   - script: pip install -r requirements.txt
     displayName: 'Install Requirements'
+  - script: pip install -r requirements-dev.txt
+    displayName: 'Install Requirements dev'
   - script: |
-          export MACOSX_DEPLOYMENT_TARGET=10.13
-          python setup.py build_ext --inplace
-    displayName: 'Build package'
-  - script: |
-      python -u setup.py unittests
+      python -m pytest
     displayName: 'Runs Unit Tests'
   - script: |
       python -u setup.py bdist_wheel
diff --git a/build_script.bat b/build_script.bat
deleted file mode 100644
index 415ae38..0000000
--- a/build_script.bat
+++ /dev/null
@@ -1,13 +0,0 @@
-@echo off
-if "%1"=="" goto default_value_python:
-set pythonexe="%1"
-%pythonexe% setup.py write_version
-goto custom_python:
-
-:default_value_python:
-set pythonexe="c:\Python395_x64\python.exe"
-if not exist %pythonexe% set pythonexe="c:\Python391_x64\python.exe"
-:custom_python:
-@echo [python] %pythonexe%
-%pythonexe% -u setup.py build_script
-if %errorlevel% neq 0 exit /b %errorlevel%
\ No newline at end of file
diff --git a/pandas_streaming/__init__.py b/pandas_streaming/__init__.py
index 8450d74..a4a6c0c 100644
--- a/pandas_streaming/__init__.py
+++ b/pandas_streaming/__init__.py
@@ -1,53 +1,5 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Module *pandas_streaming*.
-Processes large datasets with :epkg:`pandas` by
-reimplementing streeaming versions of
-:epkg:`pandas` functionalites.
-"""
-
-__version__ = "0.3.218"
+__version__ = "0.4.218"
 __author__ = "Xavier Dupré"
 __github__ = "https://github.com/sdpython/pandas_streaming"
 __url__ = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html"
 __license__ = "MIT License"
-__blog__ = """
-<?xml version="1.0" encoding="UTF-8"?>
-<opml version="1.0">
-    <head>
-        <title>blog</title>
-    </head>
-    <body>
-        <outline text="pandas_streaming"
-            title="pandas_streaming"
-            type="rss"
-            xmlUrl="http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/_downloads/rss.xml"
-            htmlUrl="http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/blog/main_0000.html" />
-    </body>
-</opml>
-"""
-
-
-def check(log=False):
-    """
-    Checks the library is working.
-    It raises an exception.
-    If you want to disable the logs:
-
-    :param log: if True, display information, otherwise none
-    :return: 0 or exception
-    """
-    return True
-
-
-def _setup_hook(use_print=False):
-    """
-    if this function is added to the module,
-    the help automation and unit tests call it first before
-    anything goes on as an initialization step.
-    """
-    # we can check many things, needed module
-    # any others things before unit tests are started
-    if use_print:
-        print("Success: _setup_hook")
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
index e78891a..ec01b02 100644
--- a/pandas_streaming/df/connex_split.py
+++ b/pandas_streaming/df/connex_split.py
@@ -112,7 +112,7 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None,
                             stratify=None, hash_size=9, unique_rows=False,
                             shuffle=True, fail_imbalanced=0.05, keep_balance=None,
                             stop_if_bigger=None, return_cnx=False,
-                            must_groups=None, random_state=None, fLOG=None):
+                            must_groups=None, random_state=None):
     """
     This split is for a specific case where data is linked
     in many ways. Let's assume we have three ids as we have
@@ -144,7 +144,6 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None,
     @param  must_groups     column name for ids which must not be shared by
                             train/test partitions
     @param  random_state    seed for random generator
-    @param  fLOG            logging function
     @return                 Two @see cl StreamingDataFrame, one
                             for train, one for test.
 
@@ -384,8 +383,7 @@ def double_merge(d):
 
 
 def train_test_apart_stratify(df, group, test_size=0.25, train_size=None,
-                              stratify=None, force=False, random_state=None,
-                              fLOG=None):
+                              stratify=None, force=False, random_state=None):
     """
     This split is for a specific case where data is linked
     in one way. Let's assume we have two ids as we have
@@ -403,7 +401,6 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None,
     @param  force           if True, tries to get at least one example on the test side
                             for each value of the column *stratify*
     @param  random_state    seed for random generators
-    @param  fLOG            logging function
     @return                 Two @see cl StreamingDataFrame, one
                             for train, one for test.
 
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
index 46f55e7..4ae503b 100644
--- a/pandas_streaming/df/dataframe_io_helpers.py
+++ b/pandas_streaming/df/dataframe_io_helpers.py
@@ -144,7 +144,7 @@ def _flatten(obj, key):
     return flattened_dict
 
 
-def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fLOG=None):
+def enumerate_json_items(filename, encoding=None, lines=False, flatten=False):
     """
     Enumerates items from a :epkg:`JSON` file or string.
 
@@ -152,7 +152,6 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL
     :param encoding: encoding
     :param lines: one record per row
     :param flatten: call @see fn flatten_dictionary
-    :param fLOG: logging function
     :return: iterator on records at first level.
 
     It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``.
@@ -236,24 +235,23 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL
             with open(filename, "r", encoding=encoding) as f:
                 for el in enumerate_json_items(
                         f, encoding=encoding, lines=lines,
-                        flatten=flatten, fLOG=fLOG):
+                        flatten=flatten):
                     yield el
         else:
             st = StringIO(filename)
             for el in enumerate_json_items(
                     st, encoding=encoding, lines=lines,
-                    flatten=flatten, fLOG=fLOG):
+                    flatten=flatten):
                 yield el
     elif isinstance(filename, bytes):
         st = BytesIO(filename)
         for el in enumerate_json_items(
-                st, encoding=encoding, lines=lines, flatten=flatten,
-                fLOG=fLOG):
+                st, encoding=encoding, lines=lines, flatten=flatten):
             yield el
     elif lines:
         for el in enumerate_json_items(
                 JsonPerRowsStream(filename),
-                encoding=encoding, lines=False, flatten=flatten, fLOG=fLOG):
+                encoding=encoding, lines=False, flatten=flatten):
             yield el
     else:
         if hasattr(filename, 'seek'):
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..bad7f7a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,31 @@
+[tool.rstcheck]
+report_level = "INFO"
+ignore_directives = [
+    "autoclass",
+    "autofunction",
+    "automodule",
+    "gdot",
+    "image-sg",
+    "runpython",
+]
+ignore_roles = ["epkg"]
+
+[tool.ruff]
+
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".eggs",
+    ".git",
+    "build",
+    "dist",
+]
+
+# Same as Black.
+line-length = 88
+
+[tool.ruff.mccabe]
+# Unlike Flake8, default to a complexity level of 10.
+max-complexity = 10
+
+[tool.ruff.per-file-ignores]
+"_doc/examples/plot_first_example.py" = ["E402", "F811"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..5ab8605
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,20 @@
+autopep8
+coverage
+ijson
+jupyter_sphinx
+jyquickhelper
+matplotlib
+pandas>=1.1.0
+pandocfilters
+Pillow
+pycodestyle
+pylint>=2.14.0
+pyquickhelper>=1.10
+pyquicksetup
+scikit-learn
+scipy
+sphinx
+sphinxcontrib.imagesvg
+sphinx_gallery
+ujson
+wheel
diff --git a/requirements.txt b/requirements.txt
index 5ab8605..fb6c7ed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,20 +1 @@
-autopep8
-coverage
-ijson
-jupyter_sphinx
-jyquickhelper
-matplotlib
-pandas>=1.1.0
-pandocfilters
-Pillow
-pycodestyle
-pylint>=2.14.0
-pyquickhelper>=1.10
-pyquicksetup
-scikit-learn
-scipy
-sphinx
-sphinxcontrib.imagesvg
-sphinx_gallery
-ujson
-wheel
+pandas
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..c544d66
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,5 @@
+[options]
+packages = find:
+
+[options.packages.find]
+include = pandas_streaming*
diff --git a/setup.py b/setup.py
index e8a706a..6968009 100644
--- a/setup.py
+++ b/setup.py
@@ -1,57 +1,71 @@
 # -*- coding: utf-8 -*-
-from __future__ import print_function
-import sys
 import os
-from setuptools import setup, Extension, find_packages
-from pyquicksetup import read_version, read_readme, default_cmdclass
-
-#########
-# settings
-#########
-
-project_var_name = "pandas_streaming"
-versionPython = f"{sys.version_info.major}.{sys.version_info.minor}"
-path = "Lib/site-packages/" + project_var_name
-readme = 'README.rst'
-history = "HISTORY.rst"
-requirements = None
-
-KEYWORDS = [project_var_name, 'Xavier Dupré', 'pandas', 'streaming']
-DESCRIPTION = """Streaming operations with pandas."""
-CLASSIFIERS = [
-    'Programming Language :: Python :: 3',
-    'Intended Audience :: Developers',
-    'Topic :: Scientific/Engineering',
-    'Topic :: Education',
-    'License :: OSI Approved :: MIT License',
-    'Development Status :: 5 - Production/Stable'
-]
-
-#######
-# data
-#######
-
-packages = find_packages()
-package_dir = {k: os.path.join('.', k.replace(".", "/")) for k in packages}
-package_data = {}
+
+from setuptools import setup
+
+######################
+# beginning of setup
+######################
+
+
+here = os.path.dirname(__file__)
+if here == "":
+    here = "."
+package_data = {"pandas_streaming.validation": ["*.css", "*.js"]}
+
+try:
+    with open(os.path.join(here, "requirements.txt"), "r") as f:
+        requirements = f.read().strip(" \n\r\t").split("\n")
+except FileNotFoundError:
+    requirements = []
+if len(requirements) == 0 or requirements == [""]:
+    requirements = ["pandas"]
+
+try:
+    with open(os.path.join(here, "README.rst"), "r", encoding="utf-8") as f:
+        long_description = "pandas-streaming:" + f.read().split("pandas-streaming:")[1]
+except FileNotFoundError:
+    long_description = ""
+
+version_str = "0.1.0"
+with open(os.path.join(here, "pandas_streaming/__init__.py"), "r") as f:
+    line = [
+        _
+        for _ in [_.strip("\r\n ") for _ in f.readlines()]
+        if _.startswith("__version__")
+    ]
+    if len(line) > 0:
+        version_str = line[0].split("=")[1].strip('" ')
 
 
 setup(
-    name=project_var_name,
-    version=read_version(__file__, project_var_name),
-    author='Xavier Dupré',
-    author_email='xavier.dupre@gmail.com',
-    license="MIT",
-    url="http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html",
-    download_url="https://github.com/sdpython/pandas_streaming/",
-    description=DESCRIPTION,
-    long_description=read_readme(__file__),
-    cmdclass=default_cmdclass(),
-    keywords=KEYWORDS,
-    classifiers=CLASSIFIERS,
-    packages=packages,
-    package_dir=package_dir,
+    name="pandas-streaming",
+    version=version_str,
+    description="Array (and numpy) API for ONNX",
+    long_description=long_description,
+    author="Xavier Dupré",
+    author_email="xavier.dupre@gmail.com",
+    url="https://github.com/sdpython/pandas-streaming",
     package_data=package_data,
-    setup_requires=["pyquicksetup"],
-    install_requires=['numpy', 'pandas', 'ijson'],
+    setup_requires=["numpy", "scipy"],
+    install_requires=requirements,
+    classifiers=[
+        "Intended Audience :: Science/Research",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: C",
+        "Programming Language :: Python",
+        "Topic :: Software Development",
+        "Topic :: Scientific/Engineering",
+        "Development Status :: 5 - Production/Stable",
+        "Operating System :: Microsoft :: Windows",
+        "Operating System :: POSIX",
+        "Operating System :: Unix",
+        "Operating System :: MacOS",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+    ],
 )

From c461afed9018d3c5133263608dafca03d4839f32 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sat, 22 Jul 2023 20:00:37 +0200
Subject: [PATCH 02/16] refactoring

---
 _doc/notebooks/first_steps.ipynb              | 1730 +++++++++--------
 _doc/sphinxdoc/source/_static/my-styles.css   |   41 -
 _doc/sphinxdoc/source/_templates/blogtoc.html |    4 -
 _doc/sphinxdoc/source/_templates/layout.html  |    5 -
 .../sphinxdoc/source/_templates/my-styles.css |   41 -
 _doc/sphinxdoc/source/_templates/page.html    |    4 -
 _doc/sphinxdoc/source/conf.py                 |  224 ++-
 _unittests/ut_df/test_connex_split.py         |  243 ++-
 _unittests/ut_df/test_connex_split_big.py     |   45 +-
 _unittests/ut_df/test_connex_split_cat.py     |   81 +-
 _unittests/ut_df/test_dataframe_helpers.py    |   19 +-
 .../ut_df/test_dataframe_helpers_simple.py    |   36 +-
 _unittests/ut_df/test_dataframe_io.py         |   29 +-
 _unittests/ut_df/test_dataframe_io_helpers.py |  216 +-
 _unittests/ut_df/test_dataframe_sort.py       |   87 +-
 _unittests/ut_df/test_pandas_groupbynan.py    |   79 +-
 _unittests/ut_df/test_streaming_dataframe.py  |  215 +-
 .../ut_documentation/test_run_notebooks.py    |   14 +-
 _unittests/ut_module/test_sklearn.py          |   10 +-
 pandas_streaming/data/__init__.py             |    5 -
 pandas_streaming/data/dummy.py                |   18 +-
 pandas_streaming/df/__init__.py               |   17 +-
 pandas_streaming/df/connex_split.py           |  222 ++-
 pandas_streaming/df/dataframe.py              |  535 +++--
 pandas_streaming/df/dataframe_helpers.py      |  155 +-
 pandas_streaming/df/dataframe_io.py           |   40 +-
 pandas_streaming/df/dataframe_io_helpers.py   |   48 +-
 pandas_streaming/df/dataframe_split.py        |   91 +-
 pandas_streaming/exc/__init__.py              |    7 +-
 pandas_streaming/exc/exc_streaming.py         |   10 +-
 pyproject.toml                                |    4 +
 requirements-dev.txt                          |    3 +-
 32 files changed, 2302 insertions(+), 1976 deletions(-)
 delete mode 100644 _doc/sphinxdoc/source/_static/my-styles.css
 delete mode 100644 _doc/sphinxdoc/source/_templates/blogtoc.html
 delete mode 100644 _doc/sphinxdoc/source/_templates/layout.html
 delete mode 100644 _doc/sphinxdoc/source/_templates/my-styles.css
 delete mode 100644 _doc/sphinxdoc/source/_templates/page.html

diff --git a/_doc/notebooks/first_steps.ipynb b/_doc/notebooks/first_steps.ipynb
index 63ff017..735ede9 100644
--- a/_doc/notebooks/first_steps.ipynb
+++ b/_doc/notebooks/first_steps.ipynb
@@ -1,902 +1,906 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "# First steps with pandas_streaming\n",
-        "\n",
-        "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*."
-      ]
-    },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# First steps with pandas_streaming\n",
+    "\n",
+    "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 1,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n",
-              "<script>\n",
-              "function repeat_indent_string(n){\n",
-              "    var a = \"\" ;\n",
-              "    for ( ; n > 0 ; --n)\n",
-              "        a += \"    \";\n",
-              "    return a;\n",
-              "}\n",
-              "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n",
-              "    var anchors = document.getElementsByClassName(\"section\");\n",
-              "    if (anchors.length == 0) {\n",
-              "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n",
-              "    }\n",
-              "    var i,t;\n",
-              "    var text_menu = begin;\n",
-              "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n",
-              "    var ind = \"\";\n",
-              "    var memo_level = 1;\n",
-              "    var href;\n",
-              "    var tags = [];\n",
-              "    var main_item = 0;\n",
-              "    var format_open = 0;\n",
-              "    for (i = 0; i <= llast; i++)\n",
-              "        tags.push(\"h\" + i);\n",
-              "\n",
-              "    for (i = 0; i < anchors.length; i++) {\n",
-              "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n",
-              "\n",
-              "        var child = null;\n",
-              "        for(t = 0; t < tags.length; t++) {\n",
-              "            var r = anchors[i].getElementsByTagName(tags[t]);\n",
-              "            if (r.length > 0) {\n",
-              "child = r[0];\n",
-              "break;\n",
-              "            }\n",
-              "        }\n",
-              "        if (child == null) {\n",
-              "            text_memo += \"null\\n\";\n",
-              "            continue;\n",
-              "        }\n",
-              "        if (anchors[i].hasAttribute(\"id\")) {\n",
-              "            // when converted in RST\n",
-              "            href = anchors[i].id;\n",
-              "            text_memo += \"#1-\" + href;\n",
-              "            // passer \u00e0 child suivant (le chercher)\n",
-              "        }\n",
-              "        else if (child.hasAttribute(\"id\")) {\n",
-              "            // in a notebook\n",
-              "            href = child.id;\n",
-              "            text_memo += \"#2-\" + href;\n",
-              "        }\n",
-              "        else {\n",
-              "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n",
-              "            continue;\n",
-              "        }\n",
-              "        var title = child.textContent;\n",
-              "        var level = parseInt(child.tagName.substring(1,2));\n",
-              "\n",
-              "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n",
-              "\n",
-              "        if ((level < lfirst) || (level > llast)) {\n",
-              "            continue ;\n",
-              "        }\n",
-              "        if (title.endsWith('\u00b6')) {\n",
-              "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n",
-              "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n",
-              "        }\n",
-              "        if (title.length == 0) {\n",
-              "            continue;\n",
-              "        }\n",
-              "\n",
-              "        while (level < memo_level) {\n",
-              "            text_menu += end_format + \"</ul>\\n\";\n",
-              "            format_open -= 1;\n",
-              "            memo_level -= 1;\n",
-              "        }\n",
-              "        if (level == lfirst) {\n",
-              "            main_item += 1;\n",
-              "        }\n",
-              "        if (keep_item != -1 && main_item != keep_item + 1) {\n",
-              "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n",
-              "            continue;\n",
-              "        }\n",
-              "        while (level > memo_level) {\n",
-              "            text_menu += \"<ul>\\n\";\n",
-              "            memo_level += 1;\n",
-              "        }\n",
-              "        text_menu += repeat_indent_string(level-2);\n",
-              "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n",
-              "        format_open += 1;\n",
-              "    }\n",
-              "    while (1 < memo_level) {\n",
-              "        text_menu += end_format + \"</ul>\\n\";\n",
-              "        memo_level -= 1;\n",
-              "        format_open -= 1;\n",
-              "    }\n",
-              "    text_menu += send;\n",
-              "    //text_menu += \"\\n\" + text_memo;\n",
-              "\n",
-              "    while (format_open > 0) {\n",
-              "        text_menu += end_format;\n",
-              "        format_open -= 1;\n",
-              "    }\n",
-              "    return text_menu;\n",
-              "};\n",
-              "var update_menu = function() {\n",
-              "    var sbegin = \"\";\n",
-              "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n",
-              "    var send = \"\";\n",
-              "    var begin_format = '<li>';\n",
-              "    var end_format = '</li>';\n",
-              "    var keep_item = -1;\n",
-              "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n",
-              "       begin_format, end_format);\n",
-              "    var menu = document.getElementById(\"my_id_menu_nb\");\n",
-              "    menu.innerHTML=text_menu;\n",
-              "};\n",
-              "window.setTimeout(update_menu,2000);\n",
-              "            </script>"
-            ],
-            "text/plain": [
-              "<IPython.core.display.HTML object>"
-            ]
-          },
-          "execution_count": 2,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n",
+       "<script>\n",
+       "function repeat_indent_string(n){\n",
+       "    var a = \"\" ;\n",
+       "    for ( ; n > 0 ; --n)\n",
+       "        a += \"    \";\n",
+       "    return a;\n",
+       "}\n",
+       "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n",
+       "    var anchors = document.getElementsByClassName(\"section\");\n",
+       "    if (anchors.length == 0) {\n",
+       "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n",
+       "    }\n",
+       "    var i,t;\n",
+       "    var text_menu = begin;\n",
+       "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n",
+       "    var ind = \"\";\n",
+       "    var memo_level = 1;\n",
+       "    var href;\n",
+       "    var tags = [];\n",
+       "    var main_item = 0;\n",
+       "    var format_open = 0;\n",
+       "    for (i = 0; i <= llast; i++)\n",
+       "        tags.push(\"h\" + i);\n",
+       "\n",
+       "    for (i = 0; i < anchors.length; i++) {\n",
+       "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n",
+       "\n",
+       "        var child = null;\n",
+       "        for(t = 0; t < tags.length; t++) {\n",
+       "            var r = anchors[i].getElementsByTagName(tags[t]);\n",
+       "            if (r.length > 0) {\n",
+       "child = r[0];\n",
+       "break;\n",
+       "            }\n",
+       "        }\n",
+       "        if (child == null) {\n",
+       "            text_memo += \"null\\n\";\n",
+       "            continue;\n",
+       "        }\n",
+       "        if (anchors[i].hasAttribute(\"id\")) {\n",
+       "            // when converted in RST\n",
+       "            href = anchors[i].id;\n",
+       "            text_memo += \"#1-\" + href;\n",
+       "            // passer à child suivant (le chercher)\n",
+       "        }\n",
+       "        else if (child.hasAttribute(\"id\")) {\n",
+       "            // in a notebook\n",
+       "            href = child.id;\n",
+       "            text_memo += \"#2-\" + href;\n",
+       "        }\n",
+       "        else {\n",
+       "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n",
+       "            continue;\n",
+       "        }\n",
+       "        var title = child.textContent;\n",
+       "        var level = parseInt(child.tagName.substring(1,2));\n",
+       "\n",
+       "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n",
+       "\n",
+       "        if ((level < lfirst) || (level > llast)) {\n",
+       "            continue ;\n",
+       "        }\n",
+       "        if (title.endsWith('¶')) {\n",
+       "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n",
+       "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n",
+       "        }\n",
+       "        if (title.length == 0) {\n",
+       "            continue;\n",
+       "        }\n",
+       "\n",
+       "        while (level < memo_level) {\n",
+       "            text_menu += end_format + \"</ul>\\n\";\n",
+       "            format_open -= 1;\n",
+       "            memo_level -= 1;\n",
+       "        }\n",
+       "        if (level == lfirst) {\n",
+       "            main_item += 1;\n",
+       "        }\n",
+       "        if (keep_item != -1 && main_item != keep_item + 1) {\n",
+       "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n",
+       "            continue;\n",
+       "        }\n",
+       "        while (level > memo_level) {\n",
+       "            text_menu += \"<ul>\\n\";\n",
+       "            memo_level += 1;\n",
+       "        }\n",
+       "        text_menu += repeat_indent_string(level-2);\n",
+       "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n",
+       "        format_open += 1;\n",
+       "    }\n",
+       "    while (1 < memo_level) {\n",
+       "        text_menu += end_format + \"</ul>\\n\";\n",
+       "        memo_level -= 1;\n",
+       "        format_open -= 1;\n",
+       "    }\n",
+       "    text_menu += send;\n",
+       "    //text_menu += \"\\n\" + text_memo;\n",
+       "\n",
+       "    while (format_open > 0) {\n",
+       "        text_menu += end_format;\n",
+       "        format_open -= 1;\n",
+       "    }\n",
+       "    return text_menu;\n",
+       "};\n",
+       "var update_menu = function() {\n",
+       "    var sbegin = \"\";\n",
+       "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n",
+       "    var send = \"\";\n",
+       "    var begin_format = '<li>';\n",
+       "    var end_format = '</li>';\n",
+       "    var keep_item = -1;\n",
+       "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n",
+       "       begin_format, end_format);\n",
+       "    var menu = document.getElementById(\"my_id_menu_nb\");\n",
+       "    menu.innerHTML=text_menu;\n",
+       "};\n",
+       "window.setTimeout(update_menu,2000);\n",
+       "            </script>"
       ],
-      "source": [
-        "from jyquickhelper import add_notebook_menu\n",
-        "add_notebook_menu()"
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
       ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## pandas to pandas_streaming"
-      ]
-    },
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from jyquickhelper import add_notebook_menu\n",
+    "\n",
+    "add_notebook_menu()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## pandas to pandas_streaming"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 2,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>X</th>\n",
-              "      <th>Y</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "     X  Y\n",
-              "0  4.5  a\n",
-              "1  6.0  b\n",
-              "2  7.0  c"
-            ]
-          },
-          "execution_count": 3,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "from pandas import DataFrame\n",
-        "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n",
-        "df"
+      "text/plain": [
+       "     X  Y\n",
+       "0  4.5  a\n",
+       "1  6.0  b\n",
+       "2  7.0  c"
       ]
-    },
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from pandas import DataFrame\n",
+    "\n",
+    "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "We create a streaming dataframe:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "markdown",
-      "metadata": {
-        "collapsed": true
-      },
-      "source": [
-        "We create a streaming dataframe:"
+     "data": {
+      "text/plain": [
+       "<pandas_streaming.df.dataframe.StreamingDataFrame at 0x15c2c606160>"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "<pandas_streaming.df.dataframe.StreamingDataFrame at 0x15c2c606160>"
-            ]
-          },
-          "execution_count": 4,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "from pandas_streaming.df import StreamingDataFrame\n",
-        "sdf = StreamingDataFrame.read_df(df)\n",
-        "sdf"
-      ]
-    },
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from pandas_streaming.df import StreamingDataFrame\n",
+    "\n",
+    "sdf = StreamingDataFrame.read_df(df)\n",
+    "sdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 4,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>X</th>\n",
-              "      <th>Y</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "     X  Y\n",
-              "0  4.5  a\n",
-              "1  6.0  b\n",
-              "2  7.0  c"
-            ]
-          },
-          "execution_count": 5,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "sdf.to_dataframe()"
+      "text/plain": [
+       "     X  Y\n",
+       "0  4.5  a\n",
+       "1  6.0  b\n",
+       "2  7.0  c"
       ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory."
-      ]
-    },
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdf.to_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 5,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>X</th>\n",
-              "      <th>Y</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "     X  Y\n",
-              "0  4.5  a\n",
-              "1  6.0  b\n",
-              "2  7.0  c\n",
-              "0  4.5  a\n",
-              "1  6.0  b\n",
-              "2  7.0  c"
-            ]
-          },
-          "execution_count": 6,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "sdf2 = sdf.concat(sdf)\n",
-        "sdf2.to_dataframe()"
+      "text/plain": [
+       "     X  Y\n",
+       "0  4.5  a\n",
+       "1  6.0  b\n",
+       "2  7.0  c\n",
+       "0  4.5  a\n",
+       "1  6.0  b\n",
+       "2  7.0  c"
       ]
-    },
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdf2 = sdf.concat(sdf)\n",
+    "sdf2.to_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>Y</th>\n",
-              "      <th>Z</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>a</td>\n",
-              "      <td>10</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>b</td>\n",
-              "      <td>20</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "   Y   Z\n",
-              "0  a  10\n",
-              "1  b  20"
-            ]
-          },
-          "execution_count": 7,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Y</th>\n",
+       "      <th>Z</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>a</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>b</td>\n",
+       "      <td>20</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n",
-        "m"
+      "text/plain": [
+       "   Y   Z\n",
+       "0  a  10\n",
+       "1  b  20"
       ]
-    },
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n",
+    "m"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>X</th>\n",
-              "      <th>Y</th>\n",
-              "      <th>Z</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "      <td>10.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "      <td>20.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "      <td>NaN</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "      <td>10.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "      <td>20.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "      <td>NaN</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "     X  Y     Z\n",
-              "0  4.5  a  10.0\n",
-              "1  6.0  b  20.0\n",
-              "2  7.0  c   NaN\n",
-              "0  4.5  a  10.0\n",
-              "1  6.0  b  20.0\n",
-              "2  7.0  c   NaN"
-            ]
-          },
-          "execution_count": 8,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "      <th>Z</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "      <td>10.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "      <td>20.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "      <td>10.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "      <td>20.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n",
-        "sdf3.to_dataframe()"
+      "text/plain": [
+       "     X  Y     Z\n",
+       "0  4.5  a  10.0\n",
+       "1  6.0  b  20.0\n",
+       "2  7.0  c   NaN\n",
+       "0  4.5  a  10.0\n",
+       "1  6.0  b  20.0\n",
+       "2  7.0  c   NaN"
       ]
-    },
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n",
+    "sdf3.to_dataframe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 8,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>X</th>\n",
-              "      <th>Y</th>\n",
-              "      <th>Z</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "      <td>10.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "      <td>10.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "      <td>20.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>3</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "      <td>20.0</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>4</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "      <td>NaN</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>5</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "      <td>NaN</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "     X  Y     Z\n",
-              "0  4.5  a  10.0\n",
-              "1  4.5  a  10.0\n",
-              "2  6.0  b  20.0\n",
-              "3  6.0  b  20.0\n",
-              "4  7.0  c   NaN\n",
-              "5  7.0  c   NaN"
-            ]
-          },
-          "execution_count": 9,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "      <th>Z</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "      <td>10.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "      <td>10.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "      <td>20.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "      <td>20.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "The order might be different."
+      "text/plain": [
+       "     X  Y     Z\n",
+       "0  4.5  a  10.0\n",
+       "1  4.5  a  10.0\n",
+       "2  6.0  b  20.0\n",
+       "3  6.0  b  20.0\n",
+       "4  7.0  c   NaN\n",
+       "5  7.0  c   NaN"
       ]
-    },
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The order might be different."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>X</th>\n",
-              "      <th>Y</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>4.5</td>\n",
-              "      <td>a</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "     X  Y\n",
-              "0  4.5  a\n",
-              "1  4.5  a"
-            ]
-          },
-          "execution_count": 10,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4.5</td>\n",
+       "      <td>a</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n",
-        "sdfte.head()"
+      "text/plain": [
+       "     X  Y\n",
+       "0  4.5  a\n",
+       "1  4.5  a"
       ]
-    },
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n",
+    "sdfte.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 10,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/html": [
-              "<div>\n",
-              "<style>\n",
-              "    .dataframe thead tr:only-child th {\n",
-              "        text-align: right;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe thead th {\n",
-              "        text-align: left;\n",
-              "    }\n",
-              "\n",
-              "    .dataframe tbody tr th {\n",
-              "        vertical-align: top;\n",
-              "    }\n",
-              "</style>\n",
-              "<table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              "    <tr style=\"text-align: right;\">\n",
-              "      <th></th>\n",
-              "      <th>X</th>\n",
-              "      <th>Y</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>1</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>2</th>\n",
-              "      <td>6.0</td>\n",
-              "      <td>b</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <th>0</th>\n",
-              "      <td>7.0</td>\n",
-              "      <td>c</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table>\n",
-              "</div>"
-            ],
-            "text/plain": [
-              "     X  Y\n",
-              "0  6.0  b\n",
-              "1  7.0  c\n",
-              "2  6.0  b\n",
-              "0  7.0  c"
-            ]
-          },
-          "execution_count": 11,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X</th>\n",
+       "      <th>Y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>6.0</td>\n",
+       "      <td>b</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>c</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
       ],
-      "source": [
-        "sdftr.head()"
+      "text/plain": [
+       "     X  Y\n",
+       "0  6.0  b\n",
+       "1  7.0  c\n",
+       "2  6.0  b\n",
+       "0  7.0  c"
       ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "collapsed": true
-      },
-      "source": [
-        "## split a big file"
-      ]
-    },
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdftr.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## split a big file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "'example.txt'"
-            ]
-          },
-          "execution_count": 12,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "sdf2.to_csv(\"example.txt\")"
+     "data": {
+      "text/plain": [
+       "'example.txt'"
       ]
-    },
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sdf2.to_csv(\"example.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 12,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "['example.train.txt', 'example.test.txt']"
-            ]
-          },
-          "execution_count": 13,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n",
-        "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)"
+     "data": {
+      "text/plain": [
+       "['example.train.txt', 'example.test.txt']"
       ]
-    },
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n",
+    "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
     {
-      "cell_type": "code",
-      "execution_count": 13,
-      "metadata": {},
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "['example.test.txt', 'example.train.txt', 'example.txt']"
-            ]
-          },
-          "execution_count": 14,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "import glob\n",
-        "glob.glob(\"ex*.txt\")"
+     "data": {
+      "text/plain": [
+       "['example.test.txt', 'example.train.txt', 'example.txt']"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 14,
-      "metadata": {},
-      "outputs": [],
-      "source": []
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.6.1"
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
     }
+   ],
+   "source": [
+    "import glob\n",
+    "\n",
+    "glob.glob(\"ex*.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
   },
-  "nbformat": 4,
-  "nbformat_minor": 2
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
\ No newline at end of file
diff --git a/_doc/sphinxdoc/source/_static/my-styles.css b/_doc/sphinxdoc/source/_static/my-styles.css
deleted file mode 100644
index 57b29ed..0000000
--- a/_doc/sphinxdoc/source/_static/my-styles.css
+++ /dev/null
@@ -1,41 +0,0 @@
-
-.highlight-ipython3 {
-	background-color: #f8f8c8;
-}
-
-div.highlight-ipython3 pre {
-	background-color: #f8f8c8;
-}
-
-.wy-nav-top {
-	background-color: #FF0040;
-}
-
-.wy-side-nav-search {
-	background-color: #FF0040;
-}
-
-pre.highlight-default {
-	background-color: #b5b5b5;
-}
-
-table {
-    border: solid 1px #DDEEEE;
-    border-collapse: collapse;
-    border-spacing: 0;
-    font: normal 13px Arial, sans-serif;
-}
-thead th {
-    background-color: #DDEFEF;
-    border: solid 1px #DDEEEE;
-    color: #336B6B;
-    padding: 10px;
-    text-align: left;
-    text-shadow: 1px 1px 1px #fff;
-}
-tbody td {
-    border: solid 1px #DDEEEE;
-    color: #333;
-    padding: 10px;
-    text-shadow: 1px 1px 1px #fff;
-}
diff --git a/_doc/sphinxdoc/source/_templates/blogtoc.html b/_doc/sphinxdoc/source/_templates/blogtoc.html
deleted file mode 100644
index 02a6b01..0000000
--- a/_doc/sphinxdoc/source/_templates/blogtoc.html
+++ /dev/null
@@ -1,4 +0,0 @@
-<a href="{{ pathto('',1) }}/genindex.html">Index</a>
-<a href="{{ pathto('',1) }}/py-modindex.html">Module</a>
-<h3><a href="{{ pathto('',1) }}/blog/main_0000.html">Blog</a></h3>
-<a href="{{ pathto('',1) }}/blog/2017/2017-09-21_first_day.html">2017-09-17 - Why pandas_streaming?</a>
\ No newline at end of file
diff --git a/_doc/sphinxdoc/source/_templates/layout.html b/_doc/sphinxdoc/source/_templates/layout.html
deleted file mode 100644
index 08baa3e..0000000
--- a/_doc/sphinxdoc/source/_templates/layout.html
+++ /dev/null
@@ -1,5 +0,0 @@
-{# Import the theme's layout. #}
-{% extends "!layout.html" %}
-
-{# Custom CSS overrides #}
-{% set bootswatch_css_custom = ['_static/my-styles.css'] %}
\ No newline at end of file
diff --git a/_doc/sphinxdoc/source/_templates/my-styles.css b/_doc/sphinxdoc/source/_templates/my-styles.css
deleted file mode 100644
index 57b29ed..0000000
--- a/_doc/sphinxdoc/source/_templates/my-styles.css
+++ /dev/null
@@ -1,41 +0,0 @@
-
-.highlight-ipython3 {
-	background-color: #f8f8c8;
-}
-
-div.highlight-ipython3 pre {
-	background-color: #f8f8c8;
-}
-
-.wy-nav-top {
-	background-color: #FF0040;
-}
-
-.wy-side-nav-search {
-	background-color: #FF0040;
-}
-
-pre.highlight-default {
-	background-color: #b5b5b5;
-}
-
-table {
-    border: solid 1px #DDEEEE;
-    border-collapse: collapse;
-    border-spacing: 0;
-    font: normal 13px Arial, sans-serif;
-}
-thead th {
-    background-color: #DDEFEF;
-    border: solid 1px #DDEEEE;
-    color: #336B6B;
-    padding: 10px;
-    text-align: left;
-    text-shadow: 1px 1px 1px #fff;
-}
-tbody td {
-    border: solid 1px #DDEEEE;
-    color: #333;
-    padding: 10px;
-    text-shadow: 1px 1px 1px #fff;
-}
diff --git a/_doc/sphinxdoc/source/_templates/page.html b/_doc/sphinxdoc/source/_templates/page.html
deleted file mode 100644
index 1be6020..0000000
--- a/_doc/sphinxdoc/source/_templates/page.html
+++ /dev/null
@@ -1,4 +0,0 @@
-{% extends "layout.html" %}
-{% block body %}
-{{ body }}
-{% endblock body %}
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
index eed8a1c..f298be6 100644
--- a/_doc/sphinxdoc/source/conf.py
+++ b/_doc/sphinxdoc/source/conf.py
@@ -1,82 +1,204 @@
 # -*- coding: utf-8 -*-
 import sys
 import os
-import alabaster
-from pyquickhelper.helpgen.default_conf import set_sphinx_variables
+from sphinx_runpython.github_link import make_linkcode_resolve
+from sphinx_runpython.conf_helper import has_dvipng, has_dvisvgm
+from pandas_streaming import __version__
 
 
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.split(__file__)[0])))
+extensions = [
+    "nbsphinx",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.coverage",
+    "sphinx.ext.githubpages",
+    "sphinx.ext.ifconfig",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.linkcode",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.todo",
+    "sphinx_gallery.gen_gallery",
+    "sphinx_issues",
+    "sphinx_runpython.blocdefs.sphinx_exref_extension",
+    "sphinx_runpython.blocdefs.sphinx_mathdef_extension",
+    "sphinx_runpython.epkg",
+    "sphinx_runpython.gdot",
+    "sphinx_runpython.runpython",
+    "matplotlib.sphinxext.plot_directive",
+]
 
-local_template = os.path.join(os.path.abspath(
-    os.path.dirname(__file__)), "_templates")
+if has_dvisvgm():
+    extensions.append("sphinx.ext.imgmath")
+    imgmath_image_format = "svg"
+elif has_dvipng():
+    extensions.append("sphinx.ext.pngmath")
+    imgmath_image_format = "png"
+else:
+    extensions.append("sphinx.ext.mathjax")
 
-set_sphinx_variables(__file__, "pandas_streaming", "Xavier Dupré", 2023,
-                     "alabaster", alabaster.get_path(),
-                     locals(), extlinks=dict(issue=(
-                         'https://github.com/sdpython/pandas_streaming/issues/%s',
-                         'issue %s')),
-                     title="Streaming functionalities for pandas", book=True)
+templates_path = ["_templates"]
+html_logo = "_static/project_ico.png"
+source_suffix = ".rst"
+master_doc = "index"
+project = "pandas-streaming"
+copyright = "2016-2023, Xavier Dupré"
+author = "Xavier Dupré"
+version = __version__
+release = __version__
+language = "en"
+exclude_patterns = ["auto_examples/*.ipynb"]
+pygments_style = "sphinx"
+todo_include_todos = True
+nbsphinx_execute = "never"
 
-blog_root = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/"
+html_theme = "furo"
+html_theme_path = ["_static"]
+html_theme_options = {}
+html_sourcelink_suffix = ""
+html_static_path = ["_static"]
 
-html_css_files = ['my-styles.css']
+issues_github_path = "sdpython/pandas-streaming"
 
-html_logo = "_static/project_ico.png"
+# The following is used by sphinx.ext.linkcode to provide links to github
+linkcode_resolve = make_linkcode_resolve(
+    "pandas_streaming",
+    (
+        "https://github.com/sdpython/pandas-streaming/"
+        "blob/{revision}/{package}/"
+        "{path}#L{lineno}"
+    ),
+)
 
-html_sidebars = {}
+latex_elements = {
+    "papersize": "a4",
+    "pointsize": "10pt",
+    "title": project,
+}
 
-language = "en"
-custom_preamble = """\n
+mathjax3_config = {"chtml": {"displayAlign": "left"}}
+
+intersphinx_mapping = {
+    "onnx": ("https://onnx.ai/onnx/", None),
+    "matplotlib": ("https://matplotlib.org/", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+    "python": (f"https://docs.python.org/{sys.version_info.major}", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
+    "sklearn": ("https://scikit-learn.org/stable/", None),
+    "sklearn-onnx": ("https://onnx.ai/sklearn-onnx/", None),
+    "torch": ("https://pytorch.org/docs/stable/", None),
+}
+
+# Check intersphinx reference targets exist
+nitpicky = True
+# See also scikit-learn/scikit-learn#26761
+nitpick_ignore = [
+    ("py:class", "False"),
+    ("py:class", "True"),
+    ("py:class", "pipeline.Pipeline"),
+    ("py:class", "default=sklearn.utils.metadata_routing.UNCHANGED"),
+]
+
+sphinx_gallery_conf = {
+    # path to your examples scripts
+    "examples_dirs": os.path.join(os.path.dirname(__file__), "examples"),
+    # path where to save gallery generated examples
+    "gallery_dirs": "auto_examples",
+}
+
+# next
+
+preamble = """
+\\usepackage{etex}
+\\usepackage{fixltx2e} % LaTeX patches, \\textsubscript
+\\usepackage{cmap} % fix search and cut-and-paste in Acrobat
+\\usepackage[raccourcis]{fast-diagram}
+\\usepackage{titlesec}
+\\usepackage{amsmath}
+\\usepackage{amssymb}
+\\usepackage{amsfonts}
+\\usepackage{graphics}
+\\usepackage{epic}
+\\usepackage{eepic}
+%\\usepackage{pict2e}
+%%% Redefined titleformat
+\\setlength{\\parindent}{0cm}
+\\setlength{\\parskip}{1ex plus 0.5ex minus 0.2ex}
+\\newcommand{\\hsp}{\\hspace{20pt}}
+\\newcommand{\\acc}[1]{\\left\\{#1\\right\\}}
+\\newcommand{\\cro}[1]{\\left[#1\\right]}
+\\newcommand{\\pa}[1]{\\left(#1\\right)}
+\\newcommand{\\R}{\\mathbb{R}}
+\\newcommand{\\HRule}{\\rule{\\linewidth}{0.5mm}}
+%\\titleformat{\\chapter}[hang]{\\Huge\\bfseries\\sffamily}{\\thechapter\\hsp}{0pt}{\\Huge\\bfseries\\sffamily}
+
+\\usepackage[all]{xy}
 \\newcommand{\\vecteur}[2]{\\pa{#1,\\dots,#2}}
 \\newcommand{\\N}[0]{\\mathbb{N}}
-\\newcommand{\\indicatrice}[1]{\\mathbf{1\\!\\!1}_{\\acc{#1}}}
-\\usepackage[all]{xy}
+\\newcommand{\\indicatrice}[1]{ {1\\!\\!1}_{\\acc{#1}} }
 \\newcommand{\\infegal}[0]{\\leqslant}
 \\newcommand{\\supegal}[0]{\\geqslant}
 \\newcommand{\\ensemble}[2]{\\acc{#1,\\dots,#2}}
 \\newcommand{\\fleche}[1]{\\overrightarrow{ #1 }}
 \\newcommand{\\intervalle}[2]{\\left\\{#1,\\cdots,#2\\right\\}}
-\\newcommand{\\loinormale}[2]{{\\cal N}\\pa{#1,#2}}
-\\newcommand{\\independant}[0]{\\;\\makebox[3ex]{\\makebox[0ex]{\\rule[-0.2ex]{3ex}{.1ex}}\\!\\!\\!\\!\\makebox[.5ex][l]{\\rule[-.2ex]{.1ex}{2ex}}\\makebox[.5ex][l]{\\rule[-.2ex]{.1ex}{2ex}}} \\,\\,}
+\\newcommand{\\independant}[0]{\\perp \\!\\!\\! \\perp}
 \\newcommand{\\esp}{\\mathbb{E}}
+\\newcommand{\\espf}[2]{\\mathbb{E}_{#1}\\pa{#2}}
 \\newcommand{\\var}{\\mathbb{V}}
 \\newcommand{\\pr}[1]{\\mathbb{P}\\pa{#1}}
 \\newcommand{\\loi}[0]{{\\cal L}}
 \\newcommand{\\vecteurno}[2]{#1,\\dots,#2}
 \\newcommand{\\norm}[1]{\\left\\Vert#1\\right\\Vert}
+\\newcommand{\\norme}[1]{\\left\\Vert#1\\right\\Vert}
+\\newcommand{\\scal}[2]{\\left<#1,#2\\right>}
 \\newcommand{\\dans}[0]{\\rightarrow}
 \\newcommand{\\partialfrac}[2]{\\frac{\\partial #1}{\\partial #2}}
 \\newcommand{\\partialdfrac}[2]{\\dfrac{\\partial #1}{\\partial #2}}
-\\newcommand{\\loimultinomiale}[1]{{\\cal M}\\pa{#1}}
 \\newcommand{\\trace}[1]{tr\\pa{#1}}
+\\newcommand{\\sac}[0]{|}
 \\newcommand{\\abs}[1]{\\left|#1\\right|}
+\\newcommand{\\loinormale}[2]{{\\cal N} \\pa{#1,#2}}
+\\newcommand{\\loibinomialea}[1]{{\\cal B} \\pa{#1}}
+\\newcommand{\\loibinomiale}[2]{{\\cal B} \\pa{#1,#2}}
+\\newcommand{\\loimultinomiale}[1]{{\\cal M} \\pa{#1}}
+\\newcommand{\\variance}[1]{\\mathbb{V}\\pa{#1}}
+\\newcommand{\\intf}[1]{\\left\\lfloor #1 \\right\\rfloor}
 """
-# \\usepackage{eepic}
-
-imgmath_latex_preamble += custom_preamble
-latex_elements['preamble'] += custom_preamble
-mathdef_link_only = True
-
-epkg_dictionary.update({
-    'csv': 'https://en.wikipedia.org/wiki/Comma-separated_values',
-    'dask': 'https://dask.pydata.org/en/latest/',
-    'dataframe': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html',
-    'Dataframe': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html',
-    'DataFrame': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html',
-    'dataframes': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html',
-    'dill': 'https://dill.readthedocs.io/en/latest/dill.html',
-    'Hadoop': 'http://hadoop.apache.org/',
-    'ijson': 'https://github.com/ICRAR/ijson',
-    'nan': 'https://numpy.org/doc/stable/reference/constants.html#numpy.NAN',
-    'pandas': ('http://pandas.pydata.org/pandas-docs/stable/',
-               ('http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html', 1),
-               ('http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html', 2)),
-    'pyarrow': 'https://arrow.apache.org/docs/python/',
-    'pyspark': 'http://spark.apache.org/docs/2.1.1/api/python/index.html',
-    'scikit-multiflow': 'https://scikit-multiflow.github.io/',
-    'sklearn': ('http://scikit-learn.org/stable/',
-                ('http://scikit-learn.org/stable/modules/generated/{0}.html', 1),
-                ('http://scikit-learn.org/stable/modules/generated/{0}.{1}.html', 2)),
-    'streamz': 'https://streamz.readthedocs.io/en/latest/index.html',
-    'tornado': 'https://www.tornadoweb.org/en/stable/',
-})
+
+imgmath_latex_preamble = preamble
+latex_elements["preamble"] = imgmath_latex_preamble
+
+
+epkg_dictionary = {
+        "csv": "https://en.wikipedia.org/wiki/Comma-separated_values",
+        "dask": "https://dask.pydata.org/en/latest/",
+        "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+        "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+        "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+        "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+        "dill": "https://dill.readthedocs.io/en/latest/dill.html",
+        "Hadoop": "http://hadoop.apache.org/",
+        "ijson": "https://github.com/ICRAR/ijson",
+        "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
+        "pandas": (
+            "http://pandas.pydata.org/pandas-docs/stable/",
+            (
+                "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html",
+                1,
+            ),
+            (
+                "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html",
+                2,
+            ),
+        ),
+        "pyarrow": "https://arrow.apache.org/docs/python/",
+        "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html",
+        "scikit-multiflow": "https://scikit-multiflow.github.io/",
+        "sklearn": (
+            "http://scikit-learn.org/stable/",
+            ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1),
+            ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
+        ),
+        "streamz": "https://streamz.readthedocs.io/en/latest/index.html",
+        "tornado": "https://www.tornadoweb.org/en/stable/",
+    }
diff --git a/_unittests/ut_df/test_connex_split.py b/_unittests/ut_df/test_connex_split.py
index 33bd03f..e373c9b 100644
--- a/_unittests/ut_df/test_connex_split.py
+++ b/_unittests/ut_df/test_connex_split.py
@@ -1,137 +1,174 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=4s)
-"""
 import unittest
 import pandas
 from pyquickhelper.pycode import ExtTestCase
-from pandas_streaming.df import dataframe_shuffle, train_test_split_weights, train_test_connex_split
+from pandas_streaming.df import (
+    dataframe_shuffle,
+    train_test_split_weights,
+    train_test_connex_split,
+)
 
 
 class TestConnexSplit(ExtTestCase):
-
     def test_shuffle(self):
-        df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1"),
-                               dict(a=2, b="f", c=5.7, ind="a2"),
-                               dict(a=4, b="g", c=5.8, ind="a3"),
-                               dict(a=8, b="h", c=5.9, ind="a4"),
-                               dict(a=16, b="i", c=6.2, ind="a5")])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e", c=5.6, ind="a1"),
+                dict(a=2, b="f", c=5.7, ind="a2"),
+                dict(a=4, b="g", c=5.8, ind="a3"),
+                dict(a=8, b="h", c=5.9, ind="a4"),
+                dict(a=16, b="i", c=6.2, ind="a5"),
+            ]
+        )
         shuffled = dataframe_shuffle(df, random_state=0)
-        sorted_ = shuffled.sort_values('a')
+        sorted_ = shuffled.sort_values("a")
         self.assertEqualDataFrame(df, sorted_)
 
-        df2 = df.set_index('ind')
+        df2 = df.set_index("ind")
         shuffled = dataframe_shuffle(df2, random_state=0)
-        sorted_ = shuffled.sort_values('a')
+        sorted_ = shuffled.sort_values("a")
         self.assertEqualDataFrame(df2, sorted_)
 
-        df2 = df.set_index(['ind', 'c'])
+        df2 = df.set_index(["ind", "c"])
         shuffled = dataframe_shuffle(df2, random_state=0)
-        sorted_ = shuffled.sort_values('a')
+        sorted_ = shuffled.sort_values("a")
         self.assertEqualDataFrame(df2, sorted_)
 
     def test_split_weights_errors(self):
-        df = pandas.DataFrame([dict(a=1, b="e", c=1),
-                               dict(a=2, b="f", c=1),
-                               dict(a=4, b="g", c=1),
-                               dict(a=8, b="h", c=1),
-                               dict(a=12, b="h", c=1),
-                               dict(a=16, b="i", c=1)])
-
-        train, test = train_test_split_weights(df, train_size=0.5, weights='c')
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e", c=1),
+                dict(a=2, b="f", c=1),
+                dict(a=4, b="g", c=1),
+                dict(a=8, b="h", c=1),
+                dict(a=12, b="h", c=1),
+                dict(a=16, b="i", c=1),
+            ]
+        )
+
+        train, test = train_test_split_weights(df, train_size=0.5, weights="c")
         self.assertTrue(train is not None)
         self.assertTrue(test is not None)
-        self.assertRaise(lambda: train_test_split_weights(
-            df, test_size=0.5, weights=[0.5, 0.5]), ValueError, 'Dimension')
-        self.assertRaise(lambda: train_test_split_weights(
-            df, test_size=0), ValueError, 'null')
-        self.assertRaise(lambda: train_test_split_weights(
-            df, test_size=0, weights='c'), ValueError, 'null')
+        self.assertRaise(
+            lambda: train_test_split_weights(df, test_size=0.5, weights=[0.5, 0.5]),
+            ValueError,
+            "Dimension",
+        )
+        self.assertRaise(
+            lambda: train_test_split_weights(df, test_size=0), ValueError, "null"
+        )
+        self.assertRaise(
+            lambda: train_test_split_weights(df, test_size=0, weights="c"),
+            ValueError,
+            "null",
+        )
 
     def test_split_weights(self):
-        df = pandas.DataFrame([dict(a=1, b="e", c=1),
-                               dict(a=2, b="f", c=1),
-                               dict(a=4, b="g", c=1),
-                               dict(a=8, b="h", c=1),
-                               dict(a=12, b="h", c=1),
-                               dict(a=16, b="i", c=1)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e", c=1),
+                dict(a=2, b="f", c=1),
+                dict(a=4, b="g", c=1),
+                dict(a=8, b="h", c=1),
+                dict(a=12, b="h", c=1),
+                dict(a=16, b="i", c=1),
+            ]
+        )
 
         train, test = train_test_split_weights(df, test_size=0.5)
         self.assertEqual(train.shape[1], test.shape[1])
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 
-        train, test = train_test_split_weights(df, test_size=0.5, weights='c')
+        train, test = train_test_split_weights(df, test_size=0.5, weights="c")
         self.assertEqual(train.shape[1], test.shape[1])
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 
-        train, test = train_test_split_weights(
-            df, test_size=0.5, weights=df['c'])
+        train, test = train_test_split_weights(df, test_size=0.5, weights=df["c"])
         self.assertEqual(train.shape[1], test.shape[1])
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 
-        df = pandas.DataFrame([dict(a=1, b="e", c=1),
-                               dict(a=2, b="f", c=2),
-                               dict(a=4, b="g", c=3),
-                               dict(a=8, b="h", c=1),
-                               dict(a=12, b="h", c=2),
-                               dict(a=16, b="i", c=3)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e", c=1),
+                dict(a=2, b="f", c=2),
+                dict(a=4, b="g", c=3),
+                dict(a=8, b="h", c=1),
+                dict(a=12, b="h", c=2),
+                dict(a=16, b="i", c=3),
+            ]
+        )
 
-        train, test = train_test_split_weights(df, test_size=0.5, weights='c',
-                                               fail_imbalanced=0.4)
+        train, test = train_test_split_weights(
+            df, test_size=0.5, weights="c", fail_imbalanced=0.4
+        )
         self.assertEqual(train.shape[1], test.shape[1])
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
-        w1, w2 = train['c'].sum(), test['c'].sum()
+        w1, w2 = train["c"].sum(), test["c"].sum()
         delta = abs(w1 - w2) / (w1 + w2)
         self.assertGreater(0.4, delta)
 
     def test_split_connex(self):
-        df = pandas.DataFrame([dict(user="UA", prod="PA", card="C1"),
-                               dict(user="UA", prod="PB", card="C1"),
-                               dict(user="UB", prod="PC", card="C2"),
-                               dict(user="UB", prod="PD", card="C2"),
-                               dict(user="UC", prod="PE", card="C3"),
-                               dict(user="UC", prod="PF", card="C4"),
-                               dict(user="UD", prod="PG", card="C5"),
-                               ])
+        df = pandas.DataFrame(
+            [
+                dict(user="UA", prod="PA", card="C1"),
+                dict(user="UA", prod="PB", card="C1"),
+                dict(user="UB", prod="PC", card="C2"),
+                dict(user="UB", prod="PD", card="C2"),
+                dict(user="UC", prod="PE", card="C3"),
+                dict(user="UC", prod="PF", card="C4"),
+                dict(user="UD", prod="PG", card="C5"),
+            ]
+        )
 
         train, test = train_test_connex_split(  # pylint: disable=W0632
-            df, test_size=0.5, groups=['user', 'prod', 'card'],
-            fail_imbalanced=0.4)
+            df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4
+        )
 
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
-        for col in ['user', 'prod', 'card']:
+        for col in ["user", "prod", "card"]:
             s1 = set(train[col])
             s2 = set(test[col])
             if s1 & s2:
                 raise AssertionError(
-                    f'Non empty intersection {s1} & {s2}\n{train}\n{test}')
+                    f"Non empty intersection {s1} & {s2}\n{train}\n{test}"
+                )
 
-        df['connex'] = 'ole'
+        df["connex"] = "ole"
         train, test = train_test_connex_split(  # pylint: disable=W0632
-            df, test_size=0.5, groups=['user', 'prod', 'card'],
-            fail_imbalanced=0.4)
+            df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4
+        )
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
 
     def test_split_connex2(self):
-        df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"),
-                               dict(user="UA", prod="PB", card="C1"),
-                               dict(user="UB", prod="PC", card="C2"),
-                               dict(user="UB", prod="PD", card="C2"),
-                               dict(user="UC", prod="PAA", card="C3"),
-                               dict(user="UC", prod="PF", card="C4"),
-                               dict(user="UD", prod="PG", card="C5"),
-                               ])
-
-        train_test_connex_split(df, test_size=0.5, groups=['user', 'prod', 'card'],
-                                fail_imbalanced=0.5, return_cnx=True)
-        train, test, stats = train_test_connex_split(df, test_size=0.5,
-                                                     groups=[
-                                                         'user', 'prod', 'card'],
-                                                     fail_imbalanced=0.5,
-                                                     return_cnx=True, random_state=0)
+        df = pandas.DataFrame(
+            [
+                dict(user="UA", prod="PAA", card="C1"),
+                dict(user="UA", prod="PB", card="C1"),
+                dict(user="UB", prod="PC", card="C2"),
+                dict(user="UB", prod="PD", card="C2"),
+                dict(user="UC", prod="PAA", card="C3"),
+                dict(user="UC", prod="PF", card="C4"),
+                dict(user="UD", prod="PG", card="C5"),
+            ]
+        )
+
+        train_test_connex_split(
+            df,
+            test_size=0.5,
+            groups=["user", "prod", "card"],
+            fail_imbalanced=0.5,
+            return_cnx=True,
+        )
+        train, test, stats = train_test_connex_split(
+            df,
+            test_size=0.5,
+            groups=["user", "prod", "card"],
+            fail_imbalanced=0.5,
+            return_cnx=True,
+            random_state=0,
+        )
 
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
-        for col in ['user', 'prod', 'card']:
+        for col in ["user", "prod", "card"]:
             s1 = set(train[col])
             s2 = set(test[col])
             if s1 & s2:
@@ -139,26 +176,35 @@ def test_split_connex2(self):
                 for k, v in sorted(stats[0].items()):
                     rows.append(f"{k}={v}")
                 raise AssertionError(
-                    'Non empty intersection {0} & {1}\n{2}\n{3}\n{4}'.format(s1, s2, train, test, "\n".join(rows)))
+                    "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(
+                        s1, s2, train, test, "\n".join(rows)
+                    )
+                )
 
     def test_split_connex_missing(self):
-        df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"),
-                               dict(user="UA", prod="PB", card="C1"),
-                               dict(user="UB", prod="PC", card="C2"),
-                               dict(user="UB", prod="PD", card="C2"),
-                               dict(user="UC", prod="PAA", card="C3"),
-                               dict(user="UC", card="C4"),
-                               dict(user="UD", prod="PG"),
-                               ])
-
-        train, test, stats = train_test_connex_split(df, test_size=0.5,
-                                                     groups=[
-                                                         'user', 'prod', 'card'],
-                                                     fail_imbalanced=0.4,
-                                                     return_cnx=True, random_state=0)
+        df = pandas.DataFrame(
+            [
+                dict(user="UA", prod="PAA", card="C1"),
+                dict(user="UA", prod="PB", card="C1"),
+                dict(user="UB", prod="PC", card="C2"),
+                dict(user="UB", prod="PD", card="C2"),
+                dict(user="UC", prod="PAA", card="C3"),
+                dict(user="UC", card="C4"),
+                dict(user="UD", prod="PG"),
+            ]
+        )
+
+        train, test, stats = train_test_connex_split(
+            df,
+            test_size=0.5,
+            groups=["user", "prod", "card"],
+            fail_imbalanced=0.4,
+            return_cnx=True,
+            random_state=0,
+        )
 
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
-        for col in ['user', 'prod', 'card']:
+        for col in ["user", "prod", "card"]:
             s1 = set(train[col])
             s2 = set(test[col])
             if s1 & s2:
@@ -166,7 +212,10 @@ def test_split_connex_missing(self):
                 for k, v in sorted(stats[0].items()):
                     rows.append(f"{k}={v}")
                 raise AssertionError(
-                    'Non empty intersection {0} & {1}\n{2}\n{3}\n{4}'.format(s1, s2, train, test, "\n".join(rows)))
+                    "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format(
+                        s1, s2, train, test, "\n".join(rows)
+                    )
+                )
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_df/test_connex_split_big.py b/_unittests/ut_df/test_connex_split_big.py
index 22292c5..f297ec8 100644
--- a/_unittests/ut_df/test_connex_split_big.py
+++ b/_unittests/ut_df/test_connex_split_big.py
@@ -1,7 +1,4 @@
 # -*- coding: utf-8 -*-
-"""
-@brief      test log(time=30s)
-"""
 import os
 import unittest
 from collections import Counter
@@ -11,18 +8,19 @@
 
 
 class TestConnexSplitBig(ExtTestCase):
-
     def test_connex_big(self):
         data = os.path.join(os.path.dirname(__file__), "data")
         name = os.path.join(data, "buggy_hash.csv")
         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
-        train, test, stats = train_test_connex_split(df,
-                                                     groups=[
-                                                         "cart_id", "mail", "product_id"],
-                                                     fail_imbalanced=0.9, return_cnx=True)
+        train, test, stats = train_test_connex_split(
+            df,
+            groups=["cart_id", "mail", "product_id"],
+            fail_imbalanced=0.9,
+            return_cnx=True,
+        )
         self.assertGreater(train.shape[0], 0)
         self.assertGreater(test.shape[0], 0)
-        elements = stats[1]['connex']
+        elements = stats[1]["connex"]
         counts = Counter(elements)
         nbc = len(counts)
         maxi = max(counts.values())
@@ -33,14 +31,16 @@ def test_connex_big_approx(self):
         data = os.path.join(os.path.dirname(__file__), "data")
         name = os.path.join(data, "buggy_hash.csv")
         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
-        train, test, stats = train_test_connex_split(df,
-                                                     groups=[
-                                                         "cart_id", "mail", "product_id"],
-                                                     stop_if_bigger=0.05, return_cnx=True,
-                                                     keep_balance=0.8)
+        train, test, stats = train_test_connex_split(
+            df,
+            groups=["cart_id", "mail", "product_id"],
+            stop_if_bigger=0.05,
+            return_cnx=True,
+            keep_balance=0.8,
+        )
         self.assertGreater(train.shape[0], 0)
         self.assertGreater(test.shape[0], 0)
-        elements = stats[1]['connex']
+        elements = stats[1]["connex"]
         counts = Counter(elements)
         nbc = len(counts)
         maxi = max(counts.values())
@@ -51,14 +51,17 @@ def test_connex_big_approx_must(self):
         data = os.path.join(os.path.dirname(__file__), "data")
         name = os.path.join(data, "buggy_hash.csv")
         df = pandas.read_csv(name, sep="\t", encoding="utf-8")
-        train, test, stats = train_test_connex_split(df,
-                                                     groups=[
-                                                         "cart_id", "mail", "product_id"],
-                                                     stop_if_bigger=0.05, return_cnx=True,
-                                                     keep_balance=0.8, must_groups=["product_id"])
+        train, test, stats = train_test_connex_split(
+            df,
+            groups=["cart_id", "mail", "product_id"],
+            stop_if_bigger=0.05,
+            return_cnx=True,
+            keep_balance=0.8,
+            must_groups=["product_id"],
+        )
         self.assertGreater(train.shape[0], 0)
         self.assertGreater(test.shape[0], 0)
-        elements = stats[1]['connex']
+        elements = stats[1]["connex"]
         counts = Counter(elements)
         nbc = len(counts)
         maxi = max(counts.values())
diff --git a/_unittests/ut_df/test_connex_split_cat.py b/_unittests/ut_df/test_connex_split_cat.py
index 27ed49e..3eb55e8 100644
--- a/_unittests/ut_df/test_connex_split_cat.py
+++ b/_unittests/ut_df/test_connex_split_cat.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
-"""
-@brief      test log(time=4s)
-"""
+
 import unittest
 from collections import Counter
 import pandas
@@ -10,63 +8,80 @@
 
 
 class TestConnexSplitCat(ExtTestCase):
-
     def test_cat_strat(self):
-        df = pandas.DataFrame([dict(a=1, b="e"),
-                               dict(a=2, b="e"),
-                               dict(a=4, b="f"),
-                               dict(a=8, b="f"),
-                               dict(a=32, b="f"),
-                               dict(a=16, b="f")])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e"),
+                dict(a=2, b="e"),
+                dict(a=4, b="f"),
+                dict(a=8, b="f"),
+                dict(a=32, b="f"),
+                dict(a=16, b="f"),
+            ]
+        )
 
         train, test = train_test_apart_stratify(
-            df, group="a", stratify="b", test_size=0.5)
+            df, group="a", stratify="b", test_size=0.5
+        )
         self.assertEqual(train.shape[1], test.shape[1])
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
         c1 = Counter(train["b"])
         c2 = Counter(train["b"])
         self.assertEqual(c1, c2)
 
-        self.assertRaise(lambda: train_test_apart_stratify(df, group=None, stratify="b", test_size=0.5),
-                         ValueError)
-        self.assertRaise(lambda: train_test_apart_stratify(df, group="b", test_size=0.5),
-                         ValueError)
+        self.assertRaise(
+            lambda: train_test_apart_stratify(
+                df, group=None, stratify="b", test_size=0.5
+            ),
+            ValueError,
+        )
+        self.assertRaise(
+            lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError
+        )
 
     def test_cat_strat_multi(self):
-        df = pandas.DataFrame([dict(a=1, b="e"),
-                               dict(a=1, b="f"),
-                               dict(a=2, b="e"),
-                               dict(a=2, b="f"),
-                               ])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e"),
+                dict(a=1, b="f"),
+                dict(a=2, b="e"),
+                dict(a=2, b="f"),
+            ]
+        )
 
         train, test = train_test_apart_stratify(
-            df, group="a", stratify="b", test_size=0.5)
+            df, group="a", stratify="b", test_size=0.5
+        )
         self.assertEqual(train.shape[1], test.shape[1])
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
         c1 = Counter(train["b"])
         c2 = Counter(train["b"])
         self.assertEqual(c1, c2)
-        self.assertEqual(len(set(train['a'])), 1)
-        self.assertEqual(len(set(test['a'])), 1)
-        self.assertTrue(set(train['a']) != set(test['a']))
+        self.assertEqual(len(set(train["a"])), 1)
+        self.assertEqual(len(set(test["a"])), 1)
+        self.assertTrue(set(train["a"]) != set(test["a"]))
 
     def test_cat_strat_multi_force(self):
-        df = pandas.DataFrame([dict(a=1, b="e"),
-                               dict(a=1, b="f"),
-                               dict(a=2, b="e"),
-                               dict(a=2, b="f"),
-                               ])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e"),
+                dict(a=1, b="f"),
+                dict(a=2, b="e"),
+                dict(a=2, b="f"),
+            ]
+        )
 
         train, test = train_test_apart_stratify(
-            df, group="a", stratify="b", test_size=0.1, force=True)
+            df, group="a", stratify="b", test_size=0.1, force=True
+        )
         self.assertEqual(train.shape[1], test.shape[1])
         self.assertEqual(train.shape[0] + test.shape[0], df.shape[0])
         c1 = Counter(train["b"])
         c2 = Counter(train["b"])
         self.assertEqual(c1, c2)
-        self.assertEqual(len(set(train['a'])), 1)
-        self.assertEqual(len(set(test['a'])), 1)
-        self.assertTrue(set(train['a']) != set(test['a']))
+        self.assertEqual(len(set(train["a"])), 1)
+        self.assertEqual(len(set(test["a"])), 1)
+        self.assertTrue(set(train["a"]) != set(test["a"]))
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_df/test_dataframe_helpers.py b/_unittests/ut_df/test_dataframe_helpers.py
index 45f295d..edd0db6 100644
--- a/_unittests/ut_df/test_dataframe_helpers.py
+++ b/_unittests/ut_df/test_dataframe_helpers.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=4s)
-"""
 import os
 import unittest
 import numpy
@@ -11,13 +7,16 @@
 
 
 class TestDataFrameHelpers(ExtTestCase):
-
     def test_hash_columns(self):
-        df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1", ai=1),
-                               dict(b="f", c=5.7, ind="a2", ai=2),
-                               dict(a=4, b="g", ind="a3", ai=3),
-                               dict(a=8, b="h", c=5.9, ai=4),
-                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="e", c=5.6, ind="a1", ai=1),
+                dict(b="f", c=5.7, ind="a2", ai=2),
+                dict(a=4, b="g", ind="a3", ai=3),
+                dict(a=8, b="h", c=5.9, ai=4),
+                dict(a=16, b="i", c=6.2, ind="a5", ai=5),
+            ]
+        )
         df2 = dataframe_hash_columns(df)
         self.assertEqual(df2.shape, df.shape)
         for j in range(df.shape[1]):
diff --git a/_unittests/ut_df/test_dataframe_helpers_simple.py b/_unittests/ut_df/test_dataframe_helpers_simple.py
index 79545c1..5d68296 100644
--- a/_unittests/ut_df/test_dataframe_helpers_simple.py
+++ b/_unittests/ut_df/test_dataframe_helpers_simple.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=4s)
-"""
 import unittest
 import pandas
 import numpy
@@ -11,34 +7,36 @@
 
 
 class TestDataFrameHelpersSimple(ExtTestCase):
-
     def test_unfold(self):
-        df = pandas.DataFrame([dict(a=1, b="e,f"),
-                               dict(a=2, b="g"),
-                               dict(a=3)])
+        df = pandas.DataFrame([dict(a=1, b="e,f"), dict(a=2, b="g"), dict(a=3)])
         df2 = dataframe_unfold(df, "b")
 
-        exp = pandas.DataFrame([dict(a=1, b="e,f", b_unfold="e"),
-                                dict(a=1, b="e,f", b_unfold="f"),
-                                dict(a=2, b="g", b_unfold="g"),
-                                dict(a=3)])
+        exp = pandas.DataFrame(
+            [
+                dict(a=1, b="e,f", b_unfold="e"),
+                dict(a=1, b="e,f", b_unfold="f"),
+                dict(a=2, b="g", b_unfold="g"),
+                dict(a=3),
+            ]
+        )
         self.assertEqualDataFrame(df2, exp)
 
         # fold
-        folded = df2.groupby('a').apply(lambda row: ','.join(
-            row['b_unfold'].dropna()) if len(row['b_unfold'].dropna()) > 0 else numpy.nan)
+        folded = df2.groupby("a").apply(
+            lambda row: ",".join(row["b_unfold"].dropna())
+            if len(row["b_unfold"].dropna()) > 0
+            else numpy.nan
+        )
         bf = folded.reset_index(drop=False)
-        bf.columns = ['a', 'b']
+        bf.columns = ["a", "b"]
         self.assertEqualDataFrame(df, bf)
 
     def test_hash_except(self):
-        self.assertRaise(lambda: hash_int(0.1, 3),
-                         ValueError, "numpy.nan expected")
+        self.assertRaise(lambda: hash_int(0.1, 3), ValueError, "numpy.nan expected")
         r = hash_int(numpy.nan, 3)
         self.assertTrue(numpy.isnan(r))
 
-        self.assertRaise(lambda: hash_str(0.1, 3),
-                         ValueError, "numpy.nan expected")
+        self.assertRaise(lambda: hash_str(0.1, 3), ValueError, "numpy.nan expected")
         r = hash_str(numpy.nan, 3)
         self.assertTrue(numpy.isnan(r))
 
diff --git a/_unittests/ut_df/test_dataframe_io.py b/_unittests/ut_df/test_dataframe_io.py
index d8e51a1..3e2125a 100644
--- a/_unittests/ut_df/test_dataframe_io.py
+++ b/_unittests/ut_df/test_dataframe_io.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=4s)
-"""
 import os
 import unittest
 import io
@@ -13,13 +9,16 @@
 
 
 class TestDataFrameIO(ExtTestCase):
-
     def test_zip_dataframe(self):
-        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
-                               dict(b="f", c=5.7, ind="a2", ai=2),
-                               dict(a=4, b="g", ind="a3", ai=3),
-                               dict(a=8, b="h", c=5.9, ai=4),
-                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                dict(b="f", c=5.7, ind="a2", ai=2),
+                dict(a=4, b="g", ind="a3", ai=3),
+                dict(a=8, b="h", c=5.9, ai=4),
+                dict(a=16, b="i", c=6.2, ind="a5", ai=5),
+            ]
+        )
 
         temp = get_temp_folder(__file__, "temp_zip")
         name = os.path.join(temp, "df.zip")
@@ -28,13 +27,13 @@ def test_zip_dataframe(self):
         self.assertEqualDataFrame(df, df2)
 
         st = io.BytesIO()
-        zp = zipfile.ZipFile(st, 'w')
+        zp = zipfile.ZipFile(st, "w")
         to_zip(df, zp, encoding="utf-8", index=False)
         zp.close()
 
         st = io.BytesIO(st.getvalue())
-        zp = zipfile.ZipFile(st, 'r')
-        df3 = read_zip(zp, encoding='utf-8')
+        zp = zipfile.ZipFile(st, "r")
+        df3 = read_zip(zp, encoding="utf-8")
         zp.close()
         self.assertEqualDataFrame(df, df3)
 
@@ -49,12 +48,12 @@ def test_zip_numpy(self):
         self.assertEqualArray(df, df2)
 
         st = io.BytesIO()
-        zp = zipfile.ZipFile(st, 'w')
+        zp = zipfile.ZipFile(st, "w")
         to_zip(df, zp, "arr.npy")
         zp.close()
 
         st = io.BytesIO(st.getvalue())
-        zp = zipfile.ZipFile(st, 'r')
+        zp = zipfile.ZipFile(st, "r")
         df3 = read_zip(zp, "arr.npy")
         zp.close()
         self.assertEqualArray(df, df3)
diff --git a/_unittests/ut_df/test_dataframe_io_helpers.py b/_unittests/ut_df/test_dataframe_io_helpers.py
index c6102a0..403a087 100644
--- a/_unittests/ut_df/test_dataframe_io_helpers.py
+++ b/_unittests/ut_df/test_dataframe_io_helpers.py
@@ -1,21 +1,18 @@
-# -*- coding: utf-8 -*-
-# pylint: disable=E1101
-"""
-@brief      test log(time=4s)
-"""
 import unittest
 from io import StringIO, BytesIO
 from json import loads
 import pandas
 from pyquickhelper.pycode import ExtTestCase
 from pandas_streaming.df.dataframe_io_helpers import (
-    enumerate_json_items, JsonPerRowsStream, JsonIterator2Stream)
+    enumerate_json_items,
+    JsonPerRowsStream,
+    JsonIterator2Stream,
+)
 from pandas_streaming.df import StreamingDataFrame
 
 
 class TestDataFrameIOHelpers(ExtTestCase):
-
-    text_json = b'''
+    text_json = b"""
         [
         {
             "glossary": {
@@ -62,28 +59,30 @@ class TestDataFrameIOHelpers(ExtTestCase):
             }
         }
         ]
-    '''
+    """
     text_json_exp = [
         {
             "glossary": {
                 "title": "example glossary",
                 "GlossDiv": {
                     "title": "S",
-                    "GlossList": [{
-                        "GlossEntry": {
-                            "ID": "SGML",
-                            "SortAs": "SGML",
-                            "GlossTerm": "Standard Generalized Markup Language",
-                            "Acronym": "SGML",
-                            "Abbrev": "ISO 8879:1986",
-                            "GlossDef": {
-                                "para": "A meta-markup language, used to create markup languages such as DocBook.",
-                                "GlossSeeAlso": ["GML", "XML"]
-                            },
-                            "GlossSee": "markup"
+                    "GlossList": [
+                        {
+                            "GlossEntry": {
+                                "ID": "SGML",
+                                "SortAs": "SGML",
+                                "GlossTerm": "Standard Generalized Markup Language",
+                                "Acronym": "SGML",
+                                "Abbrev": "ISO 8879:1986",
+                                "GlossDef": {
+                                    "para": "A meta-markup language, used to create markup languages such as DocBook.",
+                                    "GlossSeeAlso": ["GML", "XML"],
+                                },
+                                "GlossSee": "markup",
+                            }
                         }
-                    }]
-                }
+                    ],
+                },
             }
         },
         {
@@ -92,56 +91,65 @@ class TestDataFrameIOHelpers(ExtTestCase):
                 "GlossDiv": {
                     "title": "X",
                     "GlossList": {
-                        "GlossEntry": [{
-                            "ID": "SGML",
-                            "SortAs": "SGML",
-                            "GlossTerm": "Standard Generalized Markup Language",
-                            "Acronym": "SGML",
-                            "Abbrev": "ISO 8879:1986",
-                            "GlossDef": {
-                                "para": "A meta-markup language, used to create markup languages such as DocBook.",
-                                "GlossSeeAlso": ["GML", "XML"]
-                            },
-                            "GlossSee": "markup"
-                        }]
-                    }
-                }
+                        "GlossEntry": [
+                            {
+                                "ID": "SGML",
+                                "SortAs": "SGML",
+                                "GlossTerm": "Standard Generalized Markup Language",
+                                "Acronym": "SGML",
+                                "Abbrev": "ISO 8879:1986",
+                                "GlossDef": {
+                                    "para": "A meta-markup language, used to create markup languages such as DocBook.",
+                                    "GlossSeeAlso": ["GML", "XML"],
+                                },
+                                "GlossSee": "markup",
+                            }
+                        ]
+                    },
+                },
             }
-        }
+        },
     ]
 
     def test_enumerate_json_items(self):
         items = list(enumerate_json_items(TestDataFrameIOHelpers.text_json))
         self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
-        items = list(enumerate_json_items(
-            BytesIO(TestDataFrameIOHelpers.text_json)))
+        items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json)))
         self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
-        items = list(enumerate_json_items(
-            BytesIO(TestDataFrameIOHelpers.text_json)))
+        items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json)))
         self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items)
 
     def test_read_json_raw(self):
-        data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
-                {'name': {'given': 'Mose', 'family': 'Regner'}},
-                {'id': 2, 'name': 'FayeRaker'}]
+        data = [
+            {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
+            {"name": {"given": "Mose", "family": "Regner"}},
+            {"id": 2, "name": "FayeRaker"},
+        ]
         exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"},
                 {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null},
                 {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null,
-                "name.given":null,"name.last":null}]""".replace(" ", "").replace("\n", "")
-        self.assertRaise(lambda: StreamingDataFrame.read_json(
-            data), NotImplementedError)
+                "name.given":null,"name.last":null}]""".replace(
+            " ", ""
+        ).replace(
+            "\n", ""
+        )
+        self.assertRaise(
+            lambda: StreamingDataFrame.read_json(data), NotImplementedError
+        )
         it = StreamingDataFrame.read_json(data, flatten=True)
         dfs = list(it)
         self.assertEqual(len(dfs), 1)
-        js = dfs[0].to_json(orient='records')
+        js = dfs[0].to_json(orient="records")
         js_read = loads(js)
         js_exp = loads(exp)
         self.assertEqual(js_exp, js_read)
 
     def test_read_json_raw_head(self):
-        data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
-                {'name': {'given': 'Mose', 'family': 'Regner'}},
-                {'id': 2, 'name': 'FayeRaker'}]
+        data = [
+            {"id": 1, "name": {"first": "Coleen", "last": "Volk"}},
+            {"name": {"given": "Mose", "family": "Regner"}},
+            {"id": 2, "name": "FayeRaker"},
+        ]
         it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1)
         h1 = it.head()
         h2 = it.head()
@@ -150,36 +158,36 @@ def test_read_json_raw_head(self):
         self.assertGreater(h2.shape[0], 1)
 
     def test_pandas_json_chunksize(self):
-        jsonl = '''{"a": 1, "b": 2}
-                   {"a": 3, "b": 4}'''
+        jsonl = """{"a": 1, "b": 2}
+                   {"a": 3, "b": 4}"""
         df = pandas.read_json(jsonl, lines=True)
         idf = pandas.read_json(jsonl, lines=True, chunksize=2)
         ldf = list(idf)
         self.assertEqualDataFrame(df, ldf[0])
 
     def test_read_json_rows(self):
-        data = '''{"a": 1, "b": 2}
-                  {"a": 3, "b": 4}'''
+        data = """{"a": 1, "b": 2}
+                  {"a": 3, "b": 4}"""
         it = StreamingDataFrame.read_json(StringIO(data), lines=True)
         dfs = list(it)
         self.assertEqual(len(dfs), 1)
-        js = dfs[0].to_json(orient='records')
+        js = dfs[0].to_json(orient="records")
         self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]')
 
     def test_read_json_rows2(self):
-        data = b'''{"a": 1, "b": 2}
-                  {"a": 3, "b": 4}'''
+        data = b"""{"a": 1, "b": 2}
+                  {"a": 3, "b": 4}"""
         dfs = pandas.read_json(BytesIO(data), lines=True)
         self.assertEqual(dfs.shape, (2, 2))
         it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
         dfs = list(it)
         self.assertEqual(len(dfs), 1)
-        js = dfs[0].to_json(orient='records')
+        js = dfs[0].to_json(orient="records")
         self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js)
 
     def test_read_json_rows2_head(self):
-        data = b'''{"a": 1, "b": 2}
-                  {"a": 3, "b": 4}'''
+        data = b"""{"a": 1, "b": 2}
+                  {"a": 3, "b": 4}"""
         dfs = pandas.read_json(BytesIO(data), lines=True)
         self.assertEqual(dfs.shape, (2, 2))
         it = StreamingDataFrame.read_json(BytesIO(data), lines="stream")
@@ -190,8 +198,8 @@ def test_read_json_rows2_head(self):
         self.assertEqualDataFrame(h1, h2)
 
     def test_read_json_rows_file_head(self):
-        data = self.abs_path_join(__file__, 'data', 'example2.json')
-        dfs = pandas.read_json(data, orient='records')
+        data = self.abs_path_join(__file__, "data", "example2.json")
+        dfs = pandas.read_json(data, orient="records")
         self.assertEqual(dfs.shape, (2, 2))
         it = StreamingDataFrame.read_json(data)
         h1 = it.head()
@@ -201,8 +209,8 @@ def test_read_json_rows_file_head(self):
         self.assertEqualDataFrame(h1, h2)
 
     def test_read_json_rows_file_lines_head(self):
-        data = self.abs_path_join(__file__, 'data', 'example.json')
-        dfs = pandas.read_json(data, orient='records', lines=True)
+        data = self.abs_path_join(__file__, "data", "example.json")
+        dfs = pandas.read_json(data, orient="records", lines=True)
         self.assertEqual(dfs.shape, (2, 2))
         it = StreamingDataFrame.read_json(data, lines="stream")
         h1 = it.head()
@@ -212,12 +220,11 @@ def test_read_json_rows_file_lines_head(self):
         self.assertEqualDataFrame(h1, h2)
 
     def test_read_json_ijson(self):
-        it = StreamingDataFrame.read_json(
-            BytesIO(TestDataFrameIOHelpers.text_json))
+        it = StreamingDataFrame.read_json(BytesIO(TestDataFrameIOHelpers.text_json))
         dfs = list(it)
         self.assertEqual(len(dfs), 1)
-        js = dfs[0].to_json(orient='records', lines=True)
-        jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']')
+        js = dfs[0].to_json(orient="records", lines=True)
+        jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]")
         self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp)
 
     def test_read_json_stream(self):
@@ -239,33 +246,39 @@ def test_read_json_stream(self):
         self.assertEqual(val, exp)
 
     def test_enumerate_json_items_lines(self):
-        data = b'''{"a": 1, "b": 2}
-                   {"a": 3, "b": 4}'''
+        data = b"""{"a": 1, "b": 2}
+                   {"a": 3, "b": 4}"""
         items = list(enumerate_json_items(data, lines=True))
-        self.assertEqual(items, [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])
+        self.assertEqual(items, [{"a": 1, "b": 2}, {"a": 3, "b": 4}])
 
     def test_read_json_file2(self):
-        data = b'''{"a": {"c": 1}, "b": [2, 3]}
-                   {"a": {"a": 3}, "b": [4, 5, "r"]}'''
+        data = b"""{"a": {"c": 1}, "b": [2, 3]}
+                   {"a": {"a": 3}, "b": [4, 5, "r"]}"""
 
-        obj1 = list(enumerate_json_items(
-            BytesIO(data), flatten=False, lines=True))
-        obj2 = list(enumerate_json_items(
-            BytesIO(data), flatten=True, lines=True))
+        obj1 = list(enumerate_json_items(BytesIO(data), flatten=False, lines=True))
+        obj2 = list(enumerate_json_items(BytesIO(data), flatten=True, lines=True))
         self.assertNotEqual(obj1, obj2)
-        self.assertEqual(obj2, [{'a_c': 1, 'b_0': 2, 'b_1': 3},
-                                {'a_a': 3, 'b_0': 4, 'b_1': 5, 'b_2': 'r'}])
+        self.assertEqual(
+            obj2,
+            [
+                {"a_c": 1, "b_0": 2, "b_1": 3},
+                {"a_a": 3, "b_0": 4, "b_1": 5, "b_2": "r"},
+            ],
+        )
 
-        it = StreamingDataFrame.read_json(
-            BytesIO(data), lines="stream", flatten=True)
+        it = StreamingDataFrame.read_json(BytesIO(data), lines="stream", flatten=True)
         dfs = list(it)
-        self.assertEqual(['a_a', 'a_c', 'b_0', 'b_1', 'b_2'],
-                         list(sorted(dfs[0].columns)), )
+        self.assertEqual(
+            ["a_a", "a_c", "b_0", "b_1", "b_2"],
+            list(sorted(dfs[0].columns)),
+        )
         self.assertEqual(len(dfs), 1)
-        js = dfs[0].to_json(orient='records', lines=True)
-        jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']')
-        exp = [{'a_a': None, 'a_c': 1.0, 'b_0': 2, 'b_1': 3, 'b_2': None},
-               {'a_a': 3.0, 'a_c': None, 'b_0': 4, 'b_1': 5, 'b_2': 'r'}]
+        js = dfs[0].to_json(orient="records", lines=True)
+        jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]")
+        exp = [
+            {"a_a": None, "a_c": 1.0, "b_0": 2, "b_1": 3, "b_2": None},
+            {"a_a": 3.0, "a_c": None, "b_0": 4, "b_1": 5, "b_2": "r"},
+        ]
         self.assertEqual(exp, jsjson)
 
     def test_read_json_item(self):
@@ -282,18 +295,19 @@ def test_read_json_item(self):
     def test_bug_documentation(self):
         items = []
         for item in JsonIterator2Stream(
-                lambda: enumerate_json_items(TestDataFrameIOHelpers.text_json)):
+            lambda: enumerate_json_items(TestDataFrameIOHelpers.text_json)
+        ):
             items.append(item)
         self.assertEqual(len(items), 2)
 
     def test_read_json_classic(self):
-        data = self.abs_path_join(__file__, 'data', 'classic.json')
-        dfs = pandas.read_json(data, orient='records')
-        dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
+        data = self.abs_path_join(__file__, "data", "classic.json")
+        dfs = pandas.read_json(data, orient="records")
+        dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9)
         self.assertEqual(dfs.shape[1], 9)
         self.assertGreater(dfs.shape[0], 2)
         it = StreamingDataFrame.read_json(data)
-        it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
+        it["ts2"] = it["ts"].apply(lambda t: t / 1e9)
         h1 = it.to_df()
         h2 = it.to_df()
         self.assertNotEmpty(h1)
@@ -302,12 +316,12 @@ def test_read_json_classic(self):
         self.assertEqual(h1.shape[1], 9)
 
     def test_read_json_classic_file(self):
-        data = self.abs_path_join(__file__, 'data', 'classic.json')
-        dfs = pandas.read_json(data, orient='records')
+        data = self.abs_path_join(__file__, "data", "classic.json")
+        dfs = pandas.read_json(data, orient="records")
         self.assertEqual(dfs.shape[1], 8)
         self.assertGreater(dfs.shape[0], 2)
         with open(data, "r", encoding="utf-8") as f:
-            it = StreamingDataFrame.read_json(f, orient='records')
+            it = StreamingDataFrame.read_json(f, orient="records")
             h1 = it.to_df()
             h2 = it.to_df()
         self.assertNotEmpty(h1)
@@ -316,14 +330,14 @@ def test_read_json_classic_file(self):
         self.assertEqual(h1.shape[1], 8)
 
     def test_read_json_classic_file_formula(self):
-        data = self.abs_path_join(__file__, 'data', 'classic.json')
-        dfs = pandas.read_json(data, orient='records')
-        dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9)
+        data = self.abs_path_join(__file__, "data", "classic.json")
+        dfs = pandas.read_json(data, orient="records")
+        dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9)
         self.assertEqual(dfs.shape[1], 9)
         self.assertGreater(dfs.shape[0], 2)
         with open(data, "r", encoding="utf-8") as f:
             it = StreamingDataFrame.read_json(f)
-            it['ts2'] = it['ts'].apply(lambda t: t / 1e9)
+            it["ts2"] = it["ts"].apply(lambda t: t / 1e9)
             h1 = it.to_df()
             h2 = it.to_df()
         self.assertNotEmpty(h1)
diff --git a/_unittests/ut_df/test_dataframe_sort.py b/_unittests/ut_df/test_dataframe_sort.py
index d6f1202..354e4d5 100644
--- a/_unittests/ut_df/test_dataframe_sort.py
+++ b/_unittests/ut_df/test_dataframe_sort.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=4s)
-"""
 import os
 import unittest
 import pandas
@@ -10,15 +6,18 @@
 
 
 class TestDataFrameSort(ExtTestCase):
-
     def test_sort_values(self):
         temp = get_temp_folder(__file__, "temp_sort_values")
         name = os.path.join(temp, "_data_")
-        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
-                               dict(a=5, b="f", c=5.7, ind="a2", ai=2),
-                               dict(a=4, b="g", ind="a3", ai=3),
-                               dict(a=8, b="h", c=5.9, ai=4),
-                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                dict(a=5, b="f", c=5.7, ind="a2", ai=2),
+                dict(a=4, b="g", ind="a3", ai=3),
+                dict(a=8, b="h", c=5.9, ai=4),
+                dict(a=16, b="i", c=6.2, ind="a5", ai=5),
+            ]
+        )
         sdf = StreamingDataFrame.read_df(df, chunksize=2)
         sorted_df = df.sort_values(by="a")
         res = sdf.sort_values(by="a", temp_file=name)
@@ -28,11 +27,15 @@ def test_sort_values(self):
     def test_sort_values_twice(self):
         temp = get_temp_folder(__file__, "temp_sort_values_twice")
         name = os.path.join(temp, "_data_")
-        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
-                               dict(a=5, b="f", c=5.7, ind="a2", ai=2),
-                               dict(a=4, b="g", ind="a3", ai=3),
-                               dict(a=8, b="h", c=5.9, ai=4),
-                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                dict(a=5, b="f", c=5.7, ind="a2", ai=2),
+                dict(a=4, b="g", ind="a3", ai=3),
+                dict(a=8, b="h", c=5.9, ai=4),
+                dict(a=16, b="i", c=6.2, ind="a5", ai=5),
+            ]
+        )
         sdf = StreamingDataFrame.read_df(df, chunksize=2)
         sorted_df = df.sort_values(by="a")
         res = sdf.sort_values(by="a", temp_file=name)
@@ -44,11 +47,15 @@ def test_sort_values_twice(self):
     def test_sort_values_reverse(self):
         temp = get_temp_folder(__file__, "temp_sort_values_reverse")
         name = os.path.join(temp, "_data_")
-        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
-                               dict(a=5, b="f", c=5.7, ind="a2", ai=2),
-                               dict(a=4, b="g", ind="a3", ai=3),
-                               dict(a=8, b="h", c=5.9, ai=4),
-                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                dict(a=5, b="f", c=5.7, ind="a2", ai=2),
+                dict(a=4, b="g", ind="a3", ai=3),
+                dict(a=8, b="h", c=5.9, ai=4),
+                dict(a=16, b="i", c=6.2, ind="a5", ai=5),
+            ]
+        )
         sdf = StreamingDataFrame.read_df(df, chunksize=2)
         sorted_df = df.sort_values(by="a", ascending=False)
         res = sdf.sort_values(by="a", temp_file=name, ascending=False)
@@ -58,30 +65,38 @@ def test_sort_values_reverse(self):
     def test_sort_values_nan_last(self):
         temp = get_temp_folder(__file__, "temp_sort_values_nan_last")
         name = os.path.join(temp, "_data_")
-        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
-                               dict(b="f", c=5.7, ind="a2", ai=2),
-                               dict(b="f", c=5.8, ind="a2", ai=2),
-                               dict(a=4, b="g", ind="a3", ai=3),
-                               dict(a=8, b="h", c=5.9, ai=4),
-                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                dict(b="f", c=5.7, ind="a2", ai=2),
+                dict(b="f", c=5.8, ind="a2", ai=2),
+                dict(a=4, b="g", ind="a3", ai=3),
+                dict(a=8, b="h", c=5.9, ai=4),
+                dict(a=16, b="i", c=6.2, ind="a5", ai=5),
+            ]
+        )
         sdf = StreamingDataFrame.read_df(df, chunksize=2)
-        sorted_df = df.sort_values(by="a", na_position='last')
-        res = sdf.sort_values(by="a", temp_file=name, na_position='last')
+        sorted_df = df.sort_values(by="a", na_position="last")
+        res = sdf.sort_values(by="a", temp_file=name, na_position="last")
         res_df = res.to_df()
         self.assertEqualDataFrame(sorted_df, res_df)
 
     def test_sort_values_nan_first(self):
         temp = get_temp_folder(__file__, "temp_sort_values_nan_first")
         name = os.path.join(temp, "_data_")
-        df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
-                               dict(b="f", c=5.7, ind="a2", ai=2),
-                               dict(b="f", c=5.8, ind="a2", ai=2),
-                               dict(a=4, b="g", ind="a3", ai=3),
-                               dict(a=8, b="h", c=5.9, ai=4),
-                               dict(a=16, b="i", c=6.2, ind="a5", ai=5)])
+        df = pandas.DataFrame(
+            [
+                dict(a=1, b="eé", c=5.6, ind="a1", ai=1),
+                dict(b="f", c=5.7, ind="a2", ai=2),
+                dict(b="f", c=5.8, ind="a2", ai=2),
+                dict(a=4, b="g", ind="a3", ai=3),
+                dict(a=8, b="h", c=5.9, ai=4),
+                dict(a=16, b="i", c=6.2, ind="a5", ai=5),
+            ]
+        )
         sdf = StreamingDataFrame.read_df(df, chunksize=2)
-        sorted_df = df.sort_values(by="a", na_position='first')
-        res = sdf.sort_values(by="a", temp_file=name, na_position='first')
+        sorted_df = df.sort_values(by="a", na_position="first")
+        res = sdf.sort_values(by="a", temp_file=name, na_position="first")
         res_df = res.to_df()
         self.assertEqualDataFrame(sorted_df, res_df)
 
diff --git a/_unittests/ut_df/test_pandas_groupbynan.py b/_unittests/ut_df/test_pandas_groupbynan.py
index 94482d5..3d9a635 100644
--- a/_unittests/ut_df/test_pandas_groupbynan.py
+++ b/_unittests/ut_df/test_pandas_groupbynan.py
@@ -1,7 +1,3 @@
-# coding: utf-8
-"""
-@brief      test log(time=1s)
-"""
 import unittest
 import pandas
 import numpy
@@ -11,19 +7,18 @@
 
 
 class TestPandasHelper(ExtTestCase):
-
     def test_pandas_groupbynan(self):
         self.assertTrue(sparse_lsqr is not None)
-        types = [(int, -10), (float, -20.2), (str, "e"),
-                 (bytes, bytes("a", "ascii"))]
+        types = [(int, -10), (float, -20.2), (str, "e"), (bytes, bytes("a", "ascii"))]
         skip = (numpy.bool_, numpy.complex64, numpy.complex128)
         types += [(_, _(5)) for _ in numpy_types() if _ not in skip]
 
         for ty in types:
-            data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
-                    {"this": "cst", "type": "tt2=" +
-                        str(ty[0]), "value": ty[1]},
-                    {"this": "cst", "type": "row_for_nan"}]
+            data = [
+                {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
+                {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]},
+                {"this": "cst", "type": "row_for_nan"},
+            ]
             df = pandas.DataFrame(data)
             gr = pandas_groupby_nan(df, "value")
             co = gr.sum()
@@ -37,13 +32,16 @@ def test_pandas_groupbynan(self):
             except AssertionError as e:
                 raise AssertionError(
                     "Issue with value {}\n--df--\n{}\n--gr--\n{}\n--co--\n{}".format(
-                        li, df, gr.count(), co)) from e
+                        li, df, gr.count(), co
+                    )
+                ) from e
 
         for ty in types:
-            data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
-                    {"this": "cst", "type": "tt2=" +
-                        str(ty[0]), "value": ty[1]},
-                    {"this": "cst", "type": "row_for_nan"}]
+            data = [
+                {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]},
+                {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]},
+                {"this": "cst", "type": "row_for_nan"},
+            ]
             df = pandas.DataFrame(data)
             try:
                 gr = pandas_groupby_nan(df, ("value", "this"))
@@ -68,8 +66,12 @@ def test_pandas_groupbynan(self):
                 self.assertEqual(len(li), 2)
 
     def test_pandas_groupbynan_tuple(self):
-        data = [dict(a="a", b="b", c="c", n=1), dict(
-            b="b", n=2), dict(a="a", n=3), dict(c="c", n=4)]
+        data = [
+            dict(a="a", b="b", c="c", n=1),
+            dict(b="b", n=2),
+            dict(a="a", n=3),
+            dict(c="c", n=4),
+        ]
         df = pandas.DataFrame(data)
         gr = df.groupby(["a", "b", "c"]).sum()
         self.assertEqual(gr.shape, (1, 1))
@@ -77,7 +79,8 @@ def test_pandas_groupbynan_tuple(self):
         for nanback in [True, False]:
             try:
                 gr2_ = pandas_groupby_nan(
-                    df, ["a", "b", "c"], nanback=nanback, suffix="NAN")
+                    df, ["a", "b", "c"], nanback=nanback, suffix="NAN"
+                )
             except NotImplementedError:
                 continue
             gr2 = gr2_.sum().sort_values("n")
@@ -101,36 +104,42 @@ def test_pandas_groupbynan_regular_nanback(self):
         self.assertEqual(len(gr), 1)
 
     def test_pandas_groupbynan_doc(self):
-        data = [dict(a=2, ind="a", n=1),
-                dict(a=2, ind="a"),
-                dict(a=3, ind="b"),
-                dict(a=30)]
+        data = [
+            dict(a=2, ind="a", n=1),
+            dict(a=2, ind="a"),
+            dict(a=3, ind="b"),
+            dict(a=30),
+        ]
         df = pandas.DataFrame(data)
         gr2 = pandas_groupby_nan(df, ["ind"]).sum()
-        ind = list(gr2['ind'])
+        ind = list(gr2["ind"])
         self.assertTrue(numpy.isnan(ind[-1]))
-        val = list(gr2['a'])
+        val = list(gr2["a"])
         self.assertEqual(val[-1], 30)
 
     @ignore_warnings(UserWarning)
     def test_pandas_groupbynan_doc2(self):
-        data = [dict(a=2, ind="a", n=1),
-                dict(a=2, ind="a"),
-                dict(a=3, ind="b"),
-                dict(a=30)]
+        data = [
+            dict(a=2, ind="a", n=1),
+            dict(a=2, ind="a"),
+            dict(a=3, ind="b"),
+            dict(a=30),
+        ]
         df = pandas.DataFrame(data)
         gr2 = pandas_groupby_nan(df, ["ind", "a"], nanback=False).sum()
-        ind = list(gr2['ind'])
+        ind = list(gr2["ind"])
         self.assertEqual(ind[-1], "²nan")
 
     def test_pandas_groupbynan_doc3(self):
-        data = [dict(a=2, ind="a", n=1),
-                dict(a=2, ind="a"),
-                dict(a=3, ind="b"),
-                dict(a=30)]
+        data = [
+            dict(a=2, ind="a", n=1),
+            dict(a=2, ind="a"),
+            dict(a=3, ind="b"),
+            dict(a=30),
+        ]
         df = pandas.DataFrame(data)
         gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum()
-        ind = list(gr2['ind'])
+        ind = list(gr2["ind"])
         self.assertTrue(numpy.isnan(ind[-1]))
 
 
diff --git a/_unittests/ut_df/test_streaming_dataframe.py b/_unittests/ut_df/test_streaming_dataframe.py
index 11fdc51..b62f9a5 100644
--- a/_unittests/ut_df/test_streaming_dataframe.py
+++ b/_unittests/ut_df/test_streaming_dataframe.py
@@ -1,21 +1,15 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=4s)
-"""
 import os
 import unittest
 from io import StringIO
 import pandas
 import numpy
-from pyquickhelper.pycode import (
-    ExtTestCase, get_temp_folder, ignore_warnings)
+from pyquickhelper.pycode import ExtTestCase, get_temp_folder, ignore_warnings
 from pandas_streaming.data import dummy_streaming_dataframe
 from pandas_streaming.df import StreamingDataFrame
 from pandas_streaming.df.dataframe import StreamingDataFrameSchemaError
 
 
 class TestStreamingDataFrame(ExtTestCase):
-
     def test_shape(self):
         sdf = dummy_streaming_dataframe(100)
         dfs = list(sdf)
@@ -34,11 +28,9 @@ def test_init(self):
     def test_to_csv(self):
         sdf = dummy_streaming_dataframe(100)
         st = sdf.to_csv()
-        self.assertStartsWith(",cint,cstr\n0,0,s0",
-                              st.replace('\r', ''))
+        self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", ""))
         st = sdf.to_csv()
-        self.assertStartsWith(",cint,cstr\n0,0,s0",
-                              st.replace('\r', ''))
+        self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", ""))
 
     def test_iterrows(self):
         sdf = dummy_streaming_dataframe(100)
@@ -74,43 +66,42 @@ def test_read_csv(self):
         sdf = StreamingDataFrame.read_csv(name)
         text = sdf.to_csv(index=False)
         self.assertRaise(
-            lambda: StreamingDataFrame.read_csv(
-                name2, index_col=0, chunksize=None),
-            ValueError)
+            lambda: StreamingDataFrame.read_csv(name2, index_col=0, chunksize=None),
+            ValueError,
+        )
         self.assertRaise(
-            lambda: StreamingDataFrame.read_csv(
-                name2, index_col=0, iterator=False),
-            ValueError)
+            lambda: StreamingDataFrame.read_csv(name2, index_col=0, iterator=False),
+            ValueError,
+        )
         sdf2 = StreamingDataFrame.read_csv(name2, index_col=0)
         text2 = sdf2.to_csv(index=True)
         sdf2.to_csv(name3, index=True)
-        with open(name, "r", encoding='utf-8') as f:
+        with open(name, "r", encoding="utf-8") as f:
             exp = f.read()
-        with open(name2, "r", encoding='utf-8') as f:
+        with open(name2, "r", encoding="utf-8") as f:
             exp2 = f.read()
-        with open(name3, "r", encoding='utf-8') as f:
+        with open(name3, "r", encoding="utf-8") as f:
             text3 = f.read()
-        self.assertEqual(text.replace('\r', ''), exp)
+        self.assertEqual(text.replace("\r", ""), exp)
         sdf2 = StreamingDataFrame.read_df(df)
         self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe())
-        self.assertEqual(text2.replace('\r', ''), exp2)
-        self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'),
-                         exp2.replace('\r', ''))
+        self.assertEqual(text2.replace("\r", ""), exp2)
+        self.assertEqual(
+            text3.replace("\r", "").replace("\n\n", "\n"), exp2.replace("\r", "")
+        )
 
     def test_where(self):
         sdf = dummy_streaming_dataframe(100)
         cols = sdf.columns
-        self.assertEqual(list(cols), ['cint', 'cstr'])
+        self.assertEqual(list(cols), ["cint", "cstr"])
         dts = sdf.dtypes
         self.assertEqual(len(dts), 2)
         res = sdf.where(lambda row: row["cint"] == 1)
         st = res.to_csv()
-        self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1",
-                              st.replace('\r', ''))
+        self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", ""))
         res = sdf.where(lambda row: row["cint"] == 1)
         st = res.to_csv()
-        self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1",
-                              st.replace('\r', ''))
+        self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", ""))
 
     def test_dataframe(self):
         sdf = dummy_streaming_dataframe(100)
@@ -144,10 +135,12 @@ def test_sample_reservoir_cache(self):
         df2 = res.to_df()
         self.assertEqualDataFrame(df1, df2)
         self.assertEqual(df1.shape, (10, res.shape[1]))
-        self.assertRaise(lambda: sdf.sample(n=10, cache=False, reservoir=True),
-                         ValueError)
-        self.assertRaise(lambda: sdf.sample(frac=0.1, cache=True, reservoir=True),
-                         ValueError)
+        self.assertRaise(
+            lambda: sdf.sample(n=10, cache=False, reservoir=True), ValueError
+        )
+        self.assertRaise(
+            lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), ValueError
+        )
 
     def test_apply(self):
         sdf = dummy_streaming_dataframe(100)
@@ -157,19 +150,18 @@ def test_apply(self):
         sdf = sdf.apply(lambda row: row[["cint"]] + "r", axis=1)
         self.assertNotEmpty(list(sdf))
         text = sdf.to_csv(header=False)
-        self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r",
-                              text.replace('\r', ''))
+        self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", text.replace("\r", ""))
 
     def test_train_test_split(self):
         sdf = dummy_streaming_dataframe(100)
         tr, te = sdf.train_test_split(index=False, streaming=False)
         self.assertRaise(
-            lambda: StreamingDataFrame.read_str(tr, chunksize=None),
-            ValueError)
+            lambda: StreamingDataFrame.read_str(tr, chunksize=None), ValueError
+        )
         self.assertRaise(
-            lambda: StreamingDataFrame.read_str(tr, iterator=False),
-            ValueError)
-        StreamingDataFrame.read_str(tr.encode('utf-8'))
+            lambda: StreamingDataFrame.read_str(tr, iterator=False), ValueError
+        )
+        StreamingDataFrame.read_str(tr.encode("utf-8"))
         trsdf = StreamingDataFrame.read_str(tr)
         tesdf = StreamingDataFrame.read_str(te)
         trdf = trsdf.to_dataframe()
@@ -183,7 +175,8 @@ def test_train_test_split(self):
     def test_train_test_split_streaming(self):
         sdf = dummy_streaming_dataframe(100, asfloat=True)
         trsdf, tesdf = sdf.train_test_split(
-            streaming=True, unique_rows=True, partitions=[0.7, 0.3])
+            streaming=True, unique_rows=True, partitions=[0.7, 0.3]
+        )
         trdf = trsdf.to_dataframe()
         tedf = tesdf.to_dataframe()
         df_exp = sdf.to_dataframe()
@@ -228,10 +221,12 @@ def test_train_test_split_streaming_tiny(self):
             self.assertEqualDataFrame(df1, df2)
 
     def test_train_test_split_streaming_strat(self):
-        sdf = dummy_streaming_dataframe(100, asfloat=True,
-                                        tify=["t1" if i % 3 else "t0" for i in range(0, 100)])
+        sdf = dummy_streaming_dataframe(
+            100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(0, 100)]
+        )
         trsdf, tesdf = sdf.train_test_split(
-            streaming=True, unique_rows=True, stratify="tify")
+            streaming=True, unique_rows=True, stratify="tify"
+        )
         trdf = trsdf.to_dataframe()
         tedf = tesdf.to_dataframe()
         df_exp = sdf.to_dataframe()
@@ -250,12 +245,11 @@ def test_train_test_split_streaming_strat(self):
         tegr = tedf.groupby("tify").count()
         tegr["part"] = 1
         gr = pandas.concat([trgr, tegr])
-        self.assertGreater(gr['cfloat'].min(), 4)
+        self.assertGreater(gr["cfloat"].min(), 4)
 
     def test_train_test_split_file(self):
         temp = get_temp_folder(__file__, "temp_train_test_split_file")
-        names = [os.path.join(temp, "train.txt"),
-                 os.path.join(temp, "test.txt")]
+        names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")]
         sdf = dummy_streaming_dataframe(100)
         sdf.train_test_split(names, index=False, streaming=False)
         trsdf = StreamingDataFrame.read_csv(names[0])
@@ -276,8 +270,10 @@ def test_train_test_split_file_pattern(self):
         temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern")
         sdf = dummy_streaming_dataframe(100)
         names = os.path.join(temp, "spl_{0}.txt")
-        self.assertRaise(lambda: sdf.train_test_split(
-            names, index=False, streaming=False), ValueError)
+        self.assertRaise(
+            lambda: sdf.train_test_split(names, index=False, streaming=False),
+            ValueError,
+        )
         names = os.path.join(temp, "spl_{}.txt")
         tr, te = sdf.train_test_split(names, index=False, streaming=False)
         trsdf = StreamingDataFrame.read_csv(tr)
@@ -297,8 +293,9 @@ def compares(a, b, how):
             da = a.to_dataframe()
             db = b.to_dataframe()
             exp = da.merge(db, on="cint", indicator=True)
-            self.assertEqualDataFrame(dm.reset_index(drop=True),
-                                      exp.reset_index(drop=True))
+            self.assertEqualDataFrame(
+                dm.reset_index(drop=True), exp.reset_index(drop=True)
+            )
 
         sdf20 = dummy_streaming_dataframe(20)
         sdf30 = dummy_streaming_dataframe(30)
@@ -332,11 +329,17 @@ def test_concatv(self):
         self.assertEqualDataFrame(m1.to_dataframe(), df)
 
         df30["g"] = 4
-        self.assertRaise(lambda: sdf20.concat(df30).to_dataframe(),
-                         ValueError, "Frame others[0] do not have the same column names")
+        self.assertRaise(
+            lambda: sdf20.concat(df30).to_dataframe(),
+            ValueError,
+            "Frame others[0] do not have the same column names",
+        )
         df20["cint"] = df20["cint"].astype(float)
-        self.assertRaise(lambda: sdf20.concat(df20).to_dataframe(),
-                         ValueError, "Frame others[0] do not have the same column types")
+        self.assertRaise(
+            lambda: sdf20.concat(df20).to_dataframe(),
+            ValueError,
+            "Frame others[0] do not have the same column types",
+        )
 
     def test_concath(self):
         sdf20 = dummy_streaming_dataframe(20)
@@ -349,8 +352,9 @@ def test_concath(self):
         self.assertEqualDataFrame(m1.to_dataframe(), df)
         sdf22 = dummy_streaming_dataframe(22)
         sdf25 = dummy_streaming_dataframe(25)
-        self.assertRaise(lambda: sdf22.concat(sdf25, axis=1).to_dataframe(),
-                         RuntimeError)
+        self.assertRaise(
+            lambda: sdf22.concat(sdf25, axis=1).to_dataframe(), RuntimeError
+        )
 
     def test_groupby(self):
         df20 = dummy_streaming_dataframe(20).to_dataframe()
@@ -359,14 +363,19 @@ def test_groupby(self):
         gr = sdf20.groupby("key", lambda gr: gr.sum())
         gr2 = df20.groupby("key").sum()
         self.assertEqualDataFrame(gr, gr2)
-        self.assertRaise(lambda: sdf20.groupby(
-            "key", in_memory=False), NotImplementedError)
+        self.assertRaise(
+            lambda: sdf20.groupby("key", in_memory=False), NotImplementedError
+        )
 
         # Do not replace lambda c:sum(c) by sum or...
-        # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum
-        gr2 = df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c:sum(c)])
-        gr = sdf20.drop("cstr", axis=1).groupby("key", lambda gr: gr.agg(
-            [numpy.sum, lambda c:sum(c)]))
+        # pandas.core.base.SpecificationError: Function names
+        # must be unique, found multiple named sum
+        gr2 = (
+            df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c: sum(c)])
+        )
+        gr = sdf20.drop("cstr", axis=1).groupby(
+            "key", lambda gr: gr.agg([numpy.sum, lambda c: sum(c)])
+        )
         self.assertEqualDataFrame(gr, gr2)
 
         gr = sdf20.groupby("key", lambda gr: gr.count())
@@ -384,7 +393,8 @@ def test_groupby_cum(self):
         df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
         sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
         sgr = sdf20.groupby_streaming(
-            "key", lambda gr: gr.sum(), strategy='cum', as_index=False)
+            "key", lambda gr: gr.sum(), strategy="cum", as_index=False
+        )
         gr2 = df20.groupby("key", as_index=False).sum()
         lastgr = None
         for gr in sgr:
@@ -397,7 +407,8 @@ def test_groupby_streaming(self):
         df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
         sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
         sgr = sdf20.groupby_streaming(
-            "key", lambda gr: gr.sum(), strategy='streaming', as_index=False)
+            "key", lambda gr: gr.sum(), strategy="streaming", as_index=False
+        )
         gr2 = df20.groupby("key", as_index=False).sum()
         grs = list(sgr)
         gr = pandas.concat(grs).groupby("key", as_index=False).sum()
@@ -408,7 +419,8 @@ def test_groupby_cum_asindex(self):
         df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0)
         sdf20 = StreamingDataFrame.read_df(df20, chunksize=5)
         sgr = sdf20.groupby_streaming(
-            "key", lambda gr: gr.sum(), strategy='cum', as_index=True)
+            "key", lambda gr: gr.sum(), strategy="cum", as_index=True
+        )
         gr2 = df20.groupby("key", as_index=True).sum()
         lastgr = None
         for gr in sgr:
@@ -426,13 +438,21 @@ def test_merge_2(self):
         m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
         jm = df2.merge(m, left_on="Y", right_on="Y", how="outer")
         sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
-        self.assertEqualDataFrame(jm.sort_values(["X", "Y"]).reset_index(drop=True),
-                                  sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True))
+        self.assertEqualDataFrame(
+            jm.sort_values(["X", "Y"]).reset_index(drop=True),
+            sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True),
+        )
 
     @ignore_warnings(ResourceWarning)
     def test_schema_consistent(self):
-        df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"),
-                               dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")])
+        df = pandas.DataFrame(
+            [
+                dict(cf=0, cint=0, cstr="0"),
+                dict(cf=1, cint=1, cstr="1"),
+                dict(cf=2, cint="s2", cstr="2"),
+                dict(cf=3, cint=3, cstr="3"),
+            ]
+        )
         temp = get_temp_folder(__file__, "temp_schema_consistant")
         name = os.path.join(temp, "df.csv")
         stio = StringIO()
@@ -442,8 +462,7 @@ def test_schema_consistent(self):
         self.assertEqual(df.shape, (4, 3))
         sdf = StreamingDataFrame.read_csv(name, chunksize=2)
         self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError)
-        sdf = StreamingDataFrame.read_csv(
-            name, chunksize=2, check_schema=False)
+        sdf = StreamingDataFrame.read_csv(name, chunksize=2, check_schema=False)
         pieces = list(sdf)
         self.assertEqual(len(pieces), 2)
 
@@ -460,11 +479,10 @@ def test_getitem(self):
     def test_read_csv_names(self):
         this = os.path.abspath(os.path.dirname(__file__))
         data = os.path.join(this, "data", "buggy_hash2.csv")
-        df = pandas.read_csv(data, sep="\t",
-                             names=["A", "B", "C"],
-                             header=None)
+        df = pandas.read_csv(data, sep="\t", names=["A", "B", "C"], header=None)
         sdf = StreamingDataFrame.read_csv(
-            data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None)
+            data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None
+        )
         head = sdf.head(n=1)
         self.assertEqualDataFrame(df.head(n=1), head)
 
@@ -489,18 +507,15 @@ def test_add_column(self):
         self.assertEqualDataFrame(df, dfB)
 
     def test_fillna(self):
-        df = pandas.DataFrame(
-            data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan]))
+        df = pandas.DataFrame(data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan]))
         sdf = StreamingDataFrame.read_df(df)
 
-        df2 = pandas.DataFrame(
-            data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"]))
+        df2 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"]))
         na = sdf.fillna(value=dict(X=10.0, Y="NAN"))
         ndf = na.to_df()
         self.assertEqual(ndf, df2)
 
-        df3 = pandas.DataFrame(
-            data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan]))
+        df3 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan]))
         na = sdf.fillna(value=dict(X=10.0))
         ndf = na.to_df()
         self.assertEqual(ndf, df3)
@@ -513,16 +528,16 @@ def test_describe(self):
         sdf = StreamingDataFrame.read_df(df)
 
         desc = sdf.describe()
-        self.assertEqual(['X', 'Y'], list(desc.columns))
-        self.assertEqual(desc.loc['min', :].tolist(), [-0.5, 0])
-        self.assertEqual(desc.loc['max', :].tolist(), [0.5, 100000])
+        self.assertEqual(["X", "Y"], list(desc.columns))
+        self.assertEqual(desc.loc["min", :].tolist(), [-0.5, 0])
+        self.assertEqual(desc.loc["max", :].tolist(), [0.5, 100000])
+        self.assertEqualArray(desc.loc["mean", :], numpy.array([0, 50000]), atol=1e-8)
+        self.assertEqualArray(desc.loc["25%", :], numpy.array([-0.25, 25000]))
+        self.assertEqualArray(desc.loc["50%", :], numpy.array([0.0, 50000]))
+        self.assertEqualArray(desc.loc["75%", :], numpy.array([0.25, 75000]))
         self.assertEqualArray(
-            desc.loc['mean', :], numpy.array([0, 50000]), atol=1e-8)
-        self.assertEqualArray(desc.loc['25%', :], numpy.array([-0.25, 25000]))
-        self.assertEqualArray(desc.loc['50%', :], numpy.array([0.0, 50000]))
-        self.assertEqualArray(desc.loc['75%', :], numpy.array([0.25, 75000]))
-        self.assertEqualArray(desc.loc['std', :], numpy.array(
-            [2.886795e-01, 28867.946472]), decimal=4)
+            desc.loc["std", :], numpy.array([2.886795e-01, 28867.946472]), decimal=4
+        )
 
     def test_set_item(self):
         df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
@@ -530,31 +545,31 @@ def test_set_item(self):
         sdf = StreamingDataFrame.read_df(df)
 
         def f():
-            sdf[['a']] = 10
+            sdf[["a"]] = 10
+
         self.assertRaise(f, ValueError)
 
         def g():
-            sdf['a'] = [10]
+            sdf["a"] = [10]
+
         self.assertRaise(g, NotImplementedError)
 
-        sdf['aa'] = 10
+        sdf["aa"] = 10
         df = sdf.to_df()
         ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10]))
         self.assertEqualDataFrame(df, ddf)
-        sdf['bb'] = sdf['b'] + 10
+        sdf["bb"] = sdf["b"] + 10
         df = sdf.to_df()
-        ddf = ddf = pandas.DataFrame(
-            data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16]))
+        ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16]))
         self.assertEqualDataFrame(df, ddf)
 
     def test_set_item_function(self):
         df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7]))
         self.assertRaise(lambda: StreamingDataFrame(df), TypeError)
         sdf = StreamingDataFrame.read_df(df)
-        sdf['bb'] = sdf['b'].apply(lambda x: x + 11)
+        sdf["bb"] = sdf["b"].apply(lambda x: x + 11)
         df = sdf.to_df()
-        ddf = ddf = pandas.DataFrame(
-            data=dict(a=[4.5], b=[6], c=[7], bb=[17]))
+        ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], bb=[17]))
         self.assertEqualDataFrame(df, ddf)
 
 
diff --git a/_unittests/ut_documentation/test_run_notebooks.py b/_unittests/ut_documentation/test_run_notebooks.py
index 6f84e1c..aebe979 100644
--- a/_unittests/ut_documentation/test_run_notebooks.py
+++ b/_unittests/ut_documentation/test_run_notebooks.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=33s)
-"""
 import os
 import unittest
 from pyquickhelper.pycode import ExtTestCase
@@ -10,17 +6,19 @@
 
 
 class TestRunNotebooksPython(ExtTestCase):
-
     def setUp(self):
         import jyquickhelper  # pylint: disable=C0415
+
         self.assertTrue(jyquickhelper is not None)
 
     def test_notebook_artificiel(self):
         self.assertTrue(pandas_streaming is not None)
-        folder = os.path.join(os.path.dirname(__file__),
-                              "..", "..", "_doc", "notebooks")
+        folder = os.path.join(
+            os.path.dirname(__file__), "..", "..", "_doc", "notebooks"
+        )
         test_notebook_execution_coverage(
-            __file__, "first_steps", folder, 'pandas_streaming', copy_files=[])
+            __file__, "first_steps", folder, "pandas_streaming", copy_files=[]
+        )
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_module/test_sklearn.py b/_unittests/ut_module/test_sklearn.py
index 8ae2b79..c8bdbfc 100644
--- a/_unittests/ut_module/test_sklearn.py
+++ b/_unittests/ut_module/test_sklearn.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@brief      test log(time=2s)
-"""
 import unittest
 import numpy
 import pandas
@@ -10,12 +6,12 @@
 
 
 class TestScikitLearn(ExtTestCase):
-
     def test_logistic_regression_check(self):
         X = pandas.DataFrame(numpy.array([[0.1, 0.2], [-0.2, 0.3]]))
         Y = numpy.array([0, 1])
-        clq = LogisticRegression(fit_intercept=False, solver="liblinear",
-                                 random_state=42)
+        clq = LogisticRegression(
+            fit_intercept=False, solver="liblinear", random_state=42
+        )
         clq.fit(X, Y)
         pred2 = clq.predict(X)
         self.assertEqual(numpy.array([0, 1]), pred2)
diff --git a/pandas_streaming/data/__init__.py b/pandas_streaming/data/__init__.py
index ea274fc..9c3a725 100644
--- a/pandas_streaming/data/__init__.py
+++ b/pandas_streaming/data/__init__.py
@@ -1,6 +1 @@
-"""
-@file
-@brief Shortcuts to *df*.
-"""
-
 from .dummy import dummy_streaming_dataframe
diff --git a/pandas_streaming/data/dummy.py b/pandas_streaming/data/dummy.py
index 0103d1f..8500e74 100644
--- a/pandas_streaming/data/dummy.py
+++ b/pandas_streaming/data/dummy.py
@@ -1,8 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Dummy datasets.
-"""
 from pandas import DataFrame
 from ..df import StreamingDataFrame
 
@@ -19,11 +14,16 @@ def dummy_streaming_dataframe(n, chunksize=10, asfloat=False, **cols):
     :return: a @see cl StreamingDataFrame
     """
     if asfloat:
-        df = DataFrame(dict(cfloat=[_ + 0.1 for _ in range(0, n)], cstr=[
-                       f"s{i}" for i in range(0, n)]))
+        df = DataFrame(
+            dict(
+                cfloat=[_ + 0.1 for _ in range(0, n)],
+                cstr=[f"s{i}" for i in range(0, n)],
+            )
+        )
     else:
-        df = DataFrame(dict(cint=list(range(0, n)), cstr=[
-                       f"s{i}" for i in range(0, n)]))
+        df = DataFrame(
+            dict(cint=list(range(0, n)), cstr=[f"s{i}" for i in range(0, n)])
+        )
     for k, v in cols.items():
         df[k] = v
     return StreamingDataFrame.read_df(df, chunksize=chunksize)
diff --git a/pandas_streaming/df/__init__.py b/pandas_streaming/df/__init__.py
index 61e1b73..ac4996d 100644
--- a/pandas_streaming/df/__init__.py
+++ b/pandas_streaming/df/__init__.py
@@ -1,10 +1,13 @@
-"""
-@file
-@brief Shortcuts to *df*.
-"""
-
-from .connex_split import train_test_split_weights, train_test_connex_split, train_test_apart_stratify
+from .connex_split import (
+    train_test_split_weights,
+    train_test_connex_split,
+    train_test_apart_stratify,
+)
 from .dataframe import StreamingDataFrame
-from .dataframe_helpers import dataframe_hash_columns, dataframe_unfold, dataframe_shuffle
+from .dataframe_helpers import (
+    dataframe_hash_columns,
+    dataframe_unfold,
+    dataframe_shuffle,
+)
 from .dataframe_helpers import pandas_groupby_nan, numpy_types
 from .dataframe_io import to_zip, read_zip
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
index ec01b02..bc68581 100644
--- a/pandas_streaming/df/connex_split.py
+++ b/pandas_streaming/df/connex_split.py
@@ -1,8 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Implements a connex split between train and test.
-"""
 from collections import Counter
 import pandas
 import numpy
@@ -14,21 +9,31 @@ class ImbalancedSplitException(Exception):
     """
     Raised when an imbalanced split is detected.
     """
+
     pass
 
 
-def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None,
-                             shuffle=True, fail_imbalanced=0.05, random_state=None):
+def train_test_split_weights(
+    df,
+    weights=None,
+    test_size=0.25,
+    train_size=None,
+    shuffle=True,
+    fail_imbalanced=0.05,
+    random_state=None,
+):
     """
     Splits a database in train/test given, every row
     can have a different weight.
 
-    @param  df              :epkg:`pandas:DataFrame` or @see cl StreamingDataFrame
+    @param  df              :epkg:`pandas:DataFrame` or see :class:`StreamingDataFrame`
     @param  weights         None or weights or weights column name
-    @param  test_size       ratio for the test partition (if *train_size* is not specified)
+    @param  test_size       ratio for the test partition
+                            (if *train_size* is not specified)
     @param  train_size      ratio for the train partition
     @param  shuffle         shuffles before the split
-    @param  fail_imbalanced raises an exception if relative weights difference is higher than this value
+    @param  fail_imbalanced raises an exception if relative weights
+                            difference is higher than this value
     @param  random_state    seed for random generators
     @return                 train and test :epkg:`pandas:DataFrame`
 
@@ -37,21 +42,24 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None,
     as the function tries to keep equal weights among both paths
     without using randomness.
     """
-    if hasattr(df, 'iter_creation'):
+    if hasattr(df, "iter_creation"):
         raise NotImplementedError(  # pragma: no cover
-            'Not implemented yet for StreamingDataFrame.')
+            "Not implemented yet for StreamingDataFrame."
+        )
     if isinstance(df, numpy.ndarray):
         raise NotImplementedError(  # pragma: no cover
-            "Not implemented on numpy arrays.")
+            "Not implemented on numpy arrays."
+        )
     if shuffle:
         df = dataframe_shuffle(df, random_state=random_state)
     if weights is None:
         if test_size == 0 or train_size == 0:
             raise ValueError(
-                f"test_size={test_size} or train_size={train_size} cannot be null (1).")
-        return train_test_split(df, test_size=test_size,
-                                train_size=train_size,
-                                random_state=random_state)
+                f"test_size={test_size} or train_size={train_size} cannot be null (1)."
+            )
+        return train_test_split(
+            df, test_size=test_size, train_size=train_size, random_state=random_state
+        )
 
     if isinstance(weights, pandas.Series):
         weights = list(weights)
@@ -60,7 +68,8 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None,
     if len(weights) != df.shape[0]:
         raise ValueError(
             "Dimension mismatch between weights and dataframe "
-            "{0} != {1}".format(df.shape[0], len(weights)))
+            "{0} != {1}".format(df.shape[0], len(weights))
+        )
 
     p = (1 - test_size) if test_size else None
     if train_size is not None:
@@ -68,7 +77,8 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None,
         test_size = 1 - p
     if p is None or min(test_size, p) <= 0:
         raise ValueError(
-            f"test_size={test_size} or train_size={train_size} cannot be null (2).")
+            f"test_size={test_size} or train_size={train_size} cannot be null (2)."
+        )
     ratio = test_size / p
 
     if random_state is None:
@@ -98,21 +108,32 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None,
             balance -= w * ratio
             train_weights += w * ratio
 
-    r = abs(train_weights - test_weights) / \
-        (1.0 * (train_weights + test_weights))
+    r = abs(train_weights - test_weights) / (1.0 * (train_weights + test_weights))
     if r >= fail_imbalanced:
         raise ImbalancedSplitException(  # pragma: no cover
             "Split is imbalanced: train_weights={0} test_weights={1} r={2}."
-            "".format(train_weights, test_weights, r))
+            "".format(train_weights, test_weights, r)
+        )
 
     return df.iloc[train_ids, :], df.iloc[test_ids, :]
 
 
-def train_test_connex_split(df, groups, test_size=0.25, train_size=None,
-                            stratify=None, hash_size=9, unique_rows=False,
-                            shuffle=True, fail_imbalanced=0.05, keep_balance=None,
-                            stop_if_bigger=None, return_cnx=False,
-                            must_groups=None, random_state=None):
+def train_test_connex_split(
+    df,
+    groups,
+    test_size=0.25,
+    train_size=None,
+    stratify=None,
+    hash_size=9,
+    unique_rows=False,
+    shuffle=True,
+    fail_imbalanced=0.05,
+    keep_balance=None,
+    stop_if_bigger=None,
+    return_cnx=False,
+    must_groups=None,
+    random_state=None,
+):
     """
     This split is for a specific case where data is linked
     in many ways. Let's assume we have three ids as we have
@@ -124,7 +145,8 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None,
 
     @param  df              :epkg:`pandas:DataFrame`
     @param  groups          columns name for the ids
-    @param  test_size       ratio for the test partition (if *train_size* is not specified)
+    @param  test_size       ratio for the test partition
+                            (if *train_size* is not specified)
     @param  train_size      ratio for the train partition
     @param  stratify        column holding the stratification
     @param  hash_size       size of the hash to cache information about partition
@@ -138,13 +160,13 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None,
                             but does not guarantee it returns the best cut,
                             the value should be close to 0
     @param  keep_balance    (float), if not None, does not merge connected components
-                            if their relative sizes are too different, the value should be
-                            close to 1
+                            if their relative sizes are too different,
+                            the value should be close to 1
     @param  return_cnx      returns connected components as a third results
     @param  must_groups     column name for ids which must not be shared by
                             train/test partitions
     @param  random_state    seed for random generator
-    @return                 Two @see cl StreamingDataFrame, one
+    @return                 Two see :class:`StreamingDataFrame`, one
                             for train, one for test.
 
     The list of ids must hold in memory.
@@ -213,16 +235,20 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None,
     """
     if stratify is not None:
         raise NotImplementedError(  # pragma: no cover
-            "Option stratify is not implemented.")
+            "Option stratify is not implemented."
+        )
     if groups is None or len(groups) == 0:
         raise ValueError(  # pragma: no cover
-            "groups is empty. Use regular train_test_split.")
-    if hasattr(df, 'iter_creation'):
+            "groups is empty. Use regular train_test_split."
+        )
+    if hasattr(df, "iter_creation"):
         raise NotImplementedError(  # pragma: no cover
-            'Not implemented yet for StreamingDataFrame.')
+            "Not implemented yet for StreamingDataFrame."
+        )
     if isinstance(df, numpy.ndarray):
         raise NotImplementedError(  # pragma: no cover
-            "Not implemented on numpy arrays.")
+            "Not implemented on numpy arrays."
+        )
     if shuffle:
         df = dataframe_shuffle(df, random_state=random_state)
 
@@ -250,13 +276,18 @@ def do_connex_components(dfrows, local_groups, kb, sib):
 
         while modif > 0 and itern < len(elements):
             if fLOG and df.shape[0] > 10000:
-                fLOG("[train_test_connex_split] iteration={0}-#nb connect={1} - "
-                     "modif={2}".format(iter, len(set(elements)), modif))
+                fLOG(
+                    "[train_test_connex_split] iteration={0}-#nb connect={1} - "
+                    "modif={2}".format(iter, len(set(elements)), modif)
+                )
             modif = 0
             itern += 1
             for i, row in enumerate(dfrows.itertuples(index=False, name=None)):
-                vals = [val for val in zip(local_groups, row) if not isinstance(
-                    val[1], float) or not numpy.isnan(val[1])]
+                vals = [
+                    val
+                    for val in zip(local_groups, row)
+                    if not isinstance(val[1], float) or not numpy.isnan(val[1])
+                ]
 
                 c = elements[i]
 
@@ -276,27 +307,42 @@ def do_connex_components(dfrows, local_groups, kb, sib):
                     if kb is not None:
                         maxi = min(len(counts_cnx[new_c]), len(counts_cnx[c]))
                         if maxi > 5:
-                            diff = len(counts_cnx[new_c]) + \
-                                len(counts_cnx[c]) - maxi
+                            diff = len(counts_cnx[new_c]) + len(counts_cnx[c]) - maxi
                             r = diff / float(maxi)
                             if r > kb:
                                 if fLOG:  # pragma: no cover
-                                    fLOG('[train_test_connex_split]    balance '
-                                         'r={0:0.00000}>{1:0.00}, #[{2}]={3}, '
-                                         '#[{4}]={5}'.format(r, kb, new_c,
-                                                             len(counts_cnx[new_c]),
-                                                             c, len(counts_cnx[c])))
+                                    fLOG(
+                                        "[train_test_connex_split]    balance "
+                                        "r={0:0.00000}>{1:0.00}, #[{2}]={3}, "
+                                        "#[{4}]={5}".format(
+                                            r,
+                                            kb,
+                                            new_c,
+                                            len(counts_cnx[new_c]),
+                                            c,
+                                            len(counts_cnx[c]),
+                                        )
+                                    )
                                 continue
 
                     if sib is not None:
-                        r = (len(counts_cnx[new_c]) +
-                             len(counts_cnx[c])) / float(len(elements))
+                        r = (len(counts_cnx[new_c]) + len(counts_cnx[c])) / float(
+                            len(elements)
+                        )
                         if r > sib:
                             if fLOG:  # pragma: no cover
-                                fLOG('[train_test_connex_split]    no merge '
-                                     'r={0:0.00000}>{1:0.00}, #[{2}]={3}, #[{4}]={5}'
-                                     ''.format(r, sib, new_c, len(counts_cnx[new_c]),
-                                               c, len(counts_cnx[c])))
+                                fLOG(
+                                    "[train_test_connex_split]    no merge "
+                                    "r={0:0.00000}>{1:0.00}, #[{2}]={3}, #[{4}]={5}"
+                                    "".format(
+                                        r,
+                                        sib,
+                                        new_c,
+                                        len(counts_cnx[new_c]),
+                                        c,
+                                        len(counts_cnx[c]),
+                                    )
+                                )
                             avoids_merge[new_c, c] = i
                             continue
 
@@ -307,8 +353,7 @@ def do_connex_components(dfrows, local_groups, kb, sib):
                         modif += len(counts_cnx[c])
                         for ii in counts_cnx[c]:
                             elements[ii] = new_c
-                        counts_cnx[new_c] = counts_cnx[new_c].union(
-                            counts_cnx[c])
+                        counts_cnx[new_c] = counts_cnx[new_c].union(counts_cnx[c])
                         counts_cnx[c] = set()
 
                         keys = list(vals)
@@ -327,13 +372,12 @@ def do_connex_components(dfrows, local_groups, kb, sib):
     grsum = dfids[[name, one]].groupby(name, as_index=False).sum()
     if fLOG:
         for g in groups:
-            fLOG(
-                f"[train_test_connex_split]     #nb in '{g}': {len(set(dfids[g]))}")
-        fLOG(
-            f"[train_test_connex_split] #connex {grsum.shape[0]}/{dfids.shape[0]}")
+            fLOG(f"[train_test_connex_split]     #nb in '{g}': {len(set(dfids[g]))}")
+        fLOG(f"[train_test_connex_split] #connex {grsum.shape[0]}/{dfids.shape[0]}")
     if grsum.shape[0] <= 1:
         raise ValueError(  # pragma: no cover
-            "Every element is in the same connected components.")
+            "Every element is in the same connected components."
+        )
 
     # Statistics: top connected components
     if fLOG:
@@ -342,28 +386,36 @@ def do_connex_components(dfrows, local_groups, kb, sib):
         cl = [(v, k) for k, v in counts.items()]
         cum = 0
         maxc = None
-        fLOG("[train_test_connex_split] number of connected components: {0}"
-             "".format(len(set(elements))))
+        fLOG(
+            "[train_test_connex_split] number of connected components: {0}"
+            "".format(len(set(elements)))
+        )
         for i, (v, k) in enumerate(sorted(cl, reverse=True)):
             if i == 0:
                 maxc = k, v
             if i >= 10:
                 break
             cum += v
-            fLOG("[train_test_connex_split]     c={0} #elements={1} cumulated"
-                 "={2}/{3}".format(k, v, cum, len(elements)))
+            fLOG(
+                "[train_test_connex_split]     c={0} #elements={1} cumulated"
+                "={2}/{3}".format(k, v, cum, len(elements))
+            )
 
         # Most important component
-        fLOG(
-            f'[train_test_connex_split] first row of the biggest component {maxc}')
+        fLOG(f"[train_test_connex_split] first row of the biggest component {maxc}")
         tdf = dfids[dfids[name] == maxc[0]]
-        fLOG(f'[train_test_connex_split] \n{tdf.head(n=10)}')
+        fLOG(f"[train_test_connex_split] \n{tdf.head(n=10)}")
 
     # Splits.
     train, test = train_test_split_weights(
-        grsum, weights=one, test_size=test_size, train_size=train_size,
-        shuffle=shuffle, fail_imbalanced=fail_imbalanced,
-        random_state=random_state)
+        grsum,
+        weights=one,
+        test_size=test_size,
+        train_size=train_size,
+        shuffle=shuffle,
+        fail_imbalanced=fail_imbalanced,
+        random_state=random_state,
+    )
     train.drop(one, inplace=True, axis=1)
     test.drop(one, inplace=True, axis=1)
 
@@ -382,8 +434,15 @@ def double_merge(d):
         return train_f, test_f
 
 
-def train_test_apart_stratify(df, group, test_size=0.25, train_size=None,
-                              stratify=None, force=False, random_state=None):
+def train_test_apart_stratify(
+    df,
+    group,
+    test_size=0.25,
+    train_size=None,
+    stratify=None,
+    force=False,
+    random_state=None,
+):
     """
     This split is for a specific case where data is linked
     in one way. Let's assume we have two ids as we have
@@ -401,7 +460,7 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None,
     @param  force           if True, tries to get at least one example on the test side
                             for each value of the column *stratify*
     @param  random_state    seed for random generators
-    @return                 Two @see cl StreamingDataFrame, one
+    @return                 Two see :class:`StreamingDataFrame`, one
                             for train, one for test.
 
     .. index:: multi-label
@@ -434,14 +493,11 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None,
         print(test)
     """
     if stratify is None:
-        raise ValueError(  # pragma: no cover
-            "stratify must be specified.")
+        raise ValueError("stratify must be specified.")  # pragma: no cover
     if group is None:
-        raise ValueError(  # pragma: no cover
-            "group must be specified.")
-    if hasattr(df, 'iter_creation'):
-        raise NotImplementedError(
-            'Not implemented yet for StreamingDataFrame.')
+        raise ValueError("group must be specified.")  # pragma: no cover
+    if hasattr(df, "iter_creation"):
+        raise NotImplementedError("Not implemented yet for StreamingDataFrame.")
     if isinstance(df, numpy.ndarray):
         raise NotImplementedError("Not implemented on numpy arrays.")
 
@@ -451,7 +507,8 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None,
     test_size = 1 - p
     if p is None or min(test_size, p) <= 0:
         raise ValueError(  # pragma: no cover
-            f"test_size={test_size} or train_size={train_size} cannot be null")
+            f"test_size={test_size} or train_size={train_size} cannot be null"
+        )
 
     couples = df[[group, stratify]].itertuples(name=None, index=False)
     hist = Counter(df[stratify])
@@ -475,8 +532,7 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None,
             continue
         assigned = [c for c in ids[k] if c in split]
         nb_test = sum(split[c] for c in assigned)
-        expected = min(len(ids[k]), int(
-            test_size * len(ids[k]) + 0.5)) - nb_test
+        expected = min(len(ids[k]), int(test_size * len(ids[k]) + 0.5)) - nb_test
         if force and expected == 0 and nb_test == 0:
             nb_train = len(assigned) - nb_test
             if nb_train > 0 or len(not_assigned) > 1:
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
index 843e4da..db3d7b9 100644
--- a/pandas_streaming/df/dataframe.py
+++ b/pandas_streaming/df/dataframe.py
@@ -1,9 +1,3 @@
-# -*- coding: utf-8 -*-
-# pylint: disable=W0102
-"""
-@file
-@brief Defines a streaming dataframe.
-"""
 import pickle
 import os
 from io import StringIO, BytesIO
@@ -12,11 +6,15 @@
 import numpy.random as nrandom
 import pandas
 from pandas.testing import assert_frame_equal
+
 try:
     from pandas import json_normalize
 except ImportError:
     from pandas.io.json import json_normalize
-from .dataframe_split import sklearn_train_test_split, sklearn_train_test_split_streaming
+from .dataframe_split import (
+    sklearn_train_test_split,
+    sklearn_train_test_split_streaming,
+)
 from .dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream
 
 
@@ -24,6 +22,7 @@ class StreamingDataFrameSchemaError(Exception):
     """
     Reveals an issue with inconsistant schemas.
     """
+
     pass
 
 
@@ -50,7 +49,7 @@ class StreamingDataFrame:
     Instead, the class takes a function which generates
     an iterator on :epkg:`DataFrame`.
     Most of the methods returns either a :epkg:`DataFrame`
-    either a @see cl StreamingDataFrame. In the second case,
+    either a see :class:`StreamingDataFrame`. In the second case,
     methods can be chained.
 
     By default, the object checks that the schema remains
@@ -64,7 +63,7 @@ class StreamingDataFrame:
     is one of these cases.
 
     :param iter_creation: function which creates an iterator or an
-        instance of @see cl StreamingDataFrame
+        instance of see :class:`StreamingDataFrame`
     :param check_schema: checks that the schema is the same
         for every :epkg:`dataframe`
     :param stable: indicates if the :epkg:`dataframe` remains the same
@@ -73,11 +72,11 @@ class StreamingDataFrame:
 
     def __init__(self, iter_creation, check_schema=True, stable=True):
         self._delete_ = []
-        if isinstance(iter_creation, (pandas.DataFrame, dict,
-                                      numpy.ndarray, str)):
+        if isinstance(iter_creation, (pandas.DataFrame, dict, numpy.ndarray, str)):
             raise TypeError(
                 "Unexpected type %r for iter_creation. It must "
-                "be an iterator." % type(iter_creation))
+                "be an iterator." % type(iter_creation)
+            )
         if isinstance(iter_creation, StreamingDataFrame):
             self.iter_creation = iter_creation.iter_creation
             self.stable = iter_creation.stable
@@ -116,9 +115,15 @@ def get_kwargs(self):
         """
         return dict(check_schema=self.check_schema)
 
-    def train_test_split(self, path_or_buf=None, export_method="to_csv",
-                         names=None, streaming=True, partitions=None,
-                         **kwargs):
+    def train_test_split(
+        self,
+        path_or_buf=None,
+        export_method="to_csv",
+        names=None,
+        streaming=True,
+        partitions=None,
+        **kwargs,
+    ):
         """
         Randomly splits a :epkg:`dataframe` into smaller pieces.
         The function returns streams of file names.
@@ -138,7 +143,7 @@ def train_test_split(self, path_or_buf=None, export_method="to_csv",
                                 streaming version of the algorithm.
         @param  partitions      splitting partitions
         @return                 outputs of the exports functions or two
-                                @see cl StreamingDataFrame if path_or_buf is None.
+                                see :class:`StreamingDataFrame` if path_or_buf is None.
 
         The streaming version of this algorithm is implemented by function
         @see fn sklearn_train_test_split_streaming. Its documentation
@@ -150,14 +155,19 @@ def train_test_split(self, path_or_buf=None, export_method="to_csv",
                 if len(partitions) != 2:
                     raise NotImplementedError(  # pragma: no cover
                         "Only train and test split is allowed, *partitions* "
-                        "must be of length 2.")
+                        "must be of length 2."
+                    )
                 kwargs = kwargs.copy()
-                kwargs['train_size'] = partitions[0]
-                kwargs['test_size'] = partitions[1]
+                kwargs["train_size"] = partitions[0]
+                kwargs["test_size"] = partitions[1]
             return sklearn_train_test_split_streaming(self, **kwargs)
-        return sklearn_train_test_split(self, path_or_buf=path_or_buf,
-                                        export_method=export_method,
-                                        names=names, **kwargs)
+        return sklearn_train_test_split(
+            self,
+            path_or_buf=path_or_buf,
+            export_method=export_method,
+            names=names,
+            **kwargs,
+        )
 
     @staticmethod
     def _process_kwargs(kwargs):
@@ -165,14 +175,16 @@ def _process_kwargs(kwargs):
         Filters out parameters for the constructor of this class.
         """
         kw = {}
-        for k in ['check_schema']:
+        for k in ["check_schema"]:
             if k in kwargs:
                 kw[k] = kwargs[k]
                 del kwargs[k]
         return kw
 
     @staticmethod
-    def read_json(*args, chunksize=100000, flatten=False, **kwargs) -> 'StreamingDataFrame':
+    def read_json(
+        *args, chunksize=100000, flatten=False, **kwargs
+    ) -> "StreamingDataFrame":
         """
         Reads a :epkg:`json` file or buffer as an iterator
         on :epkg:`DataFrame`. The signature is the same as
@@ -225,25 +237,28 @@ def read_json(*args, chunksize=100000, flatten=False, **kwargs) -> 'StreamingDat
         `parse error: unallowed token at this point in JSON text`.
         """
         if not isinstance(chunksize, int) or chunksize <= 0:
-            raise ValueError(  # pragma: no cover
-                'chunksize must be a positive integer')
+            raise ValueError("chunksize must be a positive integer")  # pragma: no cover
         kwargs_create = StreamingDataFrame._process_kwargs(kwargs)
 
         if isinstance(args[0], (list, dict)):
             if flatten:
                 return StreamingDataFrame.read_df(
-                    json_normalize(args[0]), **kwargs_create)
+                    json_normalize(args[0]), **kwargs_create
+                )
             return StreamingDataFrame.read_df(args[0], **kwargs_create)
 
-        if kwargs.get('lines', None) == 'stream':
-            del kwargs['lines']
+        if kwargs.get("lines", None) == "stream":
+            del kwargs["lines"]
 
             def localf(a0=args[0]):
-                if hasattr(a0, 'seek'):
+                if hasattr(a0, "seek"):
                     a0.seek(0)
                 return enumerate_json_items(
-                    a0, encoding=kwargs.get('encoding', None), lines=True,
-                    flatten=flatten)
+                    a0,
+                    encoding=kwargs.get("encoding", None),
+                    lines=True,
+                    flatten=flatten,
+                )
 
             st = JsonIterator2Stream(localf)
             args = args[1:]
@@ -251,57 +266,68 @@ def localf(a0=args[0]):
             if chunksize is None:
                 return StreamingDataFrame(
                     lambda: pandas.read_json(
-                        st, *args, chunksize=None, lines=True, **kwargs),
-                    **kwargs_create)
+                        st, *args, chunksize=None, lines=True, **kwargs
+                    ),
+                    **kwargs_create,
+                )
 
             def fct1(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()):
                 st.seek(0)
                 for r in pandas.read_json(
-                        st, *args, chunksize=chunksize, nrows=chunksize,
-                        lines=True, **kw):
+                    st, *args, chunksize=chunksize, nrows=chunksize, lines=True, **kw
+                ):
                     yield r
 
             return StreamingDataFrame(fct1, **kwargs_create)
 
-        if kwargs.get('lines', False):
+        if kwargs.get("lines", False):
             if flatten:
                 raise NotImplementedError(
-                    "flatten==True is implemented with option lines='stream'")
+                    "flatten==True is implemented with option lines='stream'"
+                )
             if chunksize is None:
                 return StreamingDataFrame(
                     lambda: pandas.read_json(*args, chunksize=None, **kwargs),
-                    **kwargs_create)
+                    **kwargs_create,
+                )
 
             def fct2(args=args, chunksize=chunksize, kw=kwargs.copy()):
                 for r in pandas.read_json(
-                        *args, chunksize=chunksize, nrows=chunksize, **kw):
+                    *args, chunksize=chunksize, nrows=chunksize, **kw
+                ):
                     yield r
+
             return StreamingDataFrame(fct2, **kwargs_create)
 
         st = JsonIterator2Stream(
             lambda a0=args[0]: enumerate_json_items(
-                a0, encoding=kwargs.get('encoding', None), flatten=flatten))
+                a0, encoding=kwargs.get("encoding", None), flatten=flatten
+            )
+        )
         args = args[1:]
-        if 'lines' in kwargs:
-            del kwargs['lines']
+        if "lines" in kwargs:
+            del kwargs["lines"]
 
         if chunksize is None:
             return StreamingDataFrame(
                 lambda: pandas.read_json(
-                    st, *args, chunksize=chunksize, lines=True, **kwargs),
-                **kwargs_create)
+                    st, *args, chunksize=chunksize, lines=True, **kwargs
+                ),
+                **kwargs_create,
+            )
 
         def fct3(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()):
-            if hasattr(st, 'seek'):
+            if hasattr(st, "seek"):
                 st.seek(0)
             for r in pandas.read_json(
-                    st, *args, chunksize=chunksize, nrows=chunksize,
-                    lines=True, **kw):
+                st, *args, chunksize=chunksize, nrows=chunksize, lines=True, **kw
+            ):
                 yield r
+
         return StreamingDataFrame(fct3, **kwargs_create)
 
     @staticmethod
-    def read_csv(*args, **kwargs) -> 'StreamingDataFrame':
+    def read_csv(*args, **kwargs) -> "StreamingDataFrame":
         """
         Reads a :epkg:`csv` file or buffer
         as an iterator on :epkg:`DataFrame`.
@@ -310,41 +336,44 @@ def read_csv(*args, **kwargs) -> 'StreamingDataFrame':
         of rows to parse in a single bloc. If not specified,
         it will be equal to 100000.
         """
-        if not kwargs.get('iterator', True):
+        if not kwargs.get("iterator", True):
             raise ValueError("If specified, iterator must be True.")
-        if not kwargs.get('chunksize', 100000):
+        if not kwargs.get("chunksize", 100000):
             raise ValueError("If specified, chunksize must not be None.")
         kwargs_create = StreamingDataFrame._process_kwargs(kwargs)
-        kwargs['iterator'] = True
-        if 'chunksize' not in kwargs:
-            kwargs['chunksize'] = 100000
-        return StreamingDataFrame(lambda: pandas.read_csv(*args, **kwargs), **kwargs_create)
+        kwargs["iterator"] = True
+        if "chunksize" not in kwargs:
+            kwargs["chunksize"] = 100000
+        return StreamingDataFrame(
+            lambda: pandas.read_csv(*args, **kwargs), **kwargs_create
+        )
 
     @staticmethod
-    def read_str(text, **kwargs) -> 'StreamingDataFrame':
+    def read_str(text, **kwargs) -> "StreamingDataFrame":
         """
         Reads a :epkg:`DataFrame` as an iterator on :epkg:`DataFrame`.
         The signature is the same as :epkg:`pandas:read_csv`.
         The important parameter is *chunksize* which defines the number
         of rows to parse in a single bloc.
         """
-        if not kwargs.get('iterator', True):
+        if not kwargs.get("iterator", True):
             raise ValueError("If specified, iterator must be True.")
-        if not kwargs.get('chunksize', 100000):
+        if not kwargs.get("chunksize", 100000):
             raise ValueError("If specified, chunksize must not be None.")
         kwargs_create = StreamingDataFrame._process_kwargs(kwargs)
-        kwargs['iterator'] = True
-        if 'chunksize' not in kwargs:
-            kwargs['chunksize'] = 100000
+        kwargs["iterator"] = True
+        if "chunksize" not in kwargs:
+            kwargs["chunksize"] = 100000
         if isinstance(text, str):
             buffer = StringIO(text)
         else:
             buffer = BytesIO(text)
         return StreamingDataFrame(
-            lambda: pandas.read_csv(buffer, **kwargs), **kwargs_create)
+            lambda: pandas.read_csv(buffer, **kwargs), **kwargs_create
+        )
 
     @staticmethod
-    def read_df(df, chunksize=None, check_schema=True) -> 'StreamingDataFrame':
+    def read_df(df, chunksize=None, check_schema=True) -> "StreamingDataFrame":
         """
         Splits a :epkg:`DataFrame` into small chunks mostly for
         unit testing purposes.
@@ -352,26 +381,29 @@ def read_df(df, chunksize=None, check_schema=True) -> 'StreamingDataFrame':
         @param      df              :epkg:`DataFrame`
         @param      chunksize       number rows per chunks (// 10 by default)
         @param      check_schema    check schema between two iterations
-        @return                     iterator on @see cl StreamingDataFrame
+        @return                     iterator on see :class:`StreamingDataFrame`
         """
         if chunksize is None:
-            if hasattr(df, 'shape'):
+            if hasattr(df, "shape"):
                 chunksize = df.shape[0]
             else:
                 raise NotImplementedError(
-                    f"Cannot retrieve size to infer chunksize for type={type(df)}.")
+                    f"Cannot retrieve size to infer chunksize for type={type(df)}."
+                )
 
-        if hasattr(df, 'shape'):
+        if hasattr(df, "shape"):
             size = df.shape[0]
         else:
             raise NotImplementedError(  # pragma: no cover
-                f"Cannot retrieve size for type={type(df)}.")
+                f"Cannot retrieve size for type={type(df)}."
+            )
 
         def local_iterator():
             "local iterator"
             for i in range(0, size, chunksize):
                 end = min(size, i + chunksize)
                 yield df[i:end].copy()
+
         return StreamingDataFrame(local_iterator, check_schema=check_schema)
 
     def __iter__(self):
@@ -403,21 +435,26 @@ def __iter__(self):
             elif self.check_schema:
                 if list(it.columns) != sch[0]:  # pylint: disable=E1136
                     raise StreamingDataFrameSchemaError(  # pragma: no cover
-                        'Column names are different after row {0}\nFirst   chunk: {1}'
-                        '\nCurrent chunk: {2}'.format(
-                            rows, sch[0], list(it.columns)))  # pylint: disable=E1136
+                        "Column names are different after row {0}\nFirst   chunk: {1}"
+                        "\nCurrent chunk: {2}".format(rows, sch[0], list(it.columns))
+                    )  # pylint: disable=E1136
                 if list(it.dtypes) != sch[1]:  # pylint: disable=E1136
                     errdf = pandas.DataFrame(
-                        dict(names=sch[0], schema1=sch[1],  # pylint: disable=E1136
-                             schema2=list(it.dtypes)))  # pylint: disable=E1136
+                        dict(
+                            names=sch[0],
+                            schema1=sch[1],  # pylint: disable=E1136
+                            schema2=list(it.dtypes),
+                        )
+                    )  # pylint: disable=E1136
                     tdf = StringIO()
-                    errdf['diff'] = errdf['schema2'] != errdf['schema1']
-                    errdf = errdf[errdf['diff']]
+                    errdf["diff"] = errdf["schema2"] != errdf["schema1"]
+                    errdf = errdf[errdf["diff"]]
                     errdf.to_csv(tdf, sep=",", index=False)
                     raise StreamingDataFrameSchemaError(
-                        'Column types are different after row {0}. You may use option '
+                        "Column types are different after row {0}. You may use option "
                         'dtype={{"column_name": str}} to force the type on this column.'
-                        '\n---\n{1}'.format(rows, tdf.getvalue()))
+                        "\n---\n{1}".format(rows, tdf.getvalue())
+                    )
 
             rows += it.shape[0]
             yield it
@@ -453,7 +490,7 @@ def dtypes(self):
         for it in self:
             return it.dtypes
 
-    def to_csv(self, path_or_buf=None, **kwargs) -> 'StreamingDataFrame':
+    def to_csv(self, path_or_buf=None, **kwargs) -> "StreamingDataFrame":
         """
         Saves the :epkg:`DataFrame` into string.
         See :epkg:`pandas:DataFrame.to_csv`.
@@ -463,7 +500,8 @@ def to_csv(self, path_or_buf=None, **kwargs) -> 'StreamingDataFrame':
             close = False
         elif isinstance(path_or_buf, str):
             st = open(  # pylint: disable=R1732
-                path_or_buf, "w", encoding=kwargs.get('encoding'))
+                path_or_buf, "w", encoding=kwargs.get("encoding")
+            )
             close = True
         else:
             st = path_or_buf
@@ -471,7 +509,7 @@ def to_csv(self, path_or_buf=None, **kwargs) -> 'StreamingDataFrame':
 
         for df in self:
             df.to_csv(st, **kwargs)
-            kwargs['header'] = False
+            kwargs["header"] = False
 
         if close:
             st.close()
@@ -529,43 +567,51 @@ def tail(self, n=5) -> pandas.DataFrame:
             h = df.tail(n=n)
         return h
 
-    def where(self, *args, **kwargs) -> 'StreamingDataFrame':
+    def where(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:where`.
         *inplace* must be False.
-        This function returns a @see cl StreamingDataFrame.
+        This function returns a see :class:`StreamingDataFrame`.
         """
-        kwargs['inplace'] = False
+        kwargs["inplace"] = False
         return StreamingDataFrame(
-            lambda: map(lambda df: df.where(*args, **kwargs), self),
-            **self.get_kwargs())
+            lambda: map(lambda df: df.where(*args, **kwargs), self), **self.get_kwargs()
+        )
 
-    def sample(self, reservoir=False, cache=False, **kwargs) -> 'StreamingDataFrame':
+    def sample(self, reservoir=False, cache=False, **kwargs) -> "StreamingDataFrame":
         """
         See :epkg:`pandas:DataFrame:sample`.
         Only *frac* is available, otherwise choose
         @see me reservoir_sampling.
-        This function returns a @see cl StreamingDataFrame.
+        This function returns a see :class:`StreamingDataFrame`.
 
-        @param      reservoir   use `reservoir sampling <https://en.wikipedia.org/wiki/Reservoir_sampling>`_
-        @param      cache       cache the sample
-        @param      kwargs      additional parameters for :epkg:`pandas:DataFrame:sample`
+        :param reservoir: use
+            `reservoir sampling <https://en.wikipedia.org/wiki/Reservoir_sampling>`_
+        :param cache: cache the sample
+        :param kwargs: additional parameters for :epkg:`pandas:DataFrame:sample`
 
         If *cache* is True, the sample is cached (assuming it holds in memory).
         The second time an iterator walks through the
         """
-        if reservoir or 'n' in kwargs:
-            if 'frac' in kwargs:
-                raise ValueError(
-                    'frac cannot be specified for reservoir sampling.')
-            return self._reservoir_sampling(cache=cache, n=kwargs['n'], random_state=kwargs.get('random_state'))
+        if reservoir or "n" in kwargs:
+            if "frac" in kwargs:
+                raise ValueError("frac cannot be specified for reservoir sampling.")
+            return self._reservoir_sampling(
+                cache=cache, n=kwargs["n"], random_state=kwargs.get("random_state")
+            )
         if cache:
             sdf = self.sample(cache=False, **kwargs)
             df = sdf.to_df()
             return StreamingDataFrame.read_df(df, chunksize=df.shape[0])
-        return StreamingDataFrame(lambda: map(lambda df: df.sample(**kwargs), self), **self.get_kwargs(), stable=False)
+        return StreamingDataFrame(
+            lambda: map(lambda df: df.sample(**kwargs), self),
+            **self.get_kwargs(),
+            stable=False,
+        )
 
-    def _reservoir_sampling(self, cache=True, n=1000, random_state=None) -> 'StreamingDataFrame':
+    def _reservoir_sampling(
+        self, cache=True, n=1000, random_state=None
+    ) -> "StreamingDataFrame":
         """
         Uses the `reservoir sampling <https://en.wikipedia.org/wiki/Reservoir_sampling>`_
         algorithm to draw a random sample with exactly *n* samples.
@@ -573,15 +619,14 @@ def _reservoir_sampling(self, cache=True, n=1000, random_state=None) -> 'Streami
         @param      cache           cache the sample
         @param      n               number of observations to keep
         @param      random_state    sets the random_state
-        @return                     @see cl StreamingDataFrame
+        @return                     see :class:`StreamingDataFrame`
 
         .. warning::
             The sample is split by chunks of size 1000.
             This parameter is not yet exposed.
         """
         if not cache:
-            raise ValueError(
-                "cache=False is not available for reservoir sampling.")
+            raise ValueError("cache=False is not available for reservoir sampling.")
         indices = []
         seen = 0
         for i, df in enumerate(self):
@@ -610,51 +655,75 @@ def reservoir_iterate(sdf, indices, chunksize):
                 yield pandas.DataFrame(buffer)
 
         return StreamingDataFrame(
-            lambda: reservoir_iterate(sdf=self, indices=indices, chunksize=1000))
-
-    def drop(self, labels=None, *, axis=0, index=None, columns=None, level=None,
-             inplace=False, errors='raise') -> 'StreamingDataFrame':
+            lambda: reservoir_iterate(sdf=self, indices=indices, chunksize=1000)
+        )
+
+    def drop(
+        self,
+        labels=None,
+        *,
+        axis=0,
+        index=None,
+        columns=None,
+        level=None,
+        inplace=False,
+        errors="raise",
+    ) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:drop`.
-        This function returns a @see cl StreamingDataFrame.
+        This function returns a see :class:`StreamingDataFrame`.
         """
         if axis == 0:
             raise NotImplementedError(f"drop is not implemented for axis={axis}.")
         if inplace:
             raise NotImplementedError(f"drop is not implemented for inplace={inplace}.")
         return StreamingDataFrame(
-            lambda: map(lambda df: df.drop(
-                labels, axis=axis, index=index, columns=columns,
-                level=level, inplace=False, errors=errors), self),
-            **self.get_kwargs())
-
-    def apply(self, *args, **kwargs) -> 'StreamingDataFrame':
+            lambda: map(
+                lambda df: df.drop(
+                    labels,
+                    axis=axis,
+                    index=index,
+                    columns=columns,
+                    level=level,
+                    inplace=False,
+                    errors=errors,
+                ),
+                self,
+            ),
+            **self.get_kwargs(),
+        )
+
+    def apply(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:apply`.
-        This function returns a @see cl StreamingDataFrame.
+        This function returns a see :class:`StreamingDataFrame`.
         """
         return StreamingDataFrame(
-            lambda: map(lambda df: df.apply(*args, **kwargs), self),
-            **self.get_kwargs())
+            lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs()
+        )
 
-    def applymap(self, *args, **kwargs) -> 'StreamingDataFrame':
+    def applymap(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:applymap`.
-        This function returns a @see cl StreamingDataFrame.
+        This function returns a see :class:`StreamingDataFrame`.
         """
         return StreamingDataFrame(
             lambda: map(lambda df: df.applymap(*args, **kwargs), self),
-            **self.get_kwargs())
+            **self.get_kwargs(),
+        )
 
-    def merge(self, right, **kwargs) -> 'StreamingDataFrame':
+    def merge(self, right, **kwargs) -> "StreamingDataFrame":
         """
-        Merges two @see cl StreamingDataFrame and returns @see cl StreamingDataFrame.
-        *right* can be either a @see cl StreamingDataFrame or simply
+        Merges two see :class:`StreamingDataFrame`
+        and returns see :class:`StreamingDataFrame`.
+        *right* can be either a see :class:`StreamingDataFrame` or simply
         a :epkg:`pandas:DataFrame`. It calls :epkg:`pandas:DataFrame:merge` in
         a double loop, loop on *self*, loop on *right*.
         """
         if isinstance(right, pandas.DataFrame):
-            return self.merge(StreamingDataFrame.read_df(right, chunksize=right.shape[0]), **kwargs)
+            return self.merge(
+                StreamingDataFrame.read_df(right, chunksize=right.shape[0]), **kwargs
+            )
 
         def iterator_merge(sdf1, sdf2, **kw):
             "iterate on dataframes"
@@ -664,18 +733,20 @@ def iterator_merge(sdf1, sdf2, **kw):
                     yield df
 
         return StreamingDataFrame(
-            lambda: iterator_merge(self, right, **kwargs), **self.get_kwargs())
+            lambda: iterator_merge(self, right, **kwargs), **self.get_kwargs()
+        )
 
-    def concat(self, others, axis=0) -> 'StreamingDataFrame':
+    def concat(self, others, axis=0) -> "StreamingDataFrame":
         """
-        Concatenates :epkg:`dataframes`. The function ensures all :epkg:`pandas:DataFrame`
-        or @see cl StreamingDataFrame share the same columns (name and type).
+        Concatenates :epkg:`dataframes`.
+        The function ensures all :epkg:`pandas:DataFrame`
+        or see :class:`StreamingDataFrame` share the same columns (name and type).
         Otherwise, the function fails as it cannot guess the schema without
         walking through all :epkg:`dataframes`.
 
         :param others: list, enumeration, :epkg:`pandas:DataFrame`
         :param axis: concatenate by rows (0) or by columns (1)
-        :return: @see cl StreamingDataFrame
+        :return: see :class:`StreamingDataFrame`
         """
         if axis == 1:
             return self._concath(others)
@@ -693,13 +764,14 @@ def iterateh(self, others):
                 nrows = [_.shape[0] for _ in dfs]
                 if min(nrows) != max(nrows):
                     raise RuntimeError(
-                        "StreamingDataFram cannot merge DataFrame with different size or chunksize")
+                        "StreamingDataFram cannot merge DataFrame "
+                        "with different size or chunksize"
+                    )
                 yield pandas.concat(list(dfs), axis=1)
 
         return StreamingDataFrame(lambda: iterateh(self, others), **self.get_kwargs())
 
     def _concatv(self, others):
-
         def iterator_concat(this, lothers):
             "iterator on dataframes"
             columns = None
@@ -715,10 +787,13 @@ def iterator_concat(this, lothers):
                     if check:
                         if list(columns) != list(df.columns):
                             raise ValueError(
-                                f"Frame others[{i}] do not have the same column names or the same order.")
+                                f"Frame others[{i}] do not have the "
+                                f"same column names or the same order."
+                            )
                         if list(dtypes) != list(df.dtypes):
                             raise ValueError(
-                                f"Frame others[{i}] do not have the same column types.")
+                                f"Frame others[{i}] do not have the same column types."
+                            )
                         check = False
                     yield df
 
@@ -736,23 +811,25 @@ def change_type(obj):
 
         others = list(map(change_type, others))
         return StreamingDataFrame(
-            lambda: iterator_concat(self, others), **self.get_kwargs())
+            lambda: iterator_concat(self, others), **self.get_kwargs()
+        )
 
-    def groupby(self, by=None, lambda_agg=None, lambda_agg_agg=None,
-                in_memory=True, **kwargs) -> pandas.DataFrame:
+    def groupby(
+        self, by=None, lambda_agg=None, lambda_agg_agg=None, in_memory=True, **kwargs
+    ) -> pandas.DataFrame:
         """
         Implements the streaming :epkg:`pandas:DataFrame:groupby`.
         We assume the result holds in memory. The out-of-memory is
         not implemented yet.
 
-        @param      by              see :epkg:`pandas:DataFrame:groupby`
-        @param      in_memory       in-memory algorithm
-        @param      lambda_agg      aggregation function, *sum* by default
-        @param      lambda_agg_agg  to aggregate the aggregations, *sum* by default
-        @param      kwargs          additional parameters for :epkg:`pandas:DataFrame:groupby`
-        @return                     :epkg:`pandas:DataFrame`
+        :param by: see :epkg:`pandas:DataFrame:groupby`
+        :param in_memory: in-memory algorithm
+        :param lambda_agg: aggregation function, *sum* by default
+        :param lambda_agg_agg: to aggregate the aggregations, *sum* by default
+        :param kwargs: additional parameters for :epkg:`pandas:DataFrame:groupby`
+        :return: :epkg:`pandas:DataFrame`
 
-        As the input @see cl StreamingDataFrame does not necessarily hold
+        As the input see :class:`StreamingDataFrame` does not necessarily hold
         in memory, the aggregation must be done at every iteration.
         There are two levels of aggregation: one to reduce every iterated
         :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`.
@@ -772,7 +849,7 @@ def groupby(self, by=None, lambda_agg=None, lambda_agg_agg=None,
             :tag: streaming
 
             Here is an example which shows how to write a simple *groupby*
-            with :epkg:`pandas` and @see cl StreamingDataFrame.
+            with :epkg:`pandas` and see :class:`StreamingDataFrame`.
 
             .. runpython::
                 :showcode:
@@ -790,17 +867,20 @@ def groupby(self, by=None, lambda_agg=None, lambda_agg_agg=None,
                 print(df.groupby("A").sum())
         """
         if not in_memory:
-            raise NotImplementedError(
-                "Out-of-memory group by is not implemented.")
+            raise NotImplementedError("Out-of-memory group by is not implemented.")
         if lambda_agg is None:
+
             def lambda_agg_(gr):
                 "sum"
                 return gr.sum()
+
             lambda_agg = lambda_agg_
         if lambda_agg_agg is None:
+
             def lambda_agg_agg_(gr):
                 "sum"
                 return gr.sum()
+
             lambda_agg_agg = lambda_agg_agg_
         ckw = kwargs.copy()
         ckw["as_index"] = False
@@ -812,8 +892,15 @@ def lambda_agg_agg_(gr):
         conc = pandas.concat(agg, sort=False)
         return lambda_agg_agg(conc.groupby(by=by, **kwargs))
 
-    def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_memory=True,
-                          strategy='cum', **kwargs) -> pandas.DataFrame:
+    def groupby_streaming(
+        self,
+        by=None,
+        lambda_agg=None,
+        lambda_agg_agg=None,
+        in_memory=True,
+        strategy="cum",
+        **kwargs,
+    ) -> pandas.DataFrame:
         """
         Implements the streaming :epkg:`pandas:DataFrame:groupby`.
         We assume the result holds in memory. The out-of-memory is
@@ -827,7 +914,7 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me
         :param strategy: ``'cum'``, or ``'streaming'``, see below
         :return: :epkg:`pandas:DataFrame`
 
-        As the input @see cl StreamingDataFrame does not necessarily hold
+        As the input see :class:`StreamingDataFrame` does not necessarily hold
         in memory, the aggregation must be done at every iteration.
         There are two levels of aggregation: one to reduce every iterated
         :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`.
@@ -846,7 +933,7 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me
         First one if ``strategy is None`` goes through
         the whole datasets to produce a final :epkg:`DataFrame`.
         Second if ``strategy=='cum'`` returns a
-        @see cl StreamingDataFrame, each iteration produces
+        see :class:`StreamingDataFrame`, each iteration produces
         the current status of the *group by*. Last case,
         ``strategy=='streaming'`` produces :epkg:`DataFrame`
         which must be concatenated into a single :epkg:`DataFrame`
@@ -857,7 +944,7 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me
             :tag: streaming
 
             Here is an example which shows how to write a simple *groupby*
-            with :epkg:`pandas` and @see cl StreamingDataFrame.
+            with :epkg:`pandas` and see :class:`StreamingDataFrame`.
 
             .. runpython::
                 :showcode:
@@ -876,22 +963,26 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me
                     print(gr)
         """
         if not in_memory:
-            raise NotImplementedError(
-                "Out-of-memory group by is not implemented.")
+            raise NotImplementedError("Out-of-memory group by is not implemented.")
         if lambda_agg is None:
+
             def lambda_agg_(gr):
                 "sum"
                 return gr.sum()
+
             lambda_agg = lambda_agg_
         if lambda_agg_agg is None:
+
             def lambda_agg_agg_(gr):
                 "sum"
                 return gr.sum()
+
             lambda_agg_agg = lambda_agg_agg_
         ckw = kwargs.copy()
         ckw["as_index"] = False
 
-        if strategy == 'cum':
+        if strategy == "cum":
+
             def iterate_cum():
                 agg = None
                 for df in self:
@@ -904,18 +995,20 @@ def iterate_cum():
                         lagg = pandas.concat([agg, gragg], sort=False)
                         yield lambda_agg_agg(lagg.groupby(by=by, **kwargs))
                         agg = lagg
+
             return StreamingDataFrame(lambda: iterate_cum(), **self.get_kwargs())
 
-        if strategy == 'streaming':
+        if strategy == "streaming":
+
             def iterate_streaming():
                 for df in self:
                     gr = df.groupby(by=by, **ckw)
                     gragg = lambda_agg(gr)
                     yield lambda_agg(gragg.groupby(by=by, **kwargs))
+
             return StreamingDataFrame(lambda: iterate_streaming(), **self.get_kwargs())
 
-        raise ValueError(  # pragma: no cover
-            f"Unknown strategy '{strategy}'")
+        raise ValueError(f"Unknown strategy '{strategy}'")  # pragma: no cover
 
     def ensure_dtype(self, df, dtypes):
         """
@@ -942,7 +1035,8 @@ def __getitem__(self, *args):
         """
         if len(args) != 1:
             raise NotImplementedError(  # pragma: no cover
-                "Only a list of columns is supported.")
+                "Only a list of columns is supported."
+            )
         cols = args[0]
         if isinstance(cols, str):
             # One column.
@@ -953,6 +1047,7 @@ def iterate_col():
                 one_col = [cols]
                 for df in iter_creation():
                     yield df[one_col]
+
             return StreamingSeries(iterate_col, **self.get_kwargs())
 
         if not isinstance(cols, list):
@@ -970,8 +1065,7 @@ def __setitem__(self, index, value):
         Limited set of operators are supported.
         """
         if not isinstance(index, str):
-            raise ValueError(
-                f"Only column affected are supported but index={index!r}.")
+            raise ValueError(f"Only column affected are supported but index={index!r}.")
         if isinstance(value, (int, float, numpy.number, str)):
             # Is is equivalent to add_column.
             iter_creation = self.iter_creation
@@ -997,7 +1091,8 @@ def iterate_fct():
                         raise RuntimeError(
                             "Chunksize or shape are different when "
                             "iterating on two StreamDataFrame at the same "
-                            "time: %r != %r." % (df.shape[0], dfs.shape[0]))
+                            "time: %r != %r." % (df.shape[0], dfs.shape[0])
+                        )
                     dfc = df.copy()
                     dfc[index] = dfs
                     yield dfc
@@ -1005,8 +1100,9 @@ def iterate_fct():
             self.iter_creation = iterate_fct
         else:
             raise NotImplementedError(
-                "Not implemented for type(index)=%r and type(value)=%r." % (
-                    type(index), type(value)))
+                "Not implemented for type(index)=%r and type(value)=%r."
+                % (type(index), type(value))
+            )
 
     def add_column(self, col, value):
         """
@@ -1014,12 +1110,12 @@ def add_column(self, col, value):
         offers for the operator ``[]``.
 
         @param      col             new column
-        @param      value           @see cl StreamingDataFrame or a lambda function
-        @return                     @see cl StreamingDataFrame
+        @param      value           see :class:`StreamingDataFrame` or a lambda function
+        @return                     see :class:`StreamingDataFrame`
 
         ..note::
 
-            If value is a @see cl StreamingDataFrame,
+            If value is a see :class:`StreamingDataFrame`,
             *chunksize* must be the same for both.
 
         .. exref::
@@ -1043,9 +1139,11 @@ def add_column(self, col, value):
         """
         if not isinstance(col, str):
             raise NotImplementedError(  # pragma: no cover
-                "Only a column as a string is supported.")
+                "Only a column as a string is supported."
+            )
 
         if isfunction(value):
+
             def iterate_fct(self, value, col):
                 "iterate on rows"
                 for df in self:
@@ -1053,11 +1151,12 @@ def iterate_fct(self, value, col):
                     dfc.insert(dfc.shape[1], col, dfc.apply(value, axis=1))
                     yield dfc
 
-            return StreamingDataFrame(lambda: iterate_fct(self, value, col), **self.get_kwargs())
+            return StreamingDataFrame(
+                lambda: iterate_fct(self, value, col), **self.get_kwargs()
+            )
 
         if isinstance(value, (pandas.Series, pandas.DataFrame, StreamingDataFrame)):
-            raise NotImplementedError(
-                "Unable set a new column based on a datadframe.")
+            raise NotImplementedError("Unable set a new column based on a datadframe.")
 
         def iterate_cst(self, value, col):
             "iterate on rows"
@@ -1067,7 +1166,8 @@ def iterate_cst(self, value, col):
                 yield dfc
 
         return StreamingDataFrame(
-            lambda: iterate_cst(self, value, col), **self.get_kwargs())
+            lambda: iterate_cst(self, value, col), **self.get_kwargs()
+        )
 
     def fillna(self, **kwargs):
         """
@@ -1075,7 +1175,7 @@ def fillna(self, **kwargs):
         :epkg:`pandas:DataFrame:fillna`.
 
         @param      kwargs      see :epkg:`pandas:DataFrame:fillna`
-        @return                 @see cl StreamingDataFrame
+        @return                 see :class:`StreamingDataFrame`
 
         .. warning::
             The function does not check what happens at the
@@ -1085,8 +1185,8 @@ def fillna(self, **kwargs):
 
         def iterate_na(self, **kwargs):
             "iterate on rows"
-            if kwargs.get('inplace', True):
-                kwargs['inplace'] = True
+            if kwargs.get("inplace", True):
+                kwargs["inplace"] = True
                 for df in self:
                     df.fillna(**kwargs)
                     yield df
@@ -1095,7 +1195,8 @@ def iterate_na(self, **kwargs):
                     yield df.fillna(**kwargs)
 
         return StreamingDataFrame(
-            lambda: iterate_na(self, **kwargs), **self.get_kwargs())
+            lambda: iterate_na(self, **kwargs), **self.get_kwargs()
+        )
 
     def describe(self, percentiles=None, include=None, exclude=None):
         """
@@ -1115,31 +1216,36 @@ def describe(self, percentiles=None, include=None, exclude=None):
         """
         merged = None
         stack = []
-        notper = ['count', 'mean', 'std']
+        notper = ["count", "mean", "std"]
         for df in self:
             desc = df.describe(
-                percentiles=percentiles, include=include, exclude=exclude)
-            count = desc.loc['count', :]
+                percentiles=percentiles, include=include, exclude=exclude
+            )
+            count = desc.loc["count", :]
             rows = [name for name in desc.index if name not in notper]
             stack.append(desc.loc[rows, :])
             if merged is None:
                 merged = desc
-                merged.loc['std', :] = (
-                    merged.loc['std', :] ** 2 + merged.loc['mean', :] ** 2) * count
-                merged.loc['mean', :] *= count
+                merged.loc["std", :] = (
+                    merged.loc["std", :] ** 2 + merged.loc["mean", :] ** 2
+                ) * count
+                merged.loc["mean", :] *= count
             else:
-                merged.loc['count', :] += desc.loc['count', :]
-                merged.loc['mean', :] += desc.loc['mean', :] * count
-                merged.loc['std', :] += (
-                    desc.loc['std', :] ** 2 + desc.loc['mean', :] ** 2) * count
-                merged.loc['max', :] = numpy.maximum(
-                    merged.loc['max', :], desc.loc['max', :])
-                merged.loc['min', :] = numpy.maximum(
-                    merged.loc['min', :], desc.loc['min', :])
-        merged.loc['mean', :] /= merged.loc['count', :]
-        merged.loc['std', :] = (
-            merged.loc['std', :] / merged.loc['count', :] -
-            merged.loc['mean', :] ** 2) ** 0.5
+                merged.loc["count", :] += desc.loc["count", :]
+                merged.loc["mean", :] += desc.loc["mean", :] * count
+                merged.loc["std", :] += (
+                    desc.loc["std", :] ** 2 + desc.loc["mean", :] ** 2
+                ) * count
+                merged.loc["max", :] = numpy.maximum(
+                    merged.loc["max", :], desc.loc["max", :]
+                )
+                merged.loc["min", :] = numpy.maximum(
+                    merged.loc["min", :], desc.loc["min", :]
+                )
+        merged.loc["mean", :] /= merged.loc["count", :]
+        merged.loc["std", :] = (
+            merged.loc["std", :] / merged.loc["count", :] - merged.loc["mean", :] ** 2
+        ) ** 0.5
         values = pandas.concat(stack)
         summary = values.describe(percentiles=percentiles)
         merged = merged.loc[notper, :]
@@ -1147,9 +1253,15 @@ def describe(self, percentiles=None, include=None, exclude=None):
         summary = summary.loc[rows, :]
         return pandas.concat([merged, summary])
 
-    def sort_values(self, by, axis=0, ascending=True, kind='quicksort',
-                    na_position='last',
-                    temp_file='_pandas_streaming_sort_values_'):
+    def sort_values(
+        self,
+        by,
+        axis=0,
+        ascending=True,
+        kind="quicksort",
+        na_position="last",
+        temp_file="_pandas_streaming_sort_values_",
+    ):
         """
         Sorts the streaming dataframe by values.
 
@@ -1166,14 +1278,16 @@ def sort_values(self, by, axis=0, ascending=True, kind='quicksort',
         """
         if not isinstance(by, str):
             raise NotImplementedError(  # pragma: no cover
-                f"Only one column can be used to sort not {by!r}.")
+                f"Only one column can be used to sort not {by!r}."
+            )
         keys = {}
         nans = []
         indices = []
-        with open(temp_file, 'wb') as f:
+        with open(temp_file, "wb") as f:
             for df in self:
-                dfs = df.sort_values(by, ascending=ascending, kind=kind,
-                                     na_position=na_position)
+                dfs = df.sort_values(
+                    by, ascending=ascending, kind=kind, na_position=na_position
+                )
                 for tu in dfs[by]:
                     if isinstance(tu, float) and numpy.isnan(tu):
                         nans.append(len(indices))
@@ -1192,10 +1306,8 @@ def sort_values(self, by, axis=0, ascending=True, kind='quicksort',
         values.sort(reverse=not ascending)
 
         def iterate():
-
-            with open(temp_file, 'rb') as f:
-
-                if na_position == 'first':
+            with open(temp_file, "rb") as f:
+                if na_position == "first":
                     for p in nans:
                         f.seek(indices[p])
                         length = indices[p + 1] - indices[p]
@@ -1213,7 +1325,7 @@ def iterate():
                         sub = dfs[dfs[by] == key]
                         yield sub
 
-                if na_position == 'last':
+                if na_position == "last":
                     for p in nans:
                         f.seek(indices[p])
                         length = indices[p + 1] - indices[p]
@@ -1222,8 +1334,7 @@ def iterate():
                         sub = dfs[numpy.isnan(dfs[by])]
                         yield sub
 
-        res = StreamingDataFrame(
-            lambda: iterate(), **self.get_kwargs())
+        res = StreamingDataFrame(lambda: iterate(), **self.get_kwargs())
         res._delete_.append(lambda: os.remove(temp_file))
         return res
 
@@ -1237,25 +1348,26 @@ def __del__(self):
 
 class StreamingSeries(StreamingDataFrame):
     """
-    Seens as a @see cl StreamingDataFrame of one column.
+    Seens as a see :class:`StreamingDataFrame` of one column.
     """
 
     def __init__(self, iter_creation, check_schema=True, stable=True):
         StreamingDataFrame.__init__(
-            self, iter_creation, check_schema=check_schema, stable=stable)
+            self, iter_creation, check_schema=check_schema, stable=stable
+        )
         if len(self.columns) != 1:
             raise RuntimeError(  # pragma: no cover
-                f"A series can contain only one column not "
-                f"{len(self.columns)!r}.")
+                f"A series can contain only one column not " f"{len(self.columns)!r}."
+            )
 
-    def apply(self, *args, **kwargs) -> 'StreamingDataFrame':
+    def apply(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:Series:apply`.
         This function returns a @see cl StreamingSeries.
         """
         return StreamingSeries(
-            lambda: map(lambda df: df.apply(*args, **kwargs), self),
-            **self.get_kwargs())
+            lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs()
+        )
 
     def __add__(self, value):
         """
@@ -1264,6 +1376,7 @@ def __add__(self, value):
         :param value: any value which makes sense
         :return: a new series
         """
+
         def iterate():
             for df in self:
                 yield df + value
diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py
index 3dc8f3a..b85d78a 100644
--- a/pandas_streaming/df/dataframe_helpers.py
+++ b/pandas_streaming/df/dataframe_helpers.py
@@ -1,8 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Helpers for dataframes.
-"""
 import hashlib
 import struct
 import warnings
@@ -17,25 +12,27 @@ def numpy_types():
     :return: list of types
     """
 
-    return [numpy.bool_,
-            numpy.int_,
-            numpy.intc,
-            numpy.intp,
-            numpy.int8,
-            numpy.int16,
-            numpy.int32,
-            numpy.int64,
-            numpy.uint8,
-            numpy.uint16,
-            numpy.uint32,
-            numpy.uint64,
-            numpy.float_,
-            numpy.float16,
-            numpy.float32,
-            numpy.float64,
-            numpy.complex_,
-            numpy.complex64,
-            numpy.complex128]
+    return [
+        numpy.bool_,
+        numpy.int_,
+        numpy.intc,
+        numpy.intp,
+        numpy.int8,
+        numpy.int16,
+        numpy.int32,
+        numpy.int64,
+        numpy.uint8,
+        numpy.uint16,
+        numpy.uint32,
+        numpy.uint64,
+        numpy.float_,
+        numpy.float16,
+        numpy.float32,
+        numpy.float64,
+        numpy.complex_,
+        numpy.complex64,
+        numpy.complex128,
+    ]
 
 
 def hash_str(c, hash_length):
@@ -78,7 +75,7 @@ def hash_int(c, hash_length):
         r = m.hexdigest()
         if len(r) >= hash_length:
             r = r[:hash_length]
-        return int(r, 16) % (10 ** 8)
+        return int(r, 16) % (10**8)
 
 
 def hash_float(c, hash_length):
@@ -98,7 +95,7 @@ def hash_float(c, hash_length):
         r = m.hexdigest()
         if len(r) >= hash_length:
             r = r[:hash_length]
-        i = int(r, 16) % (2 ** 53)
+        i = int(r, 16) % (2**53)
         return float(i)
 
 
@@ -153,8 +150,9 @@ def hash_floatl(c):
         "hash float"
         return hash_float(c, hash_length)
 
-    coltype = {n: t for n, t in zip(  # pylint: disable=R1721
-        df.columns, df.dtypes)}  # pylint: disable=R1721
+    coltype = {
+        n: t for n, t in zip(df.columns, df.dtypes)  # pylint: disable=R1721
+    }  # pylint: disable=R1721
     for c in cols:
         t = coltype[c]
         if t == int:
@@ -167,7 +165,8 @@ def hash_floatl(c):
             df[c] = df[c].apply(hash_strl)
         else:
             raise NotImplementedError(  # pragma: no cover
-                f"Conversion of type {t} in column '{c}' is not implemented")
+                f"Conversion of type {t} in column '{c}' is not implemented"
+            )
 
     return df
 
@@ -204,8 +203,9 @@ def dataframe_unfold(df, col, new_col=None, sep=","):
             print(df2)
 
             # To fold:
-            folded = df2.groupby('a').apply(lambda row: ','.join(row['b_unfold'].dropna()) \\
-                                            if len(row['b_unfold'].dropna()) > 0 else numpy.nan)
+            folded = df2.groupby('a').apply(
+                lambda row: ','.join(row['b_unfold'].dropna())
+                        if len(row['b_unfold'].dropna()) > 0 else numpy.nan)
             print('----------')
             print(folded)
     """
@@ -213,7 +213,7 @@ def dataframe_unfold(df, col, new_col=None, sep=","):
         col_name = col + "_unfold"
     else:
         col_name = new_col
-    temp_col = '__index__'
+    temp_col = "__index__"
     while temp_col in df.columns:
         temp_col += "_"
     rows = []
@@ -306,7 +306,9 @@ def pandas_fillna(df, by, hasna=None, suffix=None):
             else:
                 raise TypeError(  # pragma: no cover
                     "Unable to determine a constant for type='{0}' dtype='{1}'".format(
-                        val, df[c].dtype))
+                        val, df[c].dtype
+                    )
+                )
             val += cst
             while val in se:
                 val += suffix
@@ -318,17 +320,20 @@ def pandas_fillna(df, by, hasna=None, suffix=None):
             ma = abs(dr.max())
             val = ma + mi
             if val == ma and not isinstance(val, str):
-                val += ma + 1.
+                val += ma + 1.0
             if val <= ma:
                 raise ValueError(  # pragma: no cover
                     "Unable to find a different value for column '{}' v='{}: "
-                    "min={} max={}".format(c, val, mi, ma))
+                    "min={} max={}".format(c, val, mi, ma)
+                )
             df[c].fillna(val, inplace=True)
             rep[c] = val
     return rep, df
 
 
-def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs):
+def pandas_groupby_nan(
+    df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs
+):
     """
     Does a *groupby* including keeping missing values (:epkg:`nan`).
 
@@ -391,8 +396,7 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True
     """
     if nanback and suffix is None:
         try:
-            res = df.groupby(by, axis=axis, as_index=as_index,
-                             dropna=False, **kwargs)
+            res = df.groupby(by, axis=axis, as_index=as_index, dropna=False, **kwargs)
         except TypeError:
             # old version of pandas
             res = None
@@ -421,71 +425,91 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True
             if not nanback:
                 dummy = DataFrame([{"a": "a"}])
                 do = dummy.dtypes[0]
-                typ = {c: t for c, t in zip(  # pylint: disable=R1721
-                    df.columns, df.dtypes)}  # pylint: disable=R1721
+                typ = {
+                    c: t for c, t in zip(df.columns, df.dtypes)  # pylint: disable=R1721
+                }  # pylint: disable=R1721
                 if typ[by[0]] != do:
                     warnings.warn(  # pragma: no cover
-                        f"[pandas_groupby_nan] NaN value: {rep}")
+                        f"[pandas_groupby_nan] NaN value: {rep}"
+                    )
                 return res
             for b in by:
                 fnan = rep[b]
                 if fnan in res.grouper.groups:
                     res.grouper.groups[numpy.nan] = res.grouper.groups[fnan]
                     del res.grouper.groups[fnan]
-                new_val = list((numpy.nan if b == fnan else b)
-                               for b in res.grouper.result_index)
+                new_val = list(
+                    (numpy.nan if b == fnan else b) for b in res.grouper.result_index
+                )
                 res.grouper.groupings[0]._group_index = Index(new_val)
-                res.grouper.groupings[0].obj[b].replace(
-                    fnan, numpy.nan, inplace=True)
-                if hasattr(res.grouper, 'grouping'):
+                res.grouper.groupings[0].obj[b].replace(fnan, numpy.nan, inplace=True)
+                if hasattr(res.grouper, "grouping"):
                     if isinstance(res.grouper.groupings[0].grouper, numpy.ndarray):
                         arr = numpy.array(new_val)
                         res.grouper.groupings[0].grouper = arr
-                        if (hasattr(res.grouper.groupings[0], '_cache') and
-                                'result_index' in res.grouper.groupings[0]._cache):
-                            del res.grouper.groupings[0]._cache['result_index']
+                        if (
+                            hasattr(res.grouper.groupings[0], "_cache")
+                            and "result_index" in res.grouper.groupings[0]._cache
+                        ):
+                            del res.grouper.groupings[0]._cache["result_index"]
                     else:
-                        raise NotImplementedError("Not implemented for type: {0}".format(
-                            type(res.grouper.groupings[0].grouper)))
+                        raise NotImplementedError(
+                            "Not implemented for type: {0}".format(
+                                type(res.grouper.groupings[0].grouper)
+                            )
+                        )
                 else:
                     grouper = res.grouper._get_grouper()
                     if isinstance(grouper, numpy.ndarray):
                         arr = numpy.array(new_val)
                         res.grouper.groupings[0].grouping_vector = arr
-                        if (hasattr(res.grouper.groupings[0], '_cache') and
-                                'result_index' in res.grouper.groupings[0]._cache):
-                            index = res.grouper.groupings[0]._cache['result_index']
+                        if (
+                            hasattr(res.grouper.groupings[0], "_cache")
+                            and "result_index" in res.grouper.groupings[0]._cache
+                        ):
+                            index = res.grouper.groupings[0]._cache["result_index"]
                             if len(rep) == 1:
                                 key = list(rep.values())[0]
                                 new_index = numpy.array(index)
-                                for i in range(0, len(new_index)):  # pylint: disable=C0200
+                                for i in range(
+                                    0, len(new_index)
+                                ):  # pylint: disable=C0200
                                     if new_index[i] == key:
                                         new_index[i] = numpy.nan
-                                res.grouper.groupings[0]._cache['result_index'] = (
-                                    index.__class__(new_index))
+                                res.grouper.groupings[0]._cache[
+                                    "result_index"
+                                ] = index.__class__(new_index)
                             else:
                                 raise NotImplementedError(  # pragma: no cover
-                                    "NaN values not implemented for multiindex.")
+                                    "NaN values not implemented for multiindex."
+                                )
                     else:
                         raise NotImplementedError(  # pragma: no cover
                             "Not implemented for type: {0}".format(
-                                type(res.grouper.groupings[0].grouper)))
-                res.grouper._cache['result_index'] = res.grouper.groupings[0]._group_index
+                                type(res.grouper.groupings[0].grouper)
+                            )
+                        )
+                res.grouper._cache["result_index"] = res.grouper.groupings[
+                    0
+                ]._group_index
         else:
             if not nanback:
                 dummy = DataFrame([{"a": "a"}])
                 do = dummy.dtypes[0]
-                typ = {c: t for c, t in zip(  # pylint: disable=R1721
-                    df.columns, df.dtypes)}  # pylint: disable=R1721
+                typ = {
+                    c: t for c, t in zip(df.columns, df.dtypes)  # pylint: disable=R1721
+                }  # pylint: disable=R1721
                 for b in by:
                     if typ[b] != do:
                         warnings.warn(  # pragma: no cover
-                            f"[pandas_groupby_nan] NaN values: {rep}")
+                            f"[pandas_groupby_nan] NaN values: {rep}"
+                        )
                         break
                 return res
             raise NotImplementedError(
                 "Not yet implemented. Replacing pseudo nan values by real nan "
-                "values is not as easy as it looks. Use nanback=False")
+                "values is not as easy as it looks. Use nanback=False"
+            )
 
             # keys = list(res.grouper.groups.keys())
             # didit = False
@@ -528,7 +552,8 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True
             #             grou.grouper = numpy.array(new_val)
             #         else:
             #             raise NotImplementedError(
-            #                 "Not implemented for type: {0}".format(type(grou.grouper)))
+            #                 "Not implemented for type: {0}".format(
+            #                       type(grou.grouper)))
             #     del res.grouper._cache
         return res
     return df.groupby(by, axis=axis, **kwargs)
diff --git a/pandas_streaming/df/dataframe_io.py b/pandas_streaming/df/dataframe_io.py
index 30d0fb8..532a2bf 100644
--- a/pandas_streaming/df/dataframe_io.py
+++ b/pandas_streaming/df/dataframe_io.py
@@ -1,8 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Saves and reads a :epkg:`dataframe` into a :epkg:`zip` file.
-"""
 import io
 import os
 import zipfile
@@ -66,35 +61,38 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs):
     if isinstance(df, pandas.DataFrame):
         stb = io.StringIO()
         ext = os.path.splitext(zname)[-1]
-        if ext == '.npy':
+        if ext == ".npy":
             raise ValueError(  # pragma: no cover
-                "Extension '.npy' cannot be used to save a dataframe.")
+                "Extension '.npy' cannot be used to save a dataframe."
+            )
         df.to_csv(stb, **kwargs)
     elif isinstance(df, numpy.ndarray):
         stb = io.BytesIO()
         ext = os.path.splitext(zname)[-1]
-        if ext != '.npy':
+        if ext != ".npy":
             raise ValueError(  # pragma: no cover
-                "Extension '.npy' is required when saving a numpy array.")
+                "Extension '.npy' is required when saving a numpy array."
+            )
         numpy.save(stb, df, **kwargs)
     else:
-        raise TypeError(  # pragma: no cover
-            f"Type not handled {type(df)}")
+        raise TypeError(f"Type not handled {type(df)}")  # pragma: no cover
     text = stb.getvalue()
 
     if isinstance(zipfilename, str):
         ext = os.path.splitext(zipfilename)[-1]
-        if ext != '.zip':
+        if ext != ".zip":
             raise NotImplementedError(  # pragma: no cover
-                f"Only zip file are implemented not '{ext}'.")
-        zf = zipfile.ZipFile(zipfilename, 'w')  # pylint: disable=R1732
+                f"Only zip file are implemented not '{ext}'."
+            )
+        zf = zipfile.ZipFile(zipfilename, "w")  # pylint: disable=R1732
         close = True
     elif isinstance(zipfilename, zipfile.ZipFile):
         zf = zipfilename
         close = False
     else:
         raise TypeError(  # pragma: no cover
-            f"No implementation for type '{type(zipfilename)}'")
+            f"No implementation for type '{type(zipfilename)}'"
+        )
 
     zf.writestr(zname, text)
     if close:
@@ -113,24 +111,26 @@ def read_zip(zipfilename, zname=None, **kwargs):
     """
     if isinstance(zipfilename, str):
         ext = os.path.splitext(zipfilename)[-1]
-        if ext != '.zip':
+        if ext != ".zip":
             raise NotImplementedError(  # pragma: no cover
-                f"Only zip files are supported not '{ext}'.")
-        zf = zipfile.ZipFile(zipfilename, 'r')  # pylint: disable=R1732
+                f"Only zip files are supported not '{ext}'."
+            )
+        zf = zipfile.ZipFile(zipfilename, "r")  # pylint: disable=R1732
         close = True
     elif isinstance(zipfilename, zipfile.ZipFile):
         zf = zipfilename
         close = False
     else:
         raise TypeError(  # pragma: no cover
-            f"No implementation for type '{type(zipfilename)}'")
+            f"No implementation for type '{type(zipfilename)}'"
+        )
 
     if zname is None:
         zname = zf.namelist()[0]
     content = zf.read(zname)
     stb = io.BytesIO(content)
     ext = os.path.splitext(zname)[-1]
-    if ext == '.npy':
+    if ext == ".npy":
         df = numpy.load(stb, **kwargs)
     else:
         df = pandas.read_csv(stb, **kwargs)
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
index 4ae503b..d956cf9 100644
--- a/pandas_streaming/df/dataframe_io_helpers.py
+++ b/pandas_streaming/df/dataframe_io_helpers.py
@@ -1,10 +1,6 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Saves and reads a :epkg:`dataframe` into a :epkg:`zip` file.
-"""
 import os
 from io import StringIO, BytesIO
+
 try:
     from ujson import dumps
 except ImportError:  # pragma: no cover
@@ -46,10 +42,10 @@ def readline(self, size=-1):
         if size == 0:
             return text
         if self.newline:
-            text = ',' + text
+            text = "," + text
             self.newline = False
         elif self.begin:
-            text = '[' + text
+            text = "[" + text
             self.begin = False
 
         if text.endswith("\n"):
@@ -58,7 +54,7 @@ def readline(self, size=-1):
         if len(text) == 0 or len(text) < size:
             if self.end:
                 self.end = False
-                return text + ']'
+                return text + "]"
             return text
         return text
 
@@ -76,7 +72,7 @@ def read(self, size=-1):
         if size == 0:
             return text
         if len(text) > 1:
-            t1, t2 = text[:len(text) - 1], text[len(text) - 1:]
+            t1, t2 = text[: len(text) - 1], text[len(text) - 1 :]
             t1 = t1.replace(cst[0], cst[1])
             text = t1 + t2
 
@@ -101,11 +97,13 @@ def getvalue(self):
         """
         Returns the whole stream content.
         """
+
         def byline():
             line = self.readline()
             while line:
                 yield line
                 line = self.readline()
+
         return "".join(byline())
 
 
@@ -129,8 +127,7 @@ def _flatten(obj, key):
         elif isinstance(obj, dict):
             for k, v in obj.items():
                 if not isinstance(k, str):
-                    raise TypeError(
-                        "All keys must a string.")  # pragma: no cover
+                    raise TypeError("All keys must a string.")  # pragma: no cover
                 k2 = k if key is None else f"{key}{sep}{k}"
                 _flatten(v, k2)
         elif isinstance(obj, (list, set)):
@@ -234,27 +231,28 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False):
         if "{" not in filename and os.path.exists(filename):
             with open(filename, "r", encoding=encoding) as f:
                 for el in enumerate_json_items(
-                        f, encoding=encoding, lines=lines,
-                        flatten=flatten):
+                    f, encoding=encoding, lines=lines, flatten=flatten
+                ):
                     yield el
         else:
             st = StringIO(filename)
             for el in enumerate_json_items(
-                    st, encoding=encoding, lines=lines,
-                    flatten=flatten):
+                st, encoding=encoding, lines=lines, flatten=flatten
+            ):
                 yield el
     elif isinstance(filename, bytes):
         st = BytesIO(filename)
         for el in enumerate_json_items(
-                st, encoding=encoding, lines=lines, flatten=flatten):
+            st, encoding=encoding, lines=lines, flatten=flatten
+        ):
             yield el
     elif lines:
         for el in enumerate_json_items(
-                JsonPerRowsStream(filename),
-                encoding=encoding, lines=False, flatten=flatten):
+            JsonPerRowsStream(filename), encoding=encoding, lines=False, flatten=flatten
+        ):
             yield el
     else:
-        if hasattr(filename, 'seek'):
+        if hasattr(filename, "seek"):
             filename.seek(0)
         parser = ijson.parse(filename)
         current = None
@@ -264,14 +262,16 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False):
         for i, (_, event, value) in enumerate(parser):
             if i % 1000000 == 0 and fLOG is not None:
                 fLOG(  # pragma: no cover
-                    f"[enumerate_json_items] i={i} yielded={nbyield}")
+                    f"[enumerate_json_items] i={i} yielded={nbyield}"
+                )
             if event == "start_array":
                 if curkey is None:
                     current = []
                 else:
                     if not isinstance(current, dict):
                         raise RuntimeError(  # pragma: no cover
-                            f"Type issue {type(current)}")
+                            f"Type issue {type(current)}"
+                        )
                     c = []
                     current[curkey] = c  # pylint: disable=E1137
                     current = c
@@ -321,8 +321,7 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False):
                     current[curkey] = None  # pylint: disable=E1137
                     curkey = None
             else:
-                raise ValueError(
-                    f"Unknown event '{event}'")  # pragma: no cover
+                raise ValueError(f"Unknown event '{event}'")  # pragma: no cover
 
 
 class JsonIterator2Stream:
@@ -418,8 +417,7 @@ def seek(self, offset):
         :param offset: offset, only 0 is implemented
         """
         if offset != 0:
-            raise NotImplementedError(
-                "The iterator can only return at the beginning.")
+            raise NotImplementedError("The iterator can only return at the beginning.")
         self.it0 = self.it()
 
     def write(self):
diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py
index 197d4ff..bb7ea33 100644
--- a/pandas_streaming/df/dataframe_split.py
+++ b/pandas_streaming/df/dataframe_split.py
@@ -1,8 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Implements different methods to split a dataframe.
-"""
 import hashlib
 import pickle
 import random
@@ -11,15 +6,16 @@
 import pandas
 
 
-def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv",
-                             names=None, **kwargs):
+def sklearn_train_test_split(
+    self, path_or_buf=None, export_method="to_csv", names=None, **kwargs
+):
     """
     Randomly splits a dataframe into smaller pieces.
     The function returns streams of file names.
     The function relies on :epkg:`sklearn:model_selection:train_test_split`.
     It does not handle stratified version of it.
 
-    @param  self            @see cl StreamingDataFrame
+    @param  self            see :class:`StreamingDataFrame`
     @param  path_or_buf     a string, a list of strings or buffers, if it is a
                             string, it must contain ``{}`` like ``partition{}.txt``
     @param  export_method   method used to store the partitions, by default
@@ -30,7 +26,7 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv",
     @return                 outputs of the exports functions
 
     The function cannot return two iterators or two
-    @see cl StreamingDataFrame because running through one
+    see :class:`StreamingDataFrame` because running through one
     means running through the other. We can assume both
     splits do not hold in memory and we cannot run through
     the same iterator again as random draws would be different.
@@ -42,13 +38,13 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv",
     """
     if kwargs.get("stratify") is not None:
         raise NotImplementedError(  # pragma: no cover
-            "No implementation yet for the stratified version.")
+            "No implementation yet for the stratified version."
+        )
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", category=ImportWarning)
         from sklearn.model_selection import train_test_split  # pylint: disable=C0415
 
-    opts = ['test_size', 'train_size',
-            'random_state', 'shuffle', 'stratify']
+    opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"]
     split_ops = {}
     for o in opts:
         if o in kwargs:
@@ -56,27 +52,28 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv",
             del kwargs[o]
 
     exportf_ = getattr(pandas.DataFrame, export_method)
-    if export_method == 'to_csv' and 'mode' not in kwargs:
-        exportf = lambda *a, **kw: exportf_(*a, mode='a', **kw)
+    if export_method == "to_csv" and "mode" not in kwargs:
+        exportf = lambda *a, **kw: exportf_(*a, mode="a", **kw)  # noqa: E731
     else:
         exportf = exportf_
 
     if isinstance(path_or_buf, str):
         if "{}" not in path_or_buf:
-            raise ValueError(
-                "path_or_buf must contain {} to insert the partition name")
+            raise ValueError("path_or_buf must contain {} to insert the partition name")
         if names is None:
-            names = ['train', 'test']
+            names = ["train", "test"]
         elif len(names) != len(path_or_buf):
             raise ValueError(  # pragma: no cover
-                'names and path_or_buf must have the same length')
+                "names and path_or_buf must have the same length"
+            )
         path_or_buf = [path_or_buf.format(n) for n in names]
     elif path_or_buf is None:
         path_or_buf = [None, None]
     else:
         if not isinstance(path_or_buf, list):
             raise TypeError(  # pragma: no cover
-                'path_or_buf must be a list or a string')
+                "path_or_buf must be a list or a string"
+            )
 
     bufs = []
     close = []
@@ -85,8 +82,7 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv",
             st = StringIO()
             cl = False
         elif isinstance(p, str):
-            st = open(  # pylint: disable=R1732
-                p, "w", encoding=kwargs.get('encoding'))
+            st = open(p, "w", encoding=kwargs.get("encoding"))  # pylint: disable=R1732
             cl = True
         else:
             st = p
@@ -98,34 +94,38 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv",
         train, test = train_test_split(df, **split_ops)
         exportf(train, bufs[0], **kwargs)
         exportf(test, bufs[1], **kwargs)
-        kwargs['header'] = False
+        kwargs["header"] = False
 
     for b, c in zip(bufs, close):
         if c:
             b.close()
-    return [st.getvalue() if isinstance(st, StringIO) else p
-            for st, p in zip(bufs, path_or_buf)]
+    return [
+        st.getvalue() if isinstance(st, StringIO) else p
+        for st, p in zip(bufs, path_or_buf)
+    ]
 
 
-def sklearn_train_test_split_streaming(self, test_size=0.25, train_size=None,
-                                       stratify=None, hash_size=9, unique_rows=False):
+def sklearn_train_test_split_streaming(
+    self, test_size=0.25, train_size=None, stratify=None, hash_size=9, unique_rows=False
+):
     """
     Randomly splits a dataframe into smaller pieces.
     The function returns streams of file names.
     The function relies on :epkg:`sklearn:model_selection:train_test_split`.
     It handles the stratified version of it.
 
-    @param  self            @see cl StreamingDataFrame
-    @param  test_size       ratio for the test partition (if *train_size* is not specified)
-    @param  train_size      ratio for the train partition
-    @param  stratify        column holding the stratification
-    @param  hash_size       size of the hash to cache information about partition
-    @param  unique_rows     ensures that rows are unique
-    @return                 Two @see cl StreamingDataFrame, one
+    :param self: see :class:`StreamingDataFrame`
+    :param test_size: ratio for the test partition
+        (if *train_size* is not specified)
+    :param train_size: ratio for the train partition
+    :param stratify: column holding the stratification
+    :param hash_size: size of the hash to cache information about partition
+    :param unique_rows: ensures that rows are unique
+    :return: Two see :class:`StreamingDataFrame`, one
                             for train, one for test.
 
     The function returns two iterators or two
-    @see cl StreamingDataFrame. It
+    see :class:`StreamingDataFrame`. It
     tries to do everything without writing anything on disk
     but it requires to store the repartition somehow.
     This function hashes every row and maps the hash with a part
@@ -173,7 +173,7 @@ def iterator_rows():
                         random.shuffle(vr)
                         if (0, k) in counts:
                             tt = counts[1, k] + counts[0, k]
-                            delta = - int(counts[0, k] - tt * p + 0.5)
+                            delta = -int(counts[0, k] - tt * p + 0.5)
                         else:
                             delta = 0
                         i = int(len(v) * p + 0.5)
@@ -199,7 +199,7 @@ def iterator_rows():
             random.shuffle(vr)
             if (0, k) in counts:
                 tt = counts[1, k] + counts[0, k]
-                delta = - int(counts[0, k] - tt * p + 0.5)
+                delta = -int(counts[0, k] - tt * p + 0.5)
             else:
                 delta = 0
             i = int(len(v) * p + 0.5)
@@ -234,7 +234,8 @@ def iterator_internal(part_requested):
                     raise ValueError(  # pragma: no cover
                         "A row or at least its hash is already cached. "
                         "Increase hash_size or check for duplicates "
-                        "('{0}')\n{1}.".format(h, obs))
+                        "('{0}')\n{1}.".format(h, obs)
+                    )
                 if h not in cache:
                     cache[h] = part
                 else:
@@ -242,8 +243,7 @@ def iterator_internal(part_requested):
                 if part == part_requested:
                     accumul.append(obs)
                     if len(accumul) >= static_schema[2]:
-                        dfo = pandas.DataFrame(
-                            accumul, columns=static_schema[0])
+                        dfo = pandas.DataFrame(accumul, columns=static_schema[0])
                         self.ensure_dtype(dfo, static_schema[1])
                         iy += dfo.shape[0]
                         accumul.clear()
@@ -255,12 +255,13 @@ def iterator_internal(part_requested):
                     part = cache.get(h)
                     if part is None:
                         raise ValueError(  # pragma: no cover
-                            f"Second iteration. A row was never met in the first one\n{obs}")
+                            f"Second iteration. A row was "
+                            f"never met in the first one\n{obs}"
+                        )
                     if part == part_requested:
                         accumul.append(obs)
                         if len(accumul) >= static_schema[2]:
-                            dfo = pandas.DataFrame(
-                                accumul, columns=static_schema[0])
+                            dfo = pandas.DataFrame(accumul, columns=static_schema[0])
                             self.ensure_dtype(dfo, static_schema[1])
                             iy += dfo.shape[0]
                             accumul.clear()
@@ -271,5 +272,7 @@ def iterator_internal(part_requested):
             iy += dfo.shape[0]
             yield dfo
 
-    return (self.__class__(lambda: iterator_internal(0)),
-            self.__class__(lambda: iterator_internal(1)))
+    return (
+        self.__class__(lambda: iterator_internal(0)),
+        self.__class__(lambda: iterator_internal(1)),
+    )
diff --git a/pandas_streaming/exc/__init__.py b/pandas_streaming/exc/__init__.py
index 9979b62..a5e114d 100644
--- a/pandas_streaming/exc/__init__.py
+++ b/pandas_streaming/exc/__init__.py
@@ -1,6 +1 @@
-"""
-@file
-@brief Shortcuts to *exc*.
-"""
-
-from .exc_streaming import StreamingInefficientException
+from .exc_streaming import StreamingInefficientException  # noqa: F401
diff --git a/pandas_streaming/exc/exc_streaming.py b/pandas_streaming/exc/exc_streaming.py
index c7094e5..be0bd36 100644
--- a/pandas_streaming/exc/exc_streaming.py
+++ b/pandas_streaming/exc/exc_streaming.py
@@ -1,10 +1,3 @@
-# -*- coding: utf-8 -*-
-"""
-@file
-@brief Defines a streming dataframe.
-"""
-
-
 class StreamingInefficientException(Exception):
     """
     Kind of operations doable with a :epkg:`pandas:DataFrame`
@@ -18,5 +11,4 @@ def __init__(self, meth):
 
         :param meth: inefficient method
         """
-        Exception.__init__(
-            self, f"{meth} should not be done in streaming mode.")
+        Exception.__init__(self, f"{meth} should not be done in streaming mode.")
diff --git a/pyproject.toml b/pyproject.toml
index bad7f7a..91b4010 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,3 +29,7 @@ max-complexity = 10
 
 [tool.ruff.per-file-ignores]
 "_doc/examples/plot_first_example.py" = ["E402", "F811"]
+"_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"]
+"pandas_streaming/data/__init__.py" = ["F401"]
+"pandas_streaming/df/__init__.py" = ["F401"]
+"pandas_streaming/df/dataframe_io_helpers.py" = ["E501"]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 5ab8605..16ed1c5 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,5 +1,6 @@
 autopep8
 coverage
+furo
 ijson
 jupyter_sphinx
 jyquickhelper
@@ -10,10 +11,10 @@ Pillow
 pycodestyle
 pylint>=2.14.0
 pyquickhelper>=1.10
-pyquicksetup
 scikit-learn
 scipy
 sphinx
+sphinx-runpython
 sphinxcontrib.imagesvg
 sphinx_gallery
 ujson

From 2525d9f4b16c553c846e414594d2a3d007c00ec6 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sat, 22 Jul 2023 20:01:13 +0200
Subject: [PATCH 03/16] move

---
 _doc/{sphinxdoc/source => }/_static/git_logo.png    | Bin
 _doc/{sphinxdoc/source => }/_static/project_ico.ico | Bin
 _doc/{sphinxdoc/source => }/_static/project_ico.png | Bin
 _doc/{sphinxdoc/source => }/api/index.rst           |   0
 _doc/{sphinxdoc/source => }/api/rdata.rst           |   0
 _doc/{sphinxdoc/source => }/api/rdf.rst             |   0
 _doc/{sphinxdoc/source => }/api/rexc.rst            |   0
 _doc/{sphinxdoc/source => }/api/rio.rst             |   0
 .../source => }/blog/2017/2017-09-21_first_day.rst  |   0
 .../source => }/blog/2018/2018-08-19_streamz.rst    |   0
 _doc/{sphinxdoc/source => }/conf.py                 |   0
 _doc/{sphinxdoc/source => }/glossary.rst            |   0
 _doc/{sphinxdoc/source => }/i_ex.rst                |   0
 _doc/{sphinxdoc/source => }/i_examples.rst          |   0
 _doc/{sphinxdoc/source => }/i_faq.rst               |   0
 _doc/{sphinxdoc/source => }/i_index.rst             |   0
 _doc/{sphinxdoc/source => }/index.rst               |   0
 _doc/{sphinxdoc/source => }/license.rst             |   0
 _doc/{sphinxdoc/source => }/tutorial/index.rst      |   0
 19 files changed, 0 insertions(+), 0 deletions(-)
 rename _doc/{sphinxdoc/source => }/_static/git_logo.png (100%)
 rename _doc/{sphinxdoc/source => }/_static/project_ico.ico (100%)
 rename _doc/{sphinxdoc/source => }/_static/project_ico.png (100%)
 rename _doc/{sphinxdoc/source => }/api/index.rst (100%)
 rename _doc/{sphinxdoc/source => }/api/rdata.rst (100%)
 rename _doc/{sphinxdoc/source => }/api/rdf.rst (100%)
 rename _doc/{sphinxdoc/source => }/api/rexc.rst (100%)
 rename _doc/{sphinxdoc/source => }/api/rio.rst (100%)
 rename _doc/{sphinxdoc/source => }/blog/2017/2017-09-21_first_day.rst (100%)
 rename _doc/{sphinxdoc/source => }/blog/2018/2018-08-19_streamz.rst (100%)
 rename _doc/{sphinxdoc/source => }/conf.py (100%)
 rename _doc/{sphinxdoc/source => }/glossary.rst (100%)
 rename _doc/{sphinxdoc/source => }/i_ex.rst (100%)
 rename _doc/{sphinxdoc/source => }/i_examples.rst (100%)
 rename _doc/{sphinxdoc/source => }/i_faq.rst (100%)
 rename _doc/{sphinxdoc/source => }/i_index.rst (100%)
 rename _doc/{sphinxdoc/source => }/index.rst (100%)
 rename _doc/{sphinxdoc/source => }/license.rst (100%)
 rename _doc/{sphinxdoc/source => }/tutorial/index.rst (100%)

diff --git a/_doc/sphinxdoc/source/_static/git_logo.png b/_doc/_static/git_logo.png
similarity index 100%
rename from _doc/sphinxdoc/source/_static/git_logo.png
rename to _doc/_static/git_logo.png
diff --git a/_doc/sphinxdoc/source/_static/project_ico.ico b/_doc/_static/project_ico.ico
similarity index 100%
rename from _doc/sphinxdoc/source/_static/project_ico.ico
rename to _doc/_static/project_ico.ico
diff --git a/_doc/sphinxdoc/source/_static/project_ico.png b/_doc/_static/project_ico.png
similarity index 100%
rename from _doc/sphinxdoc/source/_static/project_ico.png
rename to _doc/_static/project_ico.png
diff --git a/_doc/sphinxdoc/source/api/index.rst b/_doc/api/index.rst
similarity index 100%
rename from _doc/sphinxdoc/source/api/index.rst
rename to _doc/api/index.rst
diff --git a/_doc/sphinxdoc/source/api/rdata.rst b/_doc/api/rdata.rst
similarity index 100%
rename from _doc/sphinxdoc/source/api/rdata.rst
rename to _doc/api/rdata.rst
diff --git a/_doc/sphinxdoc/source/api/rdf.rst b/_doc/api/rdf.rst
similarity index 100%
rename from _doc/sphinxdoc/source/api/rdf.rst
rename to _doc/api/rdf.rst
diff --git a/_doc/sphinxdoc/source/api/rexc.rst b/_doc/api/rexc.rst
similarity index 100%
rename from _doc/sphinxdoc/source/api/rexc.rst
rename to _doc/api/rexc.rst
diff --git a/_doc/sphinxdoc/source/api/rio.rst b/_doc/api/rio.rst
similarity index 100%
rename from _doc/sphinxdoc/source/api/rio.rst
rename to _doc/api/rio.rst
diff --git a/_doc/sphinxdoc/source/blog/2017/2017-09-21_first_day.rst b/_doc/blog/2017/2017-09-21_first_day.rst
similarity index 100%
rename from _doc/sphinxdoc/source/blog/2017/2017-09-21_first_day.rst
rename to _doc/blog/2017/2017-09-21_first_day.rst
diff --git a/_doc/sphinxdoc/source/blog/2018/2018-08-19_streamz.rst b/_doc/blog/2018/2018-08-19_streamz.rst
similarity index 100%
rename from _doc/sphinxdoc/source/blog/2018/2018-08-19_streamz.rst
rename to _doc/blog/2018/2018-08-19_streamz.rst
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/conf.py
similarity index 100%
rename from _doc/sphinxdoc/source/conf.py
rename to _doc/conf.py
diff --git a/_doc/sphinxdoc/source/glossary.rst b/_doc/glossary.rst
similarity index 100%
rename from _doc/sphinxdoc/source/glossary.rst
rename to _doc/glossary.rst
diff --git a/_doc/sphinxdoc/source/i_ex.rst b/_doc/i_ex.rst
similarity index 100%
rename from _doc/sphinxdoc/source/i_ex.rst
rename to _doc/i_ex.rst
diff --git a/_doc/sphinxdoc/source/i_examples.rst b/_doc/i_examples.rst
similarity index 100%
rename from _doc/sphinxdoc/source/i_examples.rst
rename to _doc/i_examples.rst
diff --git a/_doc/sphinxdoc/source/i_faq.rst b/_doc/i_faq.rst
similarity index 100%
rename from _doc/sphinxdoc/source/i_faq.rst
rename to _doc/i_faq.rst
diff --git a/_doc/sphinxdoc/source/i_index.rst b/_doc/i_index.rst
similarity index 100%
rename from _doc/sphinxdoc/source/i_index.rst
rename to _doc/i_index.rst
diff --git a/_doc/sphinxdoc/source/index.rst b/_doc/index.rst
similarity index 100%
rename from _doc/sphinxdoc/source/index.rst
rename to _doc/index.rst
diff --git a/_doc/sphinxdoc/source/license.rst b/_doc/license.rst
similarity index 100%
rename from _doc/sphinxdoc/source/license.rst
rename to _doc/license.rst
diff --git a/_doc/sphinxdoc/source/tutorial/index.rst b/_doc/tutorial/index.rst
similarity index 100%
rename from _doc/sphinxdoc/source/tutorial/index.rst
rename to _doc/tutorial/index.rst

From 9e470d016bc119335eef753b45d307fee944d27d Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sat, 22 Jul 2023 20:01:33 +0200
Subject: [PATCH 04/16] doc conf

---
 _doc/conf.py | 60 ++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/_doc/conf.py b/_doc/conf.py
index f298be6..066ee6c 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -170,35 +170,35 @@
 
 
 epkg_dictionary = {
-        "csv": "https://en.wikipedia.org/wiki/Comma-separated_values",
-        "dask": "https://dask.pydata.org/en/latest/",
-        "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
-        "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
-        "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
-        "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
-        "dill": "https://dill.readthedocs.io/en/latest/dill.html",
-        "Hadoop": "http://hadoop.apache.org/",
-        "ijson": "https://github.com/ICRAR/ijson",
-        "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
-        "pandas": (
-            "http://pandas.pydata.org/pandas-docs/stable/",
-            (
-                "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html",
-                1,
-            ),
-            (
-                "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html",
-                2,
-            ),
+    "csv": "https://en.wikipedia.org/wiki/Comma-separated_values",
+    "dask": "https://dask.pydata.org/en/latest/",
+    "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+    "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+    "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+    "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
+    "dill": "https://dill.readthedocs.io/en/latest/dill.html",
+    "Hadoop": "http://hadoop.apache.org/",
+    "ijson": "https://github.com/ICRAR/ijson",
+    "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
+    "pandas": (
+        "http://pandas.pydata.org/pandas-docs/stable/",
+        (
+            "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html",
+            1,
         ),
-        "pyarrow": "https://arrow.apache.org/docs/python/",
-        "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html",
-        "scikit-multiflow": "https://scikit-multiflow.github.io/",
-        "sklearn": (
-            "http://scikit-learn.org/stable/",
-            ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1),
-            ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
+        (
+            "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html",
+            2,
         ),
-        "streamz": "https://streamz.readthedocs.io/en/latest/index.html",
-        "tornado": "https://www.tornadoweb.org/en/stable/",
-    }
+    ),
+    "pyarrow": "https://arrow.apache.org/docs/python/",
+    "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html",
+    "scikit-multiflow": "https://scikit-multiflow.github.io/",
+    "sklearn": (
+        "http://scikit-learn.org/stable/",
+        ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1),
+        ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
+    ),
+    "streamz": "https://streamz.readthedocs.io/en/latest/index.html",
+    "tornado": "https://www.tornadoweb.org/en/stable/",
+}

From fbfc01a26740a94437be3c6d6ad16b91bfd4dd6e Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sat, 22 Jul 2023 20:02:22 +0200
Subject: [PATCH 05/16] upgrade version

---
 pandas_streaming/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas_streaming/__init__.py b/pandas_streaming/__init__.py
index a4a6c0c..e0193cc 100644
--- a/pandas_streaming/__init__.py
+++ b/pandas_streaming/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.4.218"
+__version__ = "0.5.0"
 __author__ = "Xavier Dupré"
 __github__ = "https://github.com/sdpython/pandas_streaming"
 __url__ = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html"

From ad93df9dedfe4dc644457b378b2c0ec716dac2fb Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 09:51:48 +0200
Subject: [PATCH 06/16] doc

---
 _doc/conf.py                                |  5 +-
 pandas_streaming/df/connex_split.py         | 96 ++++++++++++---------
 pandas_streaming/df/dataframe.py            |  2 +-
 pandas_streaming/df/dataframe_helpers.py    |  3 +-
 pandas_streaming/df/dataframe_io_helpers.py | 19 ++--
 pandas_streaming/df/dataframe_split.py      |  6 +-
 pyproject.toml                              |  1 +
 7 files changed, 75 insertions(+), 57 deletions(-)

diff --git a/_doc/conf.py b/_doc/conf.py
index 066ee6c..446bf9d 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -177,6 +177,7 @@
     "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
     "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
     "dill": "https://dill.readthedocs.io/en/latest/dill.html",
+    "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html",
     "Hadoop": "http://hadoop.apache.org/",
     "ijson": "https://github.com/ICRAR/ijson",
     "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
@@ -196,8 +197,8 @@
     "scikit-multiflow": "https://scikit-multiflow.github.io/",
     "sklearn": (
         "http://scikit-learn.org/stable/",
-        ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1),
-        ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
+        ("https://scikit-learn.org/stable/modules/generated/{0}.html", 1),
+        ("https://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
     ),
     "streamz": "https://streamz.readthedocs.io/en/latest/index.html",
     "tornado": "https://www.tornadoweb.org/en/stable/",
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
index bc68581..c3b6150 100644
--- a/pandas_streaming/df/connex_split.py
+++ b/pandas_streaming/df/connex_split.py
@@ -1,9 +1,12 @@
 from collections import Counter
+from logging import getLogger
 import pandas
 import numpy
 from sklearn.model_selection import train_test_split
 from .dataframe_helpers import dataframe_shuffle
 
+logger = getLogger("pandas-streaming")
+
 
 class ImbalancedSplitException(Exception):
     """
@@ -133,6 +136,7 @@ def train_test_connex_split(
     return_cnx=False,
     must_groups=None,
     random_state=None,
+    verbose=0,
 ):
     """
     This split is for a specific case where data is linked
@@ -166,6 +170,7 @@ def train_test_connex_split(
     @param  must_groups     column name for ids which must not be shared by
                             train/test partitions
     @param  random_state    seed for random generator
+    @param  verbose         verbosity (uses logging)
     @return                 Two see :class:`StreamingDataFrame`, one
                             for train, one for test.
 
@@ -275,11 +280,15 @@ def do_connex_components(dfrows, local_groups, kb, sib):
         modif = 1
 
         while modif > 0 and itern < len(elements):
-            if fLOG and df.shape[0] > 10000:
-                fLOG(
-                    "[train_test_connex_split] iteration={0}-#nb connect={1} - "
-                    "modif={2}".format(iter, len(set(elements)), modif)
+            if df.shape[0] > 10000:
+                logger.info(
+                    "[train_test_connex_split] iteration=%d-#nb connect=%d - "
+                    "modif=%s",
+                    itern,
+                    len(set(elements)),
+                    modif,
                 )
+
             modif = 0
             itern += 1
             for i, row in enumerate(dfrows.itertuples(index=False, name=None)):
@@ -310,19 +319,18 @@ def do_connex_components(dfrows, local_groups, kb, sib):
                             diff = len(counts_cnx[new_c]) + len(counts_cnx[c]) - maxi
                             r = diff / float(maxi)
                             if r > kb:
-                                if fLOG:  # pragma: no cover
-                                    fLOG(
+                                if verbose:  # pragma: no cover
+                                    logger.info(
                                         "[train_test_connex_split]    balance "
-                                        "r={0:0.00000}>{1:0.00}, #[{2}]={3}, "
-                                        "#[{4}]={5}".format(
-                                            r,
-                                            kb,
-                                            new_c,
-                                            len(counts_cnx[new_c]),
-                                            c,
-                                            len(counts_cnx[c]),
-                                        )
+                                        "r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d",
+                                        r,
+                                        kb,
+                                        new_c,
+                                        len(counts_cnx[new_c]),
+                                        c,
+                                        len(counts_cnx[c]),
                                     )
+
                                 continue
 
                     if sib is not None:
@@ -330,19 +338,16 @@ def do_connex_components(dfrows, local_groups, kb, sib):
                             len(elements)
                         )
                         if r > sib:
-                            if fLOG:  # pragma: no cover
-                                fLOG(
-                                    "[train_test_connex_split]    no merge "
-                                    "r={0:0.00000}>{1:0.00}, #[{2}]={3}, #[{4}]={5}"
-                                    "".format(
-                                        r,
-                                        sib,
-                                        new_c,
-                                        len(counts_cnx[new_c]),
-                                        c,
-                                        len(counts_cnx[c]),
-                                    )
-                                )
+                            logger.info(
+                                "[train_test_connex_split]    "
+                                "no merge r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d",
+                                r,
+                                sib,
+                                new_c,
+                                len(counts_cnx[new_c]),
+                                c,
+                                len(counts_cnx[c]),
+                            )
                             avoids_merge[new_c, c] = i
                             continue
 
@@ -370,25 +375,26 @@ def do_connex_components(dfrows, local_groups, kb, sib):
     dfids[name] = elements
     dfids[one] = 1
     grsum = dfids[[name, one]].groupby(name, as_index=False).sum()
-    if fLOG:
-        for g in groups:
-            fLOG(f"[train_test_connex_split]     #nb in '{g}': {len(set(dfids[g]))}")
-        fLOG(f"[train_test_connex_split] #connex {grsum.shape[0]}/{dfids.shape[0]}")
+    for g in groups:
+        logger.info("[train_test_connex_split]     #nb in '%d':", len(set(dfids[g])))
+    logger.info(
+        "[train_test_connex_split] #connex %d/%d", grsum.shape[0], dfids.shape[0]
+    )
     if grsum.shape[0] <= 1:
         raise ValueError(  # pragma: no cover
             "Every element is in the same connected components."
         )
 
     # Statistics: top connected components
-    if fLOG:
+    if verbose:
         # Global statistics
         counts = Counter(elements)
         cl = [(v, k) for k, v in counts.items()]
         cum = 0
         maxc = None
-        fLOG(
-            "[train_test_connex_split] number of connected components: {0}"
-            "".format(len(set(elements)))
+        logger.info(
+            "[train_test_connex_split] number of connected components: %d",
+            len(set(elements)),
         )
         for i, (v, k) in enumerate(sorted(cl, reverse=True)):
             if i == 0:
@@ -396,15 +402,20 @@ def do_connex_components(dfrows, local_groups, kb, sib):
             if i >= 10:
                 break
             cum += v
-            fLOG(
-                "[train_test_connex_split]     c={0} #elements={1} cumulated"
-                "={2}/{3}".format(k, v, cum, len(elements))
+            logger.info(
+                "[train_test_connex_split]     c=%s #elements=%s cumulated=%d/%d",
+                k,
+                v,
+                cum,
+                len(elements),
             )
 
         # Most important component
-        fLOG(f"[train_test_connex_split] first row of the biggest component {maxc}")
+        logger.info(
+            "[train_test_connex_split] first row of the biggest component %d", maxc
+        )
         tdf = dfids[dfids[name] == maxc[0]]
-        fLOG(f"[train_test_connex_split] \n{tdf.head(n=10)}")
+        logger.info("[train_test_connex_split] % s", tdf.head(n=10))
 
     # Splits.
     train, test = train_test_split_weights(
@@ -471,8 +482,7 @@ def train_test_apart_stratify(
     classification. A category (*stratify*) is not exclusive
     and an observation can be assigned to multiple
     categories. In that particular case, the method
-    `train_test_split <http://scikit-learn.org/stable/modules/generated/
-    sklearn.model_selection.train_test_split.html>`_
+    :func:`sklearn.model_selection.train_test_split`
     can not directly be used.
 
     .. runpython::
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
index db3d7b9..b519102 100644
--- a/pandas_streaming/df/dataframe.py
+++ b/pandas_streaming/df/dataframe.py
@@ -138,7 +138,7 @@ def train_test_split(
                                 will be given to that function
         @param  names           partitions names, by default ``('train', 'test')``
         @param  kwargs          parameters for the export function and
-                                :epkg:`sklearn:model_selection:train_test_split`.
+                                :func:`sklearn.model_selection.train_test_split`.
         @param  streaming       the function switches to a
                                 streaming version of the algorithm.
         @param  partitions      splitting partitions
diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py
index b85d78a..5771e80 100644
--- a/pandas_streaming/df/dataframe_helpers.py
+++ b/pandas_streaming/df/dataframe_helpers.py
@@ -350,8 +350,7 @@ def pandas_groupby_nan(
         generated/pandas.DataFrame.groupby.html>`_
     :return: groupby results
 
-    See `groupby and missing values <http://pandas-docs.github.io/
-    pandas-docs-travis/groupby.html#na-and-nat-group-handling>`_.
+    See :epkg:`groupby and missing values`.
     If no :epkg:`nan` is detected, the function falls back in regular
     :epkg:`pandas:DataFrame:groupby` which has the following
     behavior.
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
index d956cf9..c8a7776 100644
--- a/pandas_streaming/df/dataframe_io_helpers.py
+++ b/pandas_streaming/df/dataframe_io_helpers.py
@@ -141,7 +141,9 @@ def _flatten(obj, key):
     return flattened_dict
 
 
-def enumerate_json_items(filename, encoding=None, lines=False, flatten=False):
+def enumerate_json_items(
+    filename, encoding=None, lines=False, flatten=False, verbose=0
+):
     """
     Enumerates items from a :epkg:`JSON` file or string.
 
@@ -149,6 +151,7 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False):
     :param encoding: encoding
     :param lines: one record per row
     :param flatten: call @see fn flatten_dictionary
+    :param verbose: verbosity (based on :epkg:`tqdm`)
     :return: iterator on records at first level.
 
     It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``.
@@ -259,11 +262,15 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False):
         curkey = None
         stack = []
         nbyield = 0
-        for i, (_, event, value) in enumerate(parser):
-            if i % 1000000 == 0 and fLOG is not None:
-                fLOG(  # pragma: no cover
-                    f"[enumerate_json_items] i={i} yielded={nbyield}"
-                )
+        if verbose:
+            from tqdm import tqdm
+
+            loop = tqdm(enumerate(parser))
+        else:
+            loop = enumerate(parser)
+        for i, (_, event, value) in loop:
+            if verbose:
+                loop.set_description(f"process row {i}-event={event!r}")
             if event == "start_array":
                 if curkey is None:
                     current = []
diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py
index bb7ea33..ec4a579 100644
--- a/pandas_streaming/df/dataframe_split.py
+++ b/pandas_streaming/df/dataframe_split.py
@@ -12,7 +12,7 @@ def sklearn_train_test_split(
     """
     Randomly splits a dataframe into smaller pieces.
     The function returns streams of file names.
-    The function relies on :epkg:`sklearn:model_selection:train_test_split`.
+    The function relies on :func:`sklearn.model_selection.train_test_split`.
     It does not handle stratified version of it.
 
     @param  self            see :class:`StreamingDataFrame`
@@ -22,7 +22,7 @@ def sklearn_train_test_split(
                             :epkg:`pandas:DataFrame:to_csv`
     @param  names           partitions names, by default ``('train', 'test')``
     @param  kwargs          parameters for the export function and
-                            :epkg:`sklearn:model_selection:train_test_split`.
+                            :fund:`sklearn.model_selection.train_test_split`.
     @return                 outputs of the exports functions
 
     The function cannot return two iterators or two
@@ -111,7 +111,7 @@ def sklearn_train_test_split_streaming(
     """
     Randomly splits a dataframe into smaller pieces.
     The function returns streams of file names.
-    The function relies on :epkg:`sklearn:model_selection:train_test_split`.
+    The function relies on :func:`sklearn.model_selection.train_test_split`.
     It handles the stratified version of it.
 
     :param self: see :class:`StreamingDataFrame`
diff --git a/pyproject.toml b/pyproject.toml
index 91b4010..50db37b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,6 +4,7 @@ ignore_directives = [
     "autoclass",
     "autofunction",
     "automodule",
+    "exreflist",
     "gdot",
     "image-sg",
     "runpython",

From 04a6d33180afa14441dfe9b2f9f4bffe350d2d5f Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 10:05:56 +0200
Subject: [PATCH 07/16] doc

---
 _doc/api/rdata.rst                      |   2 +-
 _doc/api/rdf.rst                        |  23 +-
 _doc/api/rexc.rst                       |   2 +-
 _doc/api/rio.rst                        |   4 +-
 _doc/blog/2017/2017-09-21_first_day.rst |  10 -
 _doc/blog/2018/2018-08-19_streamz.rst   |  10 -
 _doc/examples/README.txt                |   3 +
 _doc/examples/first_step.py             | 100 +++
 _doc/i_examples.rst                     |  11 -
 _doc/i_faq.rst                          |  11 -
 _doc/i_index.rst                        |  18 -
 _doc/index.rst                          |  13 +-
 _doc/notebooks/first_steps.ipynb        | 906 ------------------------
 _doc/tutorial/index.rst                 |   7 +-
 pyproject.toml                          |   1 +
 15 files changed, 127 insertions(+), 994 deletions(-)
 delete mode 100644 _doc/blog/2017/2017-09-21_first_day.rst
 delete mode 100644 _doc/blog/2018/2018-08-19_streamz.rst
 create mode 100644 _doc/examples/README.txt
 create mode 100644 _doc/examples/first_step.py
 delete mode 100644 _doc/i_examples.rst
 delete mode 100644 _doc/i_faq.rst
 delete mode 100644 _doc/i_index.rst
 delete mode 100644 _doc/notebooks/first_steps.ipynb

diff --git a/_doc/api/rdata.rst b/_doc/api/rdata.rst
index 32c9e1b..3f87481 100644
--- a/_doc/api/rdata.rst
+++ b/_doc/api/rdata.rst
@@ -5,4 +5,4 @@ pandas_streaming.data
 Collection of functions which produces
 :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`.
 
-.. autosignature:: pandas_streaming.data.dummy.dummy_streaming_dataframe
+.. autofunction:: pandas_streaming.data.dummy.dummy_streaming_dataframe
diff --git a/_doc/api/rdf.rst b/_doc/api/rdf.rst
index d8bcf5c..09bf139 100644
--- a/_doc/api/rdf.rst
+++ b/_doc/api/rdf.rst
@@ -17,29 +17,30 @@ of an instance is fast as long as the data is not
 processed. Iterators can be chained as many map reduce
 framework does.
 
-.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame
+.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame
+    :members:
 
 The module implements additional and useful functions
 not necessarily for the streaming version of the dataframes.
 Many methods have been rewritten to support
 streaming. Among them, IO methods:
 
-.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv
+.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv
 
-.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df
+.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df
 
-.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json
+.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json
 
 Data Manipulation
 +++++++++++++++++
 
-.. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_hash_columns
+.. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_hash_columns
 
-.. autosignature:: pandas_streaming.df.connex_split.dataframe_shuffle
+.. autofunction:: pandas_streaming.df.connex_split.dataframe_shuffle
 
-.. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_unfold
+.. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_unfold
 
-.. autosignature:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan
+.. autofunction:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan
 
 Complex splits
 ++++++++++++++
@@ -49,8 +50,8 @@ if rows are not independant and share some ids. In that case,
 the following functions will try to build two partitions keeping
 ids separate or separate as much as possible.
 
-.. autosignature:: pandas_streaming.df.connex_split.train_test_apart_stratify
+.. autofunction:: pandas_streaming.df.connex_split.train_test_apart_stratify
 
-.. autosignature:: pandas_streaming.df.connex_split.train_test_connex_split
+.. autofunction:: pandas_streaming.df.connex_split.train_test_connex_split
 
-.. autosignature:: pandas_streaming.df.connex_split.train_test_split_weights
+.. autofunction:: pandas_streaming.df.connex_split.train_test_split_weights
diff --git a/_doc/api/rexc.rst b/_doc/api/rexc.rst
index 1896765..5528b0d 100644
--- a/_doc/api/rexc.rst
+++ b/_doc/api/rexc.rst
@@ -4,4 +4,4 @@ pandas_streaming.exc
 
 Exceptions.
 
-.. autosignature:: pandas_streaming.exc.exc_streaming.StreamingInefficientException
+.. autoclass:: pandas_streaming.exc.exc_streaming.StreamingInefficientException
diff --git a/_doc/api/rio.rst b/_doc/api/rio.rst
index f11c081..357f6cc 100644
--- a/_doc/api/rio.rst
+++ b/_doc/api/rio.rst
@@ -14,6 +14,6 @@ to exchange with other people and other environments.
 The two following functions makes it easier to collapse many dataframes
 or numpy arrays into one single file. The data can be unzipped afterwards.
 
-.. autosignature:: pandas_streaming.df.dataframe_io.read_zip
+.. autofunction:: pandas_streaming.df.dataframe_io.read_zip
 
-.. autosignature:: pandas_streaming.df.dataframe_io.to_zip
+.. autofunction:: pandas_streaming.df.dataframe_io.to_zip
diff --git a/_doc/blog/2017/2017-09-21_first_day.rst b/_doc/blog/2017/2017-09-21_first_day.rst
deleted file mode 100644
index 3bbf7d3..0000000
--- a/_doc/blog/2017/2017-09-21_first_day.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-
-.. blogpost::
-    :title: Why pandas_streaming?
-    :keywords: pandas
-    :date: 2017-09-17
-    :categories: documentation
-
-    The module aims at using a similar APIs to
-    :epkg:`pandas` for out-of-memory dataframe.
-    See :ref:`l-objective`.
diff --git a/_doc/blog/2018/2018-08-19_streamz.rst b/_doc/blog/2018/2018-08-19_streamz.rst
deleted file mode 100644
index 2ac15a9..0000000
--- a/_doc/blog/2018/2018-08-19_streamz.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-
-.. blogpost::
-    :title: Streaming dataframes with streamz
-    :keywords: streamz
-    :date: 2018-08-19
-    :categories: alternatives
-
-    :epkg:`streamz` is the most promising
-    initiative which implements streaming
-    dataframes so far.
diff --git a/_doc/examples/README.txt b/_doc/examples/README.txt
new file mode 100644
index 0000000..cb523b0
--- /dev/null
+++ b/_doc/examples/README.txt
@@ -0,0 +1,3 @@
+Gallery of Examples
+===================
+
diff --git a/_doc/examples/first_step.py b/_doc/examples/first_step.py
new file mode 100644
index 0000000..1c446a2
--- /dev/null
+++ b/_doc/examples/first_step.py
@@ -0,0 +1,100 @@
+"""
+First steps with pandas_streaming
+=================================
+ 
+A few difference between :epkg:`pandas` and *pandas_streaming*.
+
+pandas to pandas_streaming
+++++++++++++++++++++++++++
+"""
+
+from pandas import DataFrame
+
+df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
+df
+
+
+#############################
+# We create a streaming dataframe:
+
+
+from pandas_streaming.df import StreamingDataFrame
+
+sdf = StreamingDataFrame.read_df(df)
+sdf
+
+
+################################
+#
+
+sdf.to_dataframe()
+
+
+########################################
+# Internally, StreamingDataFrame implements an iterator on
+# dataframes and then tries to replicate the same interface as
+# :class:`pandas.DataFrame` possibly wherever it is possible to
+# manipulate data without loading everything into memory.
+
+
+sdf2 = sdf.concat(sdf)
+sdf2.to_dataframe()
+
+
+###############################
+#
+
+m = DataFrame(dict(Y=["a", "b"], Z=[10, 20]))
+m
+
+
+##########################################
+#
+
+sdf3 = sdf2.merge(m, left_on="Y", right_on="Y", how="outer")
+sdf3.to_dataframe()
+
+
+############################################
+#
+
+sdf2.to_dataframe().merge(m, left_on="Y", right_on="Y", how="outer")
+
+
+############################################
+# The order might be different.
+
+
+sdftr, sdfte = sdf2.train_test_split(test_size=0.5)
+sdfte.head()
+
+
+############################################
+#
+
+
+sdftr.head()
+
+
+############################################
+# split a big file
+# ++++++++++++++++
+
+
+sdf2.to_csv("example.txt")
+
+
+############################################
+#
+
+
+new_sdf = StreamingDataFrame.read_csv("example.txt")
+new_sdf.train_test_split("example.{}.txt", streaming=False)
+
+
+############################################
+#
+
+import glob
+
+glob.glob("ex*.txt")
diff --git a/_doc/i_examples.rst b/_doc/i_examples.rst
deleted file mode 100644
index 1b16057..0000000
--- a/_doc/i_examples.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-
-.. _l-EX2:
-
-Examples
-========
-
-.. toctree::
-
-    i_ex
-    gyexamples/index
-    all_notebooks
diff --git a/_doc/i_faq.rst b/_doc/i_faq.rst
deleted file mode 100644
index 26ded95..0000000
--- a/_doc/i_faq.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-
-.. _l-FAQ2:
-
-FAQ
-===
-
-.. contents::
-    :local:
-
-.. faqreflist::
-    :contents:
diff --git a/_doc/i_index.rst b/_doc/i_index.rst
deleted file mode 100644
index ec0ffec..0000000
--- a/_doc/i_index.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-
-=====
-Index
-=====
-
-.. toctree::
-    :maxdepth: 2
-
-    gyexamples/index
-    gynotebooks/index
-    issues_todoextlist
-    completed_todoextlist
-    filechanges
-    all_report
-    glossary
-    README
-    license
-    blog/blogindex
diff --git a/_doc/index.rst b/_doc/index.rst
index 15a7eb7..1d8755b 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -2,7 +2,7 @@
 .. |gitlogo| image:: _static/git_logo.png
              :height: 20
 
-pandas_streaming: streaming API over pandas
+pandas-streaming: streaming API over pandas
 ===========================================
 
 .. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true
@@ -56,20 +56,19 @@ pandas_streaming: streaming API over pandas
     :target: https://github.com/sdpython/pandas_streaming/
     :alt: size
 
-*pandas_streaming* aims at processing big files with `pandas <http://pandas.pydata.org/>`_,
+*pandas_streaming* aims at processing big files with :epkg:`pandas`,
 too big to hold in memory, too small to be parallelized with a significant gain.
-The module replicates a subset of `pandas <http://pandas.pydata.org/>`_ API
+The module replicates a subset of :epkg:`pandas` API
 and implements other functionalities for machine learning.
 
 .. toctree::
     :maxdepth: 1
 
     tutorial/index
+    auto_examples/index
     api/index
-    i_examples
-    blog/blogindex
-    index_modules
-    i_index
+    i_ex
+
 
 **Links:** `github <https://github.com/sdpython/pandas_streaming/>`_,
 `documentation <http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html>`_,
diff --git a/_doc/notebooks/first_steps.ipynb b/_doc/notebooks/first_steps.ipynb
deleted file mode 100644
index 735ede9..0000000
--- a/_doc/notebooks/first_steps.ipynb
+++ /dev/null
@@ -1,906 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# First steps with pandas_streaming\n",
-    "\n",
-    "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div id=\"my_id_menu_nb\">run previous cell, wait for 2 seconds</div>\n",
-       "<script>\n",
-       "function repeat_indent_string(n){\n",
-       "    var a = \"\" ;\n",
-       "    for ( ; n > 0 ; --n)\n",
-       "        a += \"    \";\n",
-       "    return a;\n",
-       "}\n",
-       "var update_menu_string = function(begin, lfirst, llast, sformat, send, keep_item, begin_format, end_format) {\n",
-       "    var anchors = document.getElementsByClassName(\"section\");\n",
-       "    if (anchors.length == 0) {\n",
-       "        anchors = document.getElementsByClassName(\"text_cell_render rendered_html\");\n",
-       "    }\n",
-       "    var i,t;\n",
-       "    var text_menu = begin;\n",
-       "    var text_memo = \"<pre>\\nlength:\" + anchors.length + \"\\n\";\n",
-       "    var ind = \"\";\n",
-       "    var memo_level = 1;\n",
-       "    var href;\n",
-       "    var tags = [];\n",
-       "    var main_item = 0;\n",
-       "    var format_open = 0;\n",
-       "    for (i = 0; i <= llast; i++)\n",
-       "        tags.push(\"h\" + i);\n",
-       "\n",
-       "    for (i = 0; i < anchors.length; i++) {\n",
-       "        text_memo += \"**\" + anchors[i].id + \"--\\n\";\n",
-       "\n",
-       "        var child = null;\n",
-       "        for(t = 0; t < tags.length; t++) {\n",
-       "            var r = anchors[i].getElementsByTagName(tags[t]);\n",
-       "            if (r.length > 0) {\n",
-       "child = r[0];\n",
-       "break;\n",
-       "            }\n",
-       "        }\n",
-       "        if (child == null) {\n",
-       "            text_memo += \"null\\n\";\n",
-       "            continue;\n",
-       "        }\n",
-       "        if (anchors[i].hasAttribute(\"id\")) {\n",
-       "            // when converted in RST\n",
-       "            href = anchors[i].id;\n",
-       "            text_memo += \"#1-\" + href;\n",
-       "            // passer à child suivant (le chercher)\n",
-       "        }\n",
-       "        else if (child.hasAttribute(\"id\")) {\n",
-       "            // in a notebook\n",
-       "            href = child.id;\n",
-       "            text_memo += \"#2-\" + href;\n",
-       "        }\n",
-       "        else {\n",
-       "            text_memo += \"#3-\" + \"*\" + \"\\n\";\n",
-       "            continue;\n",
-       "        }\n",
-       "        var title = child.textContent;\n",
-       "        var level = parseInt(child.tagName.substring(1,2));\n",
-       "\n",
-       "        text_memo += \"--\" + level + \"?\" + lfirst + \"--\" + title + \"\\n\";\n",
-       "\n",
-       "        if ((level < lfirst) || (level > llast)) {\n",
-       "            continue ;\n",
-       "        }\n",
-       "        if (title.endsWith('¶')) {\n",
-       "            title = title.substring(0,title.length-1).replace(\"<\", \"&lt;\")\n",
-       "         .replace(\">\", \"&gt;\").replace(\"&\", \"&amp;\");\n",
-       "        }\n",
-       "        if (title.length == 0) {\n",
-       "            continue;\n",
-       "        }\n",
-       "\n",
-       "        while (level < memo_level) {\n",
-       "            text_menu += end_format + \"</ul>\\n\";\n",
-       "            format_open -= 1;\n",
-       "            memo_level -= 1;\n",
-       "        }\n",
-       "        if (level == lfirst) {\n",
-       "            main_item += 1;\n",
-       "        }\n",
-       "        if (keep_item != -1 && main_item != keep_item + 1) {\n",
-       "            // alert(main_item + \" - \" + level + \" - \" + keep_item);\n",
-       "            continue;\n",
-       "        }\n",
-       "        while (level > memo_level) {\n",
-       "            text_menu += \"<ul>\\n\";\n",
-       "            memo_level += 1;\n",
-       "        }\n",
-       "        text_menu += repeat_indent_string(level-2);\n",
-       "        text_menu += begin_format + sformat.replace(\"__HREF__\", href).replace(\"__TITLE__\", title);\n",
-       "        format_open += 1;\n",
-       "    }\n",
-       "    while (1 < memo_level) {\n",
-       "        text_menu += end_format + \"</ul>\\n\";\n",
-       "        memo_level -= 1;\n",
-       "        format_open -= 1;\n",
-       "    }\n",
-       "    text_menu += send;\n",
-       "    //text_menu += \"\\n\" + text_memo;\n",
-       "\n",
-       "    while (format_open > 0) {\n",
-       "        text_menu += end_format;\n",
-       "        format_open -= 1;\n",
-       "    }\n",
-       "    return text_menu;\n",
-       "};\n",
-       "var update_menu = function() {\n",
-       "    var sbegin = \"\";\n",
-       "    var sformat = '<a href=\"#__HREF__\">__TITLE__</a>';\n",
-       "    var send = \"\";\n",
-       "    var begin_format = '<li>';\n",
-       "    var end_format = '</li>';\n",
-       "    var keep_item = -1;\n",
-       "    var text_menu = update_menu_string(sbegin, 2, 4, sformat, send, keep_item,\n",
-       "       begin_format, end_format);\n",
-       "    var menu = document.getElementById(\"my_id_menu_nb\");\n",
-       "    menu.innerHTML=text_menu;\n",
-       "};\n",
-       "window.setTimeout(update_menu,2000);\n",
-       "            </script>"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from jyquickhelper import add_notebook_menu\n",
-    "\n",
-    "add_notebook_menu()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## pandas to pandas_streaming"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>X</th>\n",
-       "      <th>Y</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     X  Y\n",
-       "0  4.5  a\n",
-       "1  6.0  b\n",
-       "2  7.0  c"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from pandas import DataFrame\n",
-    "\n",
-    "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n",
-    "df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "We create a streaming dataframe:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<pandas_streaming.df.dataframe.StreamingDataFrame at 0x15c2c606160>"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from pandas_streaming.df import StreamingDataFrame\n",
-    "\n",
-    "sdf = StreamingDataFrame.read_df(df)\n",
-    "sdf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>X</th>\n",
-       "      <th>Y</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     X  Y\n",
-       "0  4.5  a\n",
-       "1  6.0  b\n",
-       "2  7.0  c"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sdf.to_dataframe()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>X</th>\n",
-       "      <th>Y</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     X  Y\n",
-       "0  4.5  a\n",
-       "1  6.0  b\n",
-       "2  7.0  c\n",
-       "0  4.5  a\n",
-       "1  6.0  b\n",
-       "2  7.0  c"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sdf2 = sdf.concat(sdf)\n",
-    "sdf2.to_dataframe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Y</th>\n",
-       "      <th>Z</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>a</td>\n",
-       "      <td>10</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>b</td>\n",
-       "      <td>20</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "   Y   Z\n",
-       "0  a  10\n",
-       "1  b  20"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n",
-    "m"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>X</th>\n",
-       "      <th>Y</th>\n",
-       "      <th>Z</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "      <td>10.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "      <td>20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "      <td>10.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "      <td>20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     X  Y     Z\n",
-       "0  4.5  a  10.0\n",
-       "1  6.0  b  20.0\n",
-       "2  7.0  c   NaN\n",
-       "0  4.5  a  10.0\n",
-       "1  6.0  b  20.0\n",
-       "2  7.0  c   NaN"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n",
-    "sdf3.to_dataframe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>X</th>\n",
-       "      <th>Y</th>\n",
-       "      <th>Z</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "      <td>10.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "      <td>10.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "      <td>20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "      <td>20.0</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "      <td>NaN</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     X  Y     Z\n",
-       "0  4.5  a  10.0\n",
-       "1  4.5  a  10.0\n",
-       "2  6.0  b  20.0\n",
-       "3  6.0  b  20.0\n",
-       "4  7.0  c   NaN\n",
-       "5  7.0  c   NaN"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The order might be different."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>X</th>\n",
-       "      <th>Y</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>4.5</td>\n",
-       "      <td>a</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     X  Y\n",
-       "0  4.5  a\n",
-       "1  4.5  a"
-      ]
-     },
-     "execution_count": 10,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n",
-    "sdfte.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style>\n",
-       "    .dataframe thead tr:only-child th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: left;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>X</th>\n",
-       "      <th>Y</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>6.0</td>\n",
-       "      <td>b</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>7.0</td>\n",
-       "      <td>c</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     X  Y\n",
-       "0  6.0  b\n",
-       "1  7.0  c\n",
-       "2  6.0  b\n",
-       "0  7.0  c"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sdftr.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "## split a big file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'example.txt'"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sdf2.to_csv(\"example.txt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['example.train.txt', 'example.test.txt']"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n",
-    "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['example.test.txt', 'example.train.txt', 'example.txt']"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import glob\n",
-    "\n",
-    "glob.glob(\"ex*.txt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file
diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst
index 5828dd9..856b12b 100644
--- a/_doc/tutorial/index.rst
+++ b/_doc/tutorial/index.rst
@@ -31,8 +31,6 @@ when it does not fit into memory.
 .. contents::
     :local:
 
-.. _l-objective:
-
 Objectives and Competitors
 ++++++++++++++++++++++++++
 
@@ -111,10 +109,7 @@ A user can either choose to draw the same sample every time he is going
 through the data. He could also choose that a different sample should be
 drawn each time. The following method indicates which kinds of sample
 the :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
-is producing.
-
-.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame
-    :members: is_table
+is producing (see :meth:`pandas_streaming.df.dataframe.StreamingDataFrame.is_table`).
 
 Check the schema consistency of a large file
 ++++++++++++++++++++++++++++++++++++++++++++
diff --git a/pyproject.toml b/pyproject.toml
index 50db37b..c1472ad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ ignore_directives = [
     "exreflist",
     "gdot",
     "image-sg",
+    "pr",
     "runpython",
 ]
 ignore_roles = ["epkg"]

From 0971f57153840ebc7391ff2f942461da4fe8b346 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 10:11:29 +0200
Subject: [PATCH 08/16] req

---
 .circleci/config.yml                        |  2 +-
 README.rst                                  | 12 ++++++------
 _doc/index.rst                              | 12 ++++++------
 pandas_streaming/df/dataframe_io_helpers.py |  2 +-
 requirements-dev.txt                        |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 712677b..e76a1b7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -54,7 +54,7 @@ jobs:
       - run:
           name: run tests
           command: |
-            python setup.py unittests
+            python -m pytest
 
       - run:
           name: wheel
diff --git a/README.rst b/README.rst
index 1096a34..8f0f162 100644
--- a/README.rst
+++ b/README.rst
@@ -1,10 +1,10 @@
 pandas-streaming: streaming API over pandas
 ===========================================
 
-.. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true
+.. image:: https://github.com/sdpython/pandas_streaming/blob/main/_doc/sphinxdoc/source/_static/project_ico.png?raw=true
     :target: https://github.com/sdpython/pandas_streaming/
 
-.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=master
+.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=main
     :target: https://app.travis-ci.com/github/sdpython/pandas_streaming
     :alt: Build status
 
@@ -12,8 +12,8 @@ pandas-streaming: streaming API over pandas
     :target: https://ci.appveyor.com/project/sdpython/pandas-streaming
     :alt: Build Status Windows
 
-.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/master.svg?style=svg
-    :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/master
+.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg
+    :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main
 
 .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
     :target: https://dev.azure.com/xavierdupre3/pandas_streaming/
@@ -25,8 +25,8 @@ pandas-streaming: streaming API over pandas
     :alt: MIT License
     :target: http://opensource.org/licenses/MIT
 
-.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=master
-    :target: https://codecov.io/github/sdpython/pandas_streaming?branch=master
+.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=main
+    :target: https://codecov.io/github/sdpython/pandas_streaming?branch=main
 
 .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png
     :alt: GitHub Issues
diff --git a/_doc/index.rst b/_doc/index.rst
index 1d8755b..345d6a8 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -5,10 +5,10 @@
 pandas-streaming: streaming API over pandas
 ===========================================
 
-.. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true
+.. image:: https://github.com/sdpython/pandas_streaming/blob/main/_doc/sphinxdoc/source/_static/project_ico.png?raw=true
     :target: https://github.com/sdpython/pandas_streaming/
 
-.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=master
+.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=main
     :target: https://app.travis-ci.com/github/sdpython/pandas_streaming
     :alt: Build status
 
@@ -16,8 +16,8 @@ pandas-streaming: streaming API over pandas
     :target: https://ci.appveyor.com/project/sdpython/pandas-streaming
     :alt: Build Status Windows
 
-.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/master.svg?style=svg
-    :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/master
+.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg
+    :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main
 
 .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming
     :target: https://dev.azure.com/xavierdupre3/pandas_streaming/
@@ -29,8 +29,8 @@ pandas-streaming: streaming API over pandas
     :alt: MIT License
     :target: http://opensource.org/licenses/MIT
 
-.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=master
-    :target: https://codecov.io/github/sdpython/pandas_streaming?branch=master
+.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=main
+    :target: https://codecov.io/github/sdpython/pandas_streaming?branch=main
 
 .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png
     :alt: GitHub Issues
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
index c8a7776..4f502fc 100644
--- a/pandas_streaming/df/dataframe_io_helpers.py
+++ b/pandas_streaming/df/dataframe_io_helpers.py
@@ -117,7 +117,7 @@ def flatten_dictionary(dico, sep="_"):
     :return: flattened dictionary
 
     Inspired from `flatten_json
-    <https://github.com/amirziai/flatten/blob/master/flatten_json.py>`_.
+    <https://github.com/amirziai/flatten/blob/master/flatten_json/__init__.py>`_.
     """
     flattened_dict = {}
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 16ed1c5..d5ec849 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -14,7 +14,7 @@ pyquickhelper>=1.10
 scikit-learn
 scipy
 sphinx
-sphinx-runpython
+git+https://github.com/sdpython/sphinx-runpython.git
 sphinxcontrib.imagesvg
 sphinx_gallery
 ujson

From 16368e7e209963497ebf094af218edee830d3a3f Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 11:00:25 +0200
Subject: [PATCH 09/16] req

---
 _doc/i_ex.rst        | 2 --
 requirements-dev.txt | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/_doc/i_ex.rst b/_doc/i_ex.rst
index 1fcf475..15f2342 100644
--- a/_doc/i_ex.rst
+++ b/_doc/i_ex.rst
@@ -1,6 +1,4 @@
 
-.. _l-EX2:
-
 Examples
 ========
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index d5ec849..35ed6ab 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,6 +5,7 @@ ijson
 jupyter_sphinx
 jyquickhelper
 matplotlib
+nbsphinx
 pandas>=1.1.0
 pandocfilters
 Pillow

From 8f768dba2003a514606524e2e56de9920a423b9d Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 11:04:25 +0200
Subject: [PATCH 10/16] req

---
 requirements-dev.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 35ed6ab..52d6e9a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -15,6 +15,7 @@ pyquickhelper>=1.10
 scikit-learn
 scipy
 sphinx
+sphinx-issues 
 git+https://github.com/sdpython/sphinx-runpython.git
 sphinxcontrib.imagesvg
 sphinx_gallery

From 955251e0eabc93bf2e41ee2bda59d11692ce6129 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 11:12:51 +0200
Subject: [PATCH 11/16] req

---
 requirements-dev.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 52d6e9a..a10fbfd 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,4 +1,5 @@
 autopep8
+black
 coverage
 furo
 ijson
@@ -11,13 +12,16 @@ pandocfilters
 Pillow
 pycodestyle
 pylint>=2.14.0
+pytest
+pytest-cov
 pyquickhelper>=1.10
+rstcheck[sphinx,toml]
+ruff
 scikit-learn
 scipy
 sphinx
 sphinx-issues 
 git+https://github.com/sdpython/sphinx-runpython.git
-sphinxcontrib.imagesvg
 sphinx_gallery
 ujson
 wheel

From 9fe59a69251369dbfb2f720f61adbb64e659c94c Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 11:33:36 +0200
Subject: [PATCH 12/16] doc

---
 _doc/api/rdf.rst                            |  3 +
 _doc/conf.py                                |  1 +
 _doc/examples/first_step.py                 |  8 +--
 pandas_streaming/df/connex_split.py         | 75 +++++++++++----------
 pandas_streaming/df/dataframe.py            | 44 ++++++------
 pandas_streaming/df/dataframe_io.py         |  4 +-
 pandas_streaming/df/dataframe_io_helpers.py |  2 +-
 7 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/_doc/api/rdf.rst b/_doc/api/rdf.rst
index 09bf139..1a41bd1 100644
--- a/_doc/api/rdf.rst
+++ b/_doc/api/rdf.rst
@@ -26,10 +26,13 @@ Many methods have been rewritten to support
 streaming. Among them, IO methods:
 
 .. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv
+    :noindex:
 
 .. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df
+    :noindex:
 
 .. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json
+    :noindex:
 
 Data Manipulation
 +++++++++++++++++
diff --git a/_doc/conf.py b/_doc/conf.py
index 446bf9d..746d91f 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -180,6 +180,7 @@
     "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html",
     "Hadoop": "http://hadoop.apache.org/",
     "ijson": "https://github.com/ICRAR/ijson",
+    "json": "https://docs.python.org/3/library/json.html",
     "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
     "pandas": (
         "http://pandas.pydata.org/pandas-docs/stable/",
diff --git a/_doc/examples/first_step.py b/_doc/examples/first_step.py
index 1c446a2..bd6870c 100644
--- a/_doc/examples/first_step.py
+++ b/_doc/examples/first_step.py
@@ -7,8 +7,10 @@
 pandas to pandas_streaming
 ++++++++++++++++++++++++++
 """
-
+import glob
 from pandas import DataFrame
+from pandas_streaming.df import StreamingDataFrame
+
 
 df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"]))
 df
@@ -18,8 +20,6 @@
 # We create a streaming dataframe:
 
 
-from pandas_streaming.df import StreamingDataFrame
-
 sdf = StreamingDataFrame.read_df(df)
 sdf
 
@@ -95,6 +95,4 @@
 ############################################
 #
 
-import glob
-
 glob.glob("ex*.txt")
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
index c3b6150..1636d13 100644
--- a/pandas_streaming/df/connex_split.py
+++ b/pandas_streaming/df/connex_split.py
@@ -147,32 +147,32 @@ def train_test_connex_split(
     test set. The function computes the connected components
     and breaks each of them in two parts for train and test.
 
-    @param  df              :epkg:`pandas:DataFrame`
-    @param  groups          columns name for the ids
-    @param  test_size       ratio for the test partition
-                            (if *train_size* is not specified)
-    @param  train_size      ratio for the train partition
-    @param  stratify        column holding the stratification
-    @param  hash_size       size of the hash to cache information about partition
-    @param  unique_rows     ensures that rows are unique
-    @param  shuffle         shuffles before the split
-    @param  fail_imbalanced raises an exception if relative weights difference
-                            is higher than this value
-    @param  stop_if_bigger  (float) stops a connected components from being
-                            bigger than this ratio of elements, this should not be used
-                            unless a big components emerges, the algorithm stops merging
-                            but does not guarantee it returns the best cut,
-                            the value should be close to 0
-    @param  keep_balance    (float), if not None, does not merge connected components
-                            if their relative sizes are too different,
-                            the value should be close to 1
-    @param  return_cnx      returns connected components as a third results
-    @param  must_groups     column name for ids which must not be shared by
-                            train/test partitions
-    @param  random_state    seed for random generator
-    @param  verbose         verbosity (uses logging)
-    @return                 Two see :class:`StreamingDataFrame`, one
-                            for train, one for test.
+    :param df: :epkg:`pandas:DataFrame`
+    :param groups: columns name for the ids
+    :param test_size: ratio for the test partition
+        (if *train_size* is not specified)
+    :param train_size: ratio for the train partition
+    :param stratify: column holding the stratification
+    :param hash_size: size of the hash to cache information about partition
+    :param unique_rows: ensures that rows are unique
+    :param shuffle: shuffles before the split
+    :param fail_imbalanced: raises an exception if relative weights difference
+        is higher than this value
+    :param stop_if_bigger: (float) stops a connected components from being
+        bigger than this ratio of elements, this should not be used
+        unless a big components emerges, the algorithm stops merging
+        but does not guarantee it returns the best cut,
+        the value should be close to 0
+    :param keep_balance: (float), if not None, does not merge connected components
+        if their relative sizes are too different,
+        the value should be close to 1
+    :param return_cnx: returns connected components as a third results
+    :param must_groups: column name for ids which must not be shared by
+        train/test partitions
+    :param random_state: seed for random generator
+    :param verbose: verbosity (uses logging)
+    :return: Two see :class:`StreamingDataFrame`, one
+        for train, one for test.
 
     The list of ids must hold in memory.
     There is no streaming implementation for the ids.
@@ -462,17 +462,17 @@ def train_test_apart_stratify(
     distinct products on train and test but common categories
     on both sides.
 
-    @param  df              :epkg:`pandas:DataFrame`
-    @param  group           columns name for the ids
-    @param  test_size       ratio for the test partition
-                            (if *train_size* is not specified)
-    @param  train_size      ratio for the train partition
-    @param  stratify        column holding the stratification
-    @param  force           if True, tries to get at least one example on the test side
-                            for each value of the column *stratify*
-    @param  random_state    seed for random generators
-    @return                 Two see :class:`StreamingDataFrame`, one
-                            for train, one for test.
+    :param df: :epkg:`pandas:DataFrame`
+    :param group: columns name for the ids
+    :param test_size: ratio for the test partition
+        (if *train_size* is not specified)
+    :param train_size: ratio for the train partition
+    :param stratify: column holding the stratification
+    :param force: if True, tries to get at least one example on the test side
+        for each value of the column *stratify*
+    :param random_state: seed for random generators
+    :return: Two see :class:`StreamingDataFrame`, one
+        for train, one for test.
 
     .. index:: multi-label
 
@@ -501,6 +501,7 @@ def train_test_apart_stratify(
         print(train)
         print('-----------')
         print(test)
+
     """
     if stratify is None:
         raise ValueError("stratify must be specified.")  # pragma: no cover
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
index b519102..3e79c55 100644
--- a/pandas_streaming/df/dataframe.py
+++ b/pandas_streaming/df/dataframe.py
@@ -44,7 +44,7 @@ class StreamingDataFrame:
     The constructor cannot receive an iterator otherwise
     this class would be able to walk through the data
     only once. The main reason is it is impossible to
-    :epkg:`*py:pickle` (or :epkg:`dill`)
+    :mod:`pickle` (or :epkg:`dill`)
     an iterator: it cannot be replicated.
     Instead, the class takes a function which generates
     an iterator on :epkg:`DataFrame`.
@@ -89,10 +89,10 @@ def is_stable(self, do_check=False, n=10):
         """
         Tells if the :epkg:`dataframe` is supposed to be stable.
 
-        @param      do_check    do not trust the value sent to the constructor
-        @param      n           number of rows used to check the stability,
-                                None for all rows
-        @return                 boolean
+        :param do_check: do not trust the value sent to the constructor
+        :param n: number of rows used to check the stability,
+            None for all rows
+        :return: boolean
 
         *do_check=True* means the methods checks the first
         *n* rows remains the same for two iterations.
@@ -130,23 +130,23 @@ def train_test_split(
         It chooses one of the options from module
         :mod:`dataframe_split <pandas_streaming.df.dataframe_split>`.
 
-        @param  path_or_buf     a string, a list of strings or buffers, if it is a
-                                string, it must contain ``{}`` like ``partition{}.txt``,
-                                if None, the function returns strings.
-        @param  export_method   method used to store the partitions, by default
-                                :epkg:`pandas:DataFrame:to_csv`, additional parameters
-                                will be given to that function
-        @param  names           partitions names, by default ``('train', 'test')``
-        @param  kwargs          parameters for the export function and
-                                :func:`sklearn.model_selection.train_test_split`.
-        @param  streaming       the function switches to a
-                                streaming version of the algorithm.
-        @param  partitions      splitting partitions
-        @return                 outputs of the exports functions or two
-                                see :class:`StreamingDataFrame` if path_or_buf is None.
+        :param path_or_buf: a string, a list of strings or buffers, if it is a
+            string, it must contain ``{}`` like ``partition{}.txt``,
+            if None, the function returns strings.
+        :param export_method: method used to store the partitions, by default
+            :epkg:`pandas:DataFrame:to_csv`, additional parameters
+            will be given to that function
+        :param names: partitions names, by default ``('train', 'test')``
+        :param kwargs: parameters for the export function and
+            :func:`sklearn.model_selection.train_test_split`.
+        :param streaming: the function switches to a
+            streaming version of the algorithm.
+        :param partitions: splitting partitions
+        :return: outputs of the exports functions or two
+            see :class:`StreamingDataFrame` if *path_or_buf* is None.
 
         The streaming version of this algorithm is implemented by function
-        @see fn sklearn_train_test_split_streaming. Its documentation
+        :func:`sklearn_train_test_split_streaming`. Its documentation
         indicates the limitation of the streaming version and gives some
         insights about the additional parameters.
         """
@@ -229,11 +229,9 @@ def read_json(
             dfs = list(it)
             print(dfs)
 
-        .. index:: IncompleteJSONError
-
         The parsed json must have an empty line at the end otherwise
         the following exception is raised:
-        `ijson.common.IncompleteJSONError: `
+        `ijson.common.IncompleteJSONError`:
         `parse error: unallowed token at this point in JSON text`.
         """
         if not isinstance(chunksize, int) or chunksize <= 0:
diff --git a/pandas_streaming/df/dataframe_io.py b/pandas_streaming/df/dataframe_io.py
index 532a2bf..bf13cc2 100644
--- a/pandas_streaming/df/dataframe_io.py
+++ b/pandas_streaming/df/dataframe_io.py
@@ -11,7 +11,7 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs):
     It can be read by @see fn to_zip.
 
     :param df: :epkg:`dataframe` or :epkg:`numpy:array`
-    :param zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename
+    :param zipfilename: a :class:`zipfile:ZipFile` or a filename
     :param zname: a filename in th zipfile
     :param kwargs: parameters for :epkg:`pandas:to_csv` or
         :epkg:`numpy:save`
@@ -104,7 +104,7 @@ def read_zip(zipfilename, zname=None, **kwargs):
     Reads a :epkg:`dataframe` from a :epkg:`zip` file.
     It can be saved by @see fn read_zip.
 
-    :param zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename
+    :param zipfilename: a :class:`zipfile.ZipFile` or a filename
     :param zname: a filename in zipfile, if None, takes the first one
     :param kwargs: parameters for :func:`pandas.read_csv`
     :return: :func:`pandas.DataFrame` or :epkg:`numpy:array`
diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py
index 4f502fc..8c00ba2 100644
--- a/pandas_streaming/df/dataframe_io_helpers.py
+++ b/pandas_streaming/df/dataframe_io_helpers.py
@@ -339,7 +339,7 @@ class JsonIterator2Stream:
     The iterator could be one returned by @see fn enumerate_json_items.
 
     :param it: iterator
-    :param kwargs: arguments to :epkg:`*py:json:dumps`
+    :param kwargs: arguments to :class:`json.dumps`
 
     .. exref::
         :title: Reshape a json file

From a132c58f3eab352514e7c83d89596659d75745a4 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 12:50:45 +0200
Subject: [PATCH 13/16] doc

---
 README.rst                             |   4 -
 _doc/api/connex_split.rst              |   6 ++
 _doc/api/dataframe.rst                 |  22 +++++
 _doc/api/dataframe_io.rst              |   6 ++
 _doc/api/dataframe_split.rst           |   6 ++
 _doc/api/rdf.rst                       |  38 ++++-----
 _doc/api/rio.rst                       |  11 +--
 _doc/conf.py                           |   7 +-
 _doc/glossary.rst                      |  13 ---
 _doc/i_ex.rst                          |   3 -
 _doc/index.rst                         |  19 +----
 _doc/tutorial/index.rst                |   8 +-
 pandas_streaming/df/connex_split.py    |  29 +++----
 pandas_streaming/df/dataframe.py       | 109 +++++++++++++++----------
 pandas_streaming/df/dataframe_io.py    |  20 ++---
 pandas_streaming/df/dataframe_split.py |  34 ++++----
 16 files changed, 185 insertions(+), 150 deletions(-)
 create mode 100644 _doc/api/connex_split.rst
 create mode 100644 _doc/api/dataframe.rst
 create mode 100644 _doc/api/dataframe_io.rst
 create mode 100644 _doc/api/dataframe_split.rst
 delete mode 100644 _doc/glossary.rst

diff --git a/README.rst b/README.rst
index 8f0f162..f3fdde7 100644
--- a/README.rst
+++ b/README.rst
@@ -32,10 +32,6 @@ pandas-streaming: streaming API over pandas
     :alt: GitHub Issues
     :target: https://github.com/sdpython/pandas_streaming/issues
 
-.. image:: http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/_images/nbcov.png
-    :target: http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/all_notebooks_coverage.html
-    :alt: Notebook Coverage
-
 .. image:: https://pepy.tech/badge/pandas_streaming/month
     :target: https://pepy.tech/project/pandas_streaming/month
     :alt: Downloads
diff --git a/_doc/api/connex_split.rst b/_doc/api/connex_split.rst
new file mode 100644
index 0000000..1612130
--- /dev/null
+++ b/_doc/api/connex_split.rst
@@ -0,0 +1,6 @@
+
+pandas_streaming.df.connex_split
+================================
+
+.. automodule:: pandas_streaming.df.connex_split
+    :members:
diff --git a/_doc/api/dataframe.rst b/_doc/api/dataframe.rst
new file mode 100644
index 0000000..143c558
--- /dev/null
+++ b/_doc/api/dataframe.rst
@@ -0,0 +1,22 @@
+
+pandas_streaming.df.dataframe
+=============================
+
+StreamingDataFrameSchemaError
++++++++++++++++++++++++++++++
+
+.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrameSchemaError
+    :members:
+
+StreamingDataFrame
+++++++++++++++++++
+
+.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame
+    :members:
+    :special-members:
+
+StreamingSeries
++++++++++++++++
+
+.. autoclass:: pandas_streaming.df.dataframe.StreamingSeries
+    :members:
diff --git a/_doc/api/dataframe_io.rst b/_doc/api/dataframe_io.rst
new file mode 100644
index 0000000..c346af5
--- /dev/null
+++ b/_doc/api/dataframe_io.rst
@@ -0,0 +1,6 @@
+
+pandas_streaming.df.dataframe_io
+================================
+
+.. automodule:: pandas_streaming.df.dataframe_io
+    :members:
diff --git a/_doc/api/dataframe_split.rst b/_doc/api/dataframe_split.rst
new file mode 100644
index 0000000..30e6dee
--- /dev/null
+++ b/_doc/api/dataframe_split.rst
@@ -0,0 +1,6 @@
+
+pandas_streaming.df.dataframe_split
+===================================
+
+.. automodule:: pandas_streaming.df.dataframe_split
+    :members:
diff --git a/_doc/api/rdf.rst b/_doc/api/rdf.rst
index 1a41bd1..751e353 100644
--- a/_doc/api/rdf.rst
+++ b/_doc/api/rdf.rst
@@ -2,14 +2,11 @@
 pandas_streaming.df
 ===================
 
-.. contents::
-    :local:
-
 Streaming
 +++++++++
 
 The main class is an interface which mimic
-:epkg:`pandas:DataFrame` interface to offer
+:class:`pandas.DataFrame` interface to offer
 a short list of methods which apply on an
 iterator of dataframes. This provides somehow
 a streaming version of it. As a result, the creation
@@ -17,22 +14,18 @@ of an instance is fast as long as the data is not
 processed. Iterators can be chained as many map reduce
 framework does.
 
-.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame
-    :members:
+.. toctree::
+    :maxdepth: 2
+
+    dataframe
 
 The module implements additional and useful functions
 not necessarily for the streaming version of the dataframes.
 Many methods have been rewritten to support
 streaming. Among them, IO methods:
-
-.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv
-    :noindex:
-
-.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df
-    :noindex:
-
-.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json
-    :noindex:
+:meth:`read_csv <pandas_streaming.df.dataframe.StreamingDataFrame.read_csv>`,
+:meth:`read_df <pandas_streaming.df.dataframe.StreamingDataFrame.read_df>`,
+:meth:`read_json <pandas_streaming.df.dataframe.StreamingDataFrame.read_json>`.
 
 Data Manipulation
 +++++++++++++++++
@@ -51,10 +44,17 @@ Complex splits
 Splitting a database into train and test is usually simple except
 if rows are not independant and share some ids. In that case,
 the following functions will try to build two partitions keeping
-ids separate or separate as much as possible.
+ids separate or separate as much as possible:
+:func:`train_test_apart_stratify <pandas_streaming.df.connex_split.train_test_apart_stratify>`,
+:func:`train_test_connex_split <pandas_streaming.df.connex_split.train_test_connex_split>`,
+:func:`train_test_split_weights <pandas_streaming.df.connex_split.train_test_split_weights>`.
 
-.. autofunction:: pandas_streaming.df.connex_split.train_test_apart_stratify
+Extensions
+++++++++++
 
-.. autofunction:: pandas_streaming.df.connex_split.train_test_connex_split
+.. toctree::
+    :maxdepth: 1
 
-.. autofunction:: pandas_streaming.df.connex_split.train_test_split_weights
+    connex_split
+    dataframe_io
+    dataframe_split
diff --git a/_doc/api/rio.rst b/_doc/api/rio.rst
index 357f6cc..4de1211 100644
--- a/_doc/api/rio.rst
+++ b/_doc/api/rio.rst
@@ -2,9 +2,6 @@
 Inputs / Outputs
 ================
 
-.. contents::
-    :local:
-
 Dataframes / Numpy arrays
 +++++++++++++++++++++++++
 
@@ -12,8 +9,6 @@ Dataframes / Numpy arrays
 is easy to manipulate in the :epkg:`Python` world but difficult
 to exchange with other people and other environments.
 The two following functions makes it easier to collapse many dataframes
-or numpy arrays into one single file. The data can be unzipped afterwards.
-
-.. autofunction:: pandas_streaming.df.dataframe_io.read_zip
-
-.. autofunction:: pandas_streaming.df.dataframe_io.to_zip
+or numpy arrays into one single file. The data can be unzipped afterwards,
+see :func:`read_zip <pandas_streaming.df.dataframe_io.read_zip>`,
+:func:`to_zip <pandas_streaming.df.dataframe_io.to_zip>`.
diff --git a/_doc/conf.py b/_doc/conf.py
index 746d91f..811119c 100644
--- a/_doc/conf.py
+++ b/_doc/conf.py
@@ -178,10 +178,12 @@
     "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html",
     "dill": "https://dill.readthedocs.io/en/latest/dill.html",
     "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html",
+    "Jupyter": "https://jupyter.org/",
     "Hadoop": "http://hadoop.apache.org/",
     "ijson": "https://github.com/ICRAR/ijson",
     "json": "https://docs.python.org/3/library/json.html",
     "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN",
+    "numpy": "https://numpy.org/",
     "pandas": (
         "http://pandas.pydata.org/pandas-docs/stable/",
         (
@@ -195,12 +197,15 @@
     ),
     "pyarrow": "https://arrow.apache.org/docs/python/",
     "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html",
+    "Python": "https://www.python.org/",
+    "scikit-learn": "https://scikit-learn.org/stable/",
     "scikit-multiflow": "https://scikit-multiflow.github.io/",
     "sklearn": (
-        "http://scikit-learn.org/stable/",
+        "https://scikit-learn.org/stable/",
         ("https://scikit-learn.org/stable/modules/generated/{0}.html", 1),
         ("https://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2),
     ),
     "streamz": "https://streamz.readthedocs.io/en/latest/index.html",
     "tornado": "https://www.tornadoweb.org/en/stable/",
+    "zip": "https://en.wikipedia.org/wiki/ZIP_(file_format)",
 }
diff --git a/_doc/glossary.rst b/_doc/glossary.rst
deleted file mode 100644
index cf8651d..0000000
--- a/_doc/glossary.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-
-.. index:: glossary
-
-Glossary
-========
-
-.. glossary::
-
-    Jupyter
-        See :epkg:`Jupyter`
-
-    pandas
-        See :epkg:`pandas`.
diff --git a/_doc/i_ex.rst b/_doc/i_ex.rst
index 15f2342..43a0265 100644
--- a/_doc/i_ex.rst
+++ b/_doc/i_ex.rst
@@ -2,9 +2,6 @@
 Examples
 ========
 
-.. contents::
-    :local:
-
 About array
 +++++++++++
 
diff --git a/_doc/index.rst b/_doc/index.rst
index 345d6a8..fa91aea 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -36,10 +36,6 @@ pandas-streaming: streaming API over pandas
     :alt: GitHub Issues
     :target: https://github.com/sdpython/pandas_streaming/issues
 
-.. image:: nbcov.png
-    :target: http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/all_notebooks_coverage.html
-    :alt: Notebook Coverage
-
 .. image:: https://pepy.tech/badge/pandas_streaming
     :target: https://pypi.org/project/pandas_streaming/
     :alt: Downloads
@@ -68,17 +64,4 @@ and implements other functionalities for machine learning.
     auto_examples/index
     api/index
     i_ex
-
-
-**Links:** `github <https://github.com/sdpython/pandas_streaming/>`_,
-`documentation <http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html>`_,
-:ref:`l-README`,
-:ref:`blog <ap-main-0>`
-
-+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+
-| :ref:`l-modules`     |  :ref:`l-functions` | :ref:`l-classes`    | :ref:`l-methods`   | :ref:`l-staticmethods` | :ref:`l-properties`                            |
-+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+
-| :ref:`modindex`      |  :ref:`l-EX2`       | :ref:`search`       | :ref:`l-license`   | :ref:`l-changes`       | :ref:`l-README`                                |
-+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+
-| :ref:`genindex`      |  :ref:`l-FAQ2`      | :ref:`l-notebooks`  |                    | :ref:`l-statcode`      | `Unit Test Coverage <coverage/index.html>`_    |
-+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+
+    license
diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst
index 856b12b..1e693c7 100644
--- a/_doc/tutorial/index.rst
+++ b/_doc/tutorial/index.rst
@@ -28,9 +28,6 @@ when it does not fit into memory.
 
     >>> ['dataset_split_train.txt', 'dataset_split_test.txt']
 
-.. contents::
-    :local:
-
 Objectives and Competitors
 ++++++++++++++++++++++++++
 
@@ -109,7 +106,7 @@ A user can either choose to draw the same sample every time he is going
 through the data. He could also choose that a different sample should be
 drawn each time. The following method indicates which kinds of sample
 the :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
-is producing (see :meth:`pandas_streaming.df.dataframe.StreamingDataFrame.is_table`).
+is producing.
 
 Check the schema consistency of a large file
 ++++++++++++++++++++++++++++++++++++++++++++
@@ -138,7 +135,8 @@ an idea of where we could find the error.
     except Exception as e:
         print("ERROR:", e)
 
-The method :py:meth:`__iter__ <pandas_streaming.df.dataframe.StreamingDataFrame.__iter__>`
+The method :meth:`__iter__
+<pandas_streaming.df.dataframe.StreamingDataFrame.__iter__>`
 checks that the schema does not change between two iterations.
 It can be disabled by adding *check_schema=False* when
 the constructor is called.
diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py
index 1636d13..1df251f 100644
--- a/pandas_streaming/df/connex_split.py
+++ b/pandas_streaming/df/connex_split.py
@@ -29,16 +29,17 @@ def train_test_split_weights(
     Splits a database in train/test given, every row
     can have a different weight.
 
-    @param  df              :epkg:`pandas:DataFrame` or see :class:`StreamingDataFrame`
-    @param  weights         None or weights or weights column name
-    @param  test_size       ratio for the test partition
-                            (if *train_size* is not specified)
-    @param  train_size      ratio for the train partition
-    @param  shuffle         shuffles before the split
-    @param  fail_imbalanced raises an exception if relative weights
-                            difference is higher than this value
-    @param  random_state    seed for random generators
-    @return                 train and test :epkg:`pandas:DataFrame`
+    :param df: :class:`pandas.DataFrame` or see
+        :class:`StreamingDataFrame <pandas_streaming.df.dataframe.StreamingDataFrame>`
+    :param weights: None or weights or weights column name
+    :param test_size: ratio for the test partition
+        (if *train_size* is not specified)
+    :param train_size: ratio for the train partition
+    :param shuffle: shuffles before the split
+    :param fail_imbalanced: raises an exception if relative weights
+        difference is higher than this value
+    :param random_state: seed for random generators
+    :return: train and test :class:`pandas.DataFrame`
 
     If the dataframe is not shuffled first, the function
     will produce two datasets which are unlikely to be randomized
@@ -171,7 +172,8 @@ def train_test_connex_split(
         train/test partitions
     :param random_state: seed for random generator
     :param verbose: verbosity (uses logging)
-    :return: Two see :class:`StreamingDataFrame`, one
+    :return: Two see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`, one
         for train, one for test.
 
     The list of ids must hold in memory.
@@ -471,11 +473,10 @@ def train_test_apart_stratify(
     :param force: if True, tries to get at least one example on the test side
         for each value of the column *stratify*
     :param random_state: seed for random generators
-    :return: Two see :class:`StreamingDataFrame`, one
+    :return: Two see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`, one
         for train, one for test.
 
-    .. index:: multi-label
-
     The list of ids must hold in memory.
     There is no streaming implementation for the ids.
     This split was implemented for a case of a multi-label
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
index 3e79c55..6537d1f 100644
--- a/pandas_streaming/df/dataframe.py
+++ b/pandas_streaming/df/dataframe.py
@@ -49,7 +49,8 @@ class StreamingDataFrame:
     Instead, the class takes a function which generates
     an iterator on :epkg:`DataFrame`.
     Most of the methods returns either a :epkg:`DataFrame`
-    either a see :class:`StreamingDataFrame`. In the second case,
+    either a see :class:`StreamingDataFrame
+    <pandas_streaming.df.dataframe.StreamingDataFrame>`. In the second case,
     methods can be chained.
 
     By default, the object checks that the schema remains
@@ -63,7 +64,8 @@ class StreamingDataFrame:
     is one of these cases.
 
     :param iter_creation: function which creates an iterator or an
-        instance of see :class:`StreamingDataFrame`
+        instance of see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`
     :param check_schema: checks that the schema is the same
         for every :epkg:`dataframe`
     :param stable: indicates if the :epkg:`dataframe` remains the same
@@ -134,7 +136,7 @@ def train_test_split(
             string, it must contain ``{}`` like ``partition{}.txt``,
             if None, the function returns strings.
         :param export_method: method used to store the partitions, by default
-            :epkg:`pandas:DataFrame:to_csv`, additional parameters
+            :meth:`pandas.DataFrame.to_csv`, additional parameters
             will be given to that function
         :param names: partitions names, by default ``('train', 'test')``
         :param kwargs: parameters for the export function and
@@ -143,12 +145,14 @@ def train_test_split(
             streaming version of the algorithm.
         :param partitions: splitting partitions
         :return: outputs of the exports functions or two
-            see :class:`StreamingDataFrame` if *path_or_buf* is None.
+            see class `StreamingDataFrame`
+            if *path_or_buf* is None.
 
         The streaming version of this algorithm is implemented by function
-        :func:`sklearn_train_test_split_streaming`. Its documentation
-        indicates the limitation of the streaming version and gives some
-        insights about the additional parameters.
+        :func:`sklearn_train_test_split_streaming
+        <pandas_streaming.df.dataframe_split.sklearn_train_test_split_streaming>`.
+        Its documentation indicates the limitation of the streaming version
+        and gives some insights about the additional parameters.
         """
         if streaming:
             if partitions is not None:
@@ -376,10 +380,11 @@ def read_df(df, chunksize=None, check_schema=True) -> "StreamingDataFrame":
         Splits a :epkg:`DataFrame` into small chunks mostly for
         unit testing purposes.
 
-        @param      df              :epkg:`DataFrame`
-        @param      chunksize       number rows per chunks (// 10 by default)
-        @param      check_schema    check schema between two iterations
-        @return                     iterator on see :class:`StreamingDataFrame`
+        :param df: :class:`pandas.DataFrame`
+        :param chunksize: number rows per chunks (// 10 by default)
+        :param check_schema: check schema between two iterations
+        :return: iterator on see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`
         """
         if chunksize is None:
             if hasattr(df, "shape"):
@@ -569,7 +574,8 @@ def where(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:where`.
         *inplace* must be False.
-        This function returns a see :class:`StreamingDataFrame`.
+        This function returns a see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`.
         """
         kwargs["inplace"] = False
         return StreamingDataFrame(
@@ -578,15 +584,15 @@ def where(self, *args, **kwargs) -> "StreamingDataFrame":
 
     def sample(self, reservoir=False, cache=False, **kwargs) -> "StreamingDataFrame":
         """
-        See :epkg:`pandas:DataFrame:sample`.
-        Only *frac* is available, otherwise choose
-        @see me reservoir_sampling.
-        This function returns a see :class:`StreamingDataFrame`.
+        See :meth:`pandas.DataFrame.sample`.
+        Only *frac* is available, otherwise choose :meth`reservoir_sampling`.
+        This function returns a see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`.
 
         :param reservoir: use
             `reservoir sampling <https://en.wikipedia.org/wiki/Reservoir_sampling>`_
         :param cache: cache the sample
-        :param kwargs: additional parameters for :epkg:`pandas:DataFrame:sample`
+        :param kwargs: additional parameters for :meth:`pandas.DataFrame.sample`
 
         If *cache* is True, the sample is cached (assuming it holds in memory).
         The second time an iterator walks through the
@@ -614,10 +620,11 @@ def _reservoir_sampling(
         Uses the `reservoir sampling <https://en.wikipedia.org/wiki/Reservoir_sampling>`_
         algorithm to draw a random sample with exactly *n* samples.
 
-        @param      cache           cache the sample
-        @param      n               number of observations to keep
-        @param      random_state    sets the random_state
-        @return                     see :class:`StreamingDataFrame`
+        :param cache: cache the sample
+        :param n: number of observations to keep
+        :param random_state: sets the random_state
+        :return: see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`
 
         .. warning::
             The sample is split by chunks of size 1000.
@@ -669,7 +676,8 @@ def drop(
     ) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:drop`.
-        This function returns a see :class:`StreamingDataFrame`.
+        This function returns a see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`.
         """
         if axis == 0:
             raise NotImplementedError(f"drop is not implemented for axis={axis}.")
@@ -694,7 +702,8 @@ def drop(
     def apply(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:apply`.
-        This function returns a see :class:`StreamingDataFrame`.
+        This function returns a see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`.
         """
         return StreamingDataFrame(
             lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs()
@@ -703,7 +712,8 @@ def apply(self, *args, **kwargs) -> "StreamingDataFrame":
     def applymap(self, *args, **kwargs) -> "StreamingDataFrame":
         """
         Applies :epkg:`pandas:DataFrame:applymap`.
-        This function returns a see :class:`StreamingDataFrame`.
+        This function returns a see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`.
         """
         return StreamingDataFrame(
             lambda: map(lambda df: df.applymap(*args, **kwargs), self),
@@ -712,9 +722,12 @@ def applymap(self, *args, **kwargs) -> "StreamingDataFrame":
 
     def merge(self, right, **kwargs) -> "StreamingDataFrame":
         """
-        Merges two see :class:`StreamingDataFrame`
-        and returns see :class:`StreamingDataFrame`.
-        *right* can be either a see :class:`StreamingDataFrame` or simply
+        Merges two see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`
+        and returns see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`.
+        *right* can be either a see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>` or simply
         a :epkg:`pandas:DataFrame`. It calls :epkg:`pandas:DataFrame:merge` in
         a double loop, loop on *self*, loop on *right*.
         """
@@ -738,13 +751,16 @@ def concat(self, others, axis=0) -> "StreamingDataFrame":
         """
         Concatenates :epkg:`dataframes`.
         The function ensures all :epkg:`pandas:DataFrame`
-        or see :class:`StreamingDataFrame` share the same columns (name and type).
+        or see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`
+        share the same columns (name and type).
         Otherwise, the function fails as it cannot guess the schema without
         walking through all :epkg:`dataframes`.
 
         :param others: list, enumeration, :epkg:`pandas:DataFrame`
         :param axis: concatenate by rows (0) or by columns (1)
-        :return: see :class:`StreamingDataFrame`
+        :return: see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`
         """
         if axis == 1:
             return self._concath(others)
@@ -827,7 +843,8 @@ def groupby(
         :param kwargs: additional parameters for :epkg:`pandas:DataFrame:groupby`
         :return: :epkg:`pandas:DataFrame`
 
-        As the input see :class:`StreamingDataFrame` does not necessarily hold
+        As the input see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>` does not necessarily hold
         in memory, the aggregation must be done at every iteration.
         There are two levels of aggregation: one to reduce every iterated
         :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`.
@@ -847,7 +864,8 @@ def groupby(
             :tag: streaming
 
             Here is an example which shows how to write a simple *groupby*
-            with :epkg:`pandas` and see :class:`StreamingDataFrame`.
+            with :epkg:`pandas` and see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`.
 
             .. runpython::
                 :showcode:
@@ -912,7 +930,8 @@ def groupby_streaming(
         :param strategy: ``'cum'``, or ``'streaming'``, see below
         :return: :epkg:`pandas:DataFrame`
 
-        As the input see :class:`StreamingDataFrame` does not necessarily hold
+        As the input see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>` does not necessarily hold
         in memory, the aggregation must be done at every iteration.
         There are two levels of aggregation: one to reduce every iterated
         :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`.
@@ -931,7 +950,9 @@ def groupby_streaming(
         First one if ``strategy is None`` goes through
         the whole datasets to produce a final :epkg:`DataFrame`.
         Second if ``strategy=='cum'`` returns a
-        see :class:`StreamingDataFrame`, each iteration produces
+        see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`,
+        each iteration produces
         the current status of the *group by*. Last case,
         ``strategy=='streaming'`` produces :epkg:`DataFrame`
         which must be concatenated into a single :epkg:`DataFrame`
@@ -942,7 +963,8 @@ def groupby_streaming(
             :tag: streaming
 
             Here is an example which shows how to write a simple *groupby*
-            with :epkg:`pandas` and see :class:`StreamingDataFrame`.
+            with :epkg:`pandas` and see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`.
 
             .. runpython::
                 :showcode:
@@ -1107,13 +1129,16 @@ def add_column(self, col, value):
         Implements some of the functionalities :epkg:`pandas`
         offers for the operator ``[]``.
 
-        @param      col             new column
-        @param      value           see :class:`StreamingDataFrame` or a lambda function
-        @return                     see :class:`StreamingDataFrame`
+        :param col: new column
+        :param value: see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>` or a lambda function
+        :return: see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`
 
         ..note::
 
-            If value is a see :class:`StreamingDataFrame`,
+            If value is a see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`,
             *chunksize* must be the same for both.
 
         .. exref::
@@ -1172,8 +1197,9 @@ def fillna(self, **kwargs):
         Replaces the missing values, calls
         :epkg:`pandas:DataFrame:fillna`.
 
-        @param      kwargs      see :epkg:`pandas:DataFrame:fillna`
-        @return                 see :class:`StreamingDataFrame`
+        :param kwargs: see :meth:`pandas.DataFrame.fillna`
+        :return: see :class:`StreamingDataFrame
+            <pandas_streaming.df.dataframe.StreamingDataFrame>`
 
         .. warning::
             The function does not check what happens at the
@@ -1346,7 +1372,8 @@ def __del__(self):
 
 class StreamingSeries(StreamingDataFrame):
     """
-    Seens as a see :class:`StreamingDataFrame` of one column.
+    Seens as a see :class:`StreamingDataFrame
+    <pandas_streaming.df.dataframe.StreamingDataFrame>` of one column.
     """
 
     def __init__(self, iter_creation, check_schema=True, stable=True):
diff --git a/pandas_streaming/df/dataframe_io.py b/pandas_streaming/df/dataframe_io.py
index bf13cc2..7b589c1 100644
--- a/pandas_streaming/df/dataframe_io.py
+++ b/pandas_streaming/df/dataframe_io.py
@@ -8,13 +8,13 @@
 def to_zip(df, zipfilename, zname="df.csv", **kwargs):
     """
     Saves a :epkg:`Dataframe` into a :epkg:`zip` file.
-    It can be read by @see fn to_zip.
+    It can be read by :meth:`read_zip`.
 
-    :param df: :epkg:`dataframe` or :epkg:`numpy:array`
-    :param zipfilename: a :class:`zipfile:ZipFile` or a filename
-    :param zname: a filename in th zipfile
-    :param kwargs: parameters for :epkg:`pandas:to_csv` or
-        :epkg:`numpy:save`
+    :param df: :epkg:`dataframe` or :class:`numpy.ndarray`
+    :param zipfilename: a :class:`zipfile.ZipFile` or a filename
+    :param zname: a filename in the zipfile
+    :param kwargs: parameters for :meth:`pandas.DataFrame.to_csv` or
+        :func:`numpy.save`
     :return: zipfilename
 
     .. exref::
@@ -22,7 +22,7 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs):
         :tag: dataframe
 
         This shows an example on how to save and read a
-        :epkg:`pandas:dataframe` directly into a zip file.
+        :class:`pandas.DataFrame` directly into a zip file.
 
         .. runpython::
             :showcode:
@@ -43,7 +43,7 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs):
         :tag: array
 
         This shows an example on how to save and read a
-        :epkg:`numpy:ndarray` directly into a zip file.
+        :class:`numpy.ndarray` directly into a zip file.
 
         .. runpython::
             :showcode:
@@ -102,12 +102,12 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs):
 def read_zip(zipfilename, zname=None, **kwargs):
     """
     Reads a :epkg:`dataframe` from a :epkg:`zip` file.
-    It can be saved by @see fn read_zip.
+    It can be saved by :meth:`to_zip`.
 
     :param zipfilename: a :class:`zipfile.ZipFile` or a filename
     :param zname: a filename in zipfile, if None, takes the first one
     :param kwargs: parameters for :func:`pandas.read_csv`
-    :return: :func:`pandas.DataFrame` or :epkg:`numpy:array`
+    :return: :class:`pandas.DataFrame` or :class:`numpy.ndarray`
     """
     if isinstance(zipfilename, str):
         ext = os.path.splitext(zipfilename)[-1]
diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py
index ec4a579..7c2d191 100644
--- a/pandas_streaming/df/dataframe_split.py
+++ b/pandas_streaming/df/dataframe_split.py
@@ -15,18 +15,21 @@ def sklearn_train_test_split(
     The function relies on :func:`sklearn.model_selection.train_test_split`.
     It does not handle stratified version of it.
 
-    @param  self            see :class:`StreamingDataFrame`
-    @param  path_or_buf     a string, a list of strings or buffers, if it is a
-                            string, it must contain ``{}`` like ``partition{}.txt``
-    @param  export_method   method used to store the partitions, by default
-                            :epkg:`pandas:DataFrame:to_csv`
-    @param  names           partitions names, by default ``('train', 'test')``
-    @param  kwargs          parameters for the export function and
-                            :fund:`sklearn.model_selection.train_test_split`.
-    @return                 outputs of the exports functions
+    :param self: see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`
+    :param path_or_buf: a string, a list of strings or buffers, if it is a
+        string, it must contain ``{}`` like ``partition{}.txt``
+    :param export_method: method used to store the partitions, by default
+        :meth:`pandas.DataFrame.to_csv`
+    :param names: partitions names, by default ``('train', 'test')``
+    :param kwargs: parameters for the export function and
+        :func:`sklearn.model_selection.train_test_split`.
+    :return: outputs of the exports functions
 
     The function cannot return two iterators or two
-    see :class:`StreamingDataFrame` because running through one
+    see :class:`StreamingDataFrame
+    <pandas_streaming.df.dataframe.StreamingDataFrame>`
+    because running through one
     means running through the other. We can assume both
     splits do not hold in memory and we cannot run through
     the same iterator again as random draws would be different.
@@ -114,18 +117,21 @@ def sklearn_train_test_split_streaming(
     The function relies on :func:`sklearn.model_selection.train_test_split`.
     It handles the stratified version of it.
 
-    :param self: see :class:`StreamingDataFrame`
+    :param self: see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`
     :param test_size: ratio for the test partition
         (if *train_size* is not specified)
     :param train_size: ratio for the train partition
     :param stratify: column holding the stratification
     :param hash_size: size of the hash to cache information about partition
     :param unique_rows: ensures that rows are unique
-    :return: Two see :class:`StreamingDataFrame`, one
-                            for train, one for test.
+    :return: Two see :class:`StreamingDataFrame
+        <pandas_streaming.df.dataframe.StreamingDataFrame>`,
+        one for train, one for test.
 
     The function returns two iterators or two
-    see :class:`StreamingDataFrame`. It
+    see :class:`StreamingDataFrame
+    <pandas_streaming.df.dataframe.StreamingDataFrame>`. It
     tries to do everything without writing anything on disk
     but it requires to store the repartition somehow.
     This function hashes every row and maps the hash with a part

From 38fe46b806198a90bcb9267d47248cb51ddd659e Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 12:57:40 +0200
Subject: [PATCH 14/16] doc

---
 .circleci/config.yml | 2 +-
 _doc/index.rst       | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e76a1b7..0d766bc 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -39,7 +39,7 @@ jobs:
       - run:
           name: install dependencies (2)
           command: |
-            pip install -r requirements.txt
+            pip install -r requirements-dev.txt
 
       - save_cache:
           paths:
diff --git a/_doc/index.rst b/_doc/index.rst
index fa91aea..dca5620 100644
--- a/_doc/index.rst
+++ b/_doc/index.rst
@@ -64,4 +64,5 @@ and implements other functionalities for machine learning.
     auto_examples/index
     api/index
     i_ex
+    CHANGELOGS
     license

From adde860f015fcb5a06397a51d1f203b6765d43bd Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 12:59:09 +0200
Subject: [PATCH 15/16] lambda

---
 pandas_streaming/df/dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
index 6537d1f..1cc87a2 100644
--- a/pandas_streaming/df/dataframe.py
+++ b/pandas_streaming/df/dataframe.py
@@ -1358,7 +1358,7 @@ def iterate():
                         sub = dfs[numpy.isnan(dfs[by])]
                         yield sub
 
-        res = StreamingDataFrame(lambda: iterate(), **self.get_kwargs())
+        res = StreamingDataFrame(iterate, **self.get_kwargs())
         res._delete_.append(lambda: os.remove(temp_file))
         return res
 

From fb88a708ab07fec458d68cc638c0b64d4f44c9b0 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 23 Jul 2023 13:05:23 +0200
Subject: [PATCH 16/16] remove unnecessary unit tests

---
 .../ut_documentation/test_run_notebooks.py    | 25 -------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 _unittests/ut_documentation/test_run_notebooks.py

diff --git a/_unittests/ut_documentation/test_run_notebooks.py b/_unittests/ut_documentation/test_run_notebooks.py
deleted file mode 100644
index aebe979..0000000
--- a/_unittests/ut_documentation/test_run_notebooks.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import os
-import unittest
-from pyquickhelper.pycode import ExtTestCase
-from pyquickhelper.ipythonhelper import test_notebook_execution_coverage
-import pandas_streaming
-
-
-class TestRunNotebooksPython(ExtTestCase):
-    def setUp(self):
-        import jyquickhelper  # pylint: disable=C0415
-
-        self.assertTrue(jyquickhelper is not None)
-
-    def test_notebook_artificiel(self):
-        self.assertTrue(pandas_streaming is not None)
-        folder = os.path.join(
-            os.path.dirname(__file__), "..", "..", "_doc", "notebooks"
-        )
-        test_notebook_execution_coverage(
-            __file__, "first_steps", folder, "pandas_streaming", copy_files=[]
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()