From 740f1c8cb8860fa545d341aea9ddbe4c58070e02 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sat, 22 Jul 2023 19:41:40 +0200 Subject: [PATCH 01/16] refactoring --- .circleci/config.yml | 4 +- .github/workflows/black.yml | 11 + .github/workflows/check-urls.yml | 47 +++ .github/workflows/codeql.yml | 61 ++++ .github/workflows/documentation.yml | 88 +++++ .github/workflows/rstcheck.yml | 27 ++ .github/workflows/wheels-any.yml | 29 ++ .gitignore | 300 ++---------------- .landscape.yml | 15 - .local.jenkins.lin.yml | 1 + .local.jenkins.win.yml | 26 -- .travis.yml | 15 - CHANGELOGS.rst | 35 ++ HISTORY.rst | 37 --- MANIFEST.in | 2 - README.rst | 7 +- _unittests/ut_df/test_connex_split.py | 26 +- _unittests/ut_df/test_connex_split_big.py | 22 +- .../ut_documentation/test_run_notebooks.py | 8 +- _unittests/ut_module/test_check.py | 26 -- _unittests/ut_module/test_code_style.py | 37 --- .../ut_module/test_convert_notebooks.py | 38 --- _unittests/ut_module/test_readme.py | 35 -- appveyor.yml | 2 +- azure-pipelines.yml | 180 +++++++++-- build_script.bat | 13 - pandas_streaming/__init__.py | 50 +-- pandas_streaming/df/connex_split.py | 7 +- pandas_streaming/df/dataframe_io_helpers.py | 12 +- pyproject.toml | 31 ++ requirements-dev.txt | 20 ++ requirements.txt | 21 +- setup.cfg | 5 + setup.py | 114 ++++--- 34 files changed, 610 insertions(+), 742 deletions(-) create mode 100644 .github/workflows/black.yml create mode 100644 .github/workflows/check-urls.yml create mode 100644 .github/workflows/codeql.yml create mode 100644 .github/workflows/documentation.yml create mode 100644 .github/workflows/rstcheck.yml create mode 100644 .github/workflows/wheels-any.yml delete mode 100644 .landscape.yml delete mode 100644 .local.jenkins.win.yml delete mode 100644 .travis.yml create mode 100644 CHANGELOGS.rst delete mode 100644 HISTORY.rst delete mode 100644 _unittests/ut_module/test_check.py delete mode 100644 _unittests/ut_module/test_code_style.py delete mode 100644 _unittests/ut_module/test_convert_notebooks.py delete mode 100644 _unittests/ut_module/test_readme.py delete mode 100644 build_script.bat create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt create mode 100644 setup.cfg diff --git a/.circleci/config.yml b/.circleci/config.yml index e764c48..712677b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,7 +11,7 @@ jobs: - restore_cache: keys: - - v3-dependencies-{{ checksum "requirements.txt" }} + - v3-dependencies-{{ checksum "requirements-dev.txt" }} - v3-dependencies- - run: @@ -44,7 +44,7 @@ jobs: - save_cache: paths: - ./venv - key: v3-dependencies-{{ checksum "requirements.txt" }} + key: v3-dependencies-{{ checksum "requirements-dev.txt" }} - run: name: compile and build diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..fe99e3c --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Black Format Checker +on: [push, pull_request] +jobs: + black-format-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable + with: + options: "--diff --check" + src: "." diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml new file mode 100644 index 0000000..f235903 --- /dev/null +++ b/.github/workflows/check-urls.yml @@ -0,0 +1,47 @@ +name: Check URLs + +on: + pull_request: + branches: [main] + schedule: + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + # │ │ │ │ │ + # │ │ │ │ │ + # │ │ │ │ │ + # * * * * * + - cron: '30 1 * * 0' + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: urls-checker-code + uses: urlstechie/urlchecker-action@master + with: + subfolder: pandas_streaming + file_types: .md,.py,.rst,.ipynb + print_all: false + timeout: 2 + retry_count# : 2 + # exclude_urls: https://dumps.wikimedia.org/other/pageviews/%Y/%Y-%m/pageviews-%Y%m%d-%H0000.gz,https://dumps.wikimedia.org/frwiki/latest/latest-all-titles-in-ns0.gz + # exclude_patterns: https://dumps.wikimedia.org/ + # force_pass : true + + - name: urls-checker-docs + uses: urlstechie/urlchecker-action@master + with: + subfolder: _doc + file_types: .md,.py,.rst,.ipynb + print_all: false + timeout: 2 + retry_count# : 2 + # exclude_urls: https://hal.archives-ouvertes.fr/hal-00990252/document + # exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/ + # force_pass : true diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..bea1259 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,61 @@ +name: "Code Scanning - Action" + +on: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + # │ │ │ │ │ + # │ │ │ │ │ + # │ │ │ │ │ + # * * * * * + - cron: '30 1 * * 0' + +jobs: + CodeQL-Build: + # CodeQL runs on ubuntu-latest, windows-latest, and macos-latest + runs-on: ubuntu-latest + + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + # Override language selection by uncommenting this and choosing your languages + # with: + # languages: go, javascript, csharp, python, cpp, java, ruby + + # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). + # If this step fails, then you should remove it and run the build manually (see below). + - name: Autobuild + uses: github/codeql-action/autobuild@v2 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun + + # ✏️ If the Autobuild fails above, remove it and uncomment the following + # three lines and modify them (or add more) to build your code if your + # project uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml new file mode 100644 index 0000000..a7a5be1 --- /dev/null +++ b/.github/workflows/documentation.yml @@ -0,0 +1,88 @@ +name: Documentation and Code Coverage + +on: + push: + pull_request: + types: + - closed + branches: + - main + +jobs: + run: + name: Build documentation on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - uses: tlylt/install-graphviz@v1 + + - name: Install pandoc + run: sudo apt-get install -y pandoc + + - name: Install requirements + run: python -m pip install -r requirements.txt + + - name: Install requirements dev + run: python -m pip install -r requirements-dev.txt + + - name: Cache pip + uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('requirements-dev.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + ${{ runner.os }}- + + - name: Generate coverage report + run: | + pip install pytest + pip install pytest-cov + export PYTHONPATH=. + pytest --cov=./pandas_streaming/ --cov-report=xml --durations=10 --ignore-glob=**LONG*.py --ignore-glob=**notebook*.py + export PYTHONPATH= + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v3 + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + + - name: Install + run: python setup.py install + + - name: Copy license, changelogs + run: | + cp LICENSE* ./_doc + cp CHANGELOGS* ./_doc + + - name: Documentation + run: python -m sphinx ./_doc ./dist/html -n -w doc.txt + + - name: Summary + run: cat doc.txt + + - name: Check for errors and warnings + run: | + if [[ $(grep ERROR doc.txt) ]]; then + echo "Documentation produces errors." + grep ERROR doc.txt + exit 1 + fi + if [[ $(grep WARNING doc.txt) ]]; then + echo "Documentation produces warnings." + grep WARNING doc.txt + exit 1 + fi + + - uses: actions/upload-artifact@v3 + with: + path: ./dist/html/** diff --git a/.github/workflows/rstcheck.yml b/.github/workflows/rstcheck.yml new file mode 100644 index 0000000..44e2a48 --- /dev/null +++ b/.github/workflows/rstcheck.yml @@ -0,0 +1,27 @@ +name: RST Check + +on: [push, pull_request] + +jobs: + build_wheels: + name: rstcheck ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: Install requirements + run: python -m pip install -r requirements.txt + + - name: Install rstcheck + run: python -m pip install sphinx tomli rstcheck[toml,sphinx] + + - name: rstcheck + run: rstcheck -r _doc pandas_streaming diff --git a/.github/workflows/wheels-any.yml b/.github/workflows/wheels-any.yml new file mode 100644 index 0000000..2547b0b --- /dev/null +++ b/.github/workflows/wheels-any.yml @@ -0,0 +1,29 @@ +name: Build Any Wheel + +on: + push: + branches: + - main + - 'releases/**' + +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v3 + + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + + - name: build wheel + run: python -m pip wheel . + + - uses: actions/upload-artifact@v3 + with: + path: ./pandas_streaming*.whl diff --git a/.gitignore b/.gitignore index 6bd1306..fbe3a19 100644 --- a/.gitignore +++ b/.gitignore @@ -1,279 +1,25 @@ -################# -## Eclipse -################# - -*.pydevproject -.project -.metadata -bin/ -tmp/ -_virtualenv/ -*.tmp -*.bak -*.swp -*~.nib -local.properties -.classpath -.settings/ -.loadpath -*.pyproj - -# External tool builders -.externalToolBuilders/ - -# Locally stored "Eclipse launch configurations" -*.launch - -# CDT-specific -.cproject - -# PDT-specific -.buildpath - - -################# -## Visual Studio -################# - -## Ignore Visual Studio temporary files, build results, and -## files generated by popular Visual Studio add-ons. - -# User-specific files -*.suo -*.user -*.sln.docstates - -# Build results - -[Dd]ebug/ -[Rr]elease/ -x64/ -build/ -[Bb]in/ -[Oo]bj/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -*_i.c -*_p.c -*.ilk -*.meta -*.obj -*.pch -*.pdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*.log -*.vspscc -*.vssscc -.builds -*.pidb -*.log -*.scc +*.pyc *.pyd - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opensdf -*.sdf -*.cachefile - -# Visual Studio profiler -*.psess -*.vsp -*.vspx - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# NCrunch -*.ncrunch* -.*crunch*.local.xml - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.Publish.xml -*.pubxml - -# NuGet Packages Directory -## TODO: If you have NuGet Package Restore enabled, uncomment the next line -#packages/ - -# Windows Azure Build Output -csx -*.build.csdef - -# Windows Store app package directory -AppPackages/ - -# Others -sql/ -*.Cache -ClientBin/ -[Ss]tyle[Cc]op.* -~$* -*~ -*.dbmdl -*.[Pp]ublish.xml -*.pfx -*.publishsettings - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file to a newer -# Visual Studio version. Backup files are not needed, because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm - -# SQL Server files -App_Data/*.mdf -App_Data/*.ldf - -############# -## Windows detritus -############# - -# Windows image file caches -Thumbs.db -ehthumbs.db - -# Folder config file -Desktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Mac crap -.DS_Store - - -############# -## Python -############# - -*.py[co] - -# Packages -*.egg -*.egg-info -dist/ -build/ -eggs/ -parts/ -var/ -sdist/ -develop-eggs/ -__pycache__/ -.installed.cfg - -# Installer logs -pip-log.txt - -# Unit test / coverage reports +*.dylib +*.so +*.whl +coverage.html/* +_cache/* .coverage -.tox - -#Translations -*.mo - -#Mr Developer -.mr.developer.cfg - -# py* packages -temp_* -out_* -*/sphinxdoc/source/index_* -*/sphinxdoc/source/readme.* -*/sphinxdoc/source/LICENSE.txt -*/sphinxdoc/source/filechanges.* -version.txt -_doc/sphinxdoc/source/python_template/*box.html -_doc/sphinxdoc/source/python_template/*toc.html -_doc/sphinxdoc/source/jyquickhelper/ -_doc/sphinxdoc/source/coverage/* -*/sphinxdoc/source/all*.rst -_doc/sphinxdoc/source/notebooks/* -*/sphinxdoc/source/gynotebooks/* -_doc/sphinxdoc/source/gyexamples/* -_doc/sphinxdoc/source/examples/* -_doc/sphinxdoc/source/gallery/* -_doc/sphinxdoc/source/gallerynb/* -build_help.bat -_doc/sphinxdoc/source/blog/*.rst -_doc/sphinxdoc/source/blog/rss.xml -_doc/sphinxdoc/source/_templates/*toc.html -_doc/sphinxdoc/source/_templates/*box.html -_doc/sphinxdoc/source/blog/feed-icon*.png -_doc/sphinxdoc/source/_static/reveal.js/* -_doc/notebooks/.ipynb_checkpoints/* -dist_module27/* -auto_*.bat -auto_*.sh -auto_*.py -auto_*.xml -auto_*.db3 -_doc/sphinxdoc/source/_static/require.js -_doc/sphinxdoc/require.js -ex.* -m.temp -_doc/notebooks/*/.ipynb_checkpoints -_doc/notebooks/nlp/frwiki-latest-all-titles-in-ns0 -_doc/notebooks/nlp/sample*.txt -_doc/notebooks/nlp/completion.prof -_doc/notebooks/nlp/profile.png -_doc/notebooks/nlp/completion.dot -_doc/notebooks/nlp/completion.png -_doc/notebooks/nlp/completion.pstat -_unittests/run_unittests.py.out -*.err -_doc/sphinxdoc/source/_static/style_notebook_snippet.css -dist -_doc/sphinxdoc/source/pandas_streaming -_doc/sphinxdoc/source/nbcov.png -_doc/notebooks/example.test.txt -_doc/notebooks/example.txt -_doc/notebooks/example.train.txt -_unittests/ut_df/buggy_hash.csv -_doc/sphinxdoc/dfs.zip -_doc/sphinxdoc/dfsa.zip -_doc/sphinxdoc/source/nbcov-*.png -.eggs +dist/* +build/* +.eggs/* +.hypothesis/* +*egg-info/* +prof +_doc/auto_examples/* +_doc/examples/_cache/* +_doc/examples/plot_*.png +_doc/examples/plot_*.xlsx +_doc/examples/*.html +_doc/_static/require.js +_doc/_static/viz.js +_unittests/ut__main/*.png +_unittests/ut__main/_cache/* +_unittests/ut__main/*.html +_unittests/.hypothesis/* diff --git a/.landscape.yml b/.landscape.yml deleted file mode 100644 index 3b83d70..0000000 --- a/.landscape.yml +++ /dev/null @@ -1,15 +0,0 @@ -doc-warnings: yes -test-warnings: no -strictness: veryhigh -max-line-length: 120 -autodetect: yes -requirements: - - requirement.txt -ignore-paths: - - _unittests - - _doc - - dist - - build -ignore-patterns: - - .*Parser\.py$ - - .*Lexer\.py$ diff --git a/.local.jenkins.lin.yml b/.local.jenkins.lin.yml index c3e0dee..9ab574d 100644 --- a/.local.jenkins.lin.yml +++ b/.local.jenkins.lin.yml @@ -11,6 +11,7 @@ install: - $PYINT -m pip install --upgrade pip - $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/ - $PYINT -m pip install -r requirements.txt + - $PYINT -m pip install -r requirements-dev.txt - $PYINT --version - $PYINT -m pip freeze diff --git a/.local.jenkins.win.yml b/.local.jenkins.win.yml deleted file mode 100644 index 7f8c60a..0000000 --- a/.local.jenkins.win.yml +++ /dev/null @@ -1,26 +0,0 @@ - -language: python - -python: - - { PATH: "{{replace(Python39, '\\', '\\\\')}}", VERSION: 3.9, DIST: std } - -virtualenv: - - path: {{ospathjoin(root_path, pickname("%NAME_JENKINS%", project_name + "_%VERSION%_%DIST%_%NAME%"), "_venv")}} - -install: - - pip install --upgrade pip - - pip install --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper --extra-index-url=https://pypi.python.org/simple/ - - pip install --no-cache-dir --no-deps --index http://localhost:8067/simple/ pyquickhelper --extra-index-url=https://pypi.python.org/simple/ - - pip install -r requirements.txt - - pip freeze - - pip freeze > pip_freeze.txt -before_script: - - python -u setup.py build_ext --inplace -script: - - { CMD: "python -u setup.py unittests", NAME: "UT" } -after_script: - - python setup.py bdist_wheel - - if [ ${DIST} != "conda" and ${NAME} == "UT" ] then copy dist\*.whl {{root_path}}\..\..\local_pypi\local_pypi_server fi -documentation: - - if [ ${NAME} == "UT" ] then python -u setup.py build_sphinx fi - - if [ ${NAME} == "UT" ] then xcopy /E /C /I /Y _doc\sphinxdoc\build\html dist\html fi diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index df69072..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -dist: focal -sudo: true -language: python -matrix: - include: - - python: 3.10 - name: "Py310-skl022+" - env: sklearnc=">=0.22" -install: - - pip install -r requirements.txt - - python -c "import sklearn;print(sklearn.__version__)" -before_script: - - python setup.py build_ext --inplace -script: - - python setup.py unittests diff --git a/CHANGELOGS.rst b/CHANGELOGS.rst new file mode 100644 index 0000000..985bb0a --- /dev/null +++ b/CHANGELOGS.rst @@ -0,0 +1,35 @@ + +=========== +Change Logs +=========== + +current - 2021-10-26 - 0.00Mb +============================= + +* :pr:`27`: Fixes json parser when input is a stream (2021-10-26) +* :pr:`26`: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26) +* :pr:`25`: Fixes documentation (2021-10-18) +* :pr:`24`: Implements a first version of sort_values. (2021-10-18) +* :pr:`23`: First version of operator __setitem__ (2021-10-16) +* :pr:`22`: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11) +* :pr:`21`: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10) +* :pr:`17`: Implements method describe (2021-04-08) + +0.2.175 - 2020-08-06 - 0.03Mb +============================= + +* :pr:`16`: Unit tests failing with pandas 1.1.0. (2020-08-06) +* :pr:`15`: implements parameter lines, flatten for read_json (2018-11-21) +* :pr:`14`: implements fillna (2018-10-29) +* :pr:`13`: implement concat for axis=0,1 (2018-10-26) +* :pr:`12`: add groupby_streaming (2018-10-26) +* :pr:`11`: add method add_column (2018-10-26) +* :pr:`10`: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26) +* :pr:`9`: head is very slow (2018-10-26) +* :pr:`8`: fix pandas_streaming for pandas 0.23.1 (2018-07-31) +* :pr:`7`: implement read_json (2018-05-17) +* :pr:`6`: add pandas_groupby_nan from pyensae (2018-05-17) +* :pr:`5`: add random_state parameter to splitting functions (2018-02-04) +* :pr:`2`: add method sample, resevoir sampling (2017-11-05) +* :pr:`3`: method train_test_split for out-of-memory datasets (2017-10-21) +* :pr:`1`: Excited for your project (2017-10-10) diff --git a/HISTORY.rst b/HISTORY.rst deleted file mode 100644 index 7a41d02..0000000 --- a/HISTORY.rst +++ /dev/null @@ -1,37 +0,0 @@ - -.. _l-HISTORY: - -======= -History -======= - -current - 2021-10-26 - 0.00Mb -============================= - -* #27: Fixes json parser when input is a stream (2021-10-26) -* #26: Fixes bug while reading json (iterator failed to be created twice) (2021-10-26) -* #25: Fixes documentation (2021-10-18) -* #24: Implements a first version of sort_values. (2021-10-18) -* #23: First version of operator __setitem__ (2021-10-16) -* #22: Fixes nan values after pandas update, add documentation example to the unit tests (2021-07-11) -* #21: Fixes grouping by nan values after update pandas to 1.3.0 (2021-07-10) -* #17: Implements method describe (2021-04-08) - -0.2.175 - 2020-08-06 - 0.03Mb -============================= - -* #16: Unit tests failing with pandas 1.1.0. (2020-08-06) -* #15: implements parameter lines, flatten for read_json (2018-11-21) -* #14: implements fillna (2018-10-29) -* #13: implement concat for axis=0,1 (2018-10-26) -* #12: add groupby_streaming (2018-10-26) -* #11: add method add_column (2018-10-26) -* #10: plan B to bypass a bug in pandas about read_csv when iterator=True --> closed, pandas has a weird behaviour when names is too small compare to the number of columns (2018-10-26) -* #9: head is very slow (2018-10-26) -* #8: fix pandas_streaming for pandas 0.23.1 (2018-07-31) -* #7: implement read_json (2018-05-17) -* #6: add pandas_groupby_nan from pyensae (2018-05-17) -* #5: add random_state parameter to splitting functions (2018-02-04) -* #2: add method sample, resevoir sampling (2017-11-05) -* #3: method train_test_split for out-of-memory datasets (2017-10-21) -* #1: Excited for your project (2017-10-10) diff --git a/MANIFEST.in b/MANIFEST.in index a782640..66ddca8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,5 @@ prune _doc prune _unittests -prune bin -prune .circleci exclude *.bat exclude *.yml exclude *.git* diff --git a/README.rst b/README.rst index 12bcec5..1096a34 100644 --- a/README.rst +++ b/README.rst @@ -1,12 +1,9 @@ +pandas-streaming: streaming API over pandas +=========================================== .. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true :target: https://github.com/sdpython/pandas_streaming/ -.. _l-README: - -pandas_streaming: streaming API over pandas -=========================================== - .. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=master :target: https://app.travis-ci.com/github/sdpython/pandas_streaming :alt: Build status diff --git a/_unittests/ut_df/test_connex_split.py b/_unittests/ut_df/test_connex_split.py index f0ab09c..33bd03f 100644 --- a/_unittests/ut_df/test_connex_split.py +++ b/_unittests/ut_df/test_connex_split.py @@ -4,7 +4,6 @@ """ import unittest import pandas -from pyquickhelper.loghelper import fLOG from pyquickhelper.pycode import ExtTestCase from pandas_streaming.df import dataframe_shuffle, train_test_split_weights, train_test_connex_split @@ -86,11 +85,6 @@ def test_split_weights(self): self.assertGreater(0.4, delta) def test_split_connex(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - df = pandas.DataFrame([dict(user="UA", prod="PA", card="C1"), dict(user="UA", prod="PB", card="C1"), dict(user="UB", prod="PC", card="C2"), @@ -102,7 +96,7 @@ def test_split_connex(self): train, test = train_test_connex_split( # pylint: disable=W0632 df, test_size=0.5, groups=['user', 'prod', 'card'], - fail_imbalanced=0.4, fLOG=fLOG) + fail_imbalanced=0.4) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) for col in ['user', 'prod', 'card']: @@ -115,15 +109,10 @@ def test_split_connex(self): df['connex'] = 'ole' train, test = train_test_connex_split( # pylint: disable=W0632 df, test_size=0.5, groups=['user', 'prod', 'card'], - fail_imbalanced=0.4, fLOG=fLOG) + fail_imbalanced=0.4) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) def test_split_connex2(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"), dict(user="UA", prod="PB", card="C1"), dict(user="UB", prod="PC", card="C2"), @@ -134,11 +123,11 @@ def test_split_connex2(self): ]) train_test_connex_split(df, test_size=0.5, groups=['user', 'prod', 'card'], - fail_imbalanced=0.5, fLOG=fLOG, return_cnx=True) + fail_imbalanced=0.5, return_cnx=True) train, test, stats = train_test_connex_split(df, test_size=0.5, groups=[ 'user', 'prod', 'card'], - fail_imbalanced=0.5, fLOG=fLOG, + fail_imbalanced=0.5, return_cnx=True, random_state=0) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) @@ -153,11 +142,6 @@ def test_split_connex2(self): 'Non empty intersection {0} & {1}\n{2}\n{3}\n{4}'.format(s1, s2, train, test, "\n".join(rows))) def test_split_connex_missing(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"), dict(user="UA", prod="PB", card="C1"), dict(user="UB", prod="PC", card="C2"), @@ -170,7 +154,7 @@ def test_split_connex_missing(self): train, test, stats = train_test_connex_split(df, test_size=0.5, groups=[ 'user', 'prod', 'card'], - fail_imbalanced=0.4, fLOG=fLOG, + fail_imbalanced=0.4, return_cnx=True, random_state=0) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) diff --git a/_unittests/ut_df/test_connex_split_big.py b/_unittests/ut_df/test_connex_split_big.py index b21c9b4..22292c5 100644 --- a/_unittests/ut_df/test_connex_split_big.py +++ b/_unittests/ut_df/test_connex_split_big.py @@ -6,7 +6,6 @@ import unittest from collections import Counter import pandas -from pyquickhelper.loghelper import fLOG from pyquickhelper.pycode import ExtTestCase from pandas_streaming.df import train_test_connex_split @@ -14,15 +13,10 @@ class TestConnexSplitBig(ExtTestCase): def test_connex_big(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - data = os.path.join(os.path.dirname(__file__), "data") name = os.path.join(data, "buggy_hash.csv") df = pandas.read_csv(name, sep="\t", encoding="utf-8") - train, test, stats = train_test_connex_split(df, fLOG=fLOG, + train, test, stats = train_test_connex_split(df, groups=[ "cart_id", "mail", "product_id"], fail_imbalanced=0.9, return_cnx=True) @@ -36,15 +30,10 @@ def test_connex_big(self): self.assertEqual(maxi, 14181) def test_connex_big_approx(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - data = os.path.join(os.path.dirname(__file__), "data") name = os.path.join(data, "buggy_hash.csv") df = pandas.read_csv(name, sep="\t", encoding="utf-8") - train, test, stats = train_test_connex_split(df, fLOG=fLOG, + train, test, stats = train_test_connex_split(df, groups=[ "cart_id", "mail", "product_id"], stop_if_bigger=0.05, return_cnx=True, @@ -59,15 +48,10 @@ def test_connex_big_approx(self): self.assertLesser(maxi, 14181) def test_connex_big_approx_must(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - data = os.path.join(os.path.dirname(__file__), "data") name = os.path.join(data, "buggy_hash.csv") df = pandas.read_csv(name, sep="\t", encoding="utf-8") - train, test, stats = train_test_connex_split(df, fLOG=fLOG, + train, test, stats = train_test_connex_split(df, groups=[ "cart_id", "mail", "product_id"], stop_if_bigger=0.05, return_cnx=True, diff --git a/_unittests/ut_documentation/test_run_notebooks.py b/_unittests/ut_documentation/test_run_notebooks.py index 486cb45..6f84e1c 100644 --- a/_unittests/ut_documentation/test_run_notebooks.py +++ b/_unittests/ut_documentation/test_run_notebooks.py @@ -4,7 +4,6 @@ """ import os import unittest -from pyquickhelper.loghelper import fLOG from pyquickhelper.pycode import ExtTestCase from pyquickhelper.ipythonhelper import test_notebook_execution_coverage import pandas_streaming @@ -17,16 +16,11 @@ def setUp(self): self.assertTrue(jyquickhelper is not None) def test_notebook_artificiel(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - self.assertTrue(pandas_streaming is not None) folder = os.path.join(os.path.dirname(__file__), "..", "..", "_doc", "notebooks") test_notebook_execution_coverage( - __file__, "first_steps", folder, 'pandas_streaming', copy_files=[], fLOG=fLOG) + __file__, "first_steps", folder, 'pandas_streaming', copy_files=[]) if __name__ == "__main__": diff --git a/_unittests/ut_module/test_check.py b/_unittests/ut_module/test_check.py deleted file mode 100644 index 8b08b34..0000000 --- a/_unittests/ut_module/test_check.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -@brief test log(time=0s) -""" -import io -import unittest -from contextlib import redirect_stdout -from pyquickhelper.pycode import ExtTestCase -from pandas_streaming import check, _setup_hook - - -class TestCheck(ExtTestCase): - """Test style.""" - - def test_check(self): - self.assertTrue(check()) - - def test_setup_hook(self): - f = io.StringIO() - with redirect_stdout(f): - _setup_hook(True) - out = f.getvalue() - self.assertIn('Success:', out) - - -if __name__ == "__main__": - unittest.main() diff --git a/_unittests/ut_module/test_code_style.py b/_unittests/ut_module/test_code_style.py deleted file mode 100644 index 3c87deb..0000000 --- a/_unittests/ut_module/test_code_style.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -@brief test log(time=0s) -""" -import os -import unittest -from pyquickhelper.loghelper import fLOG -from pyquickhelper.pycode import check_pep8, ExtTestCase - - -class TestCodeStyle(ExtTestCase): - """Test style.""" - - def test_style_src(self): - thi = os.path.abspath(os.path.dirname(__file__)) - src_ = os.path.normpath(os.path.join( - thi, "..", "..", "pandas_streaming")) - check_pep8(src_, fLOG=fLOG, - pylint_ignore=('C0103', 'C1801', 'R1705', 'W0108', 'W0613', - 'W0212', 'W0703', 'W0107', 'C0302', 'C0209', - 'C3001', 'R1735'), - skip=["Too many nested blocks", - "Module 'numpy.random' has no 'RandomState' member", - "dataframe_split.py:60: [E731]", - ]) - - def test_style_test(self): - thi = os.path.abspath(os.path.dirname(__file__)) - test = os.path.normpath(os.path.join(thi, "..", )) - check_pep8(test, fLOG=fLOG, neg_pattern="temp_.*", - pylint_ignore=('C0103', 'C1801', 'R1705', 'W0108', 'W0613', - 'C0111', 'W0107', 'C0302', 'R1732', 'C0209', - 'C3001', 'R1735'), - skip=[]) - - -if __name__ == "__main__": - unittest.main() diff --git a/_unittests/ut_module/test_convert_notebooks.py b/_unittests/ut_module/test_convert_notebooks.py deleted file mode 100644 index 12fe82a..0000000 --- a/_unittests/ut_module/test_convert_notebooks.py +++ /dev/null @@ -1,38 +0,0 @@ -""" -@brief test log(time=0s) -""" -import os -import unittest -from pyquickhelper.loghelper import fLOG -from pyquickhelper.filehelper import explore_folder_iterfile -from pyquickhelper.pycode import ExtTestCase -from pyquickhelper.ipythonhelper import upgrade_notebook, remove_execution_number - - -class TestConvertNotebooks(ExtTestCase): - - def test_convert_notebooks(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - - fold = os.path.abspath(os.path.dirname(__file__)) - fold2 = os.path.normpath( - os.path.join(fold, "..", "..", "_doc", "notebooks")) - for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): - t = upgrade_notebook(nbf) - if t: - fLOG("modified", nbf) - # remove numbers - remove_execution_number(nbf, nbf) - - fold2 = os.path.normpath(os.path.join(fold, "..", "..", "_unittests")) - for nbf in explore_folder_iterfile(fold2, pattern=".*[.]ipynb"): - t = upgrade_notebook(nbf) - if t: - fLOG("modified", nbf) - - -if __name__ == "__main__": - unittest.main() diff --git a/_unittests/ut_module/test_readme.py b/_unittests/ut_module/test_readme.py deleted file mode 100644 index 95c9fb7..0000000 --- a/_unittests/ut_module/test_readme.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -@brief test tree node (time=50s) -""" -import os -import unittest -from pyquickhelper.loghelper import fLOG -from pyquickhelper.pycode import get_temp_folder, ExtTestCase, check_readme_syntax - - -class TestReadme(ExtTestCase): - - def test_venv_docutils08_readme(self): - fLOG( - __file__, - self._testMethodName, - OutputPrint=__name__ == "__main__") - - fold = os.path.dirname(os.path.abspath(__file__)) - readme = os.path.join(fold, "..", "..", "README.rst") - self.assertTrue(os.path.exists(readme)) - with open(readme, "r", encoding="utf8") as f: - content = f.read() - - self.assertTrue(len(content) > 0) - temp = get_temp_folder(__file__, "temp_readme") - - if __name__ != "__main__": - # does not work from a virtual environment - return - - check_readme_syntax(readme, folder=temp, fLOG=fLOG) - - -if __name__ == "__main__": - unittest.main() diff --git a/appveyor.yml b/appveyor.yml index e8294be..cb1fa0e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,7 +10,7 @@ init: install: - "%PYTHON%\\python -m pip install --upgrade pip" - - "%PYTHON%\\Scripts\\pip install -r requirements.txt" + - "%PYTHON%\\Scripts\\pip install -r requirements-dev.txt" build: off before_test: diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1d95231..040a297 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,11 +1,55 @@ jobs: -- job: 'TestLinux' +- job: 'TestLinuxWheelPip' + pool: + vmImage: 'ubuntu-latest' + strategy: + matrix: + Python311-Linux: + python.version: '3.11' + maxParallel: 3 + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + architecture: 'x64' + - script: sudo apt-get update + displayName: 'AptGet Update' + - script: sudo apt-get install -y graphviz + displayName: 'Install Graphviz' + - script: python -m pip install --upgrade pip setuptools wheel + displayName: 'Install tools' + - script: pip install -r requirements.txt + displayName: 'Install Requirements' + - script: pip install -r requirements-dev.txt + displayName: 'Install Requirements dev' + - script: | + ruff . + displayName: 'Ruff' + - script: | + black --diff . + displayName: 'Black' + - script: | + python -m pip wheel . --wheel-dir dist -v -v -v + displayName: 'build wheel' + - script: | + python -m pip install . -v -v -v + displayName: 'install wheel' + - script: | + python -m pytest + displayName: 'Runs Unit Tests' + - task: PublishPipelineArtifact@0 + inputs: + artifactName: 'wheel-linux-wheel-$(python.version)' + targetPath: 'dist' + +- job: 'TestLinuxNightly' pool: vmImage: 'ubuntu-latest' strategy: matrix: - Python310-Linux: - python.version: '3.10' + Python311-Linux: + python.version: '3.11' maxParallel: 3 steps: @@ -17,10 +61,6 @@ jobs: displayName: 'AptGet Update' - script: sudo apt-get install -y pandoc displayName: 'Install Pandoc' - - script: sudo apt-get install -y texlive texlive-latex-extra texlive-xetex dvipng - displayName: 'Install Latex' - - script: sudo apt-get install -y p7zip-full - displayName: 'Install 7z, rar' - script: sudo apt-get install -y inkscape displayName: 'Install Inkscape' - script: sudo apt-get install -y graphviz @@ -29,30 +69,114 @@ jobs: displayName: 'Install tools' - script: pip install -r requirements.txt displayName: 'Install Requirements' + - script: pip install -r requirements-dev.txt + displayName: 'Install Requirements dev' + - script: pip uninstall -y scikit-learn + displayName: 'Uninstall scikit-learn' + - script: pip install --pre --extra-index https://pypi.anaconda.org/scipy-wheels-nightly/simple scikit-learn + displayName: 'Install scikit-learn nightly' + - script: | + ruff . + displayName: 'Ruff' + - script: | + rstcheck -r ./_doc ./pandas_streaming + displayName: 'rstcheck' + - script: | + black --diff . + displayName: 'Black' - script: | - python -u setup.py build_ext --inplace + python -m pytest displayName: 'Runs Unit Tests' + +- job: 'TestLinux' + pool: + vmImage: 'ubuntu-latest' + strategy: + matrix: + Python311-Linux: + python.version: '3.11' + maxParallel: 3 + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + architecture: 'x64' + - script: sudo apt-get update + displayName: 'AptGet Update' + - script: sudo apt-get install -y pandoc + displayName: 'Install Pandoc' + - script: sudo apt-get install -y inkscape + displayName: 'Install Inkscape' + - script: sudo apt-get install -y graphviz + displayName: 'Install Graphviz' + - script: python -m pip install --upgrade pip setuptools wheel + displayName: 'Install tools' + - script: pip install -r requirements.txt + displayName: 'Install Requirements' + - script: pip install -r requirements-dev.txt + displayName: 'Install Requirements dev' + - script: | + ruff . + displayName: 'Ruff' + - script: | + rstcheck -r ./_doc ./pandas_streaming + displayName: 'rstcheck' + - script: | + black --diff . + displayName: 'Black' - script: | - python -u setup.py unittests + python -m pytest --cov displayName: 'Runs Unit Tests' - script: | python -u setup.py bdist_wheel displayName: 'Build Package' -# - script: | -# python -u setup.py build_sphinx -# displayName: 'Builds Documentation' + #- script: | + # python -m sphinx _doc dist/html + # displayName: 'Builds Documentation' - task: PublishPipelineArtifact@0 inputs: artifactName: 'wheel-linux-$(python.version)' targetPath: 'dist' +- job: 'TestWindows' + pool: + vmImage: 'windows-latest' + strategy: + matrix: + Python311-Windows: + python.version: '3.11' + maxParallel: 3 + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + architecture: 'x64' + - script: python -m pip install --upgrade pip setuptools wheel + displayName: 'Install tools' + - script: pip install -r requirements.txt + displayName: 'Install Requirements' + - script: pip install -r requirements-dev.txt + displayName: 'Install Requirements dev' + - script: | + python -m pytest + displayName: 'Runs Unit Tests' + - script: | + python -u setup.py bdist_wheel + displayName: 'Build Package' + - task: PublishPipelineArtifact@0 + inputs: + artifactName: 'wheel-windows-$(python.version)' + targetPath: 'dist' + - job: 'TestMac' pool: vmImage: 'macOS-latest' strategy: matrix: - Python310-Mac: - python.version: '3.10' + Python311-Mac: + python.version: '3.11' maxParallel: 3 steps: @@ -62,36 +186,22 @@ jobs: architecture: 'x64' - script: gcc --version displayName: 'gcc version' - - script: | - brew update - displayName: 'brew update' + #- script: brew upgrade + # displayName: 'brew upgrade' + #- script: brew update + # displayName: 'brew update' - script: export displayName: 'export' - script: gcc --version displayName: 'gcc version' - - script: brew install llvm - displayName: 'install llvm' - - script: brew install p7zip - displayName: 'Install p7zip' - - script: brew install pandoc - displayName: 'Install Pandoc' - - script: brew install graphviz - continueOnError: true - displayName: 'Install Graphviz' - - script: brew install --cask mactex - displayName: 'Install latex' - script: python -m pip install --upgrade pip setuptools wheel displayName: 'Install tools' - - script: brew install pybind11 - displayName: 'Install pybind11' - script: pip install -r requirements.txt displayName: 'Install Requirements' + - script: pip install -r requirements-dev.txt + displayName: 'Install Requirements dev' - script: | - export MACOSX_DEPLOYMENT_TARGET=10.13 - python setup.py build_ext --inplace - displayName: 'Build package' - - script: | - python -u setup.py unittests + python -m pytest displayName: 'Runs Unit Tests' - script: | python -u setup.py bdist_wheel diff --git a/build_script.bat b/build_script.bat deleted file mode 100644 index 415ae38..0000000 --- a/build_script.bat +++ /dev/null @@ -1,13 +0,0 @@ -@echo off -if "%1"=="" goto default_value_python: -set pythonexe="%1" -%pythonexe% setup.py write_version -goto custom_python: - -:default_value_python: -set pythonexe="c:\Python395_x64\python.exe" -if not exist %pythonexe% set pythonexe="c:\Python391_x64\python.exe" -:custom_python: -@echo [python] %pythonexe% -%pythonexe% -u setup.py build_script -if %errorlevel% neq 0 exit /b %errorlevel% \ No newline at end of file diff --git a/pandas_streaming/__init__.py b/pandas_streaming/__init__.py index 8450d74..a4a6c0c 100644 --- a/pandas_streaming/__init__.py +++ b/pandas_streaming/__init__.py @@ -1,53 +1,5 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Module *pandas_streaming*. -Processes large datasets with :epkg:`pandas` by -reimplementing streeaming versions of -:epkg:`pandas` functionalites. -""" - -__version__ = "0.3.218" +__version__ = "0.4.218" __author__ = "Xavier Dupré" __github__ = "https://github.com/sdpython/pandas_streaming" __url__ = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html" __license__ = "MIT License" -__blog__ = """ - - - - blog - - - - - -""" - - -def check(log=False): - """ - Checks the library is working. - It raises an exception. - If you want to disable the logs: - - :param log: if True, display information, otherwise none - :return: 0 or exception - """ - return True - - -def _setup_hook(use_print=False): - """ - if this function is added to the module, - the help automation and unit tests call it first before - anything goes on as an initialization step. - """ - # we can check many things, needed module - # any others things before unit tests are started - if use_print: - print("Success: _setup_hook") diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py index e78891a..ec01b02 100644 --- a/pandas_streaming/df/connex_split.py +++ b/pandas_streaming/df/connex_split.py @@ -112,7 +112,7 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None, stratify=None, hash_size=9, unique_rows=False, shuffle=True, fail_imbalanced=0.05, keep_balance=None, stop_if_bigger=None, return_cnx=False, - must_groups=None, random_state=None, fLOG=None): + must_groups=None, random_state=None): """ This split is for a specific case where data is linked in many ways. Let's assume we have three ids as we have @@ -144,7 +144,6 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None, @param must_groups column name for ids which must not be shared by train/test partitions @param random_state seed for random generator - @param fLOG logging function @return Two @see cl StreamingDataFrame, one for train, one for test. @@ -384,8 +383,7 @@ def double_merge(d): def train_test_apart_stratify(df, group, test_size=0.25, train_size=None, - stratify=None, force=False, random_state=None, - fLOG=None): + stratify=None, force=False, random_state=None): """ This split is for a specific case where data is linked in one way. Let's assume we have two ids as we have @@ -403,7 +401,6 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None, @param force if True, tries to get at least one example on the test side for each value of the column *stratify* @param random_state seed for random generators - @param fLOG logging function @return Two @see cl StreamingDataFrame, one for train, one for test. diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py index 46f55e7..4ae503b 100644 --- a/pandas_streaming/df/dataframe_io_helpers.py +++ b/pandas_streaming/df/dataframe_io_helpers.py @@ -144,7 +144,7 @@ def _flatten(obj, key): return flattened_dict -def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fLOG=None): +def enumerate_json_items(filename, encoding=None, lines=False, flatten=False): """ Enumerates items from a :epkg:`JSON` file or string. @@ -152,7 +152,6 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL :param encoding: encoding :param lines: one record per row :param flatten: call @see fn flatten_dictionary - :param fLOG: logging function :return: iterator on records at first level. It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``. @@ -236,24 +235,23 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False, fL with open(filename, "r", encoding=encoding) as f: for el in enumerate_json_items( f, encoding=encoding, lines=lines, - flatten=flatten, fLOG=fLOG): + flatten=flatten): yield el else: st = StringIO(filename) for el in enumerate_json_items( st, encoding=encoding, lines=lines, - flatten=flatten, fLOG=fLOG): + flatten=flatten): yield el elif isinstance(filename, bytes): st = BytesIO(filename) for el in enumerate_json_items( - st, encoding=encoding, lines=lines, flatten=flatten, - fLOG=fLOG): + st, encoding=encoding, lines=lines, flatten=flatten): yield el elif lines: for el in enumerate_json_items( JsonPerRowsStream(filename), - encoding=encoding, lines=False, flatten=flatten, fLOG=fLOG): + encoding=encoding, lines=False, flatten=flatten): yield el else: if hasattr(filename, 'seek'): diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bad7f7a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,31 @@ +[tool.rstcheck] +report_level = "INFO" +ignore_directives = [ + "autoclass", + "autofunction", + "automodule", + "gdot", + "image-sg", + "runpython", +] +ignore_roles = ["epkg"] + +[tool.ruff] + +# Exclude a variety of commonly ignored directories. +exclude = [ + ".eggs", + ".git", + "build", + "dist", +] + +# Same as Black. +line-length = 88 + +[tool.ruff.mccabe] +# Unlike Flake8, default to a complexity level of 10. +max-complexity = 10 + +[tool.ruff.per-file-ignores] +"_doc/examples/plot_first_example.py" = ["E402", "F811"] diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..5ab8605 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,20 @@ +autopep8 +coverage +ijson +jupyter_sphinx +jyquickhelper +matplotlib +pandas>=1.1.0 +pandocfilters +Pillow +pycodestyle +pylint>=2.14.0 +pyquickhelper>=1.10 +pyquicksetup +scikit-learn +scipy +sphinx +sphinxcontrib.imagesvg +sphinx_gallery +ujson +wheel diff --git a/requirements.txt b/requirements.txt index 5ab8605..fb6c7ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1 @@ -autopep8 -coverage -ijson -jupyter_sphinx -jyquickhelper -matplotlib -pandas>=1.1.0 -pandocfilters -Pillow -pycodestyle -pylint>=2.14.0 -pyquickhelper>=1.10 -pyquicksetup -scikit-learn -scipy -sphinx -sphinxcontrib.imagesvg -sphinx_gallery -ujson -wheel +pandas diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..c544d66 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,5 @@ +[options] +packages = find: + +[options.packages.find] +include = pandas_streaming* diff --git a/setup.py b/setup.py index e8a706a..6968009 100644 --- a/setup.py +++ b/setup.py @@ -1,57 +1,71 @@ # -*- coding: utf-8 -*- -from __future__ import print_function -import sys import os -from setuptools import setup, Extension, find_packages -from pyquicksetup import read_version, read_readme, default_cmdclass - -######### -# settings -######### - -project_var_name = "pandas_streaming" -versionPython = f"{sys.version_info.major}.{sys.version_info.minor}" -path = "Lib/site-packages/" + project_var_name -readme = 'README.rst' -history = "HISTORY.rst" -requirements = None - -KEYWORDS = [project_var_name, 'Xavier Dupré', 'pandas', 'streaming'] -DESCRIPTION = """Streaming operations with pandas.""" -CLASSIFIERS = [ - 'Programming Language :: Python :: 3', - 'Intended Audience :: Developers', - 'Topic :: Scientific/Engineering', - 'Topic :: Education', - 'License :: OSI Approved :: MIT License', - 'Development Status :: 5 - Production/Stable' -] - -####### -# data -####### - -packages = find_packages() -package_dir = {k: os.path.join('.', k.replace(".", "/")) for k in packages} -package_data = {} + +from setuptools import setup + +###################### +# beginning of setup +###################### + + +here = os.path.dirname(__file__) +if here == "": + here = "." +package_data = {"pandas_streaming.validation": ["*.css", "*.js"]} + +try: + with open(os.path.join(here, "requirements.txt"), "r") as f: + requirements = f.read().strip(" \n\r\t").split("\n") +except FileNotFoundError: + requirements = [] +if len(requirements) == 0 or requirements == [""]: + requirements = ["pandas"] + +try: + with open(os.path.join(here, "README.rst"), "r", encoding="utf-8") as f: + long_description = "pandas-streaming:" + f.read().split("pandas-streaming:")[1] +except FileNotFoundError: + long_description = "" + +version_str = "0.1.0" +with open(os.path.join(here, "pandas_streaming/__init__.py"), "r") as f: + line = [ + _ + for _ in [_.strip("\r\n ") for _ in f.readlines()] + if _.startswith("__version__") + ] + if len(line) > 0: + version_str = line[0].split("=")[1].strip('" ') setup( - name=project_var_name, - version=read_version(__file__, project_var_name), - author='Xavier Dupré', - author_email='xavier.dupre@gmail.com', - license="MIT", - url="http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html", - download_url="https://github.com/sdpython/pandas_streaming/", - description=DESCRIPTION, - long_description=read_readme(__file__), - cmdclass=default_cmdclass(), - keywords=KEYWORDS, - classifiers=CLASSIFIERS, - packages=packages, - package_dir=package_dir, + name="pandas-streaming", + version=version_str, + description="Array (and numpy) API for ONNX", + long_description=long_description, + author="Xavier Dupré", + author_email="xavier.dupre@gmail.com", + url="https://github.com/sdpython/pandas-streaming", package_data=package_data, - setup_requires=["pyquicksetup"], - install_requires=['numpy', 'pandas', 'ijson'], + setup_requires=["numpy", "scipy"], + install_requires=requirements, + classifiers=[ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: C", + "Programming Language :: Python", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Development Status :: 5 - Production/Stable", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: Unix", + "Operating System :: MacOS", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], ) From c461afed9018d3c5133263608dafca03d4839f32 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sat, 22 Jul 2023 20:00:37 +0200 Subject: [PATCH 02/16] refactoring --- _doc/notebooks/first_steps.ipynb | 1730 +++++++++-------- _doc/sphinxdoc/source/_static/my-styles.css | 41 - _doc/sphinxdoc/source/_templates/blogtoc.html | 4 - _doc/sphinxdoc/source/_templates/layout.html | 5 - .../sphinxdoc/source/_templates/my-styles.css | 41 - _doc/sphinxdoc/source/_templates/page.html | 4 - _doc/sphinxdoc/source/conf.py | 224 ++- _unittests/ut_df/test_connex_split.py | 243 ++- _unittests/ut_df/test_connex_split_big.py | 45 +- _unittests/ut_df/test_connex_split_cat.py | 81 +- _unittests/ut_df/test_dataframe_helpers.py | 19 +- .../ut_df/test_dataframe_helpers_simple.py | 36 +- _unittests/ut_df/test_dataframe_io.py | 29 +- _unittests/ut_df/test_dataframe_io_helpers.py | 216 +- _unittests/ut_df/test_dataframe_sort.py | 87 +- _unittests/ut_df/test_pandas_groupbynan.py | 79 +- _unittests/ut_df/test_streaming_dataframe.py | 215 +- .../ut_documentation/test_run_notebooks.py | 14 +- _unittests/ut_module/test_sklearn.py | 10 +- pandas_streaming/data/__init__.py | 5 - pandas_streaming/data/dummy.py | 18 +- pandas_streaming/df/__init__.py | 17 +- pandas_streaming/df/connex_split.py | 222 ++- pandas_streaming/df/dataframe.py | 535 +++-- pandas_streaming/df/dataframe_helpers.py | 155 +- pandas_streaming/df/dataframe_io.py | 40 +- pandas_streaming/df/dataframe_io_helpers.py | 48 +- pandas_streaming/df/dataframe_split.py | 91 +- pandas_streaming/exc/__init__.py | 7 +- pandas_streaming/exc/exc_streaming.py | 10 +- pyproject.toml | 4 + requirements-dev.txt | 3 +- 32 files changed, 2302 insertions(+), 1976 deletions(-) delete mode 100644 _doc/sphinxdoc/source/_static/my-styles.css delete mode 100644 _doc/sphinxdoc/source/_templates/blogtoc.html delete mode 100644 _doc/sphinxdoc/source/_templates/layout.html delete mode 100644 _doc/sphinxdoc/source/_templates/my-styles.css delete mode 100644 _doc/sphinxdoc/source/_templates/page.html diff --git a/_doc/notebooks/first_steps.ipynb b/_doc/notebooks/first_steps.ipynb index 63ff017..735ede9 100644 --- a/_doc/notebooks/first_steps.ipynb +++ b/_doc/notebooks/first_steps.ipynb @@ -1,902 +1,906 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# First steps with pandas_streaming\n", - "\n", - "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*." - ] - }, + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# First steps with pandas_streaming\n", + "\n", + "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
run previous cell, wait for 2 seconds
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
run previous cell, wait for 2 seconds
\n", + "" ], - "source": [ - "from jyquickhelper import add_notebook_menu\n", - "add_notebook_menu()" + "text/plain": [ + "" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas to pandas_streaming" - ] - }, + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from jyquickhelper import add_notebook_menu\n", + "\n", + "add_notebook_menu()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas to pandas_streaming" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
16.0b
27.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XY
04.5a
16.0b
27.0c
\n", + "
" ], - "source": [ - "from pandas import DataFrame\n", - "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n", - "df" + "text/plain": [ + " X Y\n", + "0 4.5 a\n", + "1 6.0 b\n", + "2 7.0 c" ] - }, + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pandas import DataFrame\n", + "\n", + "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "We create a streaming dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "We create a streaming dataframe:" + "data": { + "text/plain": [ + "" ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pandas_streaming.df import StreamingDataFrame\n", - "sdf = StreamingDataFrame.read_df(df)\n", - "sdf" - ] - }, + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pandas_streaming.df import StreamingDataFrame\n", + "\n", + "sdf = StreamingDataFrame.read_df(df)\n", + "sdf" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
16.0b
27.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XY
04.5a
16.0b
27.0c
\n", + "
" ], - "source": [ - "sdf.to_dataframe()" + "text/plain": [ + " X Y\n", + "0 4.5 a\n", + "1 6.0 b\n", + "2 7.0 c" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory." - ] - }, + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdf.to_dataframe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
16.0b
27.0c
04.5a
16.0b
27.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XY
04.5a
16.0b
27.0c
04.5a
16.0b
27.0c
\n", + "
" ], - "source": [ - "sdf2 = sdf.concat(sdf)\n", - "sdf2.to_dataframe()" + "text/plain": [ + " X Y\n", + "0 4.5 a\n", + "1 6.0 b\n", + "2 7.0 c\n", + "0 4.5 a\n", + "1 6.0 b\n", + "2 7.0 c" ] - }, + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdf2 = sdf.concat(sdf)\n", + "sdf2.to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
YZ
0a10
1b20
\n", - "
" - ], - "text/plain": [ - " Y Z\n", - "0 a 10\n", - "1 b 20" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YZ
0a10
1b20
\n", + "
" ], - "source": [ - "m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n", - "m" + "text/plain": [ + " Y Z\n", + "0 a 10\n", + "1 b 20" ] - }, + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n", + "m" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XYZ
04.5a10.0
16.0b20.0
27.0cNaN
04.5a10.0
16.0b20.0
27.0cNaN
\n", - "
" - ], - "text/plain": [ - " X Y Z\n", - "0 4.5 a 10.0\n", - "1 6.0 b 20.0\n", - "2 7.0 c NaN\n", - "0 4.5 a 10.0\n", - "1 6.0 b 20.0\n", - "2 7.0 c NaN" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XYZ
04.5a10.0
16.0b20.0
27.0cNaN
04.5a10.0
16.0b20.0
27.0cNaN
\n", + "
" ], - "source": [ - "sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n", - "sdf3.to_dataframe()" + "text/plain": [ + " X Y Z\n", + "0 4.5 a 10.0\n", + "1 6.0 b 20.0\n", + "2 7.0 c NaN\n", + "0 4.5 a 10.0\n", + "1 6.0 b 20.0\n", + "2 7.0 c NaN" ] - }, + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n", + "sdf3.to_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XYZ
04.5a10.0
14.5a10.0
26.0b20.0
36.0b20.0
47.0cNaN
57.0cNaN
\n", - "
" - ], - "text/plain": [ - " X Y Z\n", - "0 4.5 a 10.0\n", - "1 4.5 a 10.0\n", - "2 6.0 b 20.0\n", - "3 6.0 b 20.0\n", - "4 7.0 c NaN\n", - "5 7.0 c NaN" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XYZ
04.5a10.0
14.5a10.0
26.0b20.0
36.0b20.0
47.0cNaN
57.0cNaN
\n", + "
" ], - "source": [ - "sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The order might be different." + "text/plain": [ + " X Y Z\n", + "0 4.5 a 10.0\n", + "1 4.5 a 10.0\n", + "2 6.0 b 20.0\n", + "3 6.0 b 20.0\n", + "4 7.0 c NaN\n", + "5 7.0 c NaN" ] - }, + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The order might be different." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
14.5a
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 4.5 a" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XY
04.5a
14.5a
\n", + "
" ], - "source": [ - "sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n", - "sdfte.head()" + "text/plain": [ + " X Y\n", + "0 4.5 a\n", + "1 4.5 a" ] - }, + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n", + "sdfte.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
06.0b
17.0c
26.0b
07.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 6.0 b\n", - "1 7.0 c\n", - "2 6.0 b\n", - "0 7.0 c" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
XY
06.0b
17.0c
26.0b
07.0c
\n", + "
" ], - "source": [ - "sdftr.head()" + "text/plain": [ + " X Y\n", + "0 6.0 b\n", + "1 7.0 c\n", + "2 6.0 b\n", + "0 7.0 c" ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## split a big file" - ] - }, + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdftr.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "## split a big file" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'example.txt'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdf2.to_csv(\"example.txt\")" + "data": { + "text/plain": [ + "'example.txt'" ] - }, + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sdf2.to_csv(\"example.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['example.train.txt', 'example.test.txt']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n", - "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)" + "data": { + "text/plain": [ + "['example.train.txt', 'example.test.txt']" ] - }, + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n", + "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['example.test.txt', 'example.train.txt', 'example.txt']" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import glob\n", - "glob.glob(\"ex*.txt\")" + "data": { + "text/plain": [ + "['example.test.txt', 'example.train.txt', 'example.txt']" ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "import glob\n", + "\n", + "glob.glob(\"ex*.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 2 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } \ No newline at end of file diff --git a/_doc/sphinxdoc/source/_static/my-styles.css b/_doc/sphinxdoc/source/_static/my-styles.css deleted file mode 100644 index 57b29ed..0000000 --- a/_doc/sphinxdoc/source/_static/my-styles.css +++ /dev/null @@ -1,41 +0,0 @@ - -.highlight-ipython3 { - background-color: #f8f8c8; -} - -div.highlight-ipython3 pre { - background-color: #f8f8c8; -} - -.wy-nav-top { - background-color: #FF0040; -} - -.wy-side-nav-search { - background-color: #FF0040; -} - -pre.highlight-default { - background-color: #b5b5b5; -} - -table { - border: solid 1px #DDEEEE; - border-collapse: collapse; - border-spacing: 0; - font: normal 13px Arial, sans-serif; -} -thead th { - background-color: #DDEFEF; - border: solid 1px #DDEEEE; - color: #336B6B; - padding: 10px; - text-align: left; - text-shadow: 1px 1px 1px #fff; -} -tbody td { - border: solid 1px #DDEEEE; - color: #333; - padding: 10px; - text-shadow: 1px 1px 1px #fff; -} diff --git a/_doc/sphinxdoc/source/_templates/blogtoc.html b/_doc/sphinxdoc/source/_templates/blogtoc.html deleted file mode 100644 index 02a6b01..0000000 --- a/_doc/sphinxdoc/source/_templates/blogtoc.html +++ /dev/null @@ -1,4 +0,0 @@ -Index -Module -

Blog

-2017-09-17 - Why pandas_streaming? \ No newline at end of file diff --git a/_doc/sphinxdoc/source/_templates/layout.html b/_doc/sphinxdoc/source/_templates/layout.html deleted file mode 100644 index 08baa3e..0000000 --- a/_doc/sphinxdoc/source/_templates/layout.html +++ /dev/null @@ -1,5 +0,0 @@ -{# Import the theme's layout. #} -{% extends "!layout.html" %} - -{# Custom CSS overrides #} -{% set bootswatch_css_custom = ['_static/my-styles.css'] %} \ No newline at end of file diff --git a/_doc/sphinxdoc/source/_templates/my-styles.css b/_doc/sphinxdoc/source/_templates/my-styles.css deleted file mode 100644 index 57b29ed..0000000 --- a/_doc/sphinxdoc/source/_templates/my-styles.css +++ /dev/null @@ -1,41 +0,0 @@ - -.highlight-ipython3 { - background-color: #f8f8c8; -} - -div.highlight-ipython3 pre { - background-color: #f8f8c8; -} - -.wy-nav-top { - background-color: #FF0040; -} - -.wy-side-nav-search { - background-color: #FF0040; -} - -pre.highlight-default { - background-color: #b5b5b5; -} - -table { - border: solid 1px #DDEEEE; - border-collapse: collapse; - border-spacing: 0; - font: normal 13px Arial, sans-serif; -} -thead th { - background-color: #DDEFEF; - border: solid 1px #DDEEEE; - color: #336B6B; - padding: 10px; - text-align: left; - text-shadow: 1px 1px 1px #fff; -} -tbody td { - border: solid 1px #DDEEEE; - color: #333; - padding: 10px; - text-shadow: 1px 1px 1px #fff; -} diff --git a/_doc/sphinxdoc/source/_templates/page.html b/_doc/sphinxdoc/source/_templates/page.html deleted file mode 100644 index 1be6020..0000000 --- a/_doc/sphinxdoc/source/_templates/page.html +++ /dev/null @@ -1,4 +0,0 @@ -{% extends "layout.html" %} -{% block body %} -{{ body }} -{% endblock body %} diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py index eed8a1c..f298be6 100644 --- a/_doc/sphinxdoc/source/conf.py +++ b/_doc/sphinxdoc/source/conf.py @@ -1,82 +1,204 @@ # -*- coding: utf-8 -*- import sys import os -import alabaster -from pyquickhelper.helpgen.default_conf import set_sphinx_variables +from sphinx_runpython.github_link import make_linkcode_resolve +from sphinx_runpython.conf_helper import has_dvipng, has_dvisvgm +from pandas_streaming import __version__ -sys.path.insert(0, os.path.abspath(os.path.join(os.path.split(__file__)[0]))) +extensions = [ + "nbsphinx", + "sphinx.ext.autodoc", + "sphinx.ext.coverage", + "sphinx.ext.githubpages", + "sphinx.ext.ifconfig", + "sphinx.ext.intersphinx", + "sphinx.ext.linkcode", + "sphinx.ext.viewcode", + "sphinx.ext.napoleon", + "sphinx.ext.todo", + "sphinx_gallery.gen_gallery", + "sphinx_issues", + "sphinx_runpython.blocdefs.sphinx_exref_extension", + "sphinx_runpython.blocdefs.sphinx_mathdef_extension", + "sphinx_runpython.epkg", + "sphinx_runpython.gdot", + "sphinx_runpython.runpython", + "matplotlib.sphinxext.plot_directive", +] -local_template = os.path.join(os.path.abspath( - os.path.dirname(__file__)), "_templates") +if has_dvisvgm(): + extensions.append("sphinx.ext.imgmath") + imgmath_image_format = "svg" +elif has_dvipng(): + extensions.append("sphinx.ext.pngmath") + imgmath_image_format = "png" +else: + extensions.append("sphinx.ext.mathjax") -set_sphinx_variables(__file__, "pandas_streaming", "Xavier Dupré", 2023, - "alabaster", alabaster.get_path(), - locals(), extlinks=dict(issue=( - 'https://github.com/sdpython/pandas_streaming/issues/%s', - 'issue %s')), - title="Streaming functionalities for pandas", book=True) +templates_path = ["_templates"] +html_logo = "_static/project_ico.png" +source_suffix = ".rst" +master_doc = "index" +project = "pandas-streaming" +copyright = "2016-2023, Xavier Dupré" +author = "Xavier Dupré" +version = __version__ +release = __version__ +language = "en" +exclude_patterns = ["auto_examples/*.ipynb"] +pygments_style = "sphinx" +todo_include_todos = True +nbsphinx_execute = "never" -blog_root = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/" +html_theme = "furo" +html_theme_path = ["_static"] +html_theme_options = {} +html_sourcelink_suffix = "" +html_static_path = ["_static"] -html_css_files = ['my-styles.css'] +issues_github_path = "sdpython/pandas-streaming" -html_logo = "_static/project_ico.png" +# The following is used by sphinx.ext.linkcode to provide links to github +linkcode_resolve = make_linkcode_resolve( + "pandas_streaming", + ( + "https://github.com/sdpython/pandas-streaming/" + "blob/{revision}/{package}/" + "{path}#L{lineno}" + ), +) -html_sidebars = {} +latex_elements = { + "papersize": "a4", + "pointsize": "10pt", + "title": project, +} -language = "en" -custom_preamble = """\n +mathjax3_config = {"chtml": {"displayAlign": "left"}} + +intersphinx_mapping = { + "onnx": ("https://onnx.ai/onnx/", None), + "matplotlib": ("https://matplotlib.org/", None), + "numpy": ("https://numpy.org/doc/stable", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), + "python": (f"https://docs.python.org/{sys.version_info.major}", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "sklearn": ("https://scikit-learn.org/stable/", None), + "sklearn-onnx": ("https://onnx.ai/sklearn-onnx/", None), + "torch": ("https://pytorch.org/docs/stable/", None), +} + +# Check intersphinx reference targets exist +nitpicky = True +# See also scikit-learn/scikit-learn#26761 +nitpick_ignore = [ + ("py:class", "False"), + ("py:class", "True"), + ("py:class", "pipeline.Pipeline"), + ("py:class", "default=sklearn.utils.metadata_routing.UNCHANGED"), +] + +sphinx_gallery_conf = { + # path to your examples scripts + "examples_dirs": os.path.join(os.path.dirname(__file__), "examples"), + # path where to save gallery generated examples + "gallery_dirs": "auto_examples", +} + +# next + +preamble = """ +\\usepackage{etex} +\\usepackage{fixltx2e} % LaTeX patches, \\textsubscript +\\usepackage{cmap} % fix search and cut-and-paste in Acrobat +\\usepackage[raccourcis]{fast-diagram} +\\usepackage{titlesec} +\\usepackage{amsmath} +\\usepackage{amssymb} +\\usepackage{amsfonts} +\\usepackage{graphics} +\\usepackage{epic} +\\usepackage{eepic} +%\\usepackage{pict2e} +%%% Redefined titleformat +\\setlength{\\parindent}{0cm} +\\setlength{\\parskip}{1ex plus 0.5ex minus 0.2ex} +\\newcommand{\\hsp}{\\hspace{20pt}} +\\newcommand{\\acc}[1]{\\left\\{#1\\right\\}} +\\newcommand{\\cro}[1]{\\left[#1\\right]} +\\newcommand{\\pa}[1]{\\left(#1\\right)} +\\newcommand{\\R}{\\mathbb{R}} +\\newcommand{\\HRule}{\\rule{\\linewidth}{0.5mm}} +%\\titleformat{\\chapter}[hang]{\\Huge\\bfseries\\sffamily}{\\thechapter\\hsp}{0pt}{\\Huge\\bfseries\\sffamily} + +\\usepackage[all]{xy} \\newcommand{\\vecteur}[2]{\\pa{#1,\\dots,#2}} \\newcommand{\\N}[0]{\\mathbb{N}} -\\newcommand{\\indicatrice}[1]{\\mathbf{1\\!\\!1}_{\\acc{#1}}} -\\usepackage[all]{xy} +\\newcommand{\\indicatrice}[1]{ {1\\!\\!1}_{\\acc{#1}} } \\newcommand{\\infegal}[0]{\\leqslant} \\newcommand{\\supegal}[0]{\\geqslant} \\newcommand{\\ensemble}[2]{\\acc{#1,\\dots,#2}} \\newcommand{\\fleche}[1]{\\overrightarrow{ #1 }} \\newcommand{\\intervalle}[2]{\\left\\{#1,\\cdots,#2\\right\\}} -\\newcommand{\\loinormale}[2]{{\\cal N}\\pa{#1,#2}} -\\newcommand{\\independant}[0]{\\;\\makebox[3ex]{\\makebox[0ex]{\\rule[-0.2ex]{3ex}{.1ex}}\\!\\!\\!\\!\\makebox[.5ex][l]{\\rule[-.2ex]{.1ex}{2ex}}\\makebox[.5ex][l]{\\rule[-.2ex]{.1ex}{2ex}}} \\,\\,} +\\newcommand{\\independant}[0]{\\perp \\!\\!\\! \\perp} \\newcommand{\\esp}{\\mathbb{E}} +\\newcommand{\\espf}[2]{\\mathbb{E}_{#1}\\pa{#2}} \\newcommand{\\var}{\\mathbb{V}} \\newcommand{\\pr}[1]{\\mathbb{P}\\pa{#1}} \\newcommand{\\loi}[0]{{\\cal L}} \\newcommand{\\vecteurno}[2]{#1,\\dots,#2} \\newcommand{\\norm}[1]{\\left\\Vert#1\\right\\Vert} +\\newcommand{\\norme}[1]{\\left\\Vert#1\\right\\Vert} +\\newcommand{\\scal}[2]{\\left<#1,#2\\right>} \\newcommand{\\dans}[0]{\\rightarrow} \\newcommand{\\partialfrac}[2]{\\frac{\\partial #1}{\\partial #2}} \\newcommand{\\partialdfrac}[2]{\\dfrac{\\partial #1}{\\partial #2}} -\\newcommand{\\loimultinomiale}[1]{{\\cal M}\\pa{#1}} \\newcommand{\\trace}[1]{tr\\pa{#1}} +\\newcommand{\\sac}[0]{|} \\newcommand{\\abs}[1]{\\left|#1\\right|} +\\newcommand{\\loinormale}[2]{{\\cal N} \\pa{#1,#2}} +\\newcommand{\\loibinomialea}[1]{{\\cal B} \\pa{#1}} +\\newcommand{\\loibinomiale}[2]{{\\cal B} \\pa{#1,#2}} +\\newcommand{\\loimultinomiale}[1]{{\\cal M} \\pa{#1}} +\\newcommand{\\variance}[1]{\\mathbb{V}\\pa{#1}} +\\newcommand{\\intf}[1]{\\left\\lfloor #1 \\right\\rfloor} """ -# \\usepackage{eepic} - -imgmath_latex_preamble += custom_preamble -latex_elements['preamble'] += custom_preamble -mathdef_link_only = True - -epkg_dictionary.update({ - 'csv': 'https://en.wikipedia.org/wiki/Comma-separated_values', - 'dask': 'https://dask.pydata.org/en/latest/', - 'dataframe': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html', - 'Dataframe': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html', - 'DataFrame': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html', - 'dataframes': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html', - 'dill': 'https://dill.readthedocs.io/en/latest/dill.html', - 'Hadoop': 'http://hadoop.apache.org/', - 'ijson': 'https://github.com/ICRAR/ijson', - 'nan': 'https://numpy.org/doc/stable/reference/constants.html#numpy.NAN', - 'pandas': ('http://pandas.pydata.org/pandas-docs/stable/', - ('http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html', 1), - ('http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html', 2)), - 'pyarrow': 'https://arrow.apache.org/docs/python/', - 'pyspark': 'http://spark.apache.org/docs/2.1.1/api/python/index.html', - 'scikit-multiflow': 'https://scikit-multiflow.github.io/', - 'sklearn': ('http://scikit-learn.org/stable/', - ('http://scikit-learn.org/stable/modules/generated/{0}.html', 1), - ('http://scikit-learn.org/stable/modules/generated/{0}.{1}.html', 2)), - 'streamz': 'https://streamz.readthedocs.io/en/latest/index.html', - 'tornado': 'https://www.tornadoweb.org/en/stable/', -}) + +imgmath_latex_preamble = preamble +latex_elements["preamble"] = imgmath_latex_preamble + + +epkg_dictionary = { + "csv": "https://en.wikipedia.org/wiki/Comma-separated_values", + "dask": "https://dask.pydata.org/en/latest/", + "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "dill": "https://dill.readthedocs.io/en/latest/dill.html", + "Hadoop": "http://hadoop.apache.org/", + "ijson": "https://github.com/ICRAR/ijson", + "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN", + "pandas": ( + "http://pandas.pydata.org/pandas-docs/stable/", + ( + "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html", + 1, + ), + ( + "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html", + 2, + ), + ), + "pyarrow": "https://arrow.apache.org/docs/python/", + "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html", + "scikit-multiflow": "https://scikit-multiflow.github.io/", + "sklearn": ( + "http://scikit-learn.org/stable/", + ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1), + ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2), + ), + "streamz": "https://streamz.readthedocs.io/en/latest/index.html", + "tornado": "https://www.tornadoweb.org/en/stable/", + } diff --git a/_unittests/ut_df/test_connex_split.py b/_unittests/ut_df/test_connex_split.py index 33bd03f..e373c9b 100644 --- a/_unittests/ut_df/test_connex_split.py +++ b/_unittests/ut_df/test_connex_split.py @@ -1,137 +1,174 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=4s) -""" import unittest import pandas from pyquickhelper.pycode import ExtTestCase -from pandas_streaming.df import dataframe_shuffle, train_test_split_weights, train_test_connex_split +from pandas_streaming.df import ( + dataframe_shuffle, + train_test_split_weights, + train_test_connex_split, +) class TestConnexSplit(ExtTestCase): - def test_shuffle(self): - df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1"), - dict(a=2, b="f", c=5.7, ind="a2"), - dict(a=4, b="g", c=5.8, ind="a3"), - dict(a=8, b="h", c=5.9, ind="a4"), - dict(a=16, b="i", c=6.2, ind="a5")]) + df = pandas.DataFrame( + [ + dict(a=1, b="e", c=5.6, ind="a1"), + dict(a=2, b="f", c=5.7, ind="a2"), + dict(a=4, b="g", c=5.8, ind="a3"), + dict(a=8, b="h", c=5.9, ind="a4"), + dict(a=16, b="i", c=6.2, ind="a5"), + ] + ) shuffled = dataframe_shuffle(df, random_state=0) - sorted_ = shuffled.sort_values('a') + sorted_ = shuffled.sort_values("a") self.assertEqualDataFrame(df, sorted_) - df2 = df.set_index('ind') + df2 = df.set_index("ind") shuffled = dataframe_shuffle(df2, random_state=0) - sorted_ = shuffled.sort_values('a') + sorted_ = shuffled.sort_values("a") self.assertEqualDataFrame(df2, sorted_) - df2 = df.set_index(['ind', 'c']) + df2 = df.set_index(["ind", "c"]) shuffled = dataframe_shuffle(df2, random_state=0) - sorted_ = shuffled.sort_values('a') + sorted_ = shuffled.sort_values("a") self.assertEqualDataFrame(df2, sorted_) def test_split_weights_errors(self): - df = pandas.DataFrame([dict(a=1, b="e", c=1), - dict(a=2, b="f", c=1), - dict(a=4, b="g", c=1), - dict(a=8, b="h", c=1), - dict(a=12, b="h", c=1), - dict(a=16, b="i", c=1)]) - - train, test = train_test_split_weights(df, train_size=0.5, weights='c') + df = pandas.DataFrame( + [ + dict(a=1, b="e", c=1), + dict(a=2, b="f", c=1), + dict(a=4, b="g", c=1), + dict(a=8, b="h", c=1), + dict(a=12, b="h", c=1), + dict(a=16, b="i", c=1), + ] + ) + + train, test = train_test_split_weights(df, train_size=0.5, weights="c") self.assertTrue(train is not None) self.assertTrue(test is not None) - self.assertRaise(lambda: train_test_split_weights( - df, test_size=0.5, weights=[0.5, 0.5]), ValueError, 'Dimension') - self.assertRaise(lambda: train_test_split_weights( - df, test_size=0), ValueError, 'null') - self.assertRaise(lambda: train_test_split_weights( - df, test_size=0, weights='c'), ValueError, 'null') + self.assertRaise( + lambda: train_test_split_weights(df, test_size=0.5, weights=[0.5, 0.5]), + ValueError, + "Dimension", + ) + self.assertRaise( + lambda: train_test_split_weights(df, test_size=0), ValueError, "null" + ) + self.assertRaise( + lambda: train_test_split_weights(df, test_size=0, weights="c"), + ValueError, + "null", + ) def test_split_weights(self): - df = pandas.DataFrame([dict(a=1, b="e", c=1), - dict(a=2, b="f", c=1), - dict(a=4, b="g", c=1), - dict(a=8, b="h", c=1), - dict(a=12, b="h", c=1), - dict(a=16, b="i", c=1)]) + df = pandas.DataFrame( + [ + dict(a=1, b="e", c=1), + dict(a=2, b="f", c=1), + dict(a=4, b="g", c=1), + dict(a=8, b="h", c=1), + dict(a=12, b="h", c=1), + dict(a=16, b="i", c=1), + ] + ) train, test = train_test_split_weights(df, test_size=0.5) self.assertEqual(train.shape[1], test.shape[1]) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) - train, test = train_test_split_weights(df, test_size=0.5, weights='c') + train, test = train_test_split_weights(df, test_size=0.5, weights="c") self.assertEqual(train.shape[1], test.shape[1]) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) - train, test = train_test_split_weights( - df, test_size=0.5, weights=df['c']) + train, test = train_test_split_weights(df, test_size=0.5, weights=df["c"]) self.assertEqual(train.shape[1], test.shape[1]) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) - df = pandas.DataFrame([dict(a=1, b="e", c=1), - dict(a=2, b="f", c=2), - dict(a=4, b="g", c=3), - dict(a=8, b="h", c=1), - dict(a=12, b="h", c=2), - dict(a=16, b="i", c=3)]) + df = pandas.DataFrame( + [ + dict(a=1, b="e", c=1), + dict(a=2, b="f", c=2), + dict(a=4, b="g", c=3), + dict(a=8, b="h", c=1), + dict(a=12, b="h", c=2), + dict(a=16, b="i", c=3), + ] + ) - train, test = train_test_split_weights(df, test_size=0.5, weights='c', - fail_imbalanced=0.4) + train, test = train_test_split_weights( + df, test_size=0.5, weights="c", fail_imbalanced=0.4 + ) self.assertEqual(train.shape[1], test.shape[1]) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) - w1, w2 = train['c'].sum(), test['c'].sum() + w1, w2 = train["c"].sum(), test["c"].sum() delta = abs(w1 - w2) / (w1 + w2) self.assertGreater(0.4, delta) def test_split_connex(self): - df = pandas.DataFrame([dict(user="UA", prod="PA", card="C1"), - dict(user="UA", prod="PB", card="C1"), - dict(user="UB", prod="PC", card="C2"), - dict(user="UB", prod="PD", card="C2"), - dict(user="UC", prod="PE", card="C3"), - dict(user="UC", prod="PF", card="C4"), - dict(user="UD", prod="PG", card="C5"), - ]) + df = pandas.DataFrame( + [ + dict(user="UA", prod="PA", card="C1"), + dict(user="UA", prod="PB", card="C1"), + dict(user="UB", prod="PC", card="C2"), + dict(user="UB", prod="PD", card="C2"), + dict(user="UC", prod="PE", card="C3"), + dict(user="UC", prod="PF", card="C4"), + dict(user="UD", prod="PG", card="C5"), + ] + ) train, test = train_test_connex_split( # pylint: disable=W0632 - df, test_size=0.5, groups=['user', 'prod', 'card'], - fail_imbalanced=0.4) + df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4 + ) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) - for col in ['user', 'prod', 'card']: + for col in ["user", "prod", "card"]: s1 = set(train[col]) s2 = set(test[col]) if s1 & s2: raise AssertionError( - f'Non empty intersection {s1} & {s2}\n{train}\n{test}') + f"Non empty intersection {s1} & {s2}\n{train}\n{test}" + ) - df['connex'] = 'ole' + df["connex"] = "ole" train, test = train_test_connex_split( # pylint: disable=W0632 - df, test_size=0.5, groups=['user', 'prod', 'card'], - fail_imbalanced=0.4) + df, test_size=0.5, groups=["user", "prod", "card"], fail_imbalanced=0.4 + ) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) def test_split_connex2(self): - df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"), - dict(user="UA", prod="PB", card="C1"), - dict(user="UB", prod="PC", card="C2"), - dict(user="UB", prod="PD", card="C2"), - dict(user="UC", prod="PAA", card="C3"), - dict(user="UC", prod="PF", card="C4"), - dict(user="UD", prod="PG", card="C5"), - ]) - - train_test_connex_split(df, test_size=0.5, groups=['user', 'prod', 'card'], - fail_imbalanced=0.5, return_cnx=True) - train, test, stats = train_test_connex_split(df, test_size=0.5, - groups=[ - 'user', 'prod', 'card'], - fail_imbalanced=0.5, - return_cnx=True, random_state=0) + df = pandas.DataFrame( + [ + dict(user="UA", prod="PAA", card="C1"), + dict(user="UA", prod="PB", card="C1"), + dict(user="UB", prod="PC", card="C2"), + dict(user="UB", prod="PD", card="C2"), + dict(user="UC", prod="PAA", card="C3"), + dict(user="UC", prod="PF", card="C4"), + dict(user="UD", prod="PG", card="C5"), + ] + ) + + train_test_connex_split( + df, + test_size=0.5, + groups=["user", "prod", "card"], + fail_imbalanced=0.5, + return_cnx=True, + ) + train, test, stats = train_test_connex_split( + df, + test_size=0.5, + groups=["user", "prod", "card"], + fail_imbalanced=0.5, + return_cnx=True, + random_state=0, + ) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) - for col in ['user', 'prod', 'card']: + for col in ["user", "prod", "card"]: s1 = set(train[col]) s2 = set(test[col]) if s1 & s2: @@ -139,26 +176,35 @@ def test_split_connex2(self): for k, v in sorted(stats[0].items()): rows.append(f"{k}={v}") raise AssertionError( - 'Non empty intersection {0} & {1}\n{2}\n{3}\n{4}'.format(s1, s2, train, test, "\n".join(rows))) + "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( + s1, s2, train, test, "\n".join(rows) + ) + ) def test_split_connex_missing(self): - df = pandas.DataFrame([dict(user="UA", prod="PAA", card="C1"), - dict(user="UA", prod="PB", card="C1"), - dict(user="UB", prod="PC", card="C2"), - dict(user="UB", prod="PD", card="C2"), - dict(user="UC", prod="PAA", card="C3"), - dict(user="UC", card="C4"), - dict(user="UD", prod="PG"), - ]) - - train, test, stats = train_test_connex_split(df, test_size=0.5, - groups=[ - 'user', 'prod', 'card'], - fail_imbalanced=0.4, - return_cnx=True, random_state=0) + df = pandas.DataFrame( + [ + dict(user="UA", prod="PAA", card="C1"), + dict(user="UA", prod="PB", card="C1"), + dict(user="UB", prod="PC", card="C2"), + dict(user="UB", prod="PD", card="C2"), + dict(user="UC", prod="PAA", card="C3"), + dict(user="UC", card="C4"), + dict(user="UD", prod="PG"), + ] + ) + + train, test, stats = train_test_connex_split( + df, + test_size=0.5, + groups=["user", "prod", "card"], + fail_imbalanced=0.4, + return_cnx=True, + random_state=0, + ) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) - for col in ['user', 'prod', 'card']: + for col in ["user", "prod", "card"]: s1 = set(train[col]) s2 = set(test[col]) if s1 & s2: @@ -166,7 +212,10 @@ def test_split_connex_missing(self): for k, v in sorted(stats[0].items()): rows.append(f"{k}={v}") raise AssertionError( - 'Non empty intersection {0} & {1}\n{2}\n{3}\n{4}'.format(s1, s2, train, test, "\n".join(rows))) + "Non empty intersection {0} & {1}\n{2}\n{3}\n{4}".format( + s1, s2, train, test, "\n".join(rows) + ) + ) if __name__ == "__main__": diff --git a/_unittests/ut_df/test_connex_split_big.py b/_unittests/ut_df/test_connex_split_big.py index 22292c5..f297ec8 100644 --- a/_unittests/ut_df/test_connex_split_big.py +++ b/_unittests/ut_df/test_connex_split_big.py @@ -1,7 +1,4 @@ # -*- coding: utf-8 -*- -""" -@brief test log(time=30s) -""" import os import unittest from collections import Counter @@ -11,18 +8,19 @@ class TestConnexSplitBig(ExtTestCase): - def test_connex_big(self): data = os.path.join(os.path.dirname(__file__), "data") name = os.path.join(data, "buggy_hash.csv") df = pandas.read_csv(name, sep="\t", encoding="utf-8") - train, test, stats = train_test_connex_split(df, - groups=[ - "cart_id", "mail", "product_id"], - fail_imbalanced=0.9, return_cnx=True) + train, test, stats = train_test_connex_split( + df, + groups=["cart_id", "mail", "product_id"], + fail_imbalanced=0.9, + return_cnx=True, + ) self.assertGreater(train.shape[0], 0) self.assertGreater(test.shape[0], 0) - elements = stats[1]['connex'] + elements = stats[1]["connex"] counts = Counter(elements) nbc = len(counts) maxi = max(counts.values()) @@ -33,14 +31,16 @@ def test_connex_big_approx(self): data = os.path.join(os.path.dirname(__file__), "data") name = os.path.join(data, "buggy_hash.csv") df = pandas.read_csv(name, sep="\t", encoding="utf-8") - train, test, stats = train_test_connex_split(df, - groups=[ - "cart_id", "mail", "product_id"], - stop_if_bigger=0.05, return_cnx=True, - keep_balance=0.8) + train, test, stats = train_test_connex_split( + df, + groups=["cart_id", "mail", "product_id"], + stop_if_bigger=0.05, + return_cnx=True, + keep_balance=0.8, + ) self.assertGreater(train.shape[0], 0) self.assertGreater(test.shape[0], 0) - elements = stats[1]['connex'] + elements = stats[1]["connex"] counts = Counter(elements) nbc = len(counts) maxi = max(counts.values()) @@ -51,14 +51,17 @@ def test_connex_big_approx_must(self): data = os.path.join(os.path.dirname(__file__), "data") name = os.path.join(data, "buggy_hash.csv") df = pandas.read_csv(name, sep="\t", encoding="utf-8") - train, test, stats = train_test_connex_split(df, - groups=[ - "cart_id", "mail", "product_id"], - stop_if_bigger=0.05, return_cnx=True, - keep_balance=0.8, must_groups=["product_id"]) + train, test, stats = train_test_connex_split( + df, + groups=["cart_id", "mail", "product_id"], + stop_if_bigger=0.05, + return_cnx=True, + keep_balance=0.8, + must_groups=["product_id"], + ) self.assertGreater(train.shape[0], 0) self.assertGreater(test.shape[0], 0) - elements = stats[1]['connex'] + elements = stats[1]["connex"] counts = Counter(elements) nbc = len(counts) maxi = max(counts.values()) diff --git a/_unittests/ut_df/test_connex_split_cat.py b/_unittests/ut_df/test_connex_split_cat.py index 27ed49e..3eb55e8 100644 --- a/_unittests/ut_df/test_connex_split_cat.py +++ b/_unittests/ut_df/test_connex_split_cat.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -""" -@brief test log(time=4s) -""" + import unittest from collections import Counter import pandas @@ -10,63 +8,80 @@ class TestConnexSplitCat(ExtTestCase): - def test_cat_strat(self): - df = pandas.DataFrame([dict(a=1, b="e"), - dict(a=2, b="e"), - dict(a=4, b="f"), - dict(a=8, b="f"), - dict(a=32, b="f"), - dict(a=16, b="f")]) + df = pandas.DataFrame( + [ + dict(a=1, b="e"), + dict(a=2, b="e"), + dict(a=4, b="f"), + dict(a=8, b="f"), + dict(a=32, b="f"), + dict(a=16, b="f"), + ] + ) train, test = train_test_apart_stratify( - df, group="a", stratify="b", test_size=0.5) + df, group="a", stratify="b", test_size=0.5 + ) self.assertEqual(train.shape[1], test.shape[1]) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) c1 = Counter(train["b"]) c2 = Counter(train["b"]) self.assertEqual(c1, c2) - self.assertRaise(lambda: train_test_apart_stratify(df, group=None, stratify="b", test_size=0.5), - ValueError) - self.assertRaise(lambda: train_test_apart_stratify(df, group="b", test_size=0.5), - ValueError) + self.assertRaise( + lambda: train_test_apart_stratify( + df, group=None, stratify="b", test_size=0.5 + ), + ValueError, + ) + self.assertRaise( + lambda: train_test_apart_stratify(df, group="b", test_size=0.5), ValueError + ) def test_cat_strat_multi(self): - df = pandas.DataFrame([dict(a=1, b="e"), - dict(a=1, b="f"), - dict(a=2, b="e"), - dict(a=2, b="f"), - ]) + df = pandas.DataFrame( + [ + dict(a=1, b="e"), + dict(a=1, b="f"), + dict(a=2, b="e"), + dict(a=2, b="f"), + ] + ) train, test = train_test_apart_stratify( - df, group="a", stratify="b", test_size=0.5) + df, group="a", stratify="b", test_size=0.5 + ) self.assertEqual(train.shape[1], test.shape[1]) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) c1 = Counter(train["b"]) c2 = Counter(train["b"]) self.assertEqual(c1, c2) - self.assertEqual(len(set(train['a'])), 1) - self.assertEqual(len(set(test['a'])), 1) - self.assertTrue(set(train['a']) != set(test['a'])) + self.assertEqual(len(set(train["a"])), 1) + self.assertEqual(len(set(test["a"])), 1) + self.assertTrue(set(train["a"]) != set(test["a"])) def test_cat_strat_multi_force(self): - df = pandas.DataFrame([dict(a=1, b="e"), - dict(a=1, b="f"), - dict(a=2, b="e"), - dict(a=2, b="f"), - ]) + df = pandas.DataFrame( + [ + dict(a=1, b="e"), + dict(a=1, b="f"), + dict(a=2, b="e"), + dict(a=2, b="f"), + ] + ) train, test = train_test_apart_stratify( - df, group="a", stratify="b", test_size=0.1, force=True) + df, group="a", stratify="b", test_size=0.1, force=True + ) self.assertEqual(train.shape[1], test.shape[1]) self.assertEqual(train.shape[0] + test.shape[0], df.shape[0]) c1 = Counter(train["b"]) c2 = Counter(train["b"]) self.assertEqual(c1, c2) - self.assertEqual(len(set(train['a'])), 1) - self.assertEqual(len(set(test['a'])), 1) - self.assertTrue(set(train['a']) != set(test['a'])) + self.assertEqual(len(set(train["a"])), 1) + self.assertEqual(len(set(test["a"])), 1) + self.assertTrue(set(train["a"]) != set(test["a"])) if __name__ == "__main__": diff --git a/_unittests/ut_df/test_dataframe_helpers.py b/_unittests/ut_df/test_dataframe_helpers.py index 45f295d..edd0db6 100644 --- a/_unittests/ut_df/test_dataframe_helpers.py +++ b/_unittests/ut_df/test_dataframe_helpers.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=4s) -""" import os import unittest import numpy @@ -11,13 +7,16 @@ class TestDataFrameHelpers(ExtTestCase): - def test_hash_columns(self): - df = pandas.DataFrame([dict(a=1, b="e", c=5.6, ind="a1", ai=1), - dict(b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) + df = pandas.DataFrame( + [ + dict(a=1, b="e", c=5.6, ind="a1", ai=1), + dict(b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) df2 = dataframe_hash_columns(df) self.assertEqual(df2.shape, df.shape) for j in range(df.shape[1]): diff --git a/_unittests/ut_df/test_dataframe_helpers_simple.py b/_unittests/ut_df/test_dataframe_helpers_simple.py index 79545c1..5d68296 100644 --- a/_unittests/ut_df/test_dataframe_helpers_simple.py +++ b/_unittests/ut_df/test_dataframe_helpers_simple.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=4s) -""" import unittest import pandas import numpy @@ -11,34 +7,36 @@ class TestDataFrameHelpersSimple(ExtTestCase): - def test_unfold(self): - df = pandas.DataFrame([dict(a=1, b="e,f"), - dict(a=2, b="g"), - dict(a=3)]) + df = pandas.DataFrame([dict(a=1, b="e,f"), dict(a=2, b="g"), dict(a=3)]) df2 = dataframe_unfold(df, "b") - exp = pandas.DataFrame([dict(a=1, b="e,f", b_unfold="e"), - dict(a=1, b="e,f", b_unfold="f"), - dict(a=2, b="g", b_unfold="g"), - dict(a=3)]) + exp = pandas.DataFrame( + [ + dict(a=1, b="e,f", b_unfold="e"), + dict(a=1, b="e,f", b_unfold="f"), + dict(a=2, b="g", b_unfold="g"), + dict(a=3), + ] + ) self.assertEqualDataFrame(df2, exp) # fold - folded = df2.groupby('a').apply(lambda row: ','.join( - row['b_unfold'].dropna()) if len(row['b_unfold'].dropna()) > 0 else numpy.nan) + folded = df2.groupby("a").apply( + lambda row: ",".join(row["b_unfold"].dropna()) + if len(row["b_unfold"].dropna()) > 0 + else numpy.nan + ) bf = folded.reset_index(drop=False) - bf.columns = ['a', 'b'] + bf.columns = ["a", "b"] self.assertEqualDataFrame(df, bf) def test_hash_except(self): - self.assertRaise(lambda: hash_int(0.1, 3), - ValueError, "numpy.nan expected") + self.assertRaise(lambda: hash_int(0.1, 3), ValueError, "numpy.nan expected") r = hash_int(numpy.nan, 3) self.assertTrue(numpy.isnan(r)) - self.assertRaise(lambda: hash_str(0.1, 3), - ValueError, "numpy.nan expected") + self.assertRaise(lambda: hash_str(0.1, 3), ValueError, "numpy.nan expected") r = hash_str(numpy.nan, 3) self.assertTrue(numpy.isnan(r)) diff --git a/_unittests/ut_df/test_dataframe_io.py b/_unittests/ut_df/test_dataframe_io.py index d8e51a1..3e2125a 100644 --- a/_unittests/ut_df/test_dataframe_io.py +++ b/_unittests/ut_df/test_dataframe_io.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=4s) -""" import os import unittest import io @@ -13,13 +9,16 @@ class TestDataFrameIO(ExtTestCase): - def test_zip_dataframe(self): - df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) temp = get_temp_folder(__file__, "temp_zip") name = os.path.join(temp, "df.zip") @@ -28,13 +27,13 @@ def test_zip_dataframe(self): self.assertEqualDataFrame(df, df2) st = io.BytesIO() - zp = zipfile.ZipFile(st, 'w') + zp = zipfile.ZipFile(st, "w") to_zip(df, zp, encoding="utf-8", index=False) zp.close() st = io.BytesIO(st.getvalue()) - zp = zipfile.ZipFile(st, 'r') - df3 = read_zip(zp, encoding='utf-8') + zp = zipfile.ZipFile(st, "r") + df3 = read_zip(zp, encoding="utf-8") zp.close() self.assertEqualDataFrame(df, df3) @@ -49,12 +48,12 @@ def test_zip_numpy(self): self.assertEqualArray(df, df2) st = io.BytesIO() - zp = zipfile.ZipFile(st, 'w') + zp = zipfile.ZipFile(st, "w") to_zip(df, zp, "arr.npy") zp.close() st = io.BytesIO(st.getvalue()) - zp = zipfile.ZipFile(st, 'r') + zp = zipfile.ZipFile(st, "r") df3 = read_zip(zp, "arr.npy") zp.close() self.assertEqualArray(df, df3) diff --git a/_unittests/ut_df/test_dataframe_io_helpers.py b/_unittests/ut_df/test_dataframe_io_helpers.py index c6102a0..403a087 100644 --- a/_unittests/ut_df/test_dataframe_io_helpers.py +++ b/_unittests/ut_df/test_dataframe_io_helpers.py @@ -1,21 +1,18 @@ -# -*- coding: utf-8 -*- -# pylint: disable=E1101 -""" -@brief test log(time=4s) -""" import unittest from io import StringIO, BytesIO from json import loads import pandas from pyquickhelper.pycode import ExtTestCase from pandas_streaming.df.dataframe_io_helpers import ( - enumerate_json_items, JsonPerRowsStream, JsonIterator2Stream) + enumerate_json_items, + JsonPerRowsStream, + JsonIterator2Stream, +) from pandas_streaming.df import StreamingDataFrame class TestDataFrameIOHelpers(ExtTestCase): - - text_json = b''' + text_json = b""" [ { "glossary": { @@ -62,28 +59,30 @@ class TestDataFrameIOHelpers(ExtTestCase): } } ] - ''' + """ text_json_exp = [ { "glossary": { "title": "example glossary", "GlossDiv": { "title": "S", - "GlossList": [{ - "GlossEntry": { - "ID": "SGML", - "SortAs": "SGML", - "GlossTerm": "Standard Generalized Markup Language", - "Acronym": "SGML", - "Abbrev": "ISO 8879:1986", - "GlossDef": { - "para": "A meta-markup language, used to create markup languages such as DocBook.", - "GlossSeeAlso": ["GML", "XML"] - }, - "GlossSee": "markup" + "GlossList": [ + { + "GlossEntry": { + "ID": "SGML", + "SortAs": "SGML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": ["GML", "XML"], + }, + "GlossSee": "markup", + } } - }] - } + ], + }, } }, { @@ -92,56 +91,65 @@ class TestDataFrameIOHelpers(ExtTestCase): "GlossDiv": { "title": "X", "GlossList": { - "GlossEntry": [{ - "ID": "SGML", - "SortAs": "SGML", - "GlossTerm": "Standard Generalized Markup Language", - "Acronym": "SGML", - "Abbrev": "ISO 8879:1986", - "GlossDef": { - "para": "A meta-markup language, used to create markup languages such as DocBook.", - "GlossSeeAlso": ["GML", "XML"] - }, - "GlossSee": "markup" - }] - } - } + "GlossEntry": [ + { + "ID": "SGML", + "SortAs": "SGML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": ["GML", "XML"], + }, + "GlossSee": "markup", + } + ] + }, + }, } - } + }, ] def test_enumerate_json_items(self): items = list(enumerate_json_items(TestDataFrameIOHelpers.text_json)) self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items) - items = list(enumerate_json_items( - BytesIO(TestDataFrameIOHelpers.text_json))) + items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json))) self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items) - items = list(enumerate_json_items( - BytesIO(TestDataFrameIOHelpers.text_json))) + items = list(enumerate_json_items(BytesIO(TestDataFrameIOHelpers.text_json))) self.assertEqual(TestDataFrameIOHelpers.text_json_exp, items) def test_read_json_raw(self): - data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, - {'name': {'given': 'Mose', 'family': 'Regner'}}, - {'id': 2, 'name': 'FayeRaker'}] + data = [ + {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + {"name": {"given": "Mose", "family": "Regner"}}, + {"id": 2, "name": "FayeRaker"}, + ] exp = """[{"id":1.0,"name":null,"name.family":null,"name.first":"Coleen","name.given":null,"name.last":"Volk"}, {"id":null,"name":null,"name.family":"Regner","name.first":null,"name.given":"Mose","name.last":null}, {"id":2.0,"name":"FayeRaker","name.family":null,"name.first":null, - "name.given":null,"name.last":null}]""".replace(" ", "").replace("\n", "") - self.assertRaise(lambda: StreamingDataFrame.read_json( - data), NotImplementedError) + "name.given":null,"name.last":null}]""".replace( + " ", "" + ).replace( + "\n", "" + ) + self.assertRaise( + lambda: StreamingDataFrame.read_json(data), NotImplementedError + ) it = StreamingDataFrame.read_json(data, flatten=True) dfs = list(it) self.assertEqual(len(dfs), 1) - js = dfs[0].to_json(orient='records') + js = dfs[0].to_json(orient="records") js_read = loads(js) js_exp = loads(exp) self.assertEqual(js_exp, js_read) def test_read_json_raw_head(self): - data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, - {'name': {'given': 'Mose', 'family': 'Regner'}}, - {'id': 2, 'name': 'FayeRaker'}] + data = [ + {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + {"name": {"given": "Mose", "family": "Regner"}}, + {"id": 2, "name": "FayeRaker"}, + ] it = StreamingDataFrame.read_json(data, flatten=True, chunksize=1) h1 = it.head() h2 = it.head() @@ -150,36 +158,36 @@ def test_read_json_raw_head(self): self.assertGreater(h2.shape[0], 1) def test_pandas_json_chunksize(self): - jsonl = '''{"a": 1, "b": 2} - {"a": 3, "b": 4}''' + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4}""" df = pandas.read_json(jsonl, lines=True) idf = pandas.read_json(jsonl, lines=True, chunksize=2) ldf = list(idf) self.assertEqualDataFrame(df, ldf[0]) def test_read_json_rows(self): - data = '''{"a": 1, "b": 2} - {"a": 3, "b": 4}''' + data = """{"a": 1, "b": 2} + {"a": 3, "b": 4}""" it = StreamingDataFrame.read_json(StringIO(data), lines=True) dfs = list(it) self.assertEqual(len(dfs), 1) - js = dfs[0].to_json(orient='records') + js = dfs[0].to_json(orient="records") self.assertEqual(js, '[{"a":1,"b":2},{"a":3,"b":4}]') def test_read_json_rows2(self): - data = b'''{"a": 1, "b": 2} - {"a": 3, "b": 4}''' + data = b"""{"a": 1, "b": 2} + {"a": 3, "b": 4}""" dfs = pandas.read_json(BytesIO(data), lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") dfs = list(it) self.assertEqual(len(dfs), 1) - js = dfs[0].to_json(orient='records') + js = dfs[0].to_json(orient="records") self.assertEqual('[{"a":1,"b":2},{"a":3,"b":4}]', js) def test_read_json_rows2_head(self): - data = b'''{"a": 1, "b": 2} - {"a": 3, "b": 4}''' + data = b"""{"a": 1, "b": 2} + {"a": 3, "b": 4}""" dfs = pandas.read_json(BytesIO(data), lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(BytesIO(data), lines="stream") @@ -190,8 +198,8 @@ def test_read_json_rows2_head(self): self.assertEqualDataFrame(h1, h2) def test_read_json_rows_file_head(self): - data = self.abs_path_join(__file__, 'data', 'example2.json') - dfs = pandas.read_json(data, orient='records') + data = self.abs_path_join(__file__, "data", "example2.json") + dfs = pandas.read_json(data, orient="records") self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(data) h1 = it.head() @@ -201,8 +209,8 @@ def test_read_json_rows_file_head(self): self.assertEqualDataFrame(h1, h2) def test_read_json_rows_file_lines_head(self): - data = self.abs_path_join(__file__, 'data', 'example.json') - dfs = pandas.read_json(data, orient='records', lines=True) + data = self.abs_path_join(__file__, "data", "example.json") + dfs = pandas.read_json(data, orient="records", lines=True) self.assertEqual(dfs.shape, (2, 2)) it = StreamingDataFrame.read_json(data, lines="stream") h1 = it.head() @@ -212,12 +220,11 @@ def test_read_json_rows_file_lines_head(self): self.assertEqualDataFrame(h1, h2) def test_read_json_ijson(self): - it = StreamingDataFrame.read_json( - BytesIO(TestDataFrameIOHelpers.text_json)) + it = StreamingDataFrame.read_json(BytesIO(TestDataFrameIOHelpers.text_json)) dfs = list(it) self.assertEqual(len(dfs), 1) - js = dfs[0].to_json(orient='records', lines=True) - jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']') + js = dfs[0].to_json(orient="records", lines=True) + jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]") self.assertEqual(jsjson, TestDataFrameIOHelpers.text_json_exp) def test_read_json_stream(self): @@ -239,33 +246,39 @@ def test_read_json_stream(self): self.assertEqual(val, exp) def test_enumerate_json_items_lines(self): - data = b'''{"a": 1, "b": 2} - {"a": 3, "b": 4}''' + data = b"""{"a": 1, "b": 2} + {"a": 3, "b": 4}""" items = list(enumerate_json_items(data, lines=True)) - self.assertEqual(items, [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]) + self.assertEqual(items, [{"a": 1, "b": 2}, {"a": 3, "b": 4}]) def test_read_json_file2(self): - data = b'''{"a": {"c": 1}, "b": [2, 3]} - {"a": {"a": 3}, "b": [4, 5, "r"]}''' + data = b"""{"a": {"c": 1}, "b": [2, 3]} + {"a": {"a": 3}, "b": [4, 5, "r"]}""" - obj1 = list(enumerate_json_items( - BytesIO(data), flatten=False, lines=True)) - obj2 = list(enumerate_json_items( - BytesIO(data), flatten=True, lines=True)) + obj1 = list(enumerate_json_items(BytesIO(data), flatten=False, lines=True)) + obj2 = list(enumerate_json_items(BytesIO(data), flatten=True, lines=True)) self.assertNotEqual(obj1, obj2) - self.assertEqual(obj2, [{'a_c': 1, 'b_0': 2, 'b_1': 3}, - {'a_a': 3, 'b_0': 4, 'b_1': 5, 'b_2': 'r'}]) + self.assertEqual( + obj2, + [ + {"a_c": 1, "b_0": 2, "b_1": 3}, + {"a_a": 3, "b_0": 4, "b_1": 5, "b_2": "r"}, + ], + ) - it = StreamingDataFrame.read_json( - BytesIO(data), lines="stream", flatten=True) + it = StreamingDataFrame.read_json(BytesIO(data), lines="stream", flatten=True) dfs = list(it) - self.assertEqual(['a_a', 'a_c', 'b_0', 'b_1', 'b_2'], - list(sorted(dfs[0].columns)), ) + self.assertEqual( + ["a_a", "a_c", "b_0", "b_1", "b_2"], + list(sorted(dfs[0].columns)), + ) self.assertEqual(len(dfs), 1) - js = dfs[0].to_json(orient='records', lines=True) - jsjson = loads('[' + js.replace("\n", ",").strip(',') + ']') - exp = [{'a_a': None, 'a_c': 1.0, 'b_0': 2, 'b_1': 3, 'b_2': None}, - {'a_a': 3.0, 'a_c': None, 'b_0': 4, 'b_1': 5, 'b_2': 'r'}] + js = dfs[0].to_json(orient="records", lines=True) + jsjson = loads("[" + js.replace("\n", ",").strip(",") + "]") + exp = [ + {"a_a": None, "a_c": 1.0, "b_0": 2, "b_1": 3, "b_2": None}, + {"a_a": 3.0, "a_c": None, "b_0": 4, "b_1": 5, "b_2": "r"}, + ] self.assertEqual(exp, jsjson) def test_read_json_item(self): @@ -282,18 +295,19 @@ def test_read_json_item(self): def test_bug_documentation(self): items = [] for item in JsonIterator2Stream( - lambda: enumerate_json_items(TestDataFrameIOHelpers.text_json)): + lambda: enumerate_json_items(TestDataFrameIOHelpers.text_json) + ): items.append(item) self.assertEqual(len(items), 2) def test_read_json_classic(self): - data = self.abs_path_join(__file__, 'data', 'classic.json') - dfs = pandas.read_json(data, orient='records') - dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9) + data = self.abs_path_join(__file__, "data", "classic.json") + dfs = pandas.read_json(data, orient="records") + dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9) self.assertEqual(dfs.shape[1], 9) self.assertGreater(dfs.shape[0], 2) it = StreamingDataFrame.read_json(data) - it['ts2'] = it['ts'].apply(lambda t: t / 1e9) + it["ts2"] = it["ts"].apply(lambda t: t / 1e9) h1 = it.to_df() h2 = it.to_df() self.assertNotEmpty(h1) @@ -302,12 +316,12 @@ def test_read_json_classic(self): self.assertEqual(h1.shape[1], 9) def test_read_json_classic_file(self): - data = self.abs_path_join(__file__, 'data', 'classic.json') - dfs = pandas.read_json(data, orient='records') + data = self.abs_path_join(__file__, "data", "classic.json") + dfs = pandas.read_json(data, orient="records") self.assertEqual(dfs.shape[1], 8) self.assertGreater(dfs.shape[0], 2) with open(data, "r", encoding="utf-8") as f: - it = StreamingDataFrame.read_json(f, orient='records') + it = StreamingDataFrame.read_json(f, orient="records") h1 = it.to_df() h2 = it.to_df() self.assertNotEmpty(h1) @@ -316,14 +330,14 @@ def test_read_json_classic_file(self): self.assertEqual(h1.shape[1], 8) def test_read_json_classic_file_formula(self): - data = self.abs_path_join(__file__, 'data', 'classic.json') - dfs = pandas.read_json(data, orient='records') - dfs['ts2'] = dfs['ts'].apply(lambda t: t / 1e9) + data = self.abs_path_join(__file__, "data", "classic.json") + dfs = pandas.read_json(data, orient="records") + dfs["ts2"] = dfs["ts"].apply(lambda t: t / 1e9) self.assertEqual(dfs.shape[1], 9) self.assertGreater(dfs.shape[0], 2) with open(data, "r", encoding="utf-8") as f: it = StreamingDataFrame.read_json(f) - it['ts2'] = it['ts'].apply(lambda t: t / 1e9) + it["ts2"] = it["ts"].apply(lambda t: t / 1e9) h1 = it.to_df() h2 = it.to_df() self.assertNotEmpty(h1) diff --git a/_unittests/ut_df/test_dataframe_sort.py b/_unittests/ut_df/test_dataframe_sort.py index d6f1202..354e4d5 100644 --- a/_unittests/ut_df/test_dataframe_sort.py +++ b/_unittests/ut_df/test_dataframe_sort.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=4s) -""" import os import unittest import pandas @@ -10,15 +6,18 @@ class TestDataFrameSort(ExtTestCase): - def test_sort_values(self): temp = get_temp_folder(__file__, "temp_sort_values") name = os.path.join(temp, "_data_") - df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(a=5, b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(a=5, b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) sdf = StreamingDataFrame.read_df(df, chunksize=2) sorted_df = df.sort_values(by="a") res = sdf.sort_values(by="a", temp_file=name) @@ -28,11 +27,15 @@ def test_sort_values(self): def test_sort_values_twice(self): temp = get_temp_folder(__file__, "temp_sort_values_twice") name = os.path.join(temp, "_data_") - df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(a=5, b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(a=5, b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) sdf = StreamingDataFrame.read_df(df, chunksize=2) sorted_df = df.sort_values(by="a") res = sdf.sort_values(by="a", temp_file=name) @@ -44,11 +47,15 @@ def test_sort_values_twice(self): def test_sort_values_reverse(self): temp = get_temp_folder(__file__, "temp_sort_values_reverse") name = os.path.join(temp, "_data_") - df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(a=5, b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(a=5, b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) sdf = StreamingDataFrame.read_df(df, chunksize=2) sorted_df = df.sort_values(by="a", ascending=False) res = sdf.sort_values(by="a", temp_file=name, ascending=False) @@ -58,30 +65,38 @@ def test_sort_values_reverse(self): def test_sort_values_nan_last(self): temp = get_temp_folder(__file__, "temp_sort_values_nan_last") name = os.path.join(temp, "_data_") - df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(b="f", c=5.7, ind="a2", ai=2), - dict(b="f", c=5.8, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(b="f", c=5.7, ind="a2", ai=2), + dict(b="f", c=5.8, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) sdf = StreamingDataFrame.read_df(df, chunksize=2) - sorted_df = df.sort_values(by="a", na_position='last') - res = sdf.sort_values(by="a", temp_file=name, na_position='last') + sorted_df = df.sort_values(by="a", na_position="last") + res = sdf.sort_values(by="a", temp_file=name, na_position="last") res_df = res.to_df() self.assertEqualDataFrame(sorted_df, res_df) def test_sort_values_nan_first(self): temp = get_temp_folder(__file__, "temp_sort_values_nan_first") name = os.path.join(temp, "_data_") - df = pandas.DataFrame([dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(b="f", c=5.7, ind="a2", ai=2), - dict(b="f", c=5.8, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5)]) + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(b="f", c=5.7, ind="a2", ai=2), + dict(b="f", c=5.8, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) sdf = StreamingDataFrame.read_df(df, chunksize=2) - sorted_df = df.sort_values(by="a", na_position='first') - res = sdf.sort_values(by="a", temp_file=name, na_position='first') + sorted_df = df.sort_values(by="a", na_position="first") + res = sdf.sort_values(by="a", temp_file=name, na_position="first") res_df = res.to_df() self.assertEqualDataFrame(sorted_df, res_df) diff --git a/_unittests/ut_df/test_pandas_groupbynan.py b/_unittests/ut_df/test_pandas_groupbynan.py index 94482d5..3d9a635 100644 --- a/_unittests/ut_df/test_pandas_groupbynan.py +++ b/_unittests/ut_df/test_pandas_groupbynan.py @@ -1,7 +1,3 @@ -# coding: utf-8 -""" -@brief test log(time=1s) -""" import unittest import pandas import numpy @@ -11,19 +7,18 @@ class TestPandasHelper(ExtTestCase): - def test_pandas_groupbynan(self): self.assertTrue(sparse_lsqr is not None) - types = [(int, -10), (float, -20.2), (str, "e"), - (bytes, bytes("a", "ascii"))] + types = [(int, -10), (float, -20.2), (str, "e"), (bytes, bytes("a", "ascii"))] skip = (numpy.bool_, numpy.complex64, numpy.complex128) types += [(_, _(5)) for _ in numpy_types() if _ not in skip] for ty in types: - data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]}, - {"this": "cst", "type": "tt2=" + - str(ty[0]), "value": ty[1]}, - {"this": "cst", "type": "row_for_nan"}] + data = [ + {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]}, + {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]}, + {"this": "cst", "type": "row_for_nan"}, + ] df = pandas.DataFrame(data) gr = pandas_groupby_nan(df, "value") co = gr.sum() @@ -37,13 +32,16 @@ def test_pandas_groupbynan(self): except AssertionError as e: raise AssertionError( "Issue with value {}\n--df--\n{}\n--gr--\n{}\n--co--\n{}".format( - li, df, gr.count(), co)) from e + li, df, gr.count(), co + ) + ) from e for ty in types: - data = [{"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]}, - {"this": "cst", "type": "tt2=" + - str(ty[0]), "value": ty[1]}, - {"this": "cst", "type": "row_for_nan"}] + data = [ + {"this": "cst", "type": "tt1=" + str(ty[0]), "value": ty[1]}, + {"this": "cst", "type": "tt2=" + str(ty[0]), "value": ty[1]}, + {"this": "cst", "type": "row_for_nan"}, + ] df = pandas.DataFrame(data) try: gr = pandas_groupby_nan(df, ("value", "this")) @@ -68,8 +66,12 @@ def test_pandas_groupbynan(self): self.assertEqual(len(li), 2) def test_pandas_groupbynan_tuple(self): - data = [dict(a="a", b="b", c="c", n=1), dict( - b="b", n=2), dict(a="a", n=3), dict(c="c", n=4)] + data = [ + dict(a="a", b="b", c="c", n=1), + dict(b="b", n=2), + dict(a="a", n=3), + dict(c="c", n=4), + ] df = pandas.DataFrame(data) gr = df.groupby(["a", "b", "c"]).sum() self.assertEqual(gr.shape, (1, 1)) @@ -77,7 +79,8 @@ def test_pandas_groupbynan_tuple(self): for nanback in [True, False]: try: gr2_ = pandas_groupby_nan( - df, ["a", "b", "c"], nanback=nanback, suffix="NAN") + df, ["a", "b", "c"], nanback=nanback, suffix="NAN" + ) except NotImplementedError: continue gr2 = gr2_.sum().sort_values("n") @@ -101,36 +104,42 @@ def test_pandas_groupbynan_regular_nanback(self): self.assertEqual(len(gr), 1) def test_pandas_groupbynan_doc(self): - data = [dict(a=2, ind="a", n=1), - dict(a=2, ind="a"), - dict(a=3, ind="b"), - dict(a=30)] + data = [ + dict(a=2, ind="a", n=1), + dict(a=2, ind="a"), + dict(a=3, ind="b"), + dict(a=30), + ] df = pandas.DataFrame(data) gr2 = pandas_groupby_nan(df, ["ind"]).sum() - ind = list(gr2['ind']) + ind = list(gr2["ind"]) self.assertTrue(numpy.isnan(ind[-1])) - val = list(gr2['a']) + val = list(gr2["a"]) self.assertEqual(val[-1], 30) @ignore_warnings(UserWarning) def test_pandas_groupbynan_doc2(self): - data = [dict(a=2, ind="a", n=1), - dict(a=2, ind="a"), - dict(a=3, ind="b"), - dict(a=30)] + data = [ + dict(a=2, ind="a", n=1), + dict(a=2, ind="a"), + dict(a=3, ind="b"), + dict(a=30), + ] df = pandas.DataFrame(data) gr2 = pandas_groupby_nan(df, ["ind", "a"], nanback=False).sum() - ind = list(gr2['ind']) + ind = list(gr2["ind"]) self.assertEqual(ind[-1], "²nan") def test_pandas_groupbynan_doc3(self): - data = [dict(a=2, ind="a", n=1), - dict(a=2, ind="a"), - dict(a=3, ind="b"), - dict(a=30)] + data = [ + dict(a=2, ind="a", n=1), + dict(a=2, ind="a"), + dict(a=3, ind="b"), + dict(a=30), + ] df = pandas.DataFrame(data) gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum() - ind = list(gr2['ind']) + ind = list(gr2["ind"]) self.assertTrue(numpy.isnan(ind[-1])) diff --git a/_unittests/ut_df/test_streaming_dataframe.py b/_unittests/ut_df/test_streaming_dataframe.py index 11fdc51..b62f9a5 100644 --- a/_unittests/ut_df/test_streaming_dataframe.py +++ b/_unittests/ut_df/test_streaming_dataframe.py @@ -1,21 +1,15 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=4s) -""" import os import unittest from io import StringIO import pandas import numpy -from pyquickhelper.pycode import ( - ExtTestCase, get_temp_folder, ignore_warnings) +from pyquickhelper.pycode import ExtTestCase, get_temp_folder, ignore_warnings from pandas_streaming.data import dummy_streaming_dataframe from pandas_streaming.df import StreamingDataFrame from pandas_streaming.df.dataframe import StreamingDataFrameSchemaError class TestStreamingDataFrame(ExtTestCase): - def test_shape(self): sdf = dummy_streaming_dataframe(100) dfs = list(sdf) @@ -34,11 +28,9 @@ def test_init(self): def test_to_csv(self): sdf = dummy_streaming_dataframe(100) st = sdf.to_csv() - self.assertStartsWith(",cint,cstr\n0,0,s0", - st.replace('\r', '')) + self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", "")) st = sdf.to_csv() - self.assertStartsWith(",cint,cstr\n0,0,s0", - st.replace('\r', '')) + self.assertStartsWith(",cint,cstr\n0,0,s0", st.replace("\r", "")) def test_iterrows(self): sdf = dummy_streaming_dataframe(100) @@ -74,43 +66,42 @@ def test_read_csv(self): sdf = StreamingDataFrame.read_csv(name) text = sdf.to_csv(index=False) self.assertRaise( - lambda: StreamingDataFrame.read_csv( - name2, index_col=0, chunksize=None), - ValueError) + lambda: StreamingDataFrame.read_csv(name2, index_col=0, chunksize=None), + ValueError, + ) self.assertRaise( - lambda: StreamingDataFrame.read_csv( - name2, index_col=0, iterator=False), - ValueError) + lambda: StreamingDataFrame.read_csv(name2, index_col=0, iterator=False), + ValueError, + ) sdf2 = StreamingDataFrame.read_csv(name2, index_col=0) text2 = sdf2.to_csv(index=True) sdf2.to_csv(name3, index=True) - with open(name, "r", encoding='utf-8') as f: + with open(name, "r", encoding="utf-8") as f: exp = f.read() - with open(name2, "r", encoding='utf-8') as f: + with open(name2, "r", encoding="utf-8") as f: exp2 = f.read() - with open(name3, "r", encoding='utf-8') as f: + with open(name3, "r", encoding="utf-8") as f: text3 = f.read() - self.assertEqual(text.replace('\r', ''), exp) + self.assertEqual(text.replace("\r", ""), exp) sdf2 = StreamingDataFrame.read_df(df) self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe()) - self.assertEqual(text2.replace('\r', ''), exp2) - self.assertEqual(text3.replace('\r', '').replace('\n\n', '\n'), - exp2.replace('\r', '')) + self.assertEqual(text2.replace("\r", ""), exp2) + self.assertEqual( + text3.replace("\r", "").replace("\n\n", "\n"), exp2.replace("\r", "") + ) def test_where(self): sdf = dummy_streaming_dataframe(100) cols = sdf.columns - self.assertEqual(list(cols), ['cint', 'cstr']) + self.assertEqual(list(cols), ["cint", "cstr"]) dts = sdf.dtypes self.assertEqual(len(dts), 2) res = sdf.where(lambda row: row["cint"] == 1) st = res.to_csv() - self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", - st.replace('\r', '')) + self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", "")) res = sdf.where(lambda row: row["cint"] == 1) st = res.to_csv() - self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", - st.replace('\r', '')) + self.assertStartsWith(",cint,cstr\n0,,\n1,1.0,s1", st.replace("\r", "")) def test_dataframe(self): sdf = dummy_streaming_dataframe(100) @@ -144,10 +135,12 @@ def test_sample_reservoir_cache(self): df2 = res.to_df() self.assertEqualDataFrame(df1, df2) self.assertEqual(df1.shape, (10, res.shape[1])) - self.assertRaise(lambda: sdf.sample(n=10, cache=False, reservoir=True), - ValueError) - self.assertRaise(lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), - ValueError) + self.assertRaise( + lambda: sdf.sample(n=10, cache=False, reservoir=True), ValueError + ) + self.assertRaise( + lambda: sdf.sample(frac=0.1, cache=True, reservoir=True), ValueError + ) def test_apply(self): sdf = dummy_streaming_dataframe(100) @@ -157,19 +150,18 @@ def test_apply(self): sdf = sdf.apply(lambda row: row[["cint"]] + "r", axis=1) self.assertNotEmpty(list(sdf)) text = sdf.to_csv(header=False) - self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", - text.replace('\r', '')) + self.assertStartsWith("0,0r\n1,1r\n2,2r\n3,3r", text.replace("\r", "")) def test_train_test_split(self): sdf = dummy_streaming_dataframe(100) tr, te = sdf.train_test_split(index=False, streaming=False) self.assertRaise( - lambda: StreamingDataFrame.read_str(tr, chunksize=None), - ValueError) + lambda: StreamingDataFrame.read_str(tr, chunksize=None), ValueError + ) self.assertRaise( - lambda: StreamingDataFrame.read_str(tr, iterator=False), - ValueError) - StreamingDataFrame.read_str(tr.encode('utf-8')) + lambda: StreamingDataFrame.read_str(tr, iterator=False), ValueError + ) + StreamingDataFrame.read_str(tr.encode("utf-8")) trsdf = StreamingDataFrame.read_str(tr) tesdf = StreamingDataFrame.read_str(te) trdf = trsdf.to_dataframe() @@ -183,7 +175,8 @@ def test_train_test_split(self): def test_train_test_split_streaming(self): sdf = dummy_streaming_dataframe(100, asfloat=True) trsdf, tesdf = sdf.train_test_split( - streaming=True, unique_rows=True, partitions=[0.7, 0.3]) + streaming=True, unique_rows=True, partitions=[0.7, 0.3] + ) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() @@ -228,10 +221,12 @@ def test_train_test_split_streaming_tiny(self): self.assertEqualDataFrame(df1, df2) def test_train_test_split_streaming_strat(self): - sdf = dummy_streaming_dataframe(100, asfloat=True, - tify=["t1" if i % 3 else "t0" for i in range(0, 100)]) + sdf = dummy_streaming_dataframe( + 100, asfloat=True, tify=["t1" if i % 3 else "t0" for i in range(0, 100)] + ) trsdf, tesdf = sdf.train_test_split( - streaming=True, unique_rows=True, stratify="tify") + streaming=True, unique_rows=True, stratify="tify" + ) trdf = trsdf.to_dataframe() tedf = tesdf.to_dataframe() df_exp = sdf.to_dataframe() @@ -250,12 +245,11 @@ def test_train_test_split_streaming_strat(self): tegr = tedf.groupby("tify").count() tegr["part"] = 1 gr = pandas.concat([trgr, tegr]) - self.assertGreater(gr['cfloat'].min(), 4) + self.assertGreater(gr["cfloat"].min(), 4) def test_train_test_split_file(self): temp = get_temp_folder(__file__, "temp_train_test_split_file") - names = [os.path.join(temp, "train.txt"), - os.path.join(temp, "test.txt")] + names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")] sdf = dummy_streaming_dataframe(100) sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(names[0]) @@ -276,8 +270,10 @@ def test_train_test_split_file_pattern(self): temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern") sdf = dummy_streaming_dataframe(100) names = os.path.join(temp, "spl_{0}.txt") - self.assertRaise(lambda: sdf.train_test_split( - names, index=False, streaming=False), ValueError) + self.assertRaise( + lambda: sdf.train_test_split(names, index=False, streaming=False), + ValueError, + ) names = os.path.join(temp, "spl_{}.txt") tr, te = sdf.train_test_split(names, index=False, streaming=False) trsdf = StreamingDataFrame.read_csv(tr) @@ -297,8 +293,9 @@ def compares(a, b, how): da = a.to_dataframe() db = b.to_dataframe() exp = da.merge(db, on="cint", indicator=True) - self.assertEqualDataFrame(dm.reset_index(drop=True), - exp.reset_index(drop=True)) + self.assertEqualDataFrame( + dm.reset_index(drop=True), exp.reset_index(drop=True) + ) sdf20 = dummy_streaming_dataframe(20) sdf30 = dummy_streaming_dataframe(30) @@ -332,11 +329,17 @@ def test_concatv(self): self.assertEqualDataFrame(m1.to_dataframe(), df) df30["g"] = 4 - self.assertRaise(lambda: sdf20.concat(df30).to_dataframe(), - ValueError, "Frame others[0] do not have the same column names") + self.assertRaise( + lambda: sdf20.concat(df30).to_dataframe(), + ValueError, + "Frame others[0] do not have the same column names", + ) df20["cint"] = df20["cint"].astype(float) - self.assertRaise(lambda: sdf20.concat(df20).to_dataframe(), - ValueError, "Frame others[0] do not have the same column types") + self.assertRaise( + lambda: sdf20.concat(df20).to_dataframe(), + ValueError, + "Frame others[0] do not have the same column types", + ) def test_concath(self): sdf20 = dummy_streaming_dataframe(20) @@ -349,8 +352,9 @@ def test_concath(self): self.assertEqualDataFrame(m1.to_dataframe(), df) sdf22 = dummy_streaming_dataframe(22) sdf25 = dummy_streaming_dataframe(25) - self.assertRaise(lambda: sdf22.concat(sdf25, axis=1).to_dataframe(), - RuntimeError) + self.assertRaise( + lambda: sdf22.concat(sdf25, axis=1).to_dataframe(), RuntimeError + ) def test_groupby(self): df20 = dummy_streaming_dataframe(20).to_dataframe() @@ -359,14 +363,19 @@ def test_groupby(self): gr = sdf20.groupby("key", lambda gr: gr.sum()) gr2 = df20.groupby("key").sum() self.assertEqualDataFrame(gr, gr2) - self.assertRaise(lambda: sdf20.groupby( - "key", in_memory=False), NotImplementedError) + self.assertRaise( + lambda: sdf20.groupby("key", in_memory=False), NotImplementedError + ) # Do not replace lambda c:sum(c) by sum or... - # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum - gr2 = df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c:sum(c)]) - gr = sdf20.drop("cstr", axis=1).groupby("key", lambda gr: gr.agg( - [numpy.sum, lambda c:sum(c)])) + # pandas.core.base.SpecificationError: Function names + # must be unique, found multiple named sum + gr2 = ( + df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c: sum(c)]) + ) + gr = sdf20.drop("cstr", axis=1).groupby( + "key", lambda gr: gr.agg([numpy.sum, lambda c: sum(c)]) + ) self.assertEqualDataFrame(gr, gr2) gr = sdf20.groupby("key", lambda gr: gr.count()) @@ -384,7 +393,8 @@ def test_groupby_cum(self): df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( - "key", lambda gr: gr.sum(), strategy='cum', as_index=False) + "key", lambda gr: gr.sum(), strategy="cum", as_index=False + ) gr2 = df20.groupby("key", as_index=False).sum() lastgr = None for gr in sgr: @@ -397,7 +407,8 @@ def test_groupby_streaming(self): df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( - "key", lambda gr: gr.sum(), strategy='streaming', as_index=False) + "key", lambda gr: gr.sum(), strategy="streaming", as_index=False + ) gr2 = df20.groupby("key", as_index=False).sum() grs = list(sgr) gr = pandas.concat(grs).groupby("key", as_index=False).sum() @@ -408,7 +419,8 @@ def test_groupby_cum_asindex(self): df20["key"] = df20["cint"].apply(lambda i: i % 3 == 0) sdf20 = StreamingDataFrame.read_df(df20, chunksize=5) sgr = sdf20.groupby_streaming( - "key", lambda gr: gr.sum(), strategy='cum', as_index=True) + "key", lambda gr: gr.sum(), strategy="cum", as_index=True + ) gr2 = df20.groupby("key", as_index=True).sum() lastgr = None for gr in sgr: @@ -426,13 +438,21 @@ def test_merge_2(self): m = pandas.DataFrame(dict(Y=["a", "b"], Z=[10, 20])) jm = df2.merge(m, left_on="Y", right_on="Y", how="outer") sjm = sdf2.merge(m, left_on="Y", right_on="Y", how="outer") - self.assertEqualDataFrame(jm.sort_values(["X", "Y"]).reset_index(drop=True), - sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True)) + self.assertEqualDataFrame( + jm.sort_values(["X", "Y"]).reset_index(drop=True), + sjm.to_dataframe().sort_values(["X", "Y"]).reset_index(drop=True), + ) @ignore_warnings(ResourceWarning) def test_schema_consistent(self): - df = pandas.DataFrame([dict(cf=0, cint=0, cstr="0"), dict(cf=1, cint=1, cstr="1"), - dict(cf=2, cint="s2", cstr="2"), dict(cf=3, cint=3, cstr="3")]) + df = pandas.DataFrame( + [ + dict(cf=0, cint=0, cstr="0"), + dict(cf=1, cint=1, cstr="1"), + dict(cf=2, cint="s2", cstr="2"), + dict(cf=3, cint=3, cstr="3"), + ] + ) temp = get_temp_folder(__file__, "temp_schema_consistant") name = os.path.join(temp, "df.csv") stio = StringIO() @@ -442,8 +462,7 @@ def test_schema_consistent(self): self.assertEqual(df.shape, (4, 3)) sdf = StreamingDataFrame.read_csv(name, chunksize=2) self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError) - sdf = StreamingDataFrame.read_csv( - name, chunksize=2, check_schema=False) + sdf = StreamingDataFrame.read_csv(name, chunksize=2, check_schema=False) pieces = list(sdf) self.assertEqual(len(pieces), 2) @@ -460,11 +479,10 @@ def test_getitem(self): def test_read_csv_names(self): this = os.path.abspath(os.path.dirname(__file__)) data = os.path.join(this, "data", "buggy_hash2.csv") - df = pandas.read_csv(data, sep="\t", - names=["A", "B", "C"], - header=None) + df = pandas.read_csv(data, sep="\t", names=["A", "B", "C"], header=None) sdf = StreamingDataFrame.read_csv( - data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None) + data, sep="\t", names=["A", "B", "C"], chunksize=2, header=None + ) head = sdf.head(n=1) self.assertEqualDataFrame(df.head(n=1), head) @@ -489,18 +507,15 @@ def test_add_column(self): self.assertEqualDataFrame(df, dfB) def test_fillna(self): - df = pandas.DataFrame( - data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan])) + df = pandas.DataFrame(data=dict(X=[4.5, numpy.nan, 7], Y=["a", "b", numpy.nan])) sdf = StreamingDataFrame.read_df(df) - df2 = pandas.DataFrame( - data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"])) + df2 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"])) na = sdf.fillna(value=dict(X=10.0, Y="NAN")) ndf = na.to_df() self.assertEqual(ndf, df2) - df3 = pandas.DataFrame( - data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan])) + df3 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan])) na = sdf.fillna(value=dict(X=10.0)) ndf = na.to_df() self.assertEqual(ndf, df3) @@ -513,16 +528,16 @@ def test_describe(self): sdf = StreamingDataFrame.read_df(df) desc = sdf.describe() - self.assertEqual(['X', 'Y'], list(desc.columns)) - self.assertEqual(desc.loc['min', :].tolist(), [-0.5, 0]) - self.assertEqual(desc.loc['max', :].tolist(), [0.5, 100000]) + self.assertEqual(["X", "Y"], list(desc.columns)) + self.assertEqual(desc.loc["min", :].tolist(), [-0.5, 0]) + self.assertEqual(desc.loc["max", :].tolist(), [0.5, 100000]) + self.assertEqualArray(desc.loc["mean", :], numpy.array([0, 50000]), atol=1e-8) + self.assertEqualArray(desc.loc["25%", :], numpy.array([-0.25, 25000])) + self.assertEqualArray(desc.loc["50%", :], numpy.array([0.0, 50000])) + self.assertEqualArray(desc.loc["75%", :], numpy.array([0.25, 75000])) self.assertEqualArray( - desc.loc['mean', :], numpy.array([0, 50000]), atol=1e-8) - self.assertEqualArray(desc.loc['25%', :], numpy.array([-0.25, 25000])) - self.assertEqualArray(desc.loc['50%', :], numpy.array([0.0, 50000])) - self.assertEqualArray(desc.loc['75%', :], numpy.array([0.25, 75000])) - self.assertEqualArray(desc.loc['std', :], numpy.array( - [2.886795e-01, 28867.946472]), decimal=4) + desc.loc["std", :], numpy.array([2.886795e-01, 28867.946472]), decimal=4 + ) def test_set_item(self): df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) @@ -530,31 +545,31 @@ def test_set_item(self): sdf = StreamingDataFrame.read_df(df) def f(): - sdf[['a']] = 10 + sdf[["a"]] = 10 + self.assertRaise(f, ValueError) def g(): - sdf['a'] = [10] + sdf["a"] = [10] + self.assertRaise(g, NotImplementedError) - sdf['aa'] = 10 + sdf["aa"] = 10 df = sdf.to_df() ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10])) self.assertEqualDataFrame(df, ddf) - sdf['bb'] = sdf['b'] + 10 + sdf["bb"] = sdf["b"] + 10 df = sdf.to_df() - ddf = ddf = pandas.DataFrame( - data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16])) + ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], aa=[10], bb=[16])) self.assertEqualDataFrame(df, ddf) def test_set_item_function(self): df = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7])) self.assertRaise(lambda: StreamingDataFrame(df), TypeError) sdf = StreamingDataFrame.read_df(df) - sdf['bb'] = sdf['b'].apply(lambda x: x + 11) + sdf["bb"] = sdf["b"].apply(lambda x: x + 11) df = sdf.to_df() - ddf = ddf = pandas.DataFrame( - data=dict(a=[4.5], b=[6], c=[7], bb=[17])) + ddf = ddf = pandas.DataFrame(data=dict(a=[4.5], b=[6], c=[7], bb=[17])) self.assertEqualDataFrame(df, ddf) diff --git a/_unittests/ut_documentation/test_run_notebooks.py b/_unittests/ut_documentation/test_run_notebooks.py index 6f84e1c..aebe979 100644 --- a/_unittests/ut_documentation/test_run_notebooks.py +++ b/_unittests/ut_documentation/test_run_notebooks.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=33s) -""" import os import unittest from pyquickhelper.pycode import ExtTestCase @@ -10,17 +6,19 @@ class TestRunNotebooksPython(ExtTestCase): - def setUp(self): import jyquickhelper # pylint: disable=C0415 + self.assertTrue(jyquickhelper is not None) def test_notebook_artificiel(self): self.assertTrue(pandas_streaming is not None) - folder = os.path.join(os.path.dirname(__file__), - "..", "..", "_doc", "notebooks") + folder = os.path.join( + os.path.dirname(__file__), "..", "..", "_doc", "notebooks" + ) test_notebook_execution_coverage( - __file__, "first_steps", folder, 'pandas_streaming', copy_files=[]) + __file__, "first_steps", folder, "pandas_streaming", copy_files=[] + ) if __name__ == "__main__": diff --git a/_unittests/ut_module/test_sklearn.py b/_unittests/ut_module/test_sklearn.py index 8ae2b79..c8bdbfc 100644 --- a/_unittests/ut_module/test_sklearn.py +++ b/_unittests/ut_module/test_sklearn.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@brief test log(time=2s) -""" import unittest import numpy import pandas @@ -10,12 +6,12 @@ class TestScikitLearn(ExtTestCase): - def test_logistic_regression_check(self): X = pandas.DataFrame(numpy.array([[0.1, 0.2], [-0.2, 0.3]])) Y = numpy.array([0, 1]) - clq = LogisticRegression(fit_intercept=False, solver="liblinear", - random_state=42) + clq = LogisticRegression( + fit_intercept=False, solver="liblinear", random_state=42 + ) clq.fit(X, Y) pred2 = clq.predict(X) self.assertEqual(numpy.array([0, 1]), pred2) diff --git a/pandas_streaming/data/__init__.py b/pandas_streaming/data/__init__.py index ea274fc..9c3a725 100644 --- a/pandas_streaming/data/__init__.py +++ b/pandas_streaming/data/__init__.py @@ -1,6 +1 @@ -""" -@file -@brief Shortcuts to *df*. -""" - from .dummy import dummy_streaming_dataframe diff --git a/pandas_streaming/data/dummy.py b/pandas_streaming/data/dummy.py index 0103d1f..8500e74 100644 --- a/pandas_streaming/data/dummy.py +++ b/pandas_streaming/data/dummy.py @@ -1,8 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Dummy datasets. -""" from pandas import DataFrame from ..df import StreamingDataFrame @@ -19,11 +14,16 @@ def dummy_streaming_dataframe(n, chunksize=10, asfloat=False, **cols): :return: a @see cl StreamingDataFrame """ if asfloat: - df = DataFrame(dict(cfloat=[_ + 0.1 for _ in range(0, n)], cstr=[ - f"s{i}" for i in range(0, n)])) + df = DataFrame( + dict( + cfloat=[_ + 0.1 for _ in range(0, n)], + cstr=[f"s{i}" for i in range(0, n)], + ) + ) else: - df = DataFrame(dict(cint=list(range(0, n)), cstr=[ - f"s{i}" for i in range(0, n)])) + df = DataFrame( + dict(cint=list(range(0, n)), cstr=[f"s{i}" for i in range(0, n)]) + ) for k, v in cols.items(): df[k] = v return StreamingDataFrame.read_df(df, chunksize=chunksize) diff --git a/pandas_streaming/df/__init__.py b/pandas_streaming/df/__init__.py index 61e1b73..ac4996d 100644 --- a/pandas_streaming/df/__init__.py +++ b/pandas_streaming/df/__init__.py @@ -1,10 +1,13 @@ -""" -@file -@brief Shortcuts to *df*. -""" - -from .connex_split import train_test_split_weights, train_test_connex_split, train_test_apart_stratify +from .connex_split import ( + train_test_split_weights, + train_test_connex_split, + train_test_apart_stratify, +) from .dataframe import StreamingDataFrame -from .dataframe_helpers import dataframe_hash_columns, dataframe_unfold, dataframe_shuffle +from .dataframe_helpers import ( + dataframe_hash_columns, + dataframe_unfold, + dataframe_shuffle, +) from .dataframe_helpers import pandas_groupby_nan, numpy_types from .dataframe_io import to_zip, read_zip diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py index ec01b02..bc68581 100644 --- a/pandas_streaming/df/connex_split.py +++ b/pandas_streaming/df/connex_split.py @@ -1,8 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Implements a connex split between train and test. -""" from collections import Counter import pandas import numpy @@ -14,21 +9,31 @@ class ImbalancedSplitException(Exception): """ Raised when an imbalanced split is detected. """ + pass -def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None, - shuffle=True, fail_imbalanced=0.05, random_state=None): +def train_test_split_weights( + df, + weights=None, + test_size=0.25, + train_size=None, + shuffle=True, + fail_imbalanced=0.05, + random_state=None, +): """ Splits a database in train/test given, every row can have a different weight. - @param df :epkg:`pandas:DataFrame` or @see cl StreamingDataFrame + @param df :epkg:`pandas:DataFrame` or see :class:`StreamingDataFrame` @param weights None or weights or weights column name - @param test_size ratio for the test partition (if *train_size* is not specified) + @param test_size ratio for the test partition + (if *train_size* is not specified) @param train_size ratio for the train partition @param shuffle shuffles before the split - @param fail_imbalanced raises an exception if relative weights difference is higher than this value + @param fail_imbalanced raises an exception if relative weights + difference is higher than this value @param random_state seed for random generators @return train and test :epkg:`pandas:DataFrame` @@ -37,21 +42,24 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None, as the function tries to keep equal weights among both paths without using randomness. """ - if hasattr(df, 'iter_creation'): + if hasattr(df, "iter_creation"): raise NotImplementedError( # pragma: no cover - 'Not implemented yet for StreamingDataFrame.') + "Not implemented yet for StreamingDataFrame." + ) if isinstance(df, numpy.ndarray): raise NotImplementedError( # pragma: no cover - "Not implemented on numpy arrays.") + "Not implemented on numpy arrays." + ) if shuffle: df = dataframe_shuffle(df, random_state=random_state) if weights is None: if test_size == 0 or train_size == 0: raise ValueError( - f"test_size={test_size} or train_size={train_size} cannot be null (1).") - return train_test_split(df, test_size=test_size, - train_size=train_size, - random_state=random_state) + f"test_size={test_size} or train_size={train_size} cannot be null (1)." + ) + return train_test_split( + df, test_size=test_size, train_size=train_size, random_state=random_state + ) if isinstance(weights, pandas.Series): weights = list(weights) @@ -60,7 +68,8 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None, if len(weights) != df.shape[0]: raise ValueError( "Dimension mismatch between weights and dataframe " - "{0} != {1}".format(df.shape[0], len(weights))) + "{0} != {1}".format(df.shape[0], len(weights)) + ) p = (1 - test_size) if test_size else None if train_size is not None: @@ -68,7 +77,8 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None, test_size = 1 - p if p is None or min(test_size, p) <= 0: raise ValueError( - f"test_size={test_size} or train_size={train_size} cannot be null (2).") + f"test_size={test_size} or train_size={train_size} cannot be null (2)." + ) ratio = test_size / p if random_state is None: @@ -98,21 +108,32 @@ def train_test_split_weights(df, weights=None, test_size=0.25, train_size=None, balance -= w * ratio train_weights += w * ratio - r = abs(train_weights - test_weights) / \ - (1.0 * (train_weights + test_weights)) + r = abs(train_weights - test_weights) / (1.0 * (train_weights + test_weights)) if r >= fail_imbalanced: raise ImbalancedSplitException( # pragma: no cover "Split is imbalanced: train_weights={0} test_weights={1} r={2}." - "".format(train_weights, test_weights, r)) + "".format(train_weights, test_weights, r) + ) return df.iloc[train_ids, :], df.iloc[test_ids, :] -def train_test_connex_split(df, groups, test_size=0.25, train_size=None, - stratify=None, hash_size=9, unique_rows=False, - shuffle=True, fail_imbalanced=0.05, keep_balance=None, - stop_if_bigger=None, return_cnx=False, - must_groups=None, random_state=None): +def train_test_connex_split( + df, + groups, + test_size=0.25, + train_size=None, + stratify=None, + hash_size=9, + unique_rows=False, + shuffle=True, + fail_imbalanced=0.05, + keep_balance=None, + stop_if_bigger=None, + return_cnx=False, + must_groups=None, + random_state=None, +): """ This split is for a specific case where data is linked in many ways. Let's assume we have three ids as we have @@ -124,7 +145,8 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None, @param df :epkg:`pandas:DataFrame` @param groups columns name for the ids - @param test_size ratio for the test partition (if *train_size* is not specified) + @param test_size ratio for the test partition + (if *train_size* is not specified) @param train_size ratio for the train partition @param stratify column holding the stratification @param hash_size size of the hash to cache information about partition @@ -138,13 +160,13 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None, but does not guarantee it returns the best cut, the value should be close to 0 @param keep_balance (float), if not None, does not merge connected components - if their relative sizes are too different, the value should be - close to 1 + if their relative sizes are too different, + the value should be close to 1 @param return_cnx returns connected components as a third results @param must_groups column name for ids which must not be shared by train/test partitions @param random_state seed for random generator - @return Two @see cl StreamingDataFrame, one + @return Two see :class:`StreamingDataFrame`, one for train, one for test. The list of ids must hold in memory. @@ -213,16 +235,20 @@ def train_test_connex_split(df, groups, test_size=0.25, train_size=None, """ if stratify is not None: raise NotImplementedError( # pragma: no cover - "Option stratify is not implemented.") + "Option stratify is not implemented." + ) if groups is None or len(groups) == 0: raise ValueError( # pragma: no cover - "groups is empty. Use regular train_test_split.") - if hasattr(df, 'iter_creation'): + "groups is empty. Use regular train_test_split." + ) + if hasattr(df, "iter_creation"): raise NotImplementedError( # pragma: no cover - 'Not implemented yet for StreamingDataFrame.') + "Not implemented yet for StreamingDataFrame." + ) if isinstance(df, numpy.ndarray): raise NotImplementedError( # pragma: no cover - "Not implemented on numpy arrays.") + "Not implemented on numpy arrays." + ) if shuffle: df = dataframe_shuffle(df, random_state=random_state) @@ -250,13 +276,18 @@ def do_connex_components(dfrows, local_groups, kb, sib): while modif > 0 and itern < len(elements): if fLOG and df.shape[0] > 10000: - fLOG("[train_test_connex_split] iteration={0}-#nb connect={1} - " - "modif={2}".format(iter, len(set(elements)), modif)) + fLOG( + "[train_test_connex_split] iteration={0}-#nb connect={1} - " + "modif={2}".format(iter, len(set(elements)), modif) + ) modif = 0 itern += 1 for i, row in enumerate(dfrows.itertuples(index=False, name=None)): - vals = [val for val in zip(local_groups, row) if not isinstance( - val[1], float) or not numpy.isnan(val[1])] + vals = [ + val + for val in zip(local_groups, row) + if not isinstance(val[1], float) or not numpy.isnan(val[1]) + ] c = elements[i] @@ -276,27 +307,42 @@ def do_connex_components(dfrows, local_groups, kb, sib): if kb is not None: maxi = min(len(counts_cnx[new_c]), len(counts_cnx[c])) if maxi > 5: - diff = len(counts_cnx[new_c]) + \ - len(counts_cnx[c]) - maxi + diff = len(counts_cnx[new_c]) + len(counts_cnx[c]) - maxi r = diff / float(maxi) if r > kb: if fLOG: # pragma: no cover - fLOG('[train_test_connex_split] balance ' - 'r={0:0.00000}>{1:0.00}, #[{2}]={3}, ' - '#[{4}]={5}'.format(r, kb, new_c, - len(counts_cnx[new_c]), - c, len(counts_cnx[c]))) + fLOG( + "[train_test_connex_split] balance " + "r={0:0.00000}>{1:0.00}, #[{2}]={3}, " + "#[{4}]={5}".format( + r, + kb, + new_c, + len(counts_cnx[new_c]), + c, + len(counts_cnx[c]), + ) + ) continue if sib is not None: - r = (len(counts_cnx[new_c]) + - len(counts_cnx[c])) / float(len(elements)) + r = (len(counts_cnx[new_c]) + len(counts_cnx[c])) / float( + len(elements) + ) if r > sib: if fLOG: # pragma: no cover - fLOG('[train_test_connex_split] no merge ' - 'r={0:0.00000}>{1:0.00}, #[{2}]={3}, #[{4}]={5}' - ''.format(r, sib, new_c, len(counts_cnx[new_c]), - c, len(counts_cnx[c]))) + fLOG( + "[train_test_connex_split] no merge " + "r={0:0.00000}>{1:0.00}, #[{2}]={3}, #[{4}]={5}" + "".format( + r, + sib, + new_c, + len(counts_cnx[new_c]), + c, + len(counts_cnx[c]), + ) + ) avoids_merge[new_c, c] = i continue @@ -307,8 +353,7 @@ def do_connex_components(dfrows, local_groups, kb, sib): modif += len(counts_cnx[c]) for ii in counts_cnx[c]: elements[ii] = new_c - counts_cnx[new_c] = counts_cnx[new_c].union( - counts_cnx[c]) + counts_cnx[new_c] = counts_cnx[new_c].union(counts_cnx[c]) counts_cnx[c] = set() keys = list(vals) @@ -327,13 +372,12 @@ def do_connex_components(dfrows, local_groups, kb, sib): grsum = dfids[[name, one]].groupby(name, as_index=False).sum() if fLOG: for g in groups: - fLOG( - f"[train_test_connex_split] #nb in '{g}': {len(set(dfids[g]))}") - fLOG( - f"[train_test_connex_split] #connex {grsum.shape[0]}/{dfids.shape[0]}") + fLOG(f"[train_test_connex_split] #nb in '{g}': {len(set(dfids[g]))}") + fLOG(f"[train_test_connex_split] #connex {grsum.shape[0]}/{dfids.shape[0]}") if grsum.shape[0] <= 1: raise ValueError( # pragma: no cover - "Every element is in the same connected components.") + "Every element is in the same connected components." + ) # Statistics: top connected components if fLOG: @@ -342,28 +386,36 @@ def do_connex_components(dfrows, local_groups, kb, sib): cl = [(v, k) for k, v in counts.items()] cum = 0 maxc = None - fLOG("[train_test_connex_split] number of connected components: {0}" - "".format(len(set(elements)))) + fLOG( + "[train_test_connex_split] number of connected components: {0}" + "".format(len(set(elements))) + ) for i, (v, k) in enumerate(sorted(cl, reverse=True)): if i == 0: maxc = k, v if i >= 10: break cum += v - fLOG("[train_test_connex_split] c={0} #elements={1} cumulated" - "={2}/{3}".format(k, v, cum, len(elements))) + fLOG( + "[train_test_connex_split] c={0} #elements={1} cumulated" + "={2}/{3}".format(k, v, cum, len(elements)) + ) # Most important component - fLOG( - f'[train_test_connex_split] first row of the biggest component {maxc}') + fLOG(f"[train_test_connex_split] first row of the biggest component {maxc}") tdf = dfids[dfids[name] == maxc[0]] - fLOG(f'[train_test_connex_split] \n{tdf.head(n=10)}') + fLOG(f"[train_test_connex_split] \n{tdf.head(n=10)}") # Splits. train, test = train_test_split_weights( - grsum, weights=one, test_size=test_size, train_size=train_size, - shuffle=shuffle, fail_imbalanced=fail_imbalanced, - random_state=random_state) + grsum, + weights=one, + test_size=test_size, + train_size=train_size, + shuffle=shuffle, + fail_imbalanced=fail_imbalanced, + random_state=random_state, + ) train.drop(one, inplace=True, axis=1) test.drop(one, inplace=True, axis=1) @@ -382,8 +434,15 @@ def double_merge(d): return train_f, test_f -def train_test_apart_stratify(df, group, test_size=0.25, train_size=None, - stratify=None, force=False, random_state=None): +def train_test_apart_stratify( + df, + group, + test_size=0.25, + train_size=None, + stratify=None, + force=False, + random_state=None, +): """ This split is for a specific case where data is linked in one way. Let's assume we have two ids as we have @@ -401,7 +460,7 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None, @param force if True, tries to get at least one example on the test side for each value of the column *stratify* @param random_state seed for random generators - @return Two @see cl StreamingDataFrame, one + @return Two see :class:`StreamingDataFrame`, one for train, one for test. .. index:: multi-label @@ -434,14 +493,11 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None, print(test) """ if stratify is None: - raise ValueError( # pragma: no cover - "stratify must be specified.") + raise ValueError("stratify must be specified.") # pragma: no cover if group is None: - raise ValueError( # pragma: no cover - "group must be specified.") - if hasattr(df, 'iter_creation'): - raise NotImplementedError( - 'Not implemented yet for StreamingDataFrame.') + raise ValueError("group must be specified.") # pragma: no cover + if hasattr(df, "iter_creation"): + raise NotImplementedError("Not implemented yet for StreamingDataFrame.") if isinstance(df, numpy.ndarray): raise NotImplementedError("Not implemented on numpy arrays.") @@ -451,7 +507,8 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None, test_size = 1 - p if p is None or min(test_size, p) <= 0: raise ValueError( # pragma: no cover - f"test_size={test_size} or train_size={train_size} cannot be null") + f"test_size={test_size} or train_size={train_size} cannot be null" + ) couples = df[[group, stratify]].itertuples(name=None, index=False) hist = Counter(df[stratify]) @@ -475,8 +532,7 @@ def train_test_apart_stratify(df, group, test_size=0.25, train_size=None, continue assigned = [c for c in ids[k] if c in split] nb_test = sum(split[c] for c in assigned) - expected = min(len(ids[k]), int( - test_size * len(ids[k]) + 0.5)) - nb_test + expected = min(len(ids[k]), int(test_size * len(ids[k]) + 0.5)) - nb_test if force and expected == 0 and nb_test == 0: nb_train = len(assigned) - nb_test if nb_train > 0 or len(not_assigned) > 1: diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py index 843e4da..db3d7b9 100644 --- a/pandas_streaming/df/dataframe.py +++ b/pandas_streaming/df/dataframe.py @@ -1,9 +1,3 @@ -# -*- coding: utf-8 -*- -# pylint: disable=W0102 -""" -@file -@brief Defines a streaming dataframe. -""" import pickle import os from io import StringIO, BytesIO @@ -12,11 +6,15 @@ import numpy.random as nrandom import pandas from pandas.testing import assert_frame_equal + try: from pandas import json_normalize except ImportError: from pandas.io.json import json_normalize -from .dataframe_split import sklearn_train_test_split, sklearn_train_test_split_streaming +from .dataframe_split import ( + sklearn_train_test_split, + sklearn_train_test_split_streaming, +) from .dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream @@ -24,6 +22,7 @@ class StreamingDataFrameSchemaError(Exception): """ Reveals an issue with inconsistant schemas. """ + pass @@ -50,7 +49,7 @@ class StreamingDataFrame: Instead, the class takes a function which generates an iterator on :epkg:`DataFrame`. Most of the methods returns either a :epkg:`DataFrame` - either a @see cl StreamingDataFrame. In the second case, + either a see :class:`StreamingDataFrame`. In the second case, methods can be chained. By default, the object checks that the schema remains @@ -64,7 +63,7 @@ class StreamingDataFrame: is one of these cases. :param iter_creation: function which creates an iterator or an - instance of @see cl StreamingDataFrame + instance of see :class:`StreamingDataFrame` :param check_schema: checks that the schema is the same for every :epkg:`dataframe` :param stable: indicates if the :epkg:`dataframe` remains the same @@ -73,11 +72,11 @@ class StreamingDataFrame: def __init__(self, iter_creation, check_schema=True, stable=True): self._delete_ = [] - if isinstance(iter_creation, (pandas.DataFrame, dict, - numpy.ndarray, str)): + if isinstance(iter_creation, (pandas.DataFrame, dict, numpy.ndarray, str)): raise TypeError( "Unexpected type %r for iter_creation. It must " - "be an iterator." % type(iter_creation)) + "be an iterator." % type(iter_creation) + ) if isinstance(iter_creation, StreamingDataFrame): self.iter_creation = iter_creation.iter_creation self.stable = iter_creation.stable @@ -116,9 +115,15 @@ def get_kwargs(self): """ return dict(check_schema=self.check_schema) - def train_test_split(self, path_or_buf=None, export_method="to_csv", - names=None, streaming=True, partitions=None, - **kwargs): + def train_test_split( + self, + path_or_buf=None, + export_method="to_csv", + names=None, + streaming=True, + partitions=None, + **kwargs, + ): """ Randomly splits a :epkg:`dataframe` into smaller pieces. The function returns streams of file names. @@ -138,7 +143,7 @@ def train_test_split(self, path_or_buf=None, export_method="to_csv", streaming version of the algorithm. @param partitions splitting partitions @return outputs of the exports functions or two - @see cl StreamingDataFrame if path_or_buf is None. + see :class:`StreamingDataFrame` if path_or_buf is None. The streaming version of this algorithm is implemented by function @see fn sklearn_train_test_split_streaming. Its documentation @@ -150,14 +155,19 @@ def train_test_split(self, path_or_buf=None, export_method="to_csv", if len(partitions) != 2: raise NotImplementedError( # pragma: no cover "Only train and test split is allowed, *partitions* " - "must be of length 2.") + "must be of length 2." + ) kwargs = kwargs.copy() - kwargs['train_size'] = partitions[0] - kwargs['test_size'] = partitions[1] + kwargs["train_size"] = partitions[0] + kwargs["test_size"] = partitions[1] return sklearn_train_test_split_streaming(self, **kwargs) - return sklearn_train_test_split(self, path_or_buf=path_or_buf, - export_method=export_method, - names=names, **kwargs) + return sklearn_train_test_split( + self, + path_or_buf=path_or_buf, + export_method=export_method, + names=names, + **kwargs, + ) @staticmethod def _process_kwargs(kwargs): @@ -165,14 +175,16 @@ def _process_kwargs(kwargs): Filters out parameters for the constructor of this class. """ kw = {} - for k in ['check_schema']: + for k in ["check_schema"]: if k in kwargs: kw[k] = kwargs[k] del kwargs[k] return kw @staticmethod - def read_json(*args, chunksize=100000, flatten=False, **kwargs) -> 'StreamingDataFrame': + def read_json( + *args, chunksize=100000, flatten=False, **kwargs + ) -> "StreamingDataFrame": """ Reads a :epkg:`json` file or buffer as an iterator on :epkg:`DataFrame`. The signature is the same as @@ -225,25 +237,28 @@ def read_json(*args, chunksize=100000, flatten=False, **kwargs) -> 'StreamingDat `parse error: unallowed token at this point in JSON text`. """ if not isinstance(chunksize, int) or chunksize <= 0: - raise ValueError( # pragma: no cover - 'chunksize must be a positive integer') + raise ValueError("chunksize must be a positive integer") # pragma: no cover kwargs_create = StreamingDataFrame._process_kwargs(kwargs) if isinstance(args[0], (list, dict)): if flatten: return StreamingDataFrame.read_df( - json_normalize(args[0]), **kwargs_create) + json_normalize(args[0]), **kwargs_create + ) return StreamingDataFrame.read_df(args[0], **kwargs_create) - if kwargs.get('lines', None) == 'stream': - del kwargs['lines'] + if kwargs.get("lines", None) == "stream": + del kwargs["lines"] def localf(a0=args[0]): - if hasattr(a0, 'seek'): + if hasattr(a0, "seek"): a0.seek(0) return enumerate_json_items( - a0, encoding=kwargs.get('encoding', None), lines=True, - flatten=flatten) + a0, + encoding=kwargs.get("encoding", None), + lines=True, + flatten=flatten, + ) st = JsonIterator2Stream(localf) args = args[1:] @@ -251,57 +266,68 @@ def localf(a0=args[0]): if chunksize is None: return StreamingDataFrame( lambda: pandas.read_json( - st, *args, chunksize=None, lines=True, **kwargs), - **kwargs_create) + st, *args, chunksize=None, lines=True, **kwargs + ), + **kwargs_create, + ) def fct1(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()): st.seek(0) for r in pandas.read_json( - st, *args, chunksize=chunksize, nrows=chunksize, - lines=True, **kw): + st, *args, chunksize=chunksize, nrows=chunksize, lines=True, **kw + ): yield r return StreamingDataFrame(fct1, **kwargs_create) - if kwargs.get('lines', False): + if kwargs.get("lines", False): if flatten: raise NotImplementedError( - "flatten==True is implemented with option lines='stream'") + "flatten==True is implemented with option lines='stream'" + ) if chunksize is None: return StreamingDataFrame( lambda: pandas.read_json(*args, chunksize=None, **kwargs), - **kwargs_create) + **kwargs_create, + ) def fct2(args=args, chunksize=chunksize, kw=kwargs.copy()): for r in pandas.read_json( - *args, chunksize=chunksize, nrows=chunksize, **kw): + *args, chunksize=chunksize, nrows=chunksize, **kw + ): yield r + return StreamingDataFrame(fct2, **kwargs_create) st = JsonIterator2Stream( lambda a0=args[0]: enumerate_json_items( - a0, encoding=kwargs.get('encoding', None), flatten=flatten)) + a0, encoding=kwargs.get("encoding", None), flatten=flatten + ) + ) args = args[1:] - if 'lines' in kwargs: - del kwargs['lines'] + if "lines" in kwargs: + del kwargs["lines"] if chunksize is None: return StreamingDataFrame( lambda: pandas.read_json( - st, *args, chunksize=chunksize, lines=True, **kwargs), - **kwargs_create) + st, *args, chunksize=chunksize, lines=True, **kwargs + ), + **kwargs_create, + ) def fct3(st=st, args=args, chunksize=chunksize, kw=kwargs.copy()): - if hasattr(st, 'seek'): + if hasattr(st, "seek"): st.seek(0) for r in pandas.read_json( - st, *args, chunksize=chunksize, nrows=chunksize, - lines=True, **kw): + st, *args, chunksize=chunksize, nrows=chunksize, lines=True, **kw + ): yield r + return StreamingDataFrame(fct3, **kwargs_create) @staticmethod - def read_csv(*args, **kwargs) -> 'StreamingDataFrame': + def read_csv(*args, **kwargs) -> "StreamingDataFrame": """ Reads a :epkg:`csv` file or buffer as an iterator on :epkg:`DataFrame`. @@ -310,41 +336,44 @@ def read_csv(*args, **kwargs) -> 'StreamingDataFrame': of rows to parse in a single bloc. If not specified, it will be equal to 100000. """ - if not kwargs.get('iterator', True): + if not kwargs.get("iterator", True): raise ValueError("If specified, iterator must be True.") - if not kwargs.get('chunksize', 100000): + if not kwargs.get("chunksize", 100000): raise ValueError("If specified, chunksize must not be None.") kwargs_create = StreamingDataFrame._process_kwargs(kwargs) - kwargs['iterator'] = True - if 'chunksize' not in kwargs: - kwargs['chunksize'] = 100000 - return StreamingDataFrame(lambda: pandas.read_csv(*args, **kwargs), **kwargs_create) + kwargs["iterator"] = True + if "chunksize" not in kwargs: + kwargs["chunksize"] = 100000 + return StreamingDataFrame( + lambda: pandas.read_csv(*args, **kwargs), **kwargs_create + ) @staticmethod - def read_str(text, **kwargs) -> 'StreamingDataFrame': + def read_str(text, **kwargs) -> "StreamingDataFrame": """ Reads a :epkg:`DataFrame` as an iterator on :epkg:`DataFrame`. The signature is the same as :epkg:`pandas:read_csv`. The important parameter is *chunksize* which defines the number of rows to parse in a single bloc. """ - if not kwargs.get('iterator', True): + if not kwargs.get("iterator", True): raise ValueError("If specified, iterator must be True.") - if not kwargs.get('chunksize', 100000): + if not kwargs.get("chunksize", 100000): raise ValueError("If specified, chunksize must not be None.") kwargs_create = StreamingDataFrame._process_kwargs(kwargs) - kwargs['iterator'] = True - if 'chunksize' not in kwargs: - kwargs['chunksize'] = 100000 + kwargs["iterator"] = True + if "chunksize" not in kwargs: + kwargs["chunksize"] = 100000 if isinstance(text, str): buffer = StringIO(text) else: buffer = BytesIO(text) return StreamingDataFrame( - lambda: pandas.read_csv(buffer, **kwargs), **kwargs_create) + lambda: pandas.read_csv(buffer, **kwargs), **kwargs_create + ) @staticmethod - def read_df(df, chunksize=None, check_schema=True) -> 'StreamingDataFrame': + def read_df(df, chunksize=None, check_schema=True) -> "StreamingDataFrame": """ Splits a :epkg:`DataFrame` into small chunks mostly for unit testing purposes. @@ -352,26 +381,29 @@ def read_df(df, chunksize=None, check_schema=True) -> 'StreamingDataFrame': @param df :epkg:`DataFrame` @param chunksize number rows per chunks (// 10 by default) @param check_schema check schema between two iterations - @return iterator on @see cl StreamingDataFrame + @return iterator on see :class:`StreamingDataFrame` """ if chunksize is None: - if hasattr(df, 'shape'): + if hasattr(df, "shape"): chunksize = df.shape[0] else: raise NotImplementedError( - f"Cannot retrieve size to infer chunksize for type={type(df)}.") + f"Cannot retrieve size to infer chunksize for type={type(df)}." + ) - if hasattr(df, 'shape'): + if hasattr(df, "shape"): size = df.shape[0] else: raise NotImplementedError( # pragma: no cover - f"Cannot retrieve size for type={type(df)}.") + f"Cannot retrieve size for type={type(df)}." + ) def local_iterator(): "local iterator" for i in range(0, size, chunksize): end = min(size, i + chunksize) yield df[i:end].copy() + return StreamingDataFrame(local_iterator, check_schema=check_schema) def __iter__(self): @@ -403,21 +435,26 @@ def __iter__(self): elif self.check_schema: if list(it.columns) != sch[0]: # pylint: disable=E1136 raise StreamingDataFrameSchemaError( # pragma: no cover - 'Column names are different after row {0}\nFirst chunk: {1}' - '\nCurrent chunk: {2}'.format( - rows, sch[0], list(it.columns))) # pylint: disable=E1136 + "Column names are different after row {0}\nFirst chunk: {1}" + "\nCurrent chunk: {2}".format(rows, sch[0], list(it.columns)) + ) # pylint: disable=E1136 if list(it.dtypes) != sch[1]: # pylint: disable=E1136 errdf = pandas.DataFrame( - dict(names=sch[0], schema1=sch[1], # pylint: disable=E1136 - schema2=list(it.dtypes))) # pylint: disable=E1136 + dict( + names=sch[0], + schema1=sch[1], # pylint: disable=E1136 + schema2=list(it.dtypes), + ) + ) # pylint: disable=E1136 tdf = StringIO() - errdf['diff'] = errdf['schema2'] != errdf['schema1'] - errdf = errdf[errdf['diff']] + errdf["diff"] = errdf["schema2"] != errdf["schema1"] + errdf = errdf[errdf["diff"]] errdf.to_csv(tdf, sep=",", index=False) raise StreamingDataFrameSchemaError( - 'Column types are different after row {0}. You may use option ' + "Column types are different after row {0}. You may use option " 'dtype={{"column_name": str}} to force the type on this column.' - '\n---\n{1}'.format(rows, tdf.getvalue())) + "\n---\n{1}".format(rows, tdf.getvalue()) + ) rows += it.shape[0] yield it @@ -453,7 +490,7 @@ def dtypes(self): for it in self: return it.dtypes - def to_csv(self, path_or_buf=None, **kwargs) -> 'StreamingDataFrame': + def to_csv(self, path_or_buf=None, **kwargs) -> "StreamingDataFrame": """ Saves the :epkg:`DataFrame` into string. See :epkg:`pandas:DataFrame.to_csv`. @@ -463,7 +500,8 @@ def to_csv(self, path_or_buf=None, **kwargs) -> 'StreamingDataFrame': close = False elif isinstance(path_or_buf, str): st = open( # pylint: disable=R1732 - path_or_buf, "w", encoding=kwargs.get('encoding')) + path_or_buf, "w", encoding=kwargs.get("encoding") + ) close = True else: st = path_or_buf @@ -471,7 +509,7 @@ def to_csv(self, path_or_buf=None, **kwargs) -> 'StreamingDataFrame': for df in self: df.to_csv(st, **kwargs) - kwargs['header'] = False + kwargs["header"] = False if close: st.close() @@ -529,43 +567,51 @@ def tail(self, n=5) -> pandas.DataFrame: h = df.tail(n=n) return h - def where(self, *args, **kwargs) -> 'StreamingDataFrame': + def where(self, *args, **kwargs) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:where`. *inplace* must be False. - This function returns a @see cl StreamingDataFrame. + This function returns a see :class:`StreamingDataFrame`. """ - kwargs['inplace'] = False + kwargs["inplace"] = False return StreamingDataFrame( - lambda: map(lambda df: df.where(*args, **kwargs), self), - **self.get_kwargs()) + lambda: map(lambda df: df.where(*args, **kwargs), self), **self.get_kwargs() + ) - def sample(self, reservoir=False, cache=False, **kwargs) -> 'StreamingDataFrame': + def sample(self, reservoir=False, cache=False, **kwargs) -> "StreamingDataFrame": """ See :epkg:`pandas:DataFrame:sample`. Only *frac* is available, otherwise choose @see me reservoir_sampling. - This function returns a @see cl StreamingDataFrame. + This function returns a see :class:`StreamingDataFrame`. - @param reservoir use `reservoir sampling `_ - @param cache cache the sample - @param kwargs additional parameters for :epkg:`pandas:DataFrame:sample` + :param reservoir: use + `reservoir sampling `_ + :param cache: cache the sample + :param kwargs: additional parameters for :epkg:`pandas:DataFrame:sample` If *cache* is True, the sample is cached (assuming it holds in memory). The second time an iterator walks through the """ - if reservoir or 'n' in kwargs: - if 'frac' in kwargs: - raise ValueError( - 'frac cannot be specified for reservoir sampling.') - return self._reservoir_sampling(cache=cache, n=kwargs['n'], random_state=kwargs.get('random_state')) + if reservoir or "n" in kwargs: + if "frac" in kwargs: + raise ValueError("frac cannot be specified for reservoir sampling.") + return self._reservoir_sampling( + cache=cache, n=kwargs["n"], random_state=kwargs.get("random_state") + ) if cache: sdf = self.sample(cache=False, **kwargs) df = sdf.to_df() return StreamingDataFrame.read_df(df, chunksize=df.shape[0]) - return StreamingDataFrame(lambda: map(lambda df: df.sample(**kwargs), self), **self.get_kwargs(), stable=False) + return StreamingDataFrame( + lambda: map(lambda df: df.sample(**kwargs), self), + **self.get_kwargs(), + stable=False, + ) - def _reservoir_sampling(self, cache=True, n=1000, random_state=None) -> 'StreamingDataFrame': + def _reservoir_sampling( + self, cache=True, n=1000, random_state=None + ) -> "StreamingDataFrame": """ Uses the `reservoir sampling `_ algorithm to draw a random sample with exactly *n* samples. @@ -573,15 +619,14 @@ def _reservoir_sampling(self, cache=True, n=1000, random_state=None) -> 'Streami @param cache cache the sample @param n number of observations to keep @param random_state sets the random_state - @return @see cl StreamingDataFrame + @return see :class:`StreamingDataFrame` .. warning:: The sample is split by chunks of size 1000. This parameter is not yet exposed. """ if not cache: - raise ValueError( - "cache=False is not available for reservoir sampling.") + raise ValueError("cache=False is not available for reservoir sampling.") indices = [] seen = 0 for i, df in enumerate(self): @@ -610,51 +655,75 @@ def reservoir_iterate(sdf, indices, chunksize): yield pandas.DataFrame(buffer) return StreamingDataFrame( - lambda: reservoir_iterate(sdf=self, indices=indices, chunksize=1000)) - - def drop(self, labels=None, *, axis=0, index=None, columns=None, level=None, - inplace=False, errors='raise') -> 'StreamingDataFrame': + lambda: reservoir_iterate(sdf=self, indices=indices, chunksize=1000) + ) + + def drop( + self, + labels=None, + *, + axis=0, + index=None, + columns=None, + level=None, + inplace=False, + errors="raise", + ) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:drop`. - This function returns a @see cl StreamingDataFrame. + This function returns a see :class:`StreamingDataFrame`. """ if axis == 0: raise NotImplementedError(f"drop is not implemented for axis={axis}.") if inplace: raise NotImplementedError(f"drop is not implemented for inplace={inplace}.") return StreamingDataFrame( - lambda: map(lambda df: df.drop( - labels, axis=axis, index=index, columns=columns, - level=level, inplace=False, errors=errors), self), - **self.get_kwargs()) - - def apply(self, *args, **kwargs) -> 'StreamingDataFrame': + lambda: map( + lambda df: df.drop( + labels, + axis=axis, + index=index, + columns=columns, + level=level, + inplace=False, + errors=errors, + ), + self, + ), + **self.get_kwargs(), + ) + + def apply(self, *args, **kwargs) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:apply`. - This function returns a @see cl StreamingDataFrame. + This function returns a see :class:`StreamingDataFrame`. """ return StreamingDataFrame( - lambda: map(lambda df: df.apply(*args, **kwargs), self), - **self.get_kwargs()) + lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs() + ) - def applymap(self, *args, **kwargs) -> 'StreamingDataFrame': + def applymap(self, *args, **kwargs) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:applymap`. - This function returns a @see cl StreamingDataFrame. + This function returns a see :class:`StreamingDataFrame`. """ return StreamingDataFrame( lambda: map(lambda df: df.applymap(*args, **kwargs), self), - **self.get_kwargs()) + **self.get_kwargs(), + ) - def merge(self, right, **kwargs) -> 'StreamingDataFrame': + def merge(self, right, **kwargs) -> "StreamingDataFrame": """ - Merges two @see cl StreamingDataFrame and returns @see cl StreamingDataFrame. - *right* can be either a @see cl StreamingDataFrame or simply + Merges two see :class:`StreamingDataFrame` + and returns see :class:`StreamingDataFrame`. + *right* can be either a see :class:`StreamingDataFrame` or simply a :epkg:`pandas:DataFrame`. It calls :epkg:`pandas:DataFrame:merge` in a double loop, loop on *self*, loop on *right*. """ if isinstance(right, pandas.DataFrame): - return self.merge(StreamingDataFrame.read_df(right, chunksize=right.shape[0]), **kwargs) + return self.merge( + StreamingDataFrame.read_df(right, chunksize=right.shape[0]), **kwargs + ) def iterator_merge(sdf1, sdf2, **kw): "iterate on dataframes" @@ -664,18 +733,20 @@ def iterator_merge(sdf1, sdf2, **kw): yield df return StreamingDataFrame( - lambda: iterator_merge(self, right, **kwargs), **self.get_kwargs()) + lambda: iterator_merge(self, right, **kwargs), **self.get_kwargs() + ) - def concat(self, others, axis=0) -> 'StreamingDataFrame': + def concat(self, others, axis=0) -> "StreamingDataFrame": """ - Concatenates :epkg:`dataframes`. The function ensures all :epkg:`pandas:DataFrame` - or @see cl StreamingDataFrame share the same columns (name and type). + Concatenates :epkg:`dataframes`. + The function ensures all :epkg:`pandas:DataFrame` + or see :class:`StreamingDataFrame` share the same columns (name and type). Otherwise, the function fails as it cannot guess the schema without walking through all :epkg:`dataframes`. :param others: list, enumeration, :epkg:`pandas:DataFrame` :param axis: concatenate by rows (0) or by columns (1) - :return: @see cl StreamingDataFrame + :return: see :class:`StreamingDataFrame` """ if axis == 1: return self._concath(others) @@ -693,13 +764,14 @@ def iterateh(self, others): nrows = [_.shape[0] for _ in dfs] if min(nrows) != max(nrows): raise RuntimeError( - "StreamingDataFram cannot merge DataFrame with different size or chunksize") + "StreamingDataFram cannot merge DataFrame " + "with different size or chunksize" + ) yield pandas.concat(list(dfs), axis=1) return StreamingDataFrame(lambda: iterateh(self, others), **self.get_kwargs()) def _concatv(self, others): - def iterator_concat(this, lothers): "iterator on dataframes" columns = None @@ -715,10 +787,13 @@ def iterator_concat(this, lothers): if check: if list(columns) != list(df.columns): raise ValueError( - f"Frame others[{i}] do not have the same column names or the same order.") + f"Frame others[{i}] do not have the " + f"same column names or the same order." + ) if list(dtypes) != list(df.dtypes): raise ValueError( - f"Frame others[{i}] do not have the same column types.") + f"Frame others[{i}] do not have the same column types." + ) check = False yield df @@ -736,23 +811,25 @@ def change_type(obj): others = list(map(change_type, others)) return StreamingDataFrame( - lambda: iterator_concat(self, others), **self.get_kwargs()) + lambda: iterator_concat(self, others), **self.get_kwargs() + ) - def groupby(self, by=None, lambda_agg=None, lambda_agg_agg=None, - in_memory=True, **kwargs) -> pandas.DataFrame: + def groupby( + self, by=None, lambda_agg=None, lambda_agg_agg=None, in_memory=True, **kwargs + ) -> pandas.DataFrame: """ Implements the streaming :epkg:`pandas:DataFrame:groupby`. We assume the result holds in memory. The out-of-memory is not implemented yet. - @param by see :epkg:`pandas:DataFrame:groupby` - @param in_memory in-memory algorithm - @param lambda_agg aggregation function, *sum* by default - @param lambda_agg_agg to aggregate the aggregations, *sum* by default - @param kwargs additional parameters for :epkg:`pandas:DataFrame:groupby` - @return :epkg:`pandas:DataFrame` + :param by: see :epkg:`pandas:DataFrame:groupby` + :param in_memory: in-memory algorithm + :param lambda_agg: aggregation function, *sum* by default + :param lambda_agg_agg: to aggregate the aggregations, *sum* by default + :param kwargs: additional parameters for :epkg:`pandas:DataFrame:groupby` + :return: :epkg:`pandas:DataFrame` - As the input @see cl StreamingDataFrame does not necessarily hold + As the input see :class:`StreamingDataFrame` does not necessarily hold in memory, the aggregation must be done at every iteration. There are two levels of aggregation: one to reduce every iterated :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`. @@ -772,7 +849,7 @@ def groupby(self, by=None, lambda_agg=None, lambda_agg_agg=None, :tag: streaming Here is an example which shows how to write a simple *groupby* - with :epkg:`pandas` and @see cl StreamingDataFrame. + with :epkg:`pandas` and see :class:`StreamingDataFrame`. .. runpython:: :showcode: @@ -790,17 +867,20 @@ def groupby(self, by=None, lambda_agg=None, lambda_agg_agg=None, print(df.groupby("A").sum()) """ if not in_memory: - raise NotImplementedError( - "Out-of-memory group by is not implemented.") + raise NotImplementedError("Out-of-memory group by is not implemented.") if lambda_agg is None: + def lambda_agg_(gr): "sum" return gr.sum() + lambda_agg = lambda_agg_ if lambda_agg_agg is None: + def lambda_agg_agg_(gr): "sum" return gr.sum() + lambda_agg_agg = lambda_agg_agg_ ckw = kwargs.copy() ckw["as_index"] = False @@ -812,8 +892,15 @@ def lambda_agg_agg_(gr): conc = pandas.concat(agg, sort=False) return lambda_agg_agg(conc.groupby(by=by, **kwargs)) - def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_memory=True, - strategy='cum', **kwargs) -> pandas.DataFrame: + def groupby_streaming( + self, + by=None, + lambda_agg=None, + lambda_agg_agg=None, + in_memory=True, + strategy="cum", + **kwargs, + ) -> pandas.DataFrame: """ Implements the streaming :epkg:`pandas:DataFrame:groupby`. We assume the result holds in memory. The out-of-memory is @@ -827,7 +914,7 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me :param strategy: ``'cum'``, or ``'streaming'``, see below :return: :epkg:`pandas:DataFrame` - As the input @see cl StreamingDataFrame does not necessarily hold + As the input see :class:`StreamingDataFrame` does not necessarily hold in memory, the aggregation must be done at every iteration. There are two levels of aggregation: one to reduce every iterated :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`. @@ -846,7 +933,7 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me First one if ``strategy is None`` goes through the whole datasets to produce a final :epkg:`DataFrame`. Second if ``strategy=='cum'`` returns a - @see cl StreamingDataFrame, each iteration produces + see :class:`StreamingDataFrame`, each iteration produces the current status of the *group by*. Last case, ``strategy=='streaming'`` produces :epkg:`DataFrame` which must be concatenated into a single :epkg:`DataFrame` @@ -857,7 +944,7 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me :tag: streaming Here is an example which shows how to write a simple *groupby* - with :epkg:`pandas` and @see cl StreamingDataFrame. + with :epkg:`pandas` and see :class:`StreamingDataFrame`. .. runpython:: :showcode: @@ -876,22 +963,26 @@ def groupby_streaming(self, by=None, lambda_agg=None, lambda_agg_agg=None, in_me print(gr) """ if not in_memory: - raise NotImplementedError( - "Out-of-memory group by is not implemented.") + raise NotImplementedError("Out-of-memory group by is not implemented.") if lambda_agg is None: + def lambda_agg_(gr): "sum" return gr.sum() + lambda_agg = lambda_agg_ if lambda_agg_agg is None: + def lambda_agg_agg_(gr): "sum" return gr.sum() + lambda_agg_agg = lambda_agg_agg_ ckw = kwargs.copy() ckw["as_index"] = False - if strategy == 'cum': + if strategy == "cum": + def iterate_cum(): agg = None for df in self: @@ -904,18 +995,20 @@ def iterate_cum(): lagg = pandas.concat([agg, gragg], sort=False) yield lambda_agg_agg(lagg.groupby(by=by, **kwargs)) agg = lagg + return StreamingDataFrame(lambda: iterate_cum(), **self.get_kwargs()) - if strategy == 'streaming': + if strategy == "streaming": + def iterate_streaming(): for df in self: gr = df.groupby(by=by, **ckw) gragg = lambda_agg(gr) yield lambda_agg(gragg.groupby(by=by, **kwargs)) + return StreamingDataFrame(lambda: iterate_streaming(), **self.get_kwargs()) - raise ValueError( # pragma: no cover - f"Unknown strategy '{strategy}'") + raise ValueError(f"Unknown strategy '{strategy}'") # pragma: no cover def ensure_dtype(self, df, dtypes): """ @@ -942,7 +1035,8 @@ def __getitem__(self, *args): """ if len(args) != 1: raise NotImplementedError( # pragma: no cover - "Only a list of columns is supported.") + "Only a list of columns is supported." + ) cols = args[0] if isinstance(cols, str): # One column. @@ -953,6 +1047,7 @@ def iterate_col(): one_col = [cols] for df in iter_creation(): yield df[one_col] + return StreamingSeries(iterate_col, **self.get_kwargs()) if not isinstance(cols, list): @@ -970,8 +1065,7 @@ def __setitem__(self, index, value): Limited set of operators are supported. """ if not isinstance(index, str): - raise ValueError( - f"Only column affected are supported but index={index!r}.") + raise ValueError(f"Only column affected are supported but index={index!r}.") if isinstance(value, (int, float, numpy.number, str)): # Is is equivalent to add_column. iter_creation = self.iter_creation @@ -997,7 +1091,8 @@ def iterate_fct(): raise RuntimeError( "Chunksize or shape are different when " "iterating on two StreamDataFrame at the same " - "time: %r != %r." % (df.shape[0], dfs.shape[0])) + "time: %r != %r." % (df.shape[0], dfs.shape[0]) + ) dfc = df.copy() dfc[index] = dfs yield dfc @@ -1005,8 +1100,9 @@ def iterate_fct(): self.iter_creation = iterate_fct else: raise NotImplementedError( - "Not implemented for type(index)=%r and type(value)=%r." % ( - type(index), type(value))) + "Not implemented for type(index)=%r and type(value)=%r." + % (type(index), type(value)) + ) def add_column(self, col, value): """ @@ -1014,12 +1110,12 @@ def add_column(self, col, value): offers for the operator ``[]``. @param col new column - @param value @see cl StreamingDataFrame or a lambda function - @return @see cl StreamingDataFrame + @param value see :class:`StreamingDataFrame` or a lambda function + @return see :class:`StreamingDataFrame` ..note:: - If value is a @see cl StreamingDataFrame, + If value is a see :class:`StreamingDataFrame`, *chunksize* must be the same for both. .. exref:: @@ -1043,9 +1139,11 @@ def add_column(self, col, value): """ if not isinstance(col, str): raise NotImplementedError( # pragma: no cover - "Only a column as a string is supported.") + "Only a column as a string is supported." + ) if isfunction(value): + def iterate_fct(self, value, col): "iterate on rows" for df in self: @@ -1053,11 +1151,12 @@ def iterate_fct(self, value, col): dfc.insert(dfc.shape[1], col, dfc.apply(value, axis=1)) yield dfc - return StreamingDataFrame(lambda: iterate_fct(self, value, col), **self.get_kwargs()) + return StreamingDataFrame( + lambda: iterate_fct(self, value, col), **self.get_kwargs() + ) if isinstance(value, (pandas.Series, pandas.DataFrame, StreamingDataFrame)): - raise NotImplementedError( - "Unable set a new column based on a datadframe.") + raise NotImplementedError("Unable set a new column based on a datadframe.") def iterate_cst(self, value, col): "iterate on rows" @@ -1067,7 +1166,8 @@ def iterate_cst(self, value, col): yield dfc return StreamingDataFrame( - lambda: iterate_cst(self, value, col), **self.get_kwargs()) + lambda: iterate_cst(self, value, col), **self.get_kwargs() + ) def fillna(self, **kwargs): """ @@ -1075,7 +1175,7 @@ def fillna(self, **kwargs): :epkg:`pandas:DataFrame:fillna`. @param kwargs see :epkg:`pandas:DataFrame:fillna` - @return @see cl StreamingDataFrame + @return see :class:`StreamingDataFrame` .. warning:: The function does not check what happens at the @@ -1085,8 +1185,8 @@ def fillna(self, **kwargs): def iterate_na(self, **kwargs): "iterate on rows" - if kwargs.get('inplace', True): - kwargs['inplace'] = True + if kwargs.get("inplace", True): + kwargs["inplace"] = True for df in self: df.fillna(**kwargs) yield df @@ -1095,7 +1195,8 @@ def iterate_na(self, **kwargs): yield df.fillna(**kwargs) return StreamingDataFrame( - lambda: iterate_na(self, **kwargs), **self.get_kwargs()) + lambda: iterate_na(self, **kwargs), **self.get_kwargs() + ) def describe(self, percentiles=None, include=None, exclude=None): """ @@ -1115,31 +1216,36 @@ def describe(self, percentiles=None, include=None, exclude=None): """ merged = None stack = [] - notper = ['count', 'mean', 'std'] + notper = ["count", "mean", "std"] for df in self: desc = df.describe( - percentiles=percentiles, include=include, exclude=exclude) - count = desc.loc['count', :] + percentiles=percentiles, include=include, exclude=exclude + ) + count = desc.loc["count", :] rows = [name for name in desc.index if name not in notper] stack.append(desc.loc[rows, :]) if merged is None: merged = desc - merged.loc['std', :] = ( - merged.loc['std', :] ** 2 + merged.loc['mean', :] ** 2) * count - merged.loc['mean', :] *= count + merged.loc["std", :] = ( + merged.loc["std", :] ** 2 + merged.loc["mean", :] ** 2 + ) * count + merged.loc["mean", :] *= count else: - merged.loc['count', :] += desc.loc['count', :] - merged.loc['mean', :] += desc.loc['mean', :] * count - merged.loc['std', :] += ( - desc.loc['std', :] ** 2 + desc.loc['mean', :] ** 2) * count - merged.loc['max', :] = numpy.maximum( - merged.loc['max', :], desc.loc['max', :]) - merged.loc['min', :] = numpy.maximum( - merged.loc['min', :], desc.loc['min', :]) - merged.loc['mean', :] /= merged.loc['count', :] - merged.loc['std', :] = ( - merged.loc['std', :] / merged.loc['count', :] - - merged.loc['mean', :] ** 2) ** 0.5 + merged.loc["count", :] += desc.loc["count", :] + merged.loc["mean", :] += desc.loc["mean", :] * count + merged.loc["std", :] += ( + desc.loc["std", :] ** 2 + desc.loc["mean", :] ** 2 + ) * count + merged.loc["max", :] = numpy.maximum( + merged.loc["max", :], desc.loc["max", :] + ) + merged.loc["min", :] = numpy.maximum( + merged.loc["min", :], desc.loc["min", :] + ) + merged.loc["mean", :] /= merged.loc["count", :] + merged.loc["std", :] = ( + merged.loc["std", :] / merged.loc["count", :] - merged.loc["mean", :] ** 2 + ) ** 0.5 values = pandas.concat(stack) summary = values.describe(percentiles=percentiles) merged = merged.loc[notper, :] @@ -1147,9 +1253,15 @@ def describe(self, percentiles=None, include=None, exclude=None): summary = summary.loc[rows, :] return pandas.concat([merged, summary]) - def sort_values(self, by, axis=0, ascending=True, kind='quicksort', - na_position='last', - temp_file='_pandas_streaming_sort_values_'): + def sort_values( + self, + by, + axis=0, + ascending=True, + kind="quicksort", + na_position="last", + temp_file="_pandas_streaming_sort_values_", + ): """ Sorts the streaming dataframe by values. @@ -1166,14 +1278,16 @@ def sort_values(self, by, axis=0, ascending=True, kind='quicksort', """ if not isinstance(by, str): raise NotImplementedError( # pragma: no cover - f"Only one column can be used to sort not {by!r}.") + f"Only one column can be used to sort not {by!r}." + ) keys = {} nans = [] indices = [] - with open(temp_file, 'wb') as f: + with open(temp_file, "wb") as f: for df in self: - dfs = df.sort_values(by, ascending=ascending, kind=kind, - na_position=na_position) + dfs = df.sort_values( + by, ascending=ascending, kind=kind, na_position=na_position + ) for tu in dfs[by]: if isinstance(tu, float) and numpy.isnan(tu): nans.append(len(indices)) @@ -1192,10 +1306,8 @@ def sort_values(self, by, axis=0, ascending=True, kind='quicksort', values.sort(reverse=not ascending) def iterate(): - - with open(temp_file, 'rb') as f: - - if na_position == 'first': + with open(temp_file, "rb") as f: + if na_position == "first": for p in nans: f.seek(indices[p]) length = indices[p + 1] - indices[p] @@ -1213,7 +1325,7 @@ def iterate(): sub = dfs[dfs[by] == key] yield sub - if na_position == 'last': + if na_position == "last": for p in nans: f.seek(indices[p]) length = indices[p + 1] - indices[p] @@ -1222,8 +1334,7 @@ def iterate(): sub = dfs[numpy.isnan(dfs[by])] yield sub - res = StreamingDataFrame( - lambda: iterate(), **self.get_kwargs()) + res = StreamingDataFrame(lambda: iterate(), **self.get_kwargs()) res._delete_.append(lambda: os.remove(temp_file)) return res @@ -1237,25 +1348,26 @@ def __del__(self): class StreamingSeries(StreamingDataFrame): """ - Seens as a @see cl StreamingDataFrame of one column. + Seens as a see :class:`StreamingDataFrame` of one column. """ def __init__(self, iter_creation, check_schema=True, stable=True): StreamingDataFrame.__init__( - self, iter_creation, check_schema=check_schema, stable=stable) + self, iter_creation, check_schema=check_schema, stable=stable + ) if len(self.columns) != 1: raise RuntimeError( # pragma: no cover - f"A series can contain only one column not " - f"{len(self.columns)!r}.") + f"A series can contain only one column not " f"{len(self.columns)!r}." + ) - def apply(self, *args, **kwargs) -> 'StreamingDataFrame': + def apply(self, *args, **kwargs) -> "StreamingDataFrame": """ Applies :epkg:`pandas:Series:apply`. This function returns a @see cl StreamingSeries. """ return StreamingSeries( - lambda: map(lambda df: df.apply(*args, **kwargs), self), - **self.get_kwargs()) + lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs() + ) def __add__(self, value): """ @@ -1264,6 +1376,7 @@ def __add__(self, value): :param value: any value which makes sense :return: a new series """ + def iterate(): for df in self: yield df + value diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py index 3dc8f3a..b85d78a 100644 --- a/pandas_streaming/df/dataframe_helpers.py +++ b/pandas_streaming/df/dataframe_helpers.py @@ -1,8 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Helpers for dataframes. -""" import hashlib import struct import warnings @@ -17,25 +12,27 @@ def numpy_types(): :return: list of types """ - return [numpy.bool_, - numpy.int_, - numpy.intc, - numpy.intp, - numpy.int8, - numpy.int16, - numpy.int32, - numpy.int64, - numpy.uint8, - numpy.uint16, - numpy.uint32, - numpy.uint64, - numpy.float_, - numpy.float16, - numpy.float32, - numpy.float64, - numpy.complex_, - numpy.complex64, - numpy.complex128] + return [ + numpy.bool_, + numpy.int_, + numpy.intc, + numpy.intp, + numpy.int8, + numpy.int16, + numpy.int32, + numpy.int64, + numpy.uint8, + numpy.uint16, + numpy.uint32, + numpy.uint64, + numpy.float_, + numpy.float16, + numpy.float32, + numpy.float64, + numpy.complex_, + numpy.complex64, + numpy.complex128, + ] def hash_str(c, hash_length): @@ -78,7 +75,7 @@ def hash_int(c, hash_length): r = m.hexdigest() if len(r) >= hash_length: r = r[:hash_length] - return int(r, 16) % (10 ** 8) + return int(r, 16) % (10**8) def hash_float(c, hash_length): @@ -98,7 +95,7 @@ def hash_float(c, hash_length): r = m.hexdigest() if len(r) >= hash_length: r = r[:hash_length] - i = int(r, 16) % (2 ** 53) + i = int(r, 16) % (2**53) return float(i) @@ -153,8 +150,9 @@ def hash_floatl(c): "hash float" return hash_float(c, hash_length) - coltype = {n: t for n, t in zip( # pylint: disable=R1721 - df.columns, df.dtypes)} # pylint: disable=R1721 + coltype = { + n: t for n, t in zip(df.columns, df.dtypes) # pylint: disable=R1721 + } # pylint: disable=R1721 for c in cols: t = coltype[c] if t == int: @@ -167,7 +165,8 @@ def hash_floatl(c): df[c] = df[c].apply(hash_strl) else: raise NotImplementedError( # pragma: no cover - f"Conversion of type {t} in column '{c}' is not implemented") + f"Conversion of type {t} in column '{c}' is not implemented" + ) return df @@ -204,8 +203,9 @@ def dataframe_unfold(df, col, new_col=None, sep=","): print(df2) # To fold: - folded = df2.groupby('a').apply(lambda row: ','.join(row['b_unfold'].dropna()) \\ - if len(row['b_unfold'].dropna()) > 0 else numpy.nan) + folded = df2.groupby('a').apply( + lambda row: ','.join(row['b_unfold'].dropna()) + if len(row['b_unfold'].dropna()) > 0 else numpy.nan) print('----------') print(folded) """ @@ -213,7 +213,7 @@ def dataframe_unfold(df, col, new_col=None, sep=","): col_name = col + "_unfold" else: col_name = new_col - temp_col = '__index__' + temp_col = "__index__" while temp_col in df.columns: temp_col += "_" rows = [] @@ -306,7 +306,9 @@ def pandas_fillna(df, by, hasna=None, suffix=None): else: raise TypeError( # pragma: no cover "Unable to determine a constant for type='{0}' dtype='{1}'".format( - val, df[c].dtype)) + val, df[c].dtype + ) + ) val += cst while val in se: val += suffix @@ -318,17 +320,20 @@ def pandas_fillna(df, by, hasna=None, suffix=None): ma = abs(dr.max()) val = ma + mi if val == ma and not isinstance(val, str): - val += ma + 1. + val += ma + 1.0 if val <= ma: raise ValueError( # pragma: no cover "Unable to find a different value for column '{}' v='{}: " - "min={} max={}".format(c, val, mi, ma)) + "min={} max={}".format(c, val, mi, ma) + ) df[c].fillna(val, inplace=True) rep[c] = val return rep, df -def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs): +def pandas_groupby_nan( + df, by, axis=0, as_index=False, suffix=None, nanback=True, **kwargs +): """ Does a *groupby* including keeping missing values (:epkg:`nan`). @@ -391,8 +396,7 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True """ if nanback and suffix is None: try: - res = df.groupby(by, axis=axis, as_index=as_index, - dropna=False, **kwargs) + res = df.groupby(by, axis=axis, as_index=as_index, dropna=False, **kwargs) except TypeError: # old version of pandas res = None @@ -421,71 +425,91 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True if not nanback: dummy = DataFrame([{"a": "a"}]) do = dummy.dtypes[0] - typ = {c: t for c, t in zip( # pylint: disable=R1721 - df.columns, df.dtypes)} # pylint: disable=R1721 + typ = { + c: t for c, t in zip(df.columns, df.dtypes) # pylint: disable=R1721 + } # pylint: disable=R1721 if typ[by[0]] != do: warnings.warn( # pragma: no cover - f"[pandas_groupby_nan] NaN value: {rep}") + f"[pandas_groupby_nan] NaN value: {rep}" + ) return res for b in by: fnan = rep[b] if fnan in res.grouper.groups: res.grouper.groups[numpy.nan] = res.grouper.groups[fnan] del res.grouper.groups[fnan] - new_val = list((numpy.nan if b == fnan else b) - for b in res.grouper.result_index) + new_val = list( + (numpy.nan if b == fnan else b) for b in res.grouper.result_index + ) res.grouper.groupings[0]._group_index = Index(new_val) - res.grouper.groupings[0].obj[b].replace( - fnan, numpy.nan, inplace=True) - if hasattr(res.grouper, 'grouping'): + res.grouper.groupings[0].obj[b].replace(fnan, numpy.nan, inplace=True) + if hasattr(res.grouper, "grouping"): if isinstance(res.grouper.groupings[0].grouper, numpy.ndarray): arr = numpy.array(new_val) res.grouper.groupings[0].grouper = arr - if (hasattr(res.grouper.groupings[0], '_cache') and - 'result_index' in res.grouper.groupings[0]._cache): - del res.grouper.groupings[0]._cache['result_index'] + if ( + hasattr(res.grouper.groupings[0], "_cache") + and "result_index" in res.grouper.groupings[0]._cache + ): + del res.grouper.groupings[0]._cache["result_index"] else: - raise NotImplementedError("Not implemented for type: {0}".format( - type(res.grouper.groupings[0].grouper))) + raise NotImplementedError( + "Not implemented for type: {0}".format( + type(res.grouper.groupings[0].grouper) + ) + ) else: grouper = res.grouper._get_grouper() if isinstance(grouper, numpy.ndarray): arr = numpy.array(new_val) res.grouper.groupings[0].grouping_vector = arr - if (hasattr(res.grouper.groupings[0], '_cache') and - 'result_index' in res.grouper.groupings[0]._cache): - index = res.grouper.groupings[0]._cache['result_index'] + if ( + hasattr(res.grouper.groupings[0], "_cache") + and "result_index" in res.grouper.groupings[0]._cache + ): + index = res.grouper.groupings[0]._cache["result_index"] if len(rep) == 1: key = list(rep.values())[0] new_index = numpy.array(index) - for i in range(0, len(new_index)): # pylint: disable=C0200 + for i in range( + 0, len(new_index) + ): # pylint: disable=C0200 if new_index[i] == key: new_index[i] = numpy.nan - res.grouper.groupings[0]._cache['result_index'] = ( - index.__class__(new_index)) + res.grouper.groupings[0]._cache[ + "result_index" + ] = index.__class__(new_index) else: raise NotImplementedError( # pragma: no cover - "NaN values not implemented for multiindex.") + "NaN values not implemented for multiindex." + ) else: raise NotImplementedError( # pragma: no cover "Not implemented for type: {0}".format( - type(res.grouper.groupings[0].grouper))) - res.grouper._cache['result_index'] = res.grouper.groupings[0]._group_index + type(res.grouper.groupings[0].grouper) + ) + ) + res.grouper._cache["result_index"] = res.grouper.groupings[ + 0 + ]._group_index else: if not nanback: dummy = DataFrame([{"a": "a"}]) do = dummy.dtypes[0] - typ = {c: t for c, t in zip( # pylint: disable=R1721 - df.columns, df.dtypes)} # pylint: disable=R1721 + typ = { + c: t for c, t in zip(df.columns, df.dtypes) # pylint: disable=R1721 + } # pylint: disable=R1721 for b in by: if typ[b] != do: warnings.warn( # pragma: no cover - f"[pandas_groupby_nan] NaN values: {rep}") + f"[pandas_groupby_nan] NaN values: {rep}" + ) break return res raise NotImplementedError( "Not yet implemented. Replacing pseudo nan values by real nan " - "values is not as easy as it looks. Use nanback=False") + "values is not as easy as it looks. Use nanback=False" + ) # keys = list(res.grouper.groups.keys()) # didit = False @@ -528,7 +552,8 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True # grou.grouper = numpy.array(new_val) # else: # raise NotImplementedError( - # "Not implemented for type: {0}".format(type(grou.grouper))) + # "Not implemented for type: {0}".format( + # type(grou.grouper))) # del res.grouper._cache return res return df.groupby(by, axis=axis, **kwargs) diff --git a/pandas_streaming/df/dataframe_io.py b/pandas_streaming/df/dataframe_io.py index 30d0fb8..532a2bf 100644 --- a/pandas_streaming/df/dataframe_io.py +++ b/pandas_streaming/df/dataframe_io.py @@ -1,8 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Saves and reads a :epkg:`dataframe` into a :epkg:`zip` file. -""" import io import os import zipfile @@ -66,35 +61,38 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs): if isinstance(df, pandas.DataFrame): stb = io.StringIO() ext = os.path.splitext(zname)[-1] - if ext == '.npy': + if ext == ".npy": raise ValueError( # pragma: no cover - "Extension '.npy' cannot be used to save a dataframe.") + "Extension '.npy' cannot be used to save a dataframe." + ) df.to_csv(stb, **kwargs) elif isinstance(df, numpy.ndarray): stb = io.BytesIO() ext = os.path.splitext(zname)[-1] - if ext != '.npy': + if ext != ".npy": raise ValueError( # pragma: no cover - "Extension '.npy' is required when saving a numpy array.") + "Extension '.npy' is required when saving a numpy array." + ) numpy.save(stb, df, **kwargs) else: - raise TypeError( # pragma: no cover - f"Type not handled {type(df)}") + raise TypeError(f"Type not handled {type(df)}") # pragma: no cover text = stb.getvalue() if isinstance(zipfilename, str): ext = os.path.splitext(zipfilename)[-1] - if ext != '.zip': + if ext != ".zip": raise NotImplementedError( # pragma: no cover - f"Only zip file are implemented not '{ext}'.") - zf = zipfile.ZipFile(zipfilename, 'w') # pylint: disable=R1732 + f"Only zip file are implemented not '{ext}'." + ) + zf = zipfile.ZipFile(zipfilename, "w") # pylint: disable=R1732 close = True elif isinstance(zipfilename, zipfile.ZipFile): zf = zipfilename close = False else: raise TypeError( # pragma: no cover - f"No implementation for type '{type(zipfilename)}'") + f"No implementation for type '{type(zipfilename)}'" + ) zf.writestr(zname, text) if close: @@ -113,24 +111,26 @@ def read_zip(zipfilename, zname=None, **kwargs): """ if isinstance(zipfilename, str): ext = os.path.splitext(zipfilename)[-1] - if ext != '.zip': + if ext != ".zip": raise NotImplementedError( # pragma: no cover - f"Only zip files are supported not '{ext}'.") - zf = zipfile.ZipFile(zipfilename, 'r') # pylint: disable=R1732 + f"Only zip files are supported not '{ext}'." + ) + zf = zipfile.ZipFile(zipfilename, "r") # pylint: disable=R1732 close = True elif isinstance(zipfilename, zipfile.ZipFile): zf = zipfilename close = False else: raise TypeError( # pragma: no cover - f"No implementation for type '{type(zipfilename)}'") + f"No implementation for type '{type(zipfilename)}'" + ) if zname is None: zname = zf.namelist()[0] content = zf.read(zname) stb = io.BytesIO(content) ext = os.path.splitext(zname)[-1] - if ext == '.npy': + if ext == ".npy": df = numpy.load(stb, **kwargs) else: df = pandas.read_csv(stb, **kwargs) diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py index 4ae503b..d956cf9 100644 --- a/pandas_streaming/df/dataframe_io_helpers.py +++ b/pandas_streaming/df/dataframe_io_helpers.py @@ -1,10 +1,6 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Saves and reads a :epkg:`dataframe` into a :epkg:`zip` file. -""" import os from io import StringIO, BytesIO + try: from ujson import dumps except ImportError: # pragma: no cover @@ -46,10 +42,10 @@ def readline(self, size=-1): if size == 0: return text if self.newline: - text = ',' + text + text = "," + text self.newline = False elif self.begin: - text = '[' + text + text = "[" + text self.begin = False if text.endswith("\n"): @@ -58,7 +54,7 @@ def readline(self, size=-1): if len(text) == 0 or len(text) < size: if self.end: self.end = False - return text + ']' + return text + "]" return text return text @@ -76,7 +72,7 @@ def read(self, size=-1): if size == 0: return text if len(text) > 1: - t1, t2 = text[:len(text) - 1], text[len(text) - 1:] + t1, t2 = text[: len(text) - 1], text[len(text) - 1 :] t1 = t1.replace(cst[0], cst[1]) text = t1 + t2 @@ -101,11 +97,13 @@ def getvalue(self): """ Returns the whole stream content. """ + def byline(): line = self.readline() while line: yield line line = self.readline() + return "".join(byline()) @@ -129,8 +127,7 @@ def _flatten(obj, key): elif isinstance(obj, dict): for k, v in obj.items(): if not isinstance(k, str): - raise TypeError( - "All keys must a string.") # pragma: no cover + raise TypeError("All keys must a string.") # pragma: no cover k2 = k if key is None else f"{key}{sep}{k}" _flatten(v, k2) elif isinstance(obj, (list, set)): @@ -234,27 +231,28 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False): if "{" not in filename and os.path.exists(filename): with open(filename, "r", encoding=encoding) as f: for el in enumerate_json_items( - f, encoding=encoding, lines=lines, - flatten=flatten): + f, encoding=encoding, lines=lines, flatten=flatten + ): yield el else: st = StringIO(filename) for el in enumerate_json_items( - st, encoding=encoding, lines=lines, - flatten=flatten): + st, encoding=encoding, lines=lines, flatten=flatten + ): yield el elif isinstance(filename, bytes): st = BytesIO(filename) for el in enumerate_json_items( - st, encoding=encoding, lines=lines, flatten=flatten): + st, encoding=encoding, lines=lines, flatten=flatten + ): yield el elif lines: for el in enumerate_json_items( - JsonPerRowsStream(filename), - encoding=encoding, lines=False, flatten=flatten): + JsonPerRowsStream(filename), encoding=encoding, lines=False, flatten=flatten + ): yield el else: - if hasattr(filename, 'seek'): + if hasattr(filename, "seek"): filename.seek(0) parser = ijson.parse(filename) current = None @@ -264,14 +262,16 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False): for i, (_, event, value) in enumerate(parser): if i % 1000000 == 0 and fLOG is not None: fLOG( # pragma: no cover - f"[enumerate_json_items] i={i} yielded={nbyield}") + f"[enumerate_json_items] i={i} yielded={nbyield}" + ) if event == "start_array": if curkey is None: current = [] else: if not isinstance(current, dict): raise RuntimeError( # pragma: no cover - f"Type issue {type(current)}") + f"Type issue {type(current)}" + ) c = [] current[curkey] = c # pylint: disable=E1137 current = c @@ -321,8 +321,7 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False): current[curkey] = None # pylint: disable=E1137 curkey = None else: - raise ValueError( - f"Unknown event '{event}'") # pragma: no cover + raise ValueError(f"Unknown event '{event}'") # pragma: no cover class JsonIterator2Stream: @@ -418,8 +417,7 @@ def seek(self, offset): :param offset: offset, only 0 is implemented """ if offset != 0: - raise NotImplementedError( - "The iterator can only return at the beginning.") + raise NotImplementedError("The iterator can only return at the beginning.") self.it0 = self.it() def write(self): diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py index 197d4ff..bb7ea33 100644 --- a/pandas_streaming/df/dataframe_split.py +++ b/pandas_streaming/df/dataframe_split.py @@ -1,8 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Implements different methods to split a dataframe. -""" import hashlib import pickle import random @@ -11,15 +6,16 @@ import pandas -def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv", - names=None, **kwargs): +def sklearn_train_test_split( + self, path_or_buf=None, export_method="to_csv", names=None, **kwargs +): """ Randomly splits a dataframe into smaller pieces. The function returns streams of file names. The function relies on :epkg:`sklearn:model_selection:train_test_split`. It does not handle stratified version of it. - @param self @see cl StreamingDataFrame + @param self see :class:`StreamingDataFrame` @param path_or_buf a string, a list of strings or buffers, if it is a string, it must contain ``{}`` like ``partition{}.txt`` @param export_method method used to store the partitions, by default @@ -30,7 +26,7 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv", @return outputs of the exports functions The function cannot return two iterators or two - @see cl StreamingDataFrame because running through one + see :class:`StreamingDataFrame` because running through one means running through the other. We can assume both splits do not hold in memory and we cannot run through the same iterator again as random draws would be different. @@ -42,13 +38,13 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv", """ if kwargs.get("stratify") is not None: raise NotImplementedError( # pragma: no cover - "No implementation yet for the stratified version.") + "No implementation yet for the stratified version." + ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ImportWarning) from sklearn.model_selection import train_test_split # pylint: disable=C0415 - opts = ['test_size', 'train_size', - 'random_state', 'shuffle', 'stratify'] + opts = ["test_size", "train_size", "random_state", "shuffle", "stratify"] split_ops = {} for o in opts: if o in kwargs: @@ -56,27 +52,28 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv", del kwargs[o] exportf_ = getattr(pandas.DataFrame, export_method) - if export_method == 'to_csv' and 'mode' not in kwargs: - exportf = lambda *a, **kw: exportf_(*a, mode='a', **kw) + if export_method == "to_csv" and "mode" not in kwargs: + exportf = lambda *a, **kw: exportf_(*a, mode="a", **kw) # noqa: E731 else: exportf = exportf_ if isinstance(path_or_buf, str): if "{}" not in path_or_buf: - raise ValueError( - "path_or_buf must contain {} to insert the partition name") + raise ValueError("path_or_buf must contain {} to insert the partition name") if names is None: - names = ['train', 'test'] + names = ["train", "test"] elif len(names) != len(path_or_buf): raise ValueError( # pragma: no cover - 'names and path_or_buf must have the same length') + "names and path_or_buf must have the same length" + ) path_or_buf = [path_or_buf.format(n) for n in names] elif path_or_buf is None: path_or_buf = [None, None] else: if not isinstance(path_or_buf, list): raise TypeError( # pragma: no cover - 'path_or_buf must be a list or a string') + "path_or_buf must be a list or a string" + ) bufs = [] close = [] @@ -85,8 +82,7 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv", st = StringIO() cl = False elif isinstance(p, str): - st = open( # pylint: disable=R1732 - p, "w", encoding=kwargs.get('encoding')) + st = open(p, "w", encoding=kwargs.get("encoding")) # pylint: disable=R1732 cl = True else: st = p @@ -98,34 +94,38 @@ def sklearn_train_test_split(self, path_or_buf=None, export_method="to_csv", train, test = train_test_split(df, **split_ops) exportf(train, bufs[0], **kwargs) exportf(test, bufs[1], **kwargs) - kwargs['header'] = False + kwargs["header"] = False for b, c in zip(bufs, close): if c: b.close() - return [st.getvalue() if isinstance(st, StringIO) else p - for st, p in zip(bufs, path_or_buf)] + return [ + st.getvalue() if isinstance(st, StringIO) else p + for st, p in zip(bufs, path_or_buf) + ] -def sklearn_train_test_split_streaming(self, test_size=0.25, train_size=None, - stratify=None, hash_size=9, unique_rows=False): +def sklearn_train_test_split_streaming( + self, test_size=0.25, train_size=None, stratify=None, hash_size=9, unique_rows=False +): """ Randomly splits a dataframe into smaller pieces. The function returns streams of file names. The function relies on :epkg:`sklearn:model_selection:train_test_split`. It handles the stratified version of it. - @param self @see cl StreamingDataFrame - @param test_size ratio for the test partition (if *train_size* is not specified) - @param train_size ratio for the train partition - @param stratify column holding the stratification - @param hash_size size of the hash to cache information about partition - @param unique_rows ensures that rows are unique - @return Two @see cl StreamingDataFrame, one + :param self: see :class:`StreamingDataFrame` + :param test_size: ratio for the test partition + (if *train_size* is not specified) + :param train_size: ratio for the train partition + :param stratify: column holding the stratification + :param hash_size: size of the hash to cache information about partition + :param unique_rows: ensures that rows are unique + :return: Two see :class:`StreamingDataFrame`, one for train, one for test. The function returns two iterators or two - @see cl StreamingDataFrame. It + see :class:`StreamingDataFrame`. It tries to do everything without writing anything on disk but it requires to store the repartition somehow. This function hashes every row and maps the hash with a part @@ -173,7 +173,7 @@ def iterator_rows(): random.shuffle(vr) if (0, k) in counts: tt = counts[1, k] + counts[0, k] - delta = - int(counts[0, k] - tt * p + 0.5) + delta = -int(counts[0, k] - tt * p + 0.5) else: delta = 0 i = int(len(v) * p + 0.5) @@ -199,7 +199,7 @@ def iterator_rows(): random.shuffle(vr) if (0, k) in counts: tt = counts[1, k] + counts[0, k] - delta = - int(counts[0, k] - tt * p + 0.5) + delta = -int(counts[0, k] - tt * p + 0.5) else: delta = 0 i = int(len(v) * p + 0.5) @@ -234,7 +234,8 @@ def iterator_internal(part_requested): raise ValueError( # pragma: no cover "A row or at least its hash is already cached. " "Increase hash_size or check for duplicates " - "('{0}')\n{1}.".format(h, obs)) + "('{0}')\n{1}.".format(h, obs) + ) if h not in cache: cache[h] = part else: @@ -242,8 +243,7 @@ def iterator_internal(part_requested): if part == part_requested: accumul.append(obs) if len(accumul) >= static_schema[2]: - dfo = pandas.DataFrame( - accumul, columns=static_schema[0]) + dfo = pandas.DataFrame(accumul, columns=static_schema[0]) self.ensure_dtype(dfo, static_schema[1]) iy += dfo.shape[0] accumul.clear() @@ -255,12 +255,13 @@ def iterator_internal(part_requested): part = cache.get(h) if part is None: raise ValueError( # pragma: no cover - f"Second iteration. A row was never met in the first one\n{obs}") + f"Second iteration. A row was " + f"never met in the first one\n{obs}" + ) if part == part_requested: accumul.append(obs) if len(accumul) >= static_schema[2]: - dfo = pandas.DataFrame( - accumul, columns=static_schema[0]) + dfo = pandas.DataFrame(accumul, columns=static_schema[0]) self.ensure_dtype(dfo, static_schema[1]) iy += dfo.shape[0] accumul.clear() @@ -271,5 +272,7 @@ def iterator_internal(part_requested): iy += dfo.shape[0] yield dfo - return (self.__class__(lambda: iterator_internal(0)), - self.__class__(lambda: iterator_internal(1))) + return ( + self.__class__(lambda: iterator_internal(0)), + self.__class__(lambda: iterator_internal(1)), + ) diff --git a/pandas_streaming/exc/__init__.py b/pandas_streaming/exc/__init__.py index 9979b62..a5e114d 100644 --- a/pandas_streaming/exc/__init__.py +++ b/pandas_streaming/exc/__init__.py @@ -1,6 +1 @@ -""" -@file -@brief Shortcuts to *exc*. -""" - -from .exc_streaming import StreamingInefficientException +from .exc_streaming import StreamingInefficientException # noqa: F401 diff --git a/pandas_streaming/exc/exc_streaming.py b/pandas_streaming/exc/exc_streaming.py index c7094e5..be0bd36 100644 --- a/pandas_streaming/exc/exc_streaming.py +++ b/pandas_streaming/exc/exc_streaming.py @@ -1,10 +1,3 @@ -# -*- coding: utf-8 -*- -""" -@file -@brief Defines a streming dataframe. -""" - - class StreamingInefficientException(Exception): """ Kind of operations doable with a :epkg:`pandas:DataFrame` @@ -18,5 +11,4 @@ def __init__(self, meth): :param meth: inefficient method """ - Exception.__init__( - self, f"{meth} should not be done in streaming mode.") + Exception.__init__(self, f"{meth} should not be done in streaming mode.") diff --git a/pyproject.toml b/pyproject.toml index bad7f7a..91b4010 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,3 +29,7 @@ max-complexity = 10 [tool.ruff.per-file-ignores] "_doc/examples/plot_first_example.py" = ["E402", "F811"] +"_unittests/ut_df/test_dataframe_io_helpers.py" = ["E501"] +"pandas_streaming/data/__init__.py" = ["F401"] +"pandas_streaming/df/__init__.py" = ["F401"] +"pandas_streaming/df/dataframe_io_helpers.py" = ["E501"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 5ab8605..16ed1c5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,6 @@ autopep8 coverage +furo ijson jupyter_sphinx jyquickhelper @@ -10,10 +11,10 @@ Pillow pycodestyle pylint>=2.14.0 pyquickhelper>=1.10 -pyquicksetup scikit-learn scipy sphinx +sphinx-runpython sphinxcontrib.imagesvg sphinx_gallery ujson From 2525d9f4b16c553c846e414594d2a3d007c00ec6 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sat, 22 Jul 2023 20:01:13 +0200 Subject: [PATCH 03/16] move --- _doc/{sphinxdoc/source => }/_static/git_logo.png | Bin _doc/{sphinxdoc/source => }/_static/project_ico.ico | Bin _doc/{sphinxdoc/source => }/_static/project_ico.png | Bin _doc/{sphinxdoc/source => }/api/index.rst | 0 _doc/{sphinxdoc/source => }/api/rdata.rst | 0 _doc/{sphinxdoc/source => }/api/rdf.rst | 0 _doc/{sphinxdoc/source => }/api/rexc.rst | 0 _doc/{sphinxdoc/source => }/api/rio.rst | 0 .../source => }/blog/2017/2017-09-21_first_day.rst | 0 .../source => }/blog/2018/2018-08-19_streamz.rst | 0 _doc/{sphinxdoc/source => }/conf.py | 0 _doc/{sphinxdoc/source => }/glossary.rst | 0 _doc/{sphinxdoc/source => }/i_ex.rst | 0 _doc/{sphinxdoc/source => }/i_examples.rst | 0 _doc/{sphinxdoc/source => }/i_faq.rst | 0 _doc/{sphinxdoc/source => }/i_index.rst | 0 _doc/{sphinxdoc/source => }/index.rst | 0 _doc/{sphinxdoc/source => }/license.rst | 0 _doc/{sphinxdoc/source => }/tutorial/index.rst | 0 19 files changed, 0 insertions(+), 0 deletions(-) rename _doc/{sphinxdoc/source => }/_static/git_logo.png (100%) rename _doc/{sphinxdoc/source => }/_static/project_ico.ico (100%) rename _doc/{sphinxdoc/source => }/_static/project_ico.png (100%) rename _doc/{sphinxdoc/source => }/api/index.rst (100%) rename _doc/{sphinxdoc/source => }/api/rdata.rst (100%) rename _doc/{sphinxdoc/source => }/api/rdf.rst (100%) rename _doc/{sphinxdoc/source => }/api/rexc.rst (100%) rename _doc/{sphinxdoc/source => }/api/rio.rst (100%) rename _doc/{sphinxdoc/source => }/blog/2017/2017-09-21_first_day.rst (100%) rename _doc/{sphinxdoc/source => }/blog/2018/2018-08-19_streamz.rst (100%) rename _doc/{sphinxdoc/source => }/conf.py (100%) rename _doc/{sphinxdoc/source => }/glossary.rst (100%) rename _doc/{sphinxdoc/source => }/i_ex.rst (100%) rename _doc/{sphinxdoc/source => }/i_examples.rst (100%) rename _doc/{sphinxdoc/source => }/i_faq.rst (100%) rename _doc/{sphinxdoc/source => }/i_index.rst (100%) rename _doc/{sphinxdoc/source => }/index.rst (100%) rename _doc/{sphinxdoc/source => }/license.rst (100%) rename _doc/{sphinxdoc/source => }/tutorial/index.rst (100%) diff --git a/_doc/sphinxdoc/source/_static/git_logo.png b/_doc/_static/git_logo.png similarity index 100% rename from _doc/sphinxdoc/source/_static/git_logo.png rename to _doc/_static/git_logo.png diff --git a/_doc/sphinxdoc/source/_static/project_ico.ico b/_doc/_static/project_ico.ico similarity index 100% rename from _doc/sphinxdoc/source/_static/project_ico.ico rename to _doc/_static/project_ico.ico diff --git a/_doc/sphinxdoc/source/_static/project_ico.png b/_doc/_static/project_ico.png similarity index 100% rename from _doc/sphinxdoc/source/_static/project_ico.png rename to _doc/_static/project_ico.png diff --git a/_doc/sphinxdoc/source/api/index.rst b/_doc/api/index.rst similarity index 100% rename from _doc/sphinxdoc/source/api/index.rst rename to _doc/api/index.rst diff --git a/_doc/sphinxdoc/source/api/rdata.rst b/_doc/api/rdata.rst similarity index 100% rename from _doc/sphinxdoc/source/api/rdata.rst rename to _doc/api/rdata.rst diff --git a/_doc/sphinxdoc/source/api/rdf.rst b/_doc/api/rdf.rst similarity index 100% rename from _doc/sphinxdoc/source/api/rdf.rst rename to _doc/api/rdf.rst diff --git a/_doc/sphinxdoc/source/api/rexc.rst b/_doc/api/rexc.rst similarity index 100% rename from _doc/sphinxdoc/source/api/rexc.rst rename to _doc/api/rexc.rst diff --git a/_doc/sphinxdoc/source/api/rio.rst b/_doc/api/rio.rst similarity index 100% rename from _doc/sphinxdoc/source/api/rio.rst rename to _doc/api/rio.rst diff --git a/_doc/sphinxdoc/source/blog/2017/2017-09-21_first_day.rst b/_doc/blog/2017/2017-09-21_first_day.rst similarity index 100% rename from _doc/sphinxdoc/source/blog/2017/2017-09-21_first_day.rst rename to _doc/blog/2017/2017-09-21_first_day.rst diff --git a/_doc/sphinxdoc/source/blog/2018/2018-08-19_streamz.rst b/_doc/blog/2018/2018-08-19_streamz.rst similarity index 100% rename from _doc/sphinxdoc/source/blog/2018/2018-08-19_streamz.rst rename to _doc/blog/2018/2018-08-19_streamz.rst diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/conf.py similarity index 100% rename from _doc/sphinxdoc/source/conf.py rename to _doc/conf.py diff --git a/_doc/sphinxdoc/source/glossary.rst b/_doc/glossary.rst similarity index 100% rename from _doc/sphinxdoc/source/glossary.rst rename to _doc/glossary.rst diff --git a/_doc/sphinxdoc/source/i_ex.rst b/_doc/i_ex.rst similarity index 100% rename from _doc/sphinxdoc/source/i_ex.rst rename to _doc/i_ex.rst diff --git a/_doc/sphinxdoc/source/i_examples.rst b/_doc/i_examples.rst similarity index 100% rename from _doc/sphinxdoc/source/i_examples.rst rename to _doc/i_examples.rst diff --git a/_doc/sphinxdoc/source/i_faq.rst b/_doc/i_faq.rst similarity index 100% rename from _doc/sphinxdoc/source/i_faq.rst rename to _doc/i_faq.rst diff --git a/_doc/sphinxdoc/source/i_index.rst b/_doc/i_index.rst similarity index 100% rename from _doc/sphinxdoc/source/i_index.rst rename to _doc/i_index.rst diff --git a/_doc/sphinxdoc/source/index.rst b/_doc/index.rst similarity index 100% rename from _doc/sphinxdoc/source/index.rst rename to _doc/index.rst diff --git a/_doc/sphinxdoc/source/license.rst b/_doc/license.rst similarity index 100% rename from _doc/sphinxdoc/source/license.rst rename to _doc/license.rst diff --git a/_doc/sphinxdoc/source/tutorial/index.rst b/_doc/tutorial/index.rst similarity index 100% rename from _doc/sphinxdoc/source/tutorial/index.rst rename to _doc/tutorial/index.rst From 9e470d016bc119335eef753b45d307fee944d27d Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sat, 22 Jul 2023 20:01:33 +0200 Subject: [PATCH 04/16] doc conf --- _doc/conf.py | 60 ++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/_doc/conf.py b/_doc/conf.py index f298be6..066ee6c 100644 --- a/_doc/conf.py +++ b/_doc/conf.py @@ -170,35 +170,35 @@ epkg_dictionary = { - "csv": "https://en.wikipedia.org/wiki/Comma-separated_values", - "dask": "https://dask.pydata.org/en/latest/", - "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", - "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", - "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", - "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", - "dill": "https://dill.readthedocs.io/en/latest/dill.html", - "Hadoop": "http://hadoop.apache.org/", - "ijson": "https://github.com/ICRAR/ijson", - "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN", - "pandas": ( - "http://pandas.pydata.org/pandas-docs/stable/", - ( - "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html", - 1, - ), - ( - "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html", - 2, - ), + "csv": "https://en.wikipedia.org/wiki/Comma-separated_values", + "dask": "https://dask.pydata.org/en/latest/", + "dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "Dataframe": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", + "dill": "https://dill.readthedocs.io/en/latest/dill.html", + "Hadoop": "http://hadoop.apache.org/", + "ijson": "https://github.com/ICRAR/ijson", + "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN", + "pandas": ( + "http://pandas.pydata.org/pandas-docs/stable/", + ( + "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.html", + 1, ), - "pyarrow": "https://arrow.apache.org/docs/python/", - "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html", - "scikit-multiflow": "https://scikit-multiflow.github.io/", - "sklearn": ( - "http://scikit-learn.org/stable/", - ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1), - ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2), + ( + "http://pandas.pydata.org/pandas-docs/stable/generated/pandas.{0}.{1}.html", + 2, ), - "streamz": "https://streamz.readthedocs.io/en/latest/index.html", - "tornado": "https://www.tornadoweb.org/en/stable/", - } + ), + "pyarrow": "https://arrow.apache.org/docs/python/", + "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html", + "scikit-multiflow": "https://scikit-multiflow.github.io/", + "sklearn": ( + "http://scikit-learn.org/stable/", + ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1), + ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2), + ), + "streamz": "https://streamz.readthedocs.io/en/latest/index.html", + "tornado": "https://www.tornadoweb.org/en/stable/", +} From fbfc01a26740a94437be3c6d6ad16b91bfd4dd6e Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sat, 22 Jul 2023 20:02:22 +0200 Subject: [PATCH 05/16] upgrade version --- pandas_streaming/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_streaming/__init__.py b/pandas_streaming/__init__.py index a4a6c0c..e0193cc 100644 --- a/pandas_streaming/__init__.py +++ b/pandas_streaming/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.4.218" +__version__ = "0.5.0" __author__ = "Xavier Dupré" __github__ = "https://github.com/sdpython/pandas_streaming" __url__ = "http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/index.html" From ad93df9dedfe4dc644457b378b2c0ec716dac2fb Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 09:51:48 +0200 Subject: [PATCH 06/16] doc --- _doc/conf.py | 5 +- pandas_streaming/df/connex_split.py | 96 ++++++++++++--------- pandas_streaming/df/dataframe.py | 2 +- pandas_streaming/df/dataframe_helpers.py | 3 +- pandas_streaming/df/dataframe_io_helpers.py | 19 ++-- pandas_streaming/df/dataframe_split.py | 6 +- pyproject.toml | 1 + 7 files changed, 75 insertions(+), 57 deletions(-) diff --git a/_doc/conf.py b/_doc/conf.py index 066ee6c..446bf9d 100644 --- a/_doc/conf.py +++ b/_doc/conf.py @@ -177,6 +177,7 @@ "DataFrame": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", "dill": "https://dill.readthedocs.io/en/latest/dill.html", + "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html", "Hadoop": "http://hadoop.apache.org/", "ijson": "https://github.com/ICRAR/ijson", "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN", @@ -196,8 +197,8 @@ "scikit-multiflow": "https://scikit-multiflow.github.io/", "sklearn": ( "http://scikit-learn.org/stable/", - ("http://scikit-learn.org/stable/modules/generated/{0}.html", 1), - ("http://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2), + ("https://scikit-learn.org/stable/modules/generated/{0}.html", 1), + ("https://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2), ), "streamz": "https://streamz.readthedocs.io/en/latest/index.html", "tornado": "https://www.tornadoweb.org/en/stable/", diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py index bc68581..c3b6150 100644 --- a/pandas_streaming/df/connex_split.py +++ b/pandas_streaming/df/connex_split.py @@ -1,9 +1,12 @@ from collections import Counter +from logging import getLogger import pandas import numpy from sklearn.model_selection import train_test_split from .dataframe_helpers import dataframe_shuffle +logger = getLogger("pandas-streaming") + class ImbalancedSplitException(Exception): """ @@ -133,6 +136,7 @@ def train_test_connex_split( return_cnx=False, must_groups=None, random_state=None, + verbose=0, ): """ This split is for a specific case where data is linked @@ -166,6 +170,7 @@ def train_test_connex_split( @param must_groups column name for ids which must not be shared by train/test partitions @param random_state seed for random generator + @param verbose verbosity (uses logging) @return Two see :class:`StreamingDataFrame`, one for train, one for test. @@ -275,11 +280,15 @@ def do_connex_components(dfrows, local_groups, kb, sib): modif = 1 while modif > 0 and itern < len(elements): - if fLOG and df.shape[0] > 10000: - fLOG( - "[train_test_connex_split] iteration={0}-#nb connect={1} - " - "modif={2}".format(iter, len(set(elements)), modif) + if df.shape[0] > 10000: + logger.info( + "[train_test_connex_split] iteration=%d-#nb connect=%d - " + "modif=%s", + itern, + len(set(elements)), + modif, ) + modif = 0 itern += 1 for i, row in enumerate(dfrows.itertuples(index=False, name=None)): @@ -310,19 +319,18 @@ def do_connex_components(dfrows, local_groups, kb, sib): diff = len(counts_cnx[new_c]) + len(counts_cnx[c]) - maxi r = diff / float(maxi) if r > kb: - if fLOG: # pragma: no cover - fLOG( + if verbose: # pragma: no cover + logger.info( "[train_test_connex_split] balance " - "r={0:0.00000}>{1:0.00}, #[{2}]={3}, " - "#[{4}]={5}".format( - r, - kb, - new_c, - len(counts_cnx[new_c]), - c, - len(counts_cnx[c]), - ) + "r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d", + r, + kb, + new_c, + len(counts_cnx[new_c]), + c, + len(counts_cnx[c]), ) + continue if sib is not None: @@ -330,19 +338,16 @@ def do_connex_components(dfrows, local_groups, kb, sib): len(elements) ) if r > sib: - if fLOG: # pragma: no cover - fLOG( - "[train_test_connex_split] no merge " - "r={0:0.00000}>{1:0.00}, #[{2}]={3}, #[{4}]={5}" - "".format( - r, - sib, - new_c, - len(counts_cnx[new_c]), - c, - len(counts_cnx[c]), - ) - ) + logger.info( + "[train_test_connex_split] " + "no merge r=%1.4f>%1.2f, #[%d]=%d, #[%d]=%d", + r, + sib, + new_c, + len(counts_cnx[new_c]), + c, + len(counts_cnx[c]), + ) avoids_merge[new_c, c] = i continue @@ -370,25 +375,26 @@ def do_connex_components(dfrows, local_groups, kb, sib): dfids[name] = elements dfids[one] = 1 grsum = dfids[[name, one]].groupby(name, as_index=False).sum() - if fLOG: - for g in groups: - fLOG(f"[train_test_connex_split] #nb in '{g}': {len(set(dfids[g]))}") - fLOG(f"[train_test_connex_split] #connex {grsum.shape[0]}/{dfids.shape[0]}") + for g in groups: + logger.info("[train_test_connex_split] #nb in '%d':", len(set(dfids[g]))) + logger.info( + "[train_test_connex_split] #connex %d/%d", grsum.shape[0], dfids.shape[0] + ) if grsum.shape[0] <= 1: raise ValueError( # pragma: no cover "Every element is in the same connected components." ) # Statistics: top connected components - if fLOG: + if verbose: # Global statistics counts = Counter(elements) cl = [(v, k) for k, v in counts.items()] cum = 0 maxc = None - fLOG( - "[train_test_connex_split] number of connected components: {0}" - "".format(len(set(elements))) + logger.info( + "[train_test_connex_split] number of connected components: %d", + len(set(elements)), ) for i, (v, k) in enumerate(sorted(cl, reverse=True)): if i == 0: @@ -396,15 +402,20 @@ def do_connex_components(dfrows, local_groups, kb, sib): if i >= 10: break cum += v - fLOG( - "[train_test_connex_split] c={0} #elements={1} cumulated" - "={2}/{3}".format(k, v, cum, len(elements)) + logger.info( + "[train_test_connex_split] c=%s #elements=%s cumulated=%d/%d", + k, + v, + cum, + len(elements), ) # Most important component - fLOG(f"[train_test_connex_split] first row of the biggest component {maxc}") + logger.info( + "[train_test_connex_split] first row of the biggest component %d", maxc + ) tdf = dfids[dfids[name] == maxc[0]] - fLOG(f"[train_test_connex_split] \n{tdf.head(n=10)}") + logger.info("[train_test_connex_split] % s", tdf.head(n=10)) # Splits. train, test = train_test_split_weights( @@ -471,8 +482,7 @@ def train_test_apart_stratify( classification. A category (*stratify*) is not exclusive and an observation can be assigned to multiple categories. In that particular case, the method - `train_test_split `_ + :func:`sklearn.model_selection.train_test_split` can not directly be used. .. runpython:: diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py index db3d7b9..b519102 100644 --- a/pandas_streaming/df/dataframe.py +++ b/pandas_streaming/df/dataframe.py @@ -138,7 +138,7 @@ def train_test_split( will be given to that function @param names partitions names, by default ``('train', 'test')`` @param kwargs parameters for the export function and - :epkg:`sklearn:model_selection:train_test_split`. + :func:`sklearn.model_selection.train_test_split`. @param streaming the function switches to a streaming version of the algorithm. @param partitions splitting partitions diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py index b85d78a..5771e80 100644 --- a/pandas_streaming/df/dataframe_helpers.py +++ b/pandas_streaming/df/dataframe_helpers.py @@ -350,8 +350,7 @@ def pandas_groupby_nan( generated/pandas.DataFrame.groupby.html>`_ :return: groupby results - See `groupby and missing values `_. + See :epkg:`groupby and missing values`. If no :epkg:`nan` is detected, the function falls back in regular :epkg:`pandas:DataFrame:groupby` which has the following behavior. diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py index d956cf9..c8a7776 100644 --- a/pandas_streaming/df/dataframe_io_helpers.py +++ b/pandas_streaming/df/dataframe_io_helpers.py @@ -141,7 +141,9 @@ def _flatten(obj, key): return flattened_dict -def enumerate_json_items(filename, encoding=None, lines=False, flatten=False): +def enumerate_json_items( + filename, encoding=None, lines=False, flatten=False, verbose=0 +): """ Enumerates items from a :epkg:`JSON` file or string. @@ -149,6 +151,7 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False): :param encoding: encoding :param lines: one record per row :param flatten: call @see fn flatten_dictionary + :param verbose: verbosity (based on :epkg:`tqdm`) :return: iterator on records at first level. It assumes the syntax follows the format: ``[ {"id":1, ...}, {"id": 2, ...}, ...]``. @@ -259,11 +262,15 @@ def enumerate_json_items(filename, encoding=None, lines=False, flatten=False): curkey = None stack = [] nbyield = 0 - for i, (_, event, value) in enumerate(parser): - if i % 1000000 == 0 and fLOG is not None: - fLOG( # pragma: no cover - f"[enumerate_json_items] i={i} yielded={nbyield}" - ) + if verbose: + from tqdm import tqdm + + loop = tqdm(enumerate(parser)) + else: + loop = enumerate(parser) + for i, (_, event, value) in loop: + if verbose: + loop.set_description(f"process row {i}-event={event!r}") if event == "start_array": if curkey is None: current = [] diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py index bb7ea33..ec4a579 100644 --- a/pandas_streaming/df/dataframe_split.py +++ b/pandas_streaming/df/dataframe_split.py @@ -12,7 +12,7 @@ def sklearn_train_test_split( """ Randomly splits a dataframe into smaller pieces. The function returns streams of file names. - The function relies on :epkg:`sklearn:model_selection:train_test_split`. + The function relies on :func:`sklearn.model_selection.train_test_split`. It does not handle stratified version of it. @param self see :class:`StreamingDataFrame` @@ -22,7 +22,7 @@ def sklearn_train_test_split( :epkg:`pandas:DataFrame:to_csv` @param names partitions names, by default ``('train', 'test')`` @param kwargs parameters for the export function and - :epkg:`sklearn:model_selection:train_test_split`. + :fund:`sklearn.model_selection.train_test_split`. @return outputs of the exports functions The function cannot return two iterators or two @@ -111,7 +111,7 @@ def sklearn_train_test_split_streaming( """ Randomly splits a dataframe into smaller pieces. The function returns streams of file names. - The function relies on :epkg:`sklearn:model_selection:train_test_split`. + The function relies on :func:`sklearn.model_selection.train_test_split`. It handles the stratified version of it. :param self: see :class:`StreamingDataFrame` diff --git a/pyproject.toml b/pyproject.toml index 91b4010..50db37b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ ignore_directives = [ "autoclass", "autofunction", "automodule", + "exreflist", "gdot", "image-sg", "runpython", From 04a6d33180afa14441dfe9b2f9f4bffe350d2d5f Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 10:05:56 +0200 Subject: [PATCH 07/16] doc --- _doc/api/rdata.rst | 2 +- _doc/api/rdf.rst | 23 +- _doc/api/rexc.rst | 2 +- _doc/api/rio.rst | 4 +- _doc/blog/2017/2017-09-21_first_day.rst | 10 - _doc/blog/2018/2018-08-19_streamz.rst | 10 - _doc/examples/README.txt | 3 + _doc/examples/first_step.py | 100 +++ _doc/i_examples.rst | 11 - _doc/i_faq.rst | 11 - _doc/i_index.rst | 18 - _doc/index.rst | 13 +- _doc/notebooks/first_steps.ipynb | 906 ------------------------ _doc/tutorial/index.rst | 7 +- pyproject.toml | 1 + 15 files changed, 127 insertions(+), 994 deletions(-) delete mode 100644 _doc/blog/2017/2017-09-21_first_day.rst delete mode 100644 _doc/blog/2018/2018-08-19_streamz.rst create mode 100644 _doc/examples/README.txt create mode 100644 _doc/examples/first_step.py delete mode 100644 _doc/i_examples.rst delete mode 100644 _doc/i_faq.rst delete mode 100644 _doc/i_index.rst delete mode 100644 _doc/notebooks/first_steps.ipynb diff --git a/_doc/api/rdata.rst b/_doc/api/rdata.rst index 32c9e1b..3f87481 100644 --- a/_doc/api/rdata.rst +++ b/_doc/api/rdata.rst @@ -5,4 +5,4 @@ pandas_streaming.data Collection of functions which produces :class:`StreamingDataFrame `. -.. autosignature:: pandas_streaming.data.dummy.dummy_streaming_dataframe +.. autofunction:: pandas_streaming.data.dummy.dummy_streaming_dataframe diff --git a/_doc/api/rdf.rst b/_doc/api/rdf.rst index d8bcf5c..09bf139 100644 --- a/_doc/api/rdf.rst +++ b/_doc/api/rdf.rst @@ -17,29 +17,30 @@ of an instance is fast as long as the data is not processed. Iterators can be chained as many map reduce framework does. -.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame +.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame + :members: The module implements additional and useful functions not necessarily for the streaming version of the dataframes. Many methods have been rewritten to support streaming. Among them, IO methods: -.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv +.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv -.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df +.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df -.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json +.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json Data Manipulation +++++++++++++++++ -.. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_hash_columns +.. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_hash_columns -.. autosignature:: pandas_streaming.df.connex_split.dataframe_shuffle +.. autofunction:: pandas_streaming.df.connex_split.dataframe_shuffle -.. autosignature:: pandas_streaming.df.dataframe_helpers.dataframe_unfold +.. autofunction:: pandas_streaming.df.dataframe_helpers.dataframe_unfold -.. autosignature:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan +.. autofunction:: pandas_streaming.df.dataframe_helpers.pandas_groupby_nan Complex splits ++++++++++++++ @@ -49,8 +50,8 @@ if rows are not independant and share some ids. In that case, the following functions will try to build two partitions keeping ids separate or separate as much as possible. -.. autosignature:: pandas_streaming.df.connex_split.train_test_apart_stratify +.. autofunction:: pandas_streaming.df.connex_split.train_test_apart_stratify -.. autosignature:: pandas_streaming.df.connex_split.train_test_connex_split +.. autofunction:: pandas_streaming.df.connex_split.train_test_connex_split -.. autosignature:: pandas_streaming.df.connex_split.train_test_split_weights +.. autofunction:: pandas_streaming.df.connex_split.train_test_split_weights diff --git a/_doc/api/rexc.rst b/_doc/api/rexc.rst index 1896765..5528b0d 100644 --- a/_doc/api/rexc.rst +++ b/_doc/api/rexc.rst @@ -4,4 +4,4 @@ pandas_streaming.exc Exceptions. -.. autosignature:: pandas_streaming.exc.exc_streaming.StreamingInefficientException +.. autoclass:: pandas_streaming.exc.exc_streaming.StreamingInefficientException diff --git a/_doc/api/rio.rst b/_doc/api/rio.rst index f11c081..357f6cc 100644 --- a/_doc/api/rio.rst +++ b/_doc/api/rio.rst @@ -14,6 +14,6 @@ to exchange with other people and other environments. The two following functions makes it easier to collapse many dataframes or numpy arrays into one single file. The data can be unzipped afterwards. -.. autosignature:: pandas_streaming.df.dataframe_io.read_zip +.. autofunction:: pandas_streaming.df.dataframe_io.read_zip -.. autosignature:: pandas_streaming.df.dataframe_io.to_zip +.. autofunction:: pandas_streaming.df.dataframe_io.to_zip diff --git a/_doc/blog/2017/2017-09-21_first_day.rst b/_doc/blog/2017/2017-09-21_first_day.rst deleted file mode 100644 index 3bbf7d3..0000000 --- a/_doc/blog/2017/2017-09-21_first_day.rst +++ /dev/null @@ -1,10 +0,0 @@ - -.. blogpost:: - :title: Why pandas_streaming? - :keywords: pandas - :date: 2017-09-17 - :categories: documentation - - The module aims at using a similar APIs to - :epkg:`pandas` for out-of-memory dataframe. - See :ref:`l-objective`. diff --git a/_doc/blog/2018/2018-08-19_streamz.rst b/_doc/blog/2018/2018-08-19_streamz.rst deleted file mode 100644 index 2ac15a9..0000000 --- a/_doc/blog/2018/2018-08-19_streamz.rst +++ /dev/null @@ -1,10 +0,0 @@ - -.. blogpost:: - :title: Streaming dataframes with streamz - :keywords: streamz - :date: 2018-08-19 - :categories: alternatives - - :epkg:`streamz` is the most promising - initiative which implements streaming - dataframes so far. diff --git a/_doc/examples/README.txt b/_doc/examples/README.txt new file mode 100644 index 0000000..cb523b0 --- /dev/null +++ b/_doc/examples/README.txt @@ -0,0 +1,3 @@ +Gallery of Examples +=================== + diff --git a/_doc/examples/first_step.py b/_doc/examples/first_step.py new file mode 100644 index 0000000..1c446a2 --- /dev/null +++ b/_doc/examples/first_step.py @@ -0,0 +1,100 @@ +""" +First steps with pandas_streaming +================================= + +A few difference between :epkg:`pandas` and *pandas_streaming*. + +pandas to pandas_streaming +++++++++++++++++++++++++++ +""" + +from pandas import DataFrame + +df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) +df + + +############################# +# We create a streaming dataframe: + + +from pandas_streaming.df import StreamingDataFrame + +sdf = StreamingDataFrame.read_df(df) +sdf + + +################################ +# + +sdf.to_dataframe() + + +######################################## +# Internally, StreamingDataFrame implements an iterator on +# dataframes and then tries to replicate the same interface as +# :class:`pandas.DataFrame` possibly wherever it is possible to +# manipulate data without loading everything into memory. + + +sdf2 = sdf.concat(sdf) +sdf2.to_dataframe() + + +############################### +# + +m = DataFrame(dict(Y=["a", "b"], Z=[10, 20])) +m + + +########################################## +# + +sdf3 = sdf2.merge(m, left_on="Y", right_on="Y", how="outer") +sdf3.to_dataframe() + + +############################################ +# + +sdf2.to_dataframe().merge(m, left_on="Y", right_on="Y", how="outer") + + +############################################ +# The order might be different. + + +sdftr, sdfte = sdf2.train_test_split(test_size=0.5) +sdfte.head() + + +############################################ +# + + +sdftr.head() + + +############################################ +# split a big file +# ++++++++++++++++ + + +sdf2.to_csv("example.txt") + + +############################################ +# + + +new_sdf = StreamingDataFrame.read_csv("example.txt") +new_sdf.train_test_split("example.{}.txt", streaming=False) + + +############################################ +# + +import glob + +glob.glob("ex*.txt") diff --git a/_doc/i_examples.rst b/_doc/i_examples.rst deleted file mode 100644 index 1b16057..0000000 --- a/_doc/i_examples.rst +++ /dev/null @@ -1,11 +0,0 @@ - -.. _l-EX2: - -Examples -======== - -.. toctree:: - - i_ex - gyexamples/index - all_notebooks diff --git a/_doc/i_faq.rst b/_doc/i_faq.rst deleted file mode 100644 index 26ded95..0000000 --- a/_doc/i_faq.rst +++ /dev/null @@ -1,11 +0,0 @@ - -.. _l-FAQ2: - -FAQ -=== - -.. contents:: - :local: - -.. faqreflist:: - :contents: diff --git a/_doc/i_index.rst b/_doc/i_index.rst deleted file mode 100644 index ec0ffec..0000000 --- a/_doc/i_index.rst +++ /dev/null @@ -1,18 +0,0 @@ - -===== -Index -===== - -.. toctree:: - :maxdepth: 2 - - gyexamples/index - gynotebooks/index - issues_todoextlist - completed_todoextlist - filechanges - all_report - glossary - README - license - blog/blogindex diff --git a/_doc/index.rst b/_doc/index.rst index 15a7eb7..1d8755b 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -2,7 +2,7 @@ .. |gitlogo| image:: _static/git_logo.png :height: 20 -pandas_streaming: streaming API over pandas +pandas-streaming: streaming API over pandas =========================================== .. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true @@ -56,20 +56,19 @@ pandas_streaming: streaming API over pandas :target: https://github.com/sdpython/pandas_streaming/ :alt: size -*pandas_streaming* aims at processing big files with `pandas `_, +*pandas_streaming* aims at processing big files with :epkg:`pandas`, too big to hold in memory, too small to be parallelized with a significant gain. -The module replicates a subset of `pandas `_ API +The module replicates a subset of :epkg:`pandas` API and implements other functionalities for machine learning. .. toctree:: :maxdepth: 1 tutorial/index + auto_examples/index api/index - i_examples - blog/blogindex - index_modules - i_index + i_ex + **Links:** `github `_, `documentation `_, diff --git a/_doc/notebooks/first_steps.ipynb b/_doc/notebooks/first_steps.ipynb deleted file mode 100644 index 735ede9..0000000 --- a/_doc/notebooks/first_steps.ipynb +++ /dev/null @@ -1,906 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# First steps with pandas_streaming\n", - "\n", - "A few difference between [pandas](http://pandas.pydata.org/) and *pandas_streaming*." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
run previous cell, wait for 2 seconds
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from jyquickhelper import add_notebook_menu\n", - "\n", - "add_notebook_menu()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas to pandas_streaming" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
16.0b
27.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pandas import DataFrame\n", - "\n", - "df = DataFrame(data=dict(X=[4.5, 6, 7], Y=[\"a\", \"b\", \"c\"]))\n", - "df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "We create a streaming dataframe:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from pandas_streaming.df import StreamingDataFrame\n", - "\n", - "sdf = StreamingDataFrame.read_df(df)\n", - "sdf" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
16.0b
27.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdf.to_dataframe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Internally, StreamingDataFrame implements an iterator on dataframes and then tries to replicate the same interface as [pandas.DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) possibly wherever it is possible to manipulate data without loading everything into memory." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
16.0b
27.0c
04.5a
16.0b
27.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c\n", - "0 4.5 a\n", - "1 6.0 b\n", - "2 7.0 c" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdf2 = sdf.concat(sdf)\n", - "sdf2.to_dataframe()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
YZ
0a10
1b20
\n", - "
" - ], - "text/plain": [ - " Y Z\n", - "0 a 10\n", - "1 b 20" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "m = DataFrame(dict(Y=[\"a\", \"b\"], Z=[10, 20]))\n", - "m" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XYZ
04.5a10.0
16.0b20.0
27.0cNaN
04.5a10.0
16.0b20.0
27.0cNaN
\n", - "
" - ], - "text/plain": [ - " X Y Z\n", - "0 4.5 a 10.0\n", - "1 6.0 b 20.0\n", - "2 7.0 c NaN\n", - "0 4.5 a 10.0\n", - "1 6.0 b 20.0\n", - "2 7.0 c NaN" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdf3 = sdf2.merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")\n", - "sdf3.to_dataframe()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XYZ
04.5a10.0
14.5a10.0
26.0b20.0
36.0b20.0
47.0cNaN
57.0cNaN
\n", - "
" - ], - "text/plain": [ - " X Y Z\n", - "0 4.5 a 10.0\n", - "1 4.5 a 10.0\n", - "2 6.0 b 20.0\n", - "3 6.0 b 20.0\n", - "4 7.0 c NaN\n", - "5 7.0 c NaN" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdf2.to_dataframe().merge(m, left_on=\"Y\", right_on=\"Y\", how=\"outer\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The order might be different." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
04.5a
14.5a
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 4.5 a\n", - "1 4.5 a" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdftr, sdfte = sdf2.train_test_split(test_size=0.5)\n", - "sdfte.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
XY
06.0b
17.0c
26.0b
07.0c
\n", - "
" - ], - "text/plain": [ - " X Y\n", - "0 6.0 b\n", - "1 7.0 c\n", - "2 6.0 b\n", - "0 7.0 c" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdftr.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "## split a big file" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'example.txt'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sdf2.to_csv(\"example.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['example.train.txt', 'example.test.txt']" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "new_sdf = StreamingDataFrame.read_csv(\"example.txt\")\n", - "new_sdf.train_test_split(\"example.{}.txt\", streaming=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['example.test.txt', 'example.train.txt', 'example.txt']" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import glob\n", - "\n", - "glob.glob(\"ex*.txt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst index 5828dd9..856b12b 100644 --- a/_doc/tutorial/index.rst +++ b/_doc/tutorial/index.rst @@ -31,8 +31,6 @@ when it does not fit into memory. .. contents:: :local: -.. _l-objective: - Objectives and Competitors ++++++++++++++++++++++++++ @@ -111,10 +109,7 @@ A user can either choose to draw the same sample every time he is going through the data. He could also choose that a different sample should be drawn each time. The following method indicates which kinds of sample the :class:`StreamingDataFrame ` -is producing. - -.. autosignature:: pandas_streaming.df.dataframe.StreamingDataFrame - :members: is_table +is producing (see :meth:`pandas_streaming.df.dataframe.StreamingDataFrame.is_table`). Check the schema consistency of a large file ++++++++++++++++++++++++++++++++++++++++++++ diff --git a/pyproject.toml b/pyproject.toml index 50db37b..c1472ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ ignore_directives = [ "exreflist", "gdot", "image-sg", + "pr", "runpython", ] ignore_roles = ["epkg"] From 0971f57153840ebc7391ff2f942461da4fe8b346 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 10:11:29 +0200 Subject: [PATCH 08/16] req --- .circleci/config.yml | 2 +- README.rst | 12 ++++++------ _doc/index.rst | 12 ++++++------ pandas_streaming/df/dataframe_io_helpers.py | 2 +- requirements-dev.txt | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 712677b..e76a1b7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -54,7 +54,7 @@ jobs: - run: name: run tests command: | - python setup.py unittests + python -m pytest - run: name: wheel diff --git a/README.rst b/README.rst index 1096a34..8f0f162 100644 --- a/README.rst +++ b/README.rst @@ -1,10 +1,10 @@ pandas-streaming: streaming API over pandas =========================================== -.. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true +.. image:: https://github.com/sdpython/pandas_streaming/blob/main/_doc/sphinxdoc/source/_static/project_ico.png?raw=true :target: https://github.com/sdpython/pandas_streaming/ -.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=master +.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=main :target: https://app.travis-ci.com/github/sdpython/pandas_streaming :alt: Build status @@ -12,8 +12,8 @@ pandas-streaming: streaming API over pandas :target: https://ci.appveyor.com/project/sdpython/pandas-streaming :alt: Build Status Windows -.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/master.svg?style=svg - :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/master +.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg + :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming :target: https://dev.azure.com/xavierdupre3/pandas_streaming/ @@ -25,8 +25,8 @@ pandas-streaming: streaming API over pandas :alt: MIT License :target: http://opensource.org/licenses/MIT -.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=master - :target: https://codecov.io/github/sdpython/pandas_streaming?branch=master +.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=main + :target: https://codecov.io/github/sdpython/pandas_streaming?branch=main .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png :alt: GitHub Issues diff --git a/_doc/index.rst b/_doc/index.rst index 1d8755b..345d6a8 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -5,10 +5,10 @@ pandas-streaming: streaming API over pandas =========================================== -.. image:: https://github.com/sdpython/pandas_streaming/blob/master/_doc/sphinxdoc/source/_static/project_ico.png?raw=true +.. image:: https://github.com/sdpython/pandas_streaming/blob/main/_doc/sphinxdoc/source/_static/project_ico.png?raw=true :target: https://github.com/sdpython/pandas_streaming/ -.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=master +.. image:: https://travis-ci.com/sdpython/pandas_streaming.svg?branch=main :target: https://app.travis-ci.com/github/sdpython/pandas_streaming :alt: Build status @@ -16,8 +16,8 @@ pandas-streaming: streaming API over pandas :target: https://ci.appveyor.com/project/sdpython/pandas-streaming :alt: Build Status Windows -.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/master.svg?style=svg - :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/master +.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg + :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming :target: https://dev.azure.com/xavierdupre3/pandas_streaming/ @@ -29,8 +29,8 @@ pandas-streaming: streaming API over pandas :alt: MIT License :target: http://opensource.org/licenses/MIT -.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=master - :target: https://codecov.io/github/sdpython/pandas_streaming?branch=master +.. image:: https://codecov.io/github/sdpython/pandas_streaming/coverage.svg?branch=main + :target: https://codecov.io/github/sdpython/pandas_streaming?branch=main .. image:: http://img.shields.io/github/issues/sdpython/pandas_streaming.png :alt: GitHub Issues diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py index c8a7776..4f502fc 100644 --- a/pandas_streaming/df/dataframe_io_helpers.py +++ b/pandas_streaming/df/dataframe_io_helpers.py @@ -117,7 +117,7 @@ def flatten_dictionary(dico, sep="_"): :return: flattened dictionary Inspired from `flatten_json - `_. + `_. """ flattened_dict = {} diff --git a/requirements-dev.txt b/requirements-dev.txt index 16ed1c5..d5ec849 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,7 +14,7 @@ pyquickhelper>=1.10 scikit-learn scipy sphinx -sphinx-runpython +git+https://github.com/sdpython/sphinx-runpython.git sphinxcontrib.imagesvg sphinx_gallery ujson From 16368e7e209963497ebf094af218edee830d3a3f Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 11:00:25 +0200 Subject: [PATCH 09/16] req --- _doc/i_ex.rst | 2 -- requirements-dev.txt | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/_doc/i_ex.rst b/_doc/i_ex.rst index 1fcf475..15f2342 100644 --- a/_doc/i_ex.rst +++ b/_doc/i_ex.rst @@ -1,6 +1,4 @@ -.. _l-EX2: - Examples ======== diff --git a/requirements-dev.txt b/requirements-dev.txt index d5ec849..35ed6ab 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,6 +5,7 @@ ijson jupyter_sphinx jyquickhelper matplotlib +nbsphinx pandas>=1.1.0 pandocfilters Pillow From 8f768dba2003a514606524e2e56de9920a423b9d Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 11:04:25 +0200 Subject: [PATCH 10/16] req --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index 35ed6ab..52d6e9a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,6 +15,7 @@ pyquickhelper>=1.10 scikit-learn scipy sphinx +sphinx-issues git+https://github.com/sdpython/sphinx-runpython.git sphinxcontrib.imagesvg sphinx_gallery From 955251e0eabc93bf2e41ee2bda59d11692ce6129 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 11:12:51 +0200 Subject: [PATCH 11/16] req --- requirements-dev.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 52d6e9a..a10fbfd 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,5 @@ autopep8 +black coverage furo ijson @@ -11,13 +12,16 @@ pandocfilters Pillow pycodestyle pylint>=2.14.0 +pytest +pytest-cov pyquickhelper>=1.10 +rstcheck[sphinx,toml] +ruff scikit-learn scipy sphinx sphinx-issues git+https://github.com/sdpython/sphinx-runpython.git -sphinxcontrib.imagesvg sphinx_gallery ujson wheel From 9fe59a69251369dbfb2f720f61adbb64e659c94c Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 11:33:36 +0200 Subject: [PATCH 12/16] doc --- _doc/api/rdf.rst | 3 + _doc/conf.py | 1 + _doc/examples/first_step.py | 8 +-- pandas_streaming/df/connex_split.py | 75 +++++++++++---------- pandas_streaming/df/dataframe.py | 44 ++++++------ pandas_streaming/df/dataframe_io.py | 4 +- pandas_streaming/df/dataframe_io_helpers.py | 2 +- 7 files changed, 69 insertions(+), 68 deletions(-) diff --git a/_doc/api/rdf.rst b/_doc/api/rdf.rst index 09bf139..1a41bd1 100644 --- a/_doc/api/rdf.rst +++ b/_doc/api/rdf.rst @@ -26,10 +26,13 @@ Many methods have been rewritten to support streaming. Among them, IO methods: .. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv + :noindex: .. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df + :noindex: .. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json + :noindex: Data Manipulation +++++++++++++++++ diff --git a/_doc/conf.py b/_doc/conf.py index 446bf9d..746d91f 100644 --- a/_doc/conf.py +++ b/_doc/conf.py @@ -180,6 +180,7 @@ "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html", "Hadoop": "http://hadoop.apache.org/", "ijson": "https://github.com/ICRAR/ijson", + "json": "https://docs.python.org/3/library/json.html", "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN", "pandas": ( "http://pandas.pydata.org/pandas-docs/stable/", diff --git a/_doc/examples/first_step.py b/_doc/examples/first_step.py index 1c446a2..bd6870c 100644 --- a/_doc/examples/first_step.py +++ b/_doc/examples/first_step.py @@ -7,8 +7,10 @@ pandas to pandas_streaming ++++++++++++++++++++++++++ """ - +import glob from pandas import DataFrame +from pandas_streaming.df import StreamingDataFrame + df = DataFrame(data=dict(X=[4.5, 6, 7], Y=["a", "b", "c"])) df @@ -18,8 +20,6 @@ # We create a streaming dataframe: -from pandas_streaming.df import StreamingDataFrame - sdf = StreamingDataFrame.read_df(df) sdf @@ -95,6 +95,4 @@ ############################################ # -import glob - glob.glob("ex*.txt") diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py index c3b6150..1636d13 100644 --- a/pandas_streaming/df/connex_split.py +++ b/pandas_streaming/df/connex_split.py @@ -147,32 +147,32 @@ def train_test_connex_split( test set. The function computes the connected components and breaks each of them in two parts for train and test. - @param df :epkg:`pandas:DataFrame` - @param groups columns name for the ids - @param test_size ratio for the test partition - (if *train_size* is not specified) - @param train_size ratio for the train partition - @param stratify column holding the stratification - @param hash_size size of the hash to cache information about partition - @param unique_rows ensures that rows are unique - @param shuffle shuffles before the split - @param fail_imbalanced raises an exception if relative weights difference - is higher than this value - @param stop_if_bigger (float) stops a connected components from being - bigger than this ratio of elements, this should not be used - unless a big components emerges, the algorithm stops merging - but does not guarantee it returns the best cut, - the value should be close to 0 - @param keep_balance (float), if not None, does not merge connected components - if their relative sizes are too different, - the value should be close to 1 - @param return_cnx returns connected components as a third results - @param must_groups column name for ids which must not be shared by - train/test partitions - @param random_state seed for random generator - @param verbose verbosity (uses logging) - @return Two see :class:`StreamingDataFrame`, one - for train, one for test. + :param df: :epkg:`pandas:DataFrame` + :param groups: columns name for the ids + :param test_size: ratio for the test partition + (if *train_size* is not specified) + :param train_size: ratio for the train partition + :param stratify: column holding the stratification + :param hash_size: size of the hash to cache information about partition + :param unique_rows: ensures that rows are unique + :param shuffle: shuffles before the split + :param fail_imbalanced: raises an exception if relative weights difference + is higher than this value + :param stop_if_bigger: (float) stops a connected components from being + bigger than this ratio of elements, this should not be used + unless a big components emerges, the algorithm stops merging + but does not guarantee it returns the best cut, + the value should be close to 0 + :param keep_balance: (float), if not None, does not merge connected components + if their relative sizes are too different, + the value should be close to 1 + :param return_cnx: returns connected components as a third results + :param must_groups: column name for ids which must not be shared by + train/test partitions + :param random_state: seed for random generator + :param verbose: verbosity (uses logging) + :return: Two see :class:`StreamingDataFrame`, one + for train, one for test. The list of ids must hold in memory. There is no streaming implementation for the ids. @@ -462,17 +462,17 @@ def train_test_apart_stratify( distinct products on train and test but common categories on both sides. - @param df :epkg:`pandas:DataFrame` - @param group columns name for the ids - @param test_size ratio for the test partition - (if *train_size* is not specified) - @param train_size ratio for the train partition - @param stratify column holding the stratification - @param force if True, tries to get at least one example on the test side - for each value of the column *stratify* - @param random_state seed for random generators - @return Two see :class:`StreamingDataFrame`, one - for train, one for test. + :param df: :epkg:`pandas:DataFrame` + :param group: columns name for the ids + :param test_size: ratio for the test partition + (if *train_size* is not specified) + :param train_size: ratio for the train partition + :param stratify: column holding the stratification + :param force: if True, tries to get at least one example on the test side + for each value of the column *stratify* + :param random_state: seed for random generators + :return: Two see :class:`StreamingDataFrame`, one + for train, one for test. .. index:: multi-label @@ -501,6 +501,7 @@ def train_test_apart_stratify( print(train) print('-----------') print(test) + """ if stratify is None: raise ValueError("stratify must be specified.") # pragma: no cover diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py index b519102..3e79c55 100644 --- a/pandas_streaming/df/dataframe.py +++ b/pandas_streaming/df/dataframe.py @@ -44,7 +44,7 @@ class StreamingDataFrame: The constructor cannot receive an iterator otherwise this class would be able to walk through the data only once. The main reason is it is impossible to - :epkg:`*py:pickle` (or :epkg:`dill`) + :mod:`pickle` (or :epkg:`dill`) an iterator: it cannot be replicated. Instead, the class takes a function which generates an iterator on :epkg:`DataFrame`. @@ -89,10 +89,10 @@ def is_stable(self, do_check=False, n=10): """ Tells if the :epkg:`dataframe` is supposed to be stable. - @param do_check do not trust the value sent to the constructor - @param n number of rows used to check the stability, - None for all rows - @return boolean + :param do_check: do not trust the value sent to the constructor + :param n: number of rows used to check the stability, + None for all rows + :return: boolean *do_check=True* means the methods checks the first *n* rows remains the same for two iterations. @@ -130,23 +130,23 @@ def train_test_split( It chooses one of the options from module :mod:`dataframe_split `. - @param path_or_buf a string, a list of strings or buffers, if it is a - string, it must contain ``{}`` like ``partition{}.txt``, - if None, the function returns strings. - @param export_method method used to store the partitions, by default - :epkg:`pandas:DataFrame:to_csv`, additional parameters - will be given to that function - @param names partitions names, by default ``('train', 'test')`` - @param kwargs parameters for the export function and - :func:`sklearn.model_selection.train_test_split`. - @param streaming the function switches to a - streaming version of the algorithm. - @param partitions splitting partitions - @return outputs of the exports functions or two - see :class:`StreamingDataFrame` if path_or_buf is None. + :param path_or_buf: a string, a list of strings or buffers, if it is a + string, it must contain ``{}`` like ``partition{}.txt``, + if None, the function returns strings. + :param export_method: method used to store the partitions, by default + :epkg:`pandas:DataFrame:to_csv`, additional parameters + will be given to that function + :param names: partitions names, by default ``('train', 'test')`` + :param kwargs: parameters for the export function and + :func:`sklearn.model_selection.train_test_split`. + :param streaming: the function switches to a + streaming version of the algorithm. + :param partitions: splitting partitions + :return: outputs of the exports functions or two + see :class:`StreamingDataFrame` if *path_or_buf* is None. The streaming version of this algorithm is implemented by function - @see fn sklearn_train_test_split_streaming. Its documentation + :func:`sklearn_train_test_split_streaming`. Its documentation indicates the limitation of the streaming version and gives some insights about the additional parameters. """ @@ -229,11 +229,9 @@ def read_json( dfs = list(it) print(dfs) - .. index:: IncompleteJSONError - The parsed json must have an empty line at the end otherwise the following exception is raised: - `ijson.common.IncompleteJSONError: ` + `ijson.common.IncompleteJSONError`: `parse error: unallowed token at this point in JSON text`. """ if not isinstance(chunksize, int) or chunksize <= 0: diff --git a/pandas_streaming/df/dataframe_io.py b/pandas_streaming/df/dataframe_io.py index 532a2bf..bf13cc2 100644 --- a/pandas_streaming/df/dataframe_io.py +++ b/pandas_streaming/df/dataframe_io.py @@ -11,7 +11,7 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs): It can be read by @see fn to_zip. :param df: :epkg:`dataframe` or :epkg:`numpy:array` - :param zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename + :param zipfilename: a :class:`zipfile:ZipFile` or a filename :param zname: a filename in th zipfile :param kwargs: parameters for :epkg:`pandas:to_csv` or :epkg:`numpy:save` @@ -104,7 +104,7 @@ def read_zip(zipfilename, zname=None, **kwargs): Reads a :epkg:`dataframe` from a :epkg:`zip` file. It can be saved by @see fn read_zip. - :param zipfilename: a :epkg:`*py:zipfile:ZipFile` or a filename + :param zipfilename: a :class:`zipfile.ZipFile` or a filename :param zname: a filename in zipfile, if None, takes the first one :param kwargs: parameters for :func:`pandas.read_csv` :return: :func:`pandas.DataFrame` or :epkg:`numpy:array` diff --git a/pandas_streaming/df/dataframe_io_helpers.py b/pandas_streaming/df/dataframe_io_helpers.py index 4f502fc..8c00ba2 100644 --- a/pandas_streaming/df/dataframe_io_helpers.py +++ b/pandas_streaming/df/dataframe_io_helpers.py @@ -339,7 +339,7 @@ class JsonIterator2Stream: The iterator could be one returned by @see fn enumerate_json_items. :param it: iterator - :param kwargs: arguments to :epkg:`*py:json:dumps` + :param kwargs: arguments to :class:`json.dumps` .. exref:: :title: Reshape a json file From a132c58f3eab352514e7c83d89596659d75745a4 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 12:50:45 +0200 Subject: [PATCH 13/16] doc --- README.rst | 4 - _doc/api/connex_split.rst | 6 ++ _doc/api/dataframe.rst | 22 +++++ _doc/api/dataframe_io.rst | 6 ++ _doc/api/dataframe_split.rst | 6 ++ _doc/api/rdf.rst | 38 ++++----- _doc/api/rio.rst | 11 +-- _doc/conf.py | 7 +- _doc/glossary.rst | 13 --- _doc/i_ex.rst | 3 - _doc/index.rst | 19 +---- _doc/tutorial/index.rst | 8 +- pandas_streaming/df/connex_split.py | 29 +++---- pandas_streaming/df/dataframe.py | 109 +++++++++++++++---------- pandas_streaming/df/dataframe_io.py | 20 ++--- pandas_streaming/df/dataframe_split.py | 34 ++++---- 16 files changed, 185 insertions(+), 150 deletions(-) create mode 100644 _doc/api/connex_split.rst create mode 100644 _doc/api/dataframe.rst create mode 100644 _doc/api/dataframe_io.rst create mode 100644 _doc/api/dataframe_split.rst delete mode 100644 _doc/glossary.rst diff --git a/README.rst b/README.rst index 8f0f162..f3fdde7 100644 --- a/README.rst +++ b/README.rst @@ -32,10 +32,6 @@ pandas-streaming: streaming API over pandas :alt: GitHub Issues :target: https://github.com/sdpython/pandas_streaming/issues -.. image:: http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/_images/nbcov.png - :target: http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/all_notebooks_coverage.html - :alt: Notebook Coverage - .. image:: https://pepy.tech/badge/pandas_streaming/month :target: https://pepy.tech/project/pandas_streaming/month :alt: Downloads diff --git a/_doc/api/connex_split.rst b/_doc/api/connex_split.rst new file mode 100644 index 0000000..1612130 --- /dev/null +++ b/_doc/api/connex_split.rst @@ -0,0 +1,6 @@ + +pandas_streaming.df.connex_split +================================ + +.. automodule:: pandas_streaming.df.connex_split + :members: diff --git a/_doc/api/dataframe.rst b/_doc/api/dataframe.rst new file mode 100644 index 0000000..143c558 --- /dev/null +++ b/_doc/api/dataframe.rst @@ -0,0 +1,22 @@ + +pandas_streaming.df.dataframe +============================= + +StreamingDataFrameSchemaError ++++++++++++++++++++++++++++++ + +.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrameSchemaError + :members: + +StreamingDataFrame +++++++++++++++++++ + +.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame + :members: + :special-members: + +StreamingSeries ++++++++++++++++ + +.. autoclass:: pandas_streaming.df.dataframe.StreamingSeries + :members: diff --git a/_doc/api/dataframe_io.rst b/_doc/api/dataframe_io.rst new file mode 100644 index 0000000..c346af5 --- /dev/null +++ b/_doc/api/dataframe_io.rst @@ -0,0 +1,6 @@ + +pandas_streaming.df.dataframe_io +================================ + +.. automodule:: pandas_streaming.df.dataframe_io + :members: diff --git a/_doc/api/dataframe_split.rst b/_doc/api/dataframe_split.rst new file mode 100644 index 0000000..30e6dee --- /dev/null +++ b/_doc/api/dataframe_split.rst @@ -0,0 +1,6 @@ + +pandas_streaming.df.dataframe_split +=================================== + +.. automodule:: pandas_streaming.df.dataframe_split + :members: diff --git a/_doc/api/rdf.rst b/_doc/api/rdf.rst index 1a41bd1..751e353 100644 --- a/_doc/api/rdf.rst +++ b/_doc/api/rdf.rst @@ -2,14 +2,11 @@ pandas_streaming.df =================== -.. contents:: - :local: - Streaming +++++++++ The main class is an interface which mimic -:epkg:`pandas:DataFrame` interface to offer +:class:`pandas.DataFrame` interface to offer a short list of methods which apply on an iterator of dataframes. This provides somehow a streaming version of it. As a result, the creation @@ -17,22 +14,18 @@ of an instance is fast as long as the data is not processed. Iterators can be chained as many map reduce framework does. -.. autoclass:: pandas_streaming.df.dataframe.StreamingDataFrame - :members: +.. toctree:: + :maxdepth: 2 + + dataframe The module implements additional and useful functions not necessarily for the streaming version of the dataframes. Many methods have been rewritten to support streaming. Among them, IO methods: - -.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_csv - :noindex: - -.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_df - :noindex: - -.. autofunction:: pandas_streaming.df.dataframe.StreamingDataFrame.read_json - :noindex: +:meth:`read_csv `, +:meth:`read_df `, +:meth:`read_json `. Data Manipulation +++++++++++++++++ @@ -51,10 +44,17 @@ Complex splits Splitting a database into train and test is usually simple except if rows are not independant and share some ids. In that case, the following functions will try to build two partitions keeping -ids separate or separate as much as possible. +ids separate or separate as much as possible: +:func:`train_test_apart_stratify `, +:func:`train_test_connex_split `, +:func:`train_test_split_weights `. -.. autofunction:: pandas_streaming.df.connex_split.train_test_apart_stratify +Extensions +++++++++++ -.. autofunction:: pandas_streaming.df.connex_split.train_test_connex_split +.. toctree:: + :maxdepth: 1 -.. autofunction:: pandas_streaming.df.connex_split.train_test_split_weights + connex_split + dataframe_io + dataframe_split diff --git a/_doc/api/rio.rst b/_doc/api/rio.rst index 357f6cc..4de1211 100644 --- a/_doc/api/rio.rst +++ b/_doc/api/rio.rst @@ -2,9 +2,6 @@ Inputs / Outputs ================ -.. contents:: - :local: - Dataframes / Numpy arrays +++++++++++++++++++++++++ @@ -12,8 +9,6 @@ Dataframes / Numpy arrays is easy to manipulate in the :epkg:`Python` world but difficult to exchange with other people and other environments. The two following functions makes it easier to collapse many dataframes -or numpy arrays into one single file. The data can be unzipped afterwards. - -.. autofunction:: pandas_streaming.df.dataframe_io.read_zip - -.. autofunction:: pandas_streaming.df.dataframe_io.to_zip +or numpy arrays into one single file. The data can be unzipped afterwards, +see :func:`read_zip `, +:func:`to_zip `. diff --git a/_doc/conf.py b/_doc/conf.py index 746d91f..811119c 100644 --- a/_doc/conf.py +++ b/_doc/conf.py @@ -178,10 +178,12 @@ "dataframes": "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html", "dill": "https://dill.readthedocs.io/en/latest/dill.html", "groupby and missing values": "https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html", + "Jupyter": "https://jupyter.org/", "Hadoop": "http://hadoop.apache.org/", "ijson": "https://github.com/ICRAR/ijson", "json": "https://docs.python.org/3/library/json.html", "nan": "https://numpy.org/doc/stable/reference/constants.html#numpy.NAN", + "numpy": "https://numpy.org/", "pandas": ( "http://pandas.pydata.org/pandas-docs/stable/", ( @@ -195,12 +197,15 @@ ), "pyarrow": "https://arrow.apache.org/docs/python/", "pyspark": "http://spark.apache.org/docs/2.1.1/api/python/index.html", + "Python": "https://www.python.org/", + "scikit-learn": "https://scikit-learn.org/stable/", "scikit-multiflow": "https://scikit-multiflow.github.io/", "sklearn": ( - "http://scikit-learn.org/stable/", + "https://scikit-learn.org/stable/", ("https://scikit-learn.org/stable/modules/generated/{0}.html", 1), ("https://scikit-learn.org/stable/modules/generated/{0}.{1}.html", 2), ), "streamz": "https://streamz.readthedocs.io/en/latest/index.html", "tornado": "https://www.tornadoweb.org/en/stable/", + "zip": "https://en.wikipedia.org/wiki/ZIP_(file_format)", } diff --git a/_doc/glossary.rst b/_doc/glossary.rst deleted file mode 100644 index cf8651d..0000000 --- a/_doc/glossary.rst +++ /dev/null @@ -1,13 +0,0 @@ - -.. index:: glossary - -Glossary -======== - -.. glossary:: - - Jupyter - See :epkg:`Jupyter` - - pandas - See :epkg:`pandas`. diff --git a/_doc/i_ex.rst b/_doc/i_ex.rst index 15f2342..43a0265 100644 --- a/_doc/i_ex.rst +++ b/_doc/i_ex.rst @@ -2,9 +2,6 @@ Examples ======== -.. contents:: - :local: - About array +++++++++++ diff --git a/_doc/index.rst b/_doc/index.rst index 345d6a8..fa91aea 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -36,10 +36,6 @@ pandas-streaming: streaming API over pandas :alt: GitHub Issues :target: https://github.com/sdpython/pandas_streaming/issues -.. image:: nbcov.png - :target: http://www.xavierdupre.fr/app/pandas_streaming/helpsphinx/all_notebooks_coverage.html - :alt: Notebook Coverage - .. image:: https://pepy.tech/badge/pandas_streaming :target: https://pypi.org/project/pandas_streaming/ :alt: Downloads @@ -68,17 +64,4 @@ and implements other functionalities for machine learning. auto_examples/index api/index i_ex - - -**Links:** `github `_, -`documentation `_, -:ref:`l-README`, -:ref:`blog ` - -+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+ -| :ref:`l-modules` | :ref:`l-functions` | :ref:`l-classes` | :ref:`l-methods` | :ref:`l-staticmethods` | :ref:`l-properties` | -+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+ -| :ref:`modindex` | :ref:`l-EX2` | :ref:`search` | :ref:`l-license` | :ref:`l-changes` | :ref:`l-README` | -+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+ -| :ref:`genindex` | :ref:`l-FAQ2` | :ref:`l-notebooks` | | :ref:`l-statcode` | `Unit Test Coverage `_ | -+----------------------+---------------------+---------------------+--------------------+------------------------+------------------------------------------------+ + license diff --git a/_doc/tutorial/index.rst b/_doc/tutorial/index.rst index 856b12b..1e693c7 100644 --- a/_doc/tutorial/index.rst +++ b/_doc/tutorial/index.rst @@ -28,9 +28,6 @@ when it does not fit into memory. >>> ['dataset_split_train.txt', 'dataset_split_test.txt'] -.. contents:: - :local: - Objectives and Competitors ++++++++++++++++++++++++++ @@ -109,7 +106,7 @@ A user can either choose to draw the same sample every time he is going through the data. He could also choose that a different sample should be drawn each time. The following method indicates which kinds of sample the :class:`StreamingDataFrame ` -is producing (see :meth:`pandas_streaming.df.dataframe.StreamingDataFrame.is_table`). +is producing. Check the schema consistency of a large file ++++++++++++++++++++++++++++++++++++++++++++ @@ -138,7 +135,8 @@ an idea of where we could find the error. except Exception as e: print("ERROR:", e) -The method :py:meth:`__iter__ ` +The method :meth:`__iter__ +` checks that the schema does not change between two iterations. It can be disabled by adding *check_schema=False* when the constructor is called. diff --git a/pandas_streaming/df/connex_split.py b/pandas_streaming/df/connex_split.py index 1636d13..1df251f 100644 --- a/pandas_streaming/df/connex_split.py +++ b/pandas_streaming/df/connex_split.py @@ -29,16 +29,17 @@ def train_test_split_weights( Splits a database in train/test given, every row can have a different weight. - @param df :epkg:`pandas:DataFrame` or see :class:`StreamingDataFrame` - @param weights None or weights or weights column name - @param test_size ratio for the test partition - (if *train_size* is not specified) - @param train_size ratio for the train partition - @param shuffle shuffles before the split - @param fail_imbalanced raises an exception if relative weights - difference is higher than this value - @param random_state seed for random generators - @return train and test :epkg:`pandas:DataFrame` + :param df: :class:`pandas.DataFrame` or see + :class:`StreamingDataFrame ` + :param weights: None or weights or weights column name + :param test_size: ratio for the test partition + (if *train_size* is not specified) + :param train_size: ratio for the train partition + :param shuffle: shuffles before the split + :param fail_imbalanced: raises an exception if relative weights + difference is higher than this value + :param random_state: seed for random generators + :return: train and test :class:`pandas.DataFrame` If the dataframe is not shuffled first, the function will produce two datasets which are unlikely to be randomized @@ -171,7 +172,8 @@ def train_test_connex_split( train/test partitions :param random_state: seed for random generator :param verbose: verbosity (uses logging) - :return: Two see :class:`StreamingDataFrame`, one + :return: Two see :class:`StreamingDataFrame + `, one for train, one for test. The list of ids must hold in memory. @@ -471,11 +473,10 @@ def train_test_apart_stratify( :param force: if True, tries to get at least one example on the test side for each value of the column *stratify* :param random_state: seed for random generators - :return: Two see :class:`StreamingDataFrame`, one + :return: Two see :class:`StreamingDataFrame + `, one for train, one for test. - .. index:: multi-label - The list of ids must hold in memory. There is no streaming implementation for the ids. This split was implemented for a case of a multi-label diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py index 3e79c55..6537d1f 100644 --- a/pandas_streaming/df/dataframe.py +++ b/pandas_streaming/df/dataframe.py @@ -49,7 +49,8 @@ class StreamingDataFrame: Instead, the class takes a function which generates an iterator on :epkg:`DataFrame`. Most of the methods returns either a :epkg:`DataFrame` - either a see :class:`StreamingDataFrame`. In the second case, + either a see :class:`StreamingDataFrame + `. In the second case, methods can be chained. By default, the object checks that the schema remains @@ -63,7 +64,8 @@ class StreamingDataFrame: is one of these cases. :param iter_creation: function which creates an iterator or an - instance of see :class:`StreamingDataFrame` + instance of see :class:`StreamingDataFrame + ` :param check_schema: checks that the schema is the same for every :epkg:`dataframe` :param stable: indicates if the :epkg:`dataframe` remains the same @@ -134,7 +136,7 @@ def train_test_split( string, it must contain ``{}`` like ``partition{}.txt``, if None, the function returns strings. :param export_method: method used to store the partitions, by default - :epkg:`pandas:DataFrame:to_csv`, additional parameters + :meth:`pandas.DataFrame.to_csv`, additional parameters will be given to that function :param names: partitions names, by default ``('train', 'test')`` :param kwargs: parameters for the export function and @@ -143,12 +145,14 @@ def train_test_split( streaming version of the algorithm. :param partitions: splitting partitions :return: outputs of the exports functions or two - see :class:`StreamingDataFrame` if *path_or_buf* is None. + see class `StreamingDataFrame` + if *path_or_buf* is None. The streaming version of this algorithm is implemented by function - :func:`sklearn_train_test_split_streaming`. Its documentation - indicates the limitation of the streaming version and gives some - insights about the additional parameters. + :func:`sklearn_train_test_split_streaming + `. + Its documentation indicates the limitation of the streaming version + and gives some insights about the additional parameters. """ if streaming: if partitions is not None: @@ -376,10 +380,11 @@ def read_df(df, chunksize=None, check_schema=True) -> "StreamingDataFrame": Splits a :epkg:`DataFrame` into small chunks mostly for unit testing purposes. - @param df :epkg:`DataFrame` - @param chunksize number rows per chunks (// 10 by default) - @param check_schema check schema between two iterations - @return iterator on see :class:`StreamingDataFrame` + :param df: :class:`pandas.DataFrame` + :param chunksize: number rows per chunks (// 10 by default) + :param check_schema: check schema between two iterations + :return: iterator on see :class:`StreamingDataFrame + ` """ if chunksize is None: if hasattr(df, "shape"): @@ -569,7 +574,8 @@ def where(self, *args, **kwargs) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:where`. *inplace* must be False. - This function returns a see :class:`StreamingDataFrame`. + This function returns a see :class:`StreamingDataFrame + `. """ kwargs["inplace"] = False return StreamingDataFrame( @@ -578,15 +584,15 @@ def where(self, *args, **kwargs) -> "StreamingDataFrame": def sample(self, reservoir=False, cache=False, **kwargs) -> "StreamingDataFrame": """ - See :epkg:`pandas:DataFrame:sample`. - Only *frac* is available, otherwise choose - @see me reservoir_sampling. - This function returns a see :class:`StreamingDataFrame`. + See :meth:`pandas.DataFrame.sample`. + Only *frac* is available, otherwise choose :meth`reservoir_sampling`. + This function returns a see :class:`StreamingDataFrame + `. :param reservoir: use `reservoir sampling `_ :param cache: cache the sample - :param kwargs: additional parameters for :epkg:`pandas:DataFrame:sample` + :param kwargs: additional parameters for :meth:`pandas.DataFrame.sample` If *cache* is True, the sample is cached (assuming it holds in memory). The second time an iterator walks through the @@ -614,10 +620,11 @@ def _reservoir_sampling( Uses the `reservoir sampling `_ algorithm to draw a random sample with exactly *n* samples. - @param cache cache the sample - @param n number of observations to keep - @param random_state sets the random_state - @return see :class:`StreamingDataFrame` + :param cache: cache the sample + :param n: number of observations to keep + :param random_state: sets the random_state + :return: see :class:`StreamingDataFrame + ` .. warning:: The sample is split by chunks of size 1000. @@ -669,7 +676,8 @@ def drop( ) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:drop`. - This function returns a see :class:`StreamingDataFrame`. + This function returns a see :class:`StreamingDataFrame + `. """ if axis == 0: raise NotImplementedError(f"drop is not implemented for axis={axis}.") @@ -694,7 +702,8 @@ def drop( def apply(self, *args, **kwargs) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:apply`. - This function returns a see :class:`StreamingDataFrame`. + This function returns a see :class:`StreamingDataFrame + `. """ return StreamingDataFrame( lambda: map(lambda df: df.apply(*args, **kwargs), self), **self.get_kwargs() @@ -703,7 +712,8 @@ def apply(self, *args, **kwargs) -> "StreamingDataFrame": def applymap(self, *args, **kwargs) -> "StreamingDataFrame": """ Applies :epkg:`pandas:DataFrame:applymap`. - This function returns a see :class:`StreamingDataFrame`. + This function returns a see :class:`StreamingDataFrame + `. """ return StreamingDataFrame( lambda: map(lambda df: df.applymap(*args, **kwargs), self), @@ -712,9 +722,12 @@ def applymap(self, *args, **kwargs) -> "StreamingDataFrame": def merge(self, right, **kwargs) -> "StreamingDataFrame": """ - Merges two see :class:`StreamingDataFrame` - and returns see :class:`StreamingDataFrame`. - *right* can be either a see :class:`StreamingDataFrame` or simply + Merges two see :class:`StreamingDataFrame + ` + and returns see :class:`StreamingDataFrame + `. + *right* can be either a see :class:`StreamingDataFrame + ` or simply a :epkg:`pandas:DataFrame`. It calls :epkg:`pandas:DataFrame:merge` in a double loop, loop on *self*, loop on *right*. """ @@ -738,13 +751,16 @@ def concat(self, others, axis=0) -> "StreamingDataFrame": """ Concatenates :epkg:`dataframes`. The function ensures all :epkg:`pandas:DataFrame` - or see :class:`StreamingDataFrame` share the same columns (name and type). + or see :class:`StreamingDataFrame + ` + share the same columns (name and type). Otherwise, the function fails as it cannot guess the schema without walking through all :epkg:`dataframes`. :param others: list, enumeration, :epkg:`pandas:DataFrame` :param axis: concatenate by rows (0) or by columns (1) - :return: see :class:`StreamingDataFrame` + :return: see :class:`StreamingDataFrame + ` """ if axis == 1: return self._concath(others) @@ -827,7 +843,8 @@ def groupby( :param kwargs: additional parameters for :epkg:`pandas:DataFrame:groupby` :return: :epkg:`pandas:DataFrame` - As the input see :class:`StreamingDataFrame` does not necessarily hold + As the input see :class:`StreamingDataFrame + ` does not necessarily hold in memory, the aggregation must be done at every iteration. There are two levels of aggregation: one to reduce every iterated :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`. @@ -847,7 +864,8 @@ def groupby( :tag: streaming Here is an example which shows how to write a simple *groupby* - with :epkg:`pandas` and see :class:`StreamingDataFrame`. + with :epkg:`pandas` and see :class:`StreamingDataFrame + `. .. runpython:: :showcode: @@ -912,7 +930,8 @@ def groupby_streaming( :param strategy: ``'cum'``, or ``'streaming'``, see below :return: :epkg:`pandas:DataFrame` - As the input see :class:`StreamingDataFrame` does not necessarily hold + As the input see :class:`StreamingDataFrame + ` does not necessarily hold in memory, the aggregation must be done at every iteration. There are two levels of aggregation: one to reduce every iterated :epkg:`dataframe`, another one to combine all the reduced :epkg:`dataframes`. @@ -931,7 +950,9 @@ def groupby_streaming( First one if ``strategy is None`` goes through the whole datasets to produce a final :epkg:`DataFrame`. Second if ``strategy=='cum'`` returns a - see :class:`StreamingDataFrame`, each iteration produces + see :class:`StreamingDataFrame + `, + each iteration produces the current status of the *group by*. Last case, ``strategy=='streaming'`` produces :epkg:`DataFrame` which must be concatenated into a single :epkg:`DataFrame` @@ -942,7 +963,8 @@ def groupby_streaming( :tag: streaming Here is an example which shows how to write a simple *groupby* - with :epkg:`pandas` and see :class:`StreamingDataFrame`. + with :epkg:`pandas` and see :class:`StreamingDataFrame + `. .. runpython:: :showcode: @@ -1107,13 +1129,16 @@ def add_column(self, col, value): Implements some of the functionalities :epkg:`pandas` offers for the operator ``[]``. - @param col new column - @param value see :class:`StreamingDataFrame` or a lambda function - @return see :class:`StreamingDataFrame` + :param col: new column + :param value: see :class:`StreamingDataFrame + ` or a lambda function + :return: see :class:`StreamingDataFrame + ` ..note:: - If value is a see :class:`StreamingDataFrame`, + If value is a see :class:`StreamingDataFrame + `, *chunksize* must be the same for both. .. exref:: @@ -1172,8 +1197,9 @@ def fillna(self, **kwargs): Replaces the missing values, calls :epkg:`pandas:DataFrame:fillna`. - @param kwargs see :epkg:`pandas:DataFrame:fillna` - @return see :class:`StreamingDataFrame` + :param kwargs: see :meth:`pandas.DataFrame.fillna` + :return: see :class:`StreamingDataFrame + ` .. warning:: The function does not check what happens at the @@ -1346,7 +1372,8 @@ def __del__(self): class StreamingSeries(StreamingDataFrame): """ - Seens as a see :class:`StreamingDataFrame` of one column. + Seens as a see :class:`StreamingDataFrame + ` of one column. """ def __init__(self, iter_creation, check_schema=True, stable=True): diff --git a/pandas_streaming/df/dataframe_io.py b/pandas_streaming/df/dataframe_io.py index bf13cc2..7b589c1 100644 --- a/pandas_streaming/df/dataframe_io.py +++ b/pandas_streaming/df/dataframe_io.py @@ -8,13 +8,13 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs): """ Saves a :epkg:`Dataframe` into a :epkg:`zip` file. - It can be read by @see fn to_zip. + It can be read by :meth:`read_zip`. - :param df: :epkg:`dataframe` or :epkg:`numpy:array` - :param zipfilename: a :class:`zipfile:ZipFile` or a filename - :param zname: a filename in th zipfile - :param kwargs: parameters for :epkg:`pandas:to_csv` or - :epkg:`numpy:save` + :param df: :epkg:`dataframe` or :class:`numpy.ndarray` + :param zipfilename: a :class:`zipfile.ZipFile` or a filename + :param zname: a filename in the zipfile + :param kwargs: parameters for :meth:`pandas.DataFrame.to_csv` or + :func:`numpy.save` :return: zipfilename .. exref:: @@ -22,7 +22,7 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs): :tag: dataframe This shows an example on how to save and read a - :epkg:`pandas:dataframe` directly into a zip file. + :class:`pandas.DataFrame` directly into a zip file. .. runpython:: :showcode: @@ -43,7 +43,7 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs): :tag: array This shows an example on how to save and read a - :epkg:`numpy:ndarray` directly into a zip file. + :class:`numpy.ndarray` directly into a zip file. .. runpython:: :showcode: @@ -102,12 +102,12 @@ def to_zip(df, zipfilename, zname="df.csv", **kwargs): def read_zip(zipfilename, zname=None, **kwargs): """ Reads a :epkg:`dataframe` from a :epkg:`zip` file. - It can be saved by @see fn read_zip. + It can be saved by :meth:`to_zip`. :param zipfilename: a :class:`zipfile.ZipFile` or a filename :param zname: a filename in zipfile, if None, takes the first one :param kwargs: parameters for :func:`pandas.read_csv` - :return: :func:`pandas.DataFrame` or :epkg:`numpy:array` + :return: :class:`pandas.DataFrame` or :class:`numpy.ndarray` """ if isinstance(zipfilename, str): ext = os.path.splitext(zipfilename)[-1] diff --git a/pandas_streaming/df/dataframe_split.py b/pandas_streaming/df/dataframe_split.py index ec4a579..7c2d191 100644 --- a/pandas_streaming/df/dataframe_split.py +++ b/pandas_streaming/df/dataframe_split.py @@ -15,18 +15,21 @@ def sklearn_train_test_split( The function relies on :func:`sklearn.model_selection.train_test_split`. It does not handle stratified version of it. - @param self see :class:`StreamingDataFrame` - @param path_or_buf a string, a list of strings or buffers, if it is a - string, it must contain ``{}`` like ``partition{}.txt`` - @param export_method method used to store the partitions, by default - :epkg:`pandas:DataFrame:to_csv` - @param names partitions names, by default ``('train', 'test')`` - @param kwargs parameters for the export function and - :fund:`sklearn.model_selection.train_test_split`. - @return outputs of the exports functions + :param self: see :class:`StreamingDataFrame + ` + :param path_or_buf: a string, a list of strings or buffers, if it is a + string, it must contain ``{}`` like ``partition{}.txt`` + :param export_method: method used to store the partitions, by default + :meth:`pandas.DataFrame.to_csv` + :param names: partitions names, by default ``('train', 'test')`` + :param kwargs: parameters for the export function and + :func:`sklearn.model_selection.train_test_split`. + :return: outputs of the exports functions The function cannot return two iterators or two - see :class:`StreamingDataFrame` because running through one + see :class:`StreamingDataFrame + ` + because running through one means running through the other. We can assume both splits do not hold in memory and we cannot run through the same iterator again as random draws would be different. @@ -114,18 +117,21 @@ def sklearn_train_test_split_streaming( The function relies on :func:`sklearn.model_selection.train_test_split`. It handles the stratified version of it. - :param self: see :class:`StreamingDataFrame` + :param self: see :class:`StreamingDataFrame + ` :param test_size: ratio for the test partition (if *train_size* is not specified) :param train_size: ratio for the train partition :param stratify: column holding the stratification :param hash_size: size of the hash to cache information about partition :param unique_rows: ensures that rows are unique - :return: Two see :class:`StreamingDataFrame`, one - for train, one for test. + :return: Two see :class:`StreamingDataFrame + `, + one for train, one for test. The function returns two iterators or two - see :class:`StreamingDataFrame`. It + see :class:`StreamingDataFrame + `. It tries to do everything without writing anything on disk but it requires to store the repartition somehow. This function hashes every row and maps the hash with a part From 38fe46b806198a90bcb9267d47248cb51ddd659e Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 12:57:40 +0200 Subject: [PATCH 14/16] doc --- .circleci/config.yml | 2 +- _doc/index.rst | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e76a1b7..0d766bc 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -39,7 +39,7 @@ jobs: - run: name: install dependencies (2) command: | - pip install -r requirements.txt + pip install -r requirements-dev.txt - save_cache: paths: diff --git a/_doc/index.rst b/_doc/index.rst index fa91aea..dca5620 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -64,4 +64,5 @@ and implements other functionalities for machine learning. auto_examples/index api/index i_ex + CHANGELOGS license From adde860f015fcb5a06397a51d1f203b6765d43bd Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 12:59:09 +0200 Subject: [PATCH 15/16] lambda --- pandas_streaming/df/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py index 6537d1f..1cc87a2 100644 --- a/pandas_streaming/df/dataframe.py +++ b/pandas_streaming/df/dataframe.py @@ -1358,7 +1358,7 @@ def iterate(): sub = dfs[numpy.isnan(dfs[by])] yield sub - res = StreamingDataFrame(lambda: iterate(), **self.get_kwargs()) + res = StreamingDataFrame(iterate, **self.get_kwargs()) res._delete_.append(lambda: os.remove(temp_file)) return res From fb88a708ab07fec458d68cc638c0b64d4f44c9b0 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 23 Jul 2023 13:05:23 +0200 Subject: [PATCH 16/16] remove unnecessary unit tests --- .../ut_documentation/test_run_notebooks.py | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100644 _unittests/ut_documentation/test_run_notebooks.py diff --git a/_unittests/ut_documentation/test_run_notebooks.py b/_unittests/ut_documentation/test_run_notebooks.py deleted file mode 100644 index aebe979..0000000 --- a/_unittests/ut_documentation/test_run_notebooks.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import unittest -from pyquickhelper.pycode import ExtTestCase -from pyquickhelper.ipythonhelper import test_notebook_execution_coverage -import pandas_streaming - - -class TestRunNotebooksPython(ExtTestCase): - def setUp(self): - import jyquickhelper # pylint: disable=C0415 - - self.assertTrue(jyquickhelper is not None) - - def test_notebook_artificiel(self): - self.assertTrue(pandas_streaming is not None) - folder = os.path.join( - os.path.dirname(__file__), "..", "..", "_doc", "notebooks" - ) - test_notebook_execution_coverage( - __file__, "first_steps", folder, "pandas_streaming", copy_files=[] - ) - - -if __name__ == "__main__": - unittest.main()