From 1a83df42061bcf93315e40f0973e094b45e82e62 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 28 Aug 2023 23:45:23 +0200 Subject: [PATCH] Apply `dprint` for Markdown formatting (#384) --- CONTRIBUTING.md | 5 +- docs/_build/scripts/people.py | 4 + docs/_build/snippets/under_construction.md | 2 +- docs/getting-started/expressions.md | 5 +- docs/getting-started/installation.md | 5 +- docs/getting-started/joins.md | 2 +- docs/getting-started/reading-writing.md | 5 +- docs/getting-started/series-dataframes.md | 9 +- docs/index.md | 17 +- docs/user-guide/concepts/contexts.md | 16 +- docs/user-guide/concepts/data-structures.md | 10 +- docs/user-guide/concepts/data-types.md | 48 ++-- docs/user-guide/concepts/expressions.md | 8 +- docs/user-guide/concepts/lazy-vs-eager.md | 10 +- docs/user-guide/concepts/streaming.md | 4 +- docs/user-guide/expressions/aggregation.md | 11 +- docs/user-guide/expressions/casting.md | 6 +- .../expressions/column_selections.md | 12 +- docs/user-guide/expressions/folds.md | 1 - docs/user-guide/expressions/functions.md | 18 +- docs/user-guide/expressions/lists.md | 12 +- docs/user-guide/expressions/null.md | 9 +- docs/user-guide/expressions/numpy.md | 2 +- docs/user-guide/expressions/operators.md | 2 +- docs/user-guide/expressions/strings.md | 7 +- docs/user-guide/expressions/structs.md | 17 +- .../expressions/user-defined-functions.md | 36 +-- docs/user-guide/expressions/window.md | 8 +- docs/user-guide/index.md | 4 +- docs/user-guide/installation.md | 225 +++++++++--------- docs/user-guide/io/aws.md | 8 +- docs/user-guide/io/bigquery.md | 8 +- docs/user-guide/io/csv.md | 4 +- docs/user-guide/io/database.md | 21 +- docs/user-guide/io/json_file.md | 4 +- docs/user-guide/io/multiple.md | 4 +- docs/user-guide/io/parquet.md | 5 +- docs/user-guide/lazy/execution.md | 5 +- docs/user-guide/lazy/optimizations.md | 6 +- docs/user-guide/lazy/query_plan.md | 2 +- docs/user-guide/lazy/schemas.md | 5 +- docs/user-guide/migration/pandas.md | 12 +- docs/user-guide/migration/spark.md | 2 +- docs/user-guide/misc/alternatives.md | 74 +++--- docs/user-guide/misc/contributing.md | 2 +- docs/user-guide/sql/cte.md | 7 +- docs/user-guide/sql/intro.md | 11 +- docs/user-guide/sql/select.md | 7 +- docs/user-guide/sql/show.md | 3 +- .../transformations/concatenation.md | 2 +- docs/user-guide/transformations/joins.md | 45 ++-- docs/user-guide/transformations/melt.md | 2 +- docs/user-guide/transformations/pivot.md | 2 - .../transformations/time-series/filter.md | 2 +- .../transformations/time-series/parsing.md | 3 +- .../transformations/time-series/resampling.md | 3 +- .../transformations/time-series/rolling.md | 15 +- .../transformations/time-series/timezones.md | 6 +- 58 files changed, 374 insertions(+), 416 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7e8f33b56..0fb926e0a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,10 +27,13 @@ To update your own repo with code pushed on the upstream repo: 1. `git push origin ` ### Building locally + To build the documentation locally you will need to install the python libraries defined in the `requirements.txt` file. + When these steps are done run `mkdocs serve` to run the server. You can then view the docs at http://localhost:8000/ + ### Want to discuss something? @@ -72,7 +75,7 @@ Find the correct placement for the functionality. Is it an expression add it to The `Markdown` file should roughly match the following structure: -1. A clear short title (for example: "*Interact with an AWS bucket*"). +1. A clear short title (for example: "_Interact with an AWS bucket_"). 1. A one-ish-liner to introduce the code snippet. 1. The code example itself under the corresponding folder (e.g. `docs/src/user-guide/expressions/...py), using the [Snippets](https://facelessuser.github.io/pymdown-extensions/extensions/snippets/) syntax. 1. The output of the example, using [markdown-exec](https://pawamoy.github.io/markdown-exec/) diff --git a/docs/_build/scripts/people.py b/docs/_build/scripts/people.py index 1921eb9be..81ba1982f 100644 --- a/docs/_build/scripts/people.py +++ b/docs/_build/scripts/people.py @@ -11,6 +11,10 @@ def get_people_md(): contributors = repo.get_contributors() with open("./docs/people.md", "w") as f: for c in itertools.islice(contributors, 50): + # We love dependabot, but he doesn't need a spot on our website + if c.login == "dependabot[bot]": + continue + f.write( ICON_TEMPLATE.format( login=c.login, diff --git a/docs/_build/snippets/under_construction.md b/docs/_build/snippets/under_construction.md index b9eb58223..094fdb1c6 100644 --- a/docs/_build/snippets/under_construction.md +++ b/docs/_build/snippets/under_construction.md @@ -1,4 +1,4 @@ !!! warning ":construction: Under Construction :construction: " This section is still under development. Want to help out? Consider contributing and making a [pull request](https://github.com/pola-rs/polars-book) to our repository. - Please read our [Contribution Guidelines](https://github.com/pola-rs/polars-book/blob/master/CONTRIBUTING.md) on how to proceed. \ No newline at end of file + Please read our [Contribution Guidelines](https://github.com/pola-rs/polars-book/blob/master/CONTRIBUTING.md) on how to proceed. diff --git a/docs/getting-started/expressions.md b/docs/getting-started/expressions.md index 86813f7a1..ecff07b80 100644 --- a/docs/getting-started/expressions.md +++ b/docs/getting-started/expressions.md @@ -52,7 +52,6 @@ print( ) ``` - ### Filter The `filter` option allows us to create a subset of the `DataFrame`. We use the same `DataFrame` as earlier and we filter between two specified dates. @@ -81,7 +80,6 @@ print( {{code_block('getting-started/expressions','with_columns',['with_columns'])}} - ```python exec="on" result="text" session="getting-started/expressions" print( --8<-- "python/getting-started/expressions.py:with_columns" @@ -112,7 +110,7 @@ print( ```python exec="on" result="text" session="getting-started/expressions" print( --8<-- "python/getting-started/expressions.py:groupby2" -) +) ``` ### Combining operations @@ -124,6 +122,7 @@ Below are some examples on how to combine operations to create the `DataFrame` y ```python exec="on" result="text" session="getting-started/expressions" --8<-- "python/getting-started/expressions.py:combine" ``` + {{code_block('getting-started/expressions','combine2',['select','with_columns'])}} ```python exec="on" result="text" session="getting-started/expressions" diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index dc3d8b4fb..9ef8e10d9 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -13,6 +13,7 @@ Polars is a library and installation is as simple as invoking the package manage ``` shell cargo add polars ``` + === ":fontawesome-brands-node-js: NodeJS" ``` shell @@ -23,7 +24,6 @@ Polars is a library and installation is as simple as invoking the package manage To use the library import it into your project - === ":fontawesome-brands-python: Python" ``` python @@ -35,6 +35,7 @@ To use the library import it into your project ``` rust use polars::prelude::*; ``` + === ":fontawesome-brands-node-js: NodeJS" ``` javaScript @@ -43,4 +44,4 @@ To use the library import it into your project // require const pl = require('nodejs-polars'); - ``` \ No newline at end of file + ``` diff --git a/docs/getting-started/joins.md b/docs/getting-started/joins.md index f092b1677..42d875d79 100644 --- a/docs/getting-started/joins.md +++ b/docs/getting-started/joins.md @@ -23,4 +23,4 @@ We can also `concatenate` two `DataFrames`. Vertical concatenation will make the ```python exec="on" result="text" session="getting-started/joins" --8<-- "python/getting-started/joins.py:hstack" -``` \ No newline at end of file +``` diff --git a/docs/getting-started/reading-writing.md b/docs/getting-started/reading-writing.md index 9f652d4d6..67c772f11 100644 --- a/docs/getting-started/reading-writing.md +++ b/docs/getting-started/reading-writing.md @@ -2,7 +2,6 @@ Polars supports reading & writing to all common files (e.g. csv, json, parquet), cloud storage (S3, Azure Blob, BigQuery) and databases (e.g. postgres, mysql). In the following examples we will show how to operate on most common file formats. For the following dataframe - {{code_block('getting-started/reading-writing','dataframe',['DataFrame'])}} ```python exec="on" result="text" session="getting-started/reading" @@ -11,7 +10,7 @@ Polars supports reading & writing to all common files (e.g. csv, json, parquet), #### CSV -Polars has its own fast implementation for csv reading with many flexible configuration options. +Polars has its own fast implementation for csv reading with many flexible configuration options. {{code_block('getting-started/reading-writing','csv',['read_csv','write_csv'])}} @@ -43,4 +42,4 @@ As we can see above, Polars made the datetimes a `string`. We can tell Polars to --8<-- "python/getting-started/reading-writing.py:parquet" ``` -To see more examples and other data formats go to the [User Guide](../user-guide/io/csv.md), section IO. \ No newline at end of file +To see more examples and other data formats go to the [User Guide](../user-guide/io/csv.md), section IO. diff --git a/docs/getting-started/series-dataframes.md b/docs/getting-started/series-dataframes.md index d0c5ca93d..496653032 100644 --- a/docs/getting-started/series-dataframes.md +++ b/docs/getting-started/series-dataframes.md @@ -1,11 +1,11 @@ # Series & DataFrames -The core base data structures provided by Polars are `Series` and `DataFrames`. +The core base data structures provided by Polars are `Series` and `DataFrames`. ## Series -Series are a 1-dimensional data structure. Within a series all elements have the same data type (e.g. int, string). -The snippet below shows how to create a simple named `Series` object. In a later section of this getting started guide we will learn how to read data from external sources (e.g. files, database), for now lets keep it simple. +Series are a 1-dimensional data structure. Within a series all elements have the same data type (e.g. int, string). +The snippet below shows how to create a simple named `Series` object. In a later section of this getting started guide we will learn how to read data from external sources (e.g. files, database), for now lets keep it simple. {{code_block('getting-started/series-dataframes','series',['Series'])}} @@ -17,7 +17,6 @@ The snippet below shows how to create a simple named `Series` object. In a later Although it is more common to work directly on a `DataFrame` object, `Series` implement a number of base methods which make it easy to perform transformations. Below are some examples of common operations you might want to perform. Note that these are for illustration purposes and only show a small subset of what is available. - ##### Aggregations `Series` out of the box supports all basic aggregations (e.g. min, max, mean, mode, ...). @@ -84,7 +83,7 @@ The `tail` function shows the last 5 rows of a `DataFrame`. You can also specify #### Sample -If you want to get an impression of the data of your `DataFrame`, you can also use `sample`. With `sample` you get an *n* number of random rows from the `DataFrame`. +If you want to get an impression of the data of your `DataFrame`, you can also use `sample`. With `sample` you get an _n_ number of random rows from the `DataFrame`. {{code_block('getting-started/series-dataframes','sample',['sample'])}} diff --git a/docs/index.md b/docs/index.md index de1a8893a..b231494c4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,6 +2,7 @@ hide: - navigation --- + # Polars ![logo](https://raw.githubusercontent.com/pola-rs/polars-static/master/logos/polars_github_logo_rect_dark_name.svg) @@ -30,12 +31,11 @@ hide: Polars is a highly performant DataFrame library for manipulating structured data. The core is written in Rust, but the library is available in Python, Rust & NodeJS. Its key features are: - -- **Fast**: Polars is written from the ground up, designed close to the machine and without external dependencies. -- **I/O**: First class support for all common data storage layers: local, cloud storage & databases. +- **Fast**: Polars is written from the ground up, designed close to the machine and without external dependencies. +- **I/O**: First class support for all common data storage layers: local, cloud storage & databases. - **Easy to use**: Write your queries the way they were intended. Polars, internally, will determine the most efficient way to execute using its query optimizer. - **Out of Core**: Polars supports out of core data transformation with its streaming API. Allowing you to process your results without requiring all your data to be in memory at the same time -- **Parallel**: Polars fully utilises the power of your machine by dividing the workload among the available CPU cores without any additional configuration. +- **Parallel**: Polars fully utilises the power of your machine by dividing the workload among the available CPU cores without any additional configuration. - **Vectorized Query Engine**: Polars uses [Apache Arrow](https://arrow.apache.org/), a columnar data format, to process your queries in a vectorized manner. It uses [SIMD](https://en.wikipedia.org/wiki/Single_instruction,_multiple_data) to optimize CPU usage. ## About this guide @@ -43,7 +43,7 @@ Polars is a highly performant DataFrame library for manipulating structured data The `Polars` user guide is intended to live alongside the API documentation. Its purpose is to explain (new) users how to use `Polars` and to provide meaningful examples. The guide is split into two parts: - [Getting Started](getting-started/intro.md): A 10 minute helicopter view of the library and its primary function. -- [User Guide](user-guide/index.md): A detailed explanation of how the library is setup and how to use it most effectively. +- [User Guide](user-guide/index.md): A detailed explanation of how the library is setup and how to use it most effectively. If you are looking for details on a specific level / object, it is probably best to go the API documentation: [Python](https://pola-rs.github.io/polars/py-polars/html/reference/index.html) | [NodeJS](https://pola-rs.github.io/nodejs-polars/index.html) | [Rust](https://docs.rs/polars/latest/polars/). @@ -54,7 +54,6 @@ See the results in h2oai's [db-benchmark](https://duckdblabs.github.io/db-benchm `Polars` [TPCH Benchmark results](https://www.pola.rs/benchmarks.html) are now available on the official website. - ## Example {{code_block('home/example','example',['scan_csv','filter','groupby','collect'])}} @@ -65,16 +64,14 @@ See the results in h2oai's [db-benchmark](https://duckdblabs.github.io/db-benchm ## Community -`Polars` has a very active community with frequent releases (approximately weekly). Below are some of the top contributors to the project: +`Polars` has a very active community with frequent releases (approximately weekly). Below are some of the top contributors to the project: --8<-- "docs/people.md" - -## Contribute +## Contribute Thanks for taking the time to contribute! We appreciate all contributions, from reporting bugs to implementing new features. If you're unclear on how to proceed read our [contribution guide](https://github.com/pola-rs/polars/blob/main/CONTRIBUTING.md) or contact us on [discord](https://discord.com/invite/4UfP5cfBE7). - ## License This project is licensed under the terms of the MIT license. diff --git a/docs/user-guide/concepts/contexts.md b/docs/user-guide/concepts/contexts.md index cf3a812fb..5cb5a9b71 100644 --- a/docs/user-guide/concepts/contexts.md +++ b/docs/user-guide/concepts/contexts.md @@ -1,8 +1,8 @@ # Contexts -Polars has developed its own Domain Specific Language (DSL) for transforming data. The language is very easy to use and allows for complex queries that remain human readable. The two core components of the language are Contexts and Expressions, the latter we will cover in the next section. +Polars has developed its own Domain Specific Language (DSL) for transforming data. The language is very easy to use and allows for complex queries that remain human readable. The two core components of the language are Contexts and Expressions, the latter we will cover in the next section. -A context, as implied by the name, refers to the context in which an expression needs to be evaluated. There are three main contexts [^1]: +A context, as implied by the name, refers to the context in which an expression needs to be evaluated. There are three main contexts [^1]: 1. Selection: `df.select([..])`, `df.with_columns([..])` 1. Filtering: `df.filter()` @@ -17,7 +17,7 @@ The examples below are performed on the following `DataFrame`: --8<-- "python/user-guide/concepts/contexts.py:dataframe" ``` -## Select +## Select In the `select` context the selection applies expressions over columns. The expressions in this context must produce `Series` that are all the same length or have a length of 1. @@ -29,7 +29,7 @@ A `Series` of a length of 1 will be broadcasted to match the height of the `Data --8<-- "python/user-guide/concepts/contexts.py:select" ``` -As you can see from the query the `select` context is very powerful and allows you to perform arbitrary expressions independent (and in parallel) of each other. +As you can see from the query the `select` context is very powerful and allows you to perform arbitrary expressions independent (and in parallel) of each other. Similarly to the `select` statement there is the `with_columns` statement which also is an entrance to the selection context. The main difference is that `with_columns` retains the original columns and adds new ones while `select` drops the original columns. @@ -39,9 +39,9 @@ Similarly to the `select` statement there is the `with_columns` statement which --8<-- "python/user-guide/concepts/contexts.py:with_columns" ``` -## Filter +## Filter -In the `filter` context you filter the existing dataframe based on arbritary expression which evaluates to the `Boolean` data type. +In the `filter` context you filter the existing dataframe based on arbritary expression which evaluates to the `Boolean` data type. {{code_block('user-guide/concepts/contexts','filter',['filter'])}} @@ -49,7 +49,7 @@ In the `filter` context you filter the existing dataframe based on arbritary exp --8<-- "python/user-guide/concepts/contexts.py:filter" ``` -## Groupby / Aggregation +## Groupby / Aggregation In the `groupby` context expressions work on groups and thus may yield results of any length (a group may have many members). @@ -61,4 +61,4 @@ In the `groupby` context expressions work on groups and thus may yield results o As you can see from the result all expressions are applied to the group defined by the `groupby` context. Besides the standard `groupby`, `groupby_dynamic`, and `groupby_rolling` are also entrances to the groupby context. -[^1]: There are additional List and SQL contexts which are covered later in this guide. But for simplicity, we leave them out of scope for now. +[^1]: There are additional List and SQL contexts which are covered later in this guide. But for simplicity, we leave them out of scope for now. diff --git a/docs/user-guide/concepts/data-structures.md b/docs/user-guide/concepts/data-structures.md index 4ebd708de..3389469b4 100644 --- a/docs/user-guide/concepts/data-structures.md +++ b/docs/user-guide/concepts/data-structures.md @@ -1,11 +1,11 @@ # Data Structures -The core base data structures provided by Polars are `Series` and `DataFrames`. +The core base data structures provided by Polars are `Series` and `DataFrames`. ## Series -Series are a 1-dimensional data structure. Within a series all elements have the same [Data Type](data-types.md) . -The snippet below shows how to create a simple named `Series` object. +Series are a 1-dimensional data structure. Within a series all elements have the same [Data Type](data-types.md) . +The snippet below shows how to create a simple named `Series` object. {{code_block('getting-started/series-dataframes','series',['Series'])}} @@ -33,7 +33,6 @@ The `head` function shows by default the first 5 rows of a `DataFrame`. You can {{code_block('getting-started/series-dataframes','head',['head'])}} - ```python exec="on" result="text" session="getting-started/series" --8<-- "python/getting-started/series-dataframes.py:head" ``` @@ -50,7 +49,7 @@ The `tail` function shows the last 5 rows of a `DataFrame`. You can also specify #### Sample -If you want to get an impression of the data of your `DataFrame`, you can also use `sample`. With `sample` you get an *n* number of random rows from the `DataFrame`. +If you want to get an impression of the data of your `DataFrame`, you can also use `sample`. With `sample` you get an _n_ number of random rows from the `DataFrame`. {{code_block('getting-started/series-dataframes','sample',['sample'])}} @@ -67,4 +66,3 @@ If you want to get an impression of the data of your `DataFrame`, you can also u ```python exec="on" result="text" session="getting-started/series" --8<-- "python/getting-started/series-dataframes.py:describe" ``` - diff --git a/docs/user-guide/concepts/data-types.md b/docs/user-guide/concepts/data-types.md index 3bfeedb5a..c63c9b4a3 100644 --- a/docs/user-guide/concepts/data-types.md +++ b/docs/user-guide/concepts/data-types.md @@ -4,28 +4,28 @@ cache-efficient and well-supported for Inter Process Communication. Most data types follow the exact implementation from `Arrow`, with the exception of `Utf8` (this is actually `LargeUtf8`), `Categorical`, and `Object` (support is limited). The data types are: -|Group| Type | Details | -|-----|-------|-------------| -|Numeric| `Int8`| 8-bit signed integer.| -|| `Int16`| 16-bit signed integer.| -|| `Int32`| 32-bit signed integer.| -|| `Int64`| 64-bit signed integer.| -|| `UInt8`| 8-bit unsigned integer.| -|| `UInt16`| 16-bit unsigned integer.| -|| `UInt32`| 32-bit unsigned integer.| -|| `UInt64`| 64-bit unsigned integer.| -|| `Float32`| 32-bit floating point.| -|| `Float64`| 64-bit floating point.| -|Nested| `Struct`| A struct array is represented as a `Vec` and is useful to pack multiple/heterogenous values in a single column.| -|| `List`| A list array contains a child array containing the list values and an offset array. (this is actually `Arrow` `LargeList` internally).| -|Temporal| `Date`| Date representation, internally represented as days since UNIX epoch encoded by a 32-bit signed integer.| -|| `Datetime`| Datetime representation, internally represented as microseconds since UNIX epoch encoded by a 64-bit signed integer.| -|| `Duration`| A timedelta type, internally represented as microseconds. Created when subtracting `Date/Datetime`.| -|| `Time`| Time representation, internally represented as nanoseconds since midnight.| -|Other| `Boolean`| Boolean type effectively bit packed.| -|| `Utf8`| String data (this is actually `Arrow` `LargeUtf8` internally).| -|| `Binary`| Store data as bytes.| -|| `Object`| A limited supported data type that can be any value.| -|| `Categorical` | A categorical encoding of a set of strings.| +| Group | Type | Details | +| -------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------- | +| Numeric | `Int8` | 8-bit signed integer. | +| | `Int16` | 16-bit signed integer. | +| | `Int32` | 32-bit signed integer. | +| | `Int64` | 64-bit signed integer. | +| | `UInt8` | 8-bit unsigned integer. | +| | `UInt16` | 16-bit unsigned integer. | +| | `UInt32` | 32-bit unsigned integer. | +| | `UInt64` | 64-bit unsigned integer. | +| | `Float32` | 32-bit floating point. | +| | `Float64` | 64-bit floating point. | +| Nested | `Struct` | A struct array is represented as a `Vec` and is useful to pack multiple/heterogenous values in a single column. | +| | `List` | A list array contains a child array containing the list values and an offset array. (this is actually `Arrow` `LargeList` internally). | +| Temporal | `Date` | Date representation, internally represented as days since UNIX epoch encoded by a 32-bit signed integer. | +| | `Datetime` | Datetime representation, internally represented as microseconds since UNIX epoch encoded by a 64-bit signed integer. | +| | `Duration` | A timedelta type, internally represented as microseconds. Created when subtracting `Date/Datetime`. | +| | `Time` | Time representation, internally represented as nanoseconds since midnight. | +| Other | `Boolean` | Boolean type effectively bit packed. | +| | `Utf8` | String data (this is actually `Arrow` `LargeUtf8` internally). | +| | `Binary` | Store data as bytes. | +| | `Object` | A limited supported data type that can be any value. | +| | `Categorical` | A categorical encoding of a set of strings. | -To learn more about the internal representation of these data types, check the [`Arrow` columnar format](https://arrow.apache.org/docs/format/Columnar.html). \ No newline at end of file +To learn more about the internal representation of these data types, check the [`Arrow` columnar format](https://arrow.apache.org/docs/format/Columnar.html). diff --git a/docs/user-guide/concepts/expressions.md b/docs/user-guide/concepts/expressions.md index 72f6deb73..82737d519 100644 --- a/docs/user-guide/concepts/expressions.md +++ b/docs/user-guide/concepts/expressions.md @@ -26,9 +26,8 @@ Polars expressions are a mapping from a series to a series (or mathematically `F ## Examples The following is an expression: - -{{code_block('user-guide/concepts/expressions','example1',['col','sort','head'])}} +{{code_block('user-guide/concepts/expressions','example1',['col','sort','head'])}} The snippet above says: @@ -37,15 +36,14 @@ The snippet above says: 1. Then take the first two values of the sorted output The power of expressions is that every expression produces a new expression, and that they -can be *piped* together. You can run an expression by passing them to one of `Polars` execution contexts. +can be _piped_ together. You can run an expression by passing them to one of `Polars` execution contexts. Here we run two expressions by running `df.select`: {{code_block('user-guide/concepts/expressions','example2',['select'])}} - All expressions are run in parallel, meaning that separate `Polars` expressions are **embarrassingly parallel**. Note that within an expression there may be more parallelization going on. ## Conclusion -This is the tip of the iceberg in terms of possible expressions. There are a ton more, and they can be combined in a variety of ways. This page is intended to get you familiar with the concept of expressions, in the section on [expressions](../expressions/operators.md) we will dive deeper. \ No newline at end of file +This is the tip of the iceberg in terms of possible expressions. There are a ton more, and they can be combined in a variety of ways. This page is intended to get you familiar with the concept of expressions, in the section on [expressions](../expressions/operators.md) we will dive deeper. diff --git a/docs/user-guide/concepts/lazy-vs-eager.md b/docs/user-guide/concepts/lazy-vs-eager.md index 99ba3361d..edc720785 100644 --- a/docs/user-guide/concepts/lazy-vs-eager.md +++ b/docs/user-guide/concepts/lazy-vs-eager.md @@ -4,10 +4,9 @@ {{code_block('user-guide/concepts/lazy-vs-eager','eager',['read_csv'])}} - In this example we use the eager API to: -1. Read the iris [dataset](https://archive.ics.uci.edu/ml/datasets/iris). +1. Read the iris [dataset](https://archive.ics.uci.edu/ml/datasets/iris). 1. Filter the dataset based on sepal length 1. Calculate the mean of the sepal width per species @@ -20,13 +19,10 @@ Every step is executed immediately returning the intermediate results. This can These will significantly lower the load on memory & CPU thus allowing you to fit bigger datasets in memory and process faster. Once the query is defined you call `collect` to inform `Polars` that you want to execute it. In the section on Lazy API we will go into more details on its implementation. - !!! info "Eager API" - In many cases the eager API is actually calling the lazy API under the hood and immediately collecting the result. This has the benefit that within the query itself optimization(s) made by the query planner can still take place. - + In many cases the eager API is actually calling the lazy API under the hood and immediately collecting the result. This has the benefit that within the query itself optimization(s) made by the query planner can still take place. ### When to use which -In general the lazy API should be preferred unless you are either interested in the intermediate results or are doing exploratory work and don't know yet what your query is going to look like. - +In general the lazy API should be preferred unless you are either interested in the intermediate results or are doing exploratory work and don't know yet what your query is going to look like. diff --git a/docs/user-guide/concepts/streaming.md b/docs/user-guide/concepts/streaming.md index a5c27ede3..b5e434576 100644 --- a/docs/user-guide/concepts/streaming.md +++ b/docs/user-guide/concepts/streaming.md @@ -1,6 +1,6 @@ # Streaming API -One additional benefit of the lazy API is that it allows queries to be executed in a streaming manner. Instead of processing the data all-at-once `Polars` can execute the query in batches allowing you to process datasets that are larger-than-memory. +One additional benefit of the lazy API is that it allows queries to be executed in a streaming manner. Instead of processing the data all-at-once `Polars` can execute the query in batches allowing you to process datasets that are larger-than-memory. To tell Polars we want to execute a query in streaming mode we pass the `streaming=True` argument to `collect` @@ -18,4 +18,4 @@ Streaming is supported for many operations including: - `join` - `sort` - `explode`,`melt` -- `scan_csv`,`scan_parquet`,`scan_ipc` \ No newline at end of file +- `scan_csv`,`scan_parquet`,`scan_ipc` diff --git a/docs/user-guide/expressions/aggregation.md b/docs/user-guide/expressions/aggregation.md index db398d9e2..98f156f19 100644 --- a/docs/user-guide/expressions/aggregation.md +++ b/docs/user-guide/expressions/aggregation.md @@ -3,7 +3,7 @@ `Polars` implements a powerful syntax defined not only in its lazy API, but also in its eager API. Let's take a look at what that means. We can start with the simple [US congress `dataset`](https://github.com/unitedstates/congress-legislators). - + {{code_block('user-guide/expressions/aggregation','dataframe',['DataFrame','Categorical'])}} #### Basic aggregations @@ -28,7 +28,6 @@ we have a nice summary overview. {{code_block('user-guide/expressions/aggregation','basic',['groupby'])}} - ```python exec="on" result="text" session="user-guide/expressions" --8<-- "python/user-guide/expressions/aggregation.py:setup" --8<-- "python/user-guide/expressions/aggregation.py:dataframe" @@ -64,7 +63,8 @@ rows from the `DataFrame` (because we need those rows for another aggregation). In the example below we show how this can be done. !!! note - Note that we can make `Python` functions for clarity. These functions don't cost us anything. That is because we only create `Polars` expressions, we don't apply a custom function over a `Series` during runtime of the query. Of course, you can make functions that return expressions in Rust, too. + + Note that we can make `Python` functions for clarity. These functions don't cost us anything. That is because we only create `Polars` expressions, we don't apply a custom function over a `Series` during runtime of the query. Of course, you can make functions that return expressions in Rust, too. {{code_block('user-guide/expressions/aggregation','filter',['groupby'])}} @@ -112,12 +112,11 @@ code preventing any multiple threads from executing the function. This all feels terribly limiting, especially because we often need those `lambda` functions in a `.groupby()` step, for example. This approach is still supported by `Polars`, but -keeping in mind bytecode **and** the GIL costs have to be paid. It is recommended to try to solve your queries using the expression syntax before moving to `lambdas`. If you want to learn more about using `lambdas`, go to the [user defined functions section](./user-defined-functions.md). - +keeping in mind bytecode **and** the GIL costs have to be paid. It is recommended to try to solve your queries using the expression syntax before moving to `lambdas`. If you want to learn more about using `lambdas`, go to the [user defined functions section](./user-defined-functions.md). ### Conclusion In the examples above we've seen that we can do a lot by combining expressions. By doing so we delay the use of custom `Python` functions that slow down the queries (by the slow nature of Python AND the GIL). If we are missing a type expression let us know by opening a -[feature request](https://github.com/pola-rs/polars/issues/new/choose)! \ No newline at end of file +[feature request](https://github.com/pola-rs/polars/issues/new/choose)! diff --git a/docs/user-guide/expressions/casting.md b/docs/user-guide/expressions/casting.md index 4a0c3627b..cb06699fa 100644 --- a/docs/user-guide/expressions/casting.md +++ b/docs/user-guide/expressions/casting.md @@ -1,7 +1,6 @@ # Casting -Casting converts the underlying [`DataType`](../concepts/data-types.md) of a column to a new one. Polars uses Arrow to manage the data in memory and relies on the compute kernels in the [rust implementation](https://github.com/jorgecarleitao/arrow2) to do the conversion. Casting is available with the `cast()` method. - +Casting converts the underlying [`DataType`](../concepts/data-types.md) of a column to a new one. Polars uses Arrow to manage the data in memory and relies on the compute kernels in the [rust implementation](https://github.com/jorgecarleitao/arrow2) to do the conversion. Casting is available with the `cast()` method. The `cast` method includes a `strict` parameter that determines how Polars behaves when it encounters a value that can't be converted from the source `DataType` to the target `DataType`. By default, `strict=True`, which means that Polars will throw an error to notify the user of the failed conversion and provide details on the values that couldn't be cast. On the other hand, if `strict=False`, any values that can't be converted to the target `DataType` will be quietly converted to `null`. @@ -26,7 +25,6 @@ To perform casting operations between floats and integers, or vice versa, we can Note that in the case of decimal values these are rounded downwards when casting to an integer. - ##### Downcast Reducing the memory footprint is also achievable by modifying the number of bits allocated to an element. As an illustration, the code below demonstrates how casting from `Int64` to `Int16` and from `Float64` to `Float32` can be used to lower memory usage. @@ -100,5 +98,3 @@ To perform casting operations between strings and `Dates`/`Datetimes`, `strftime ```python exec="on" result="text" session="user-guide/cast" --8<-- "python/user-guide/expressions/casting.py:dates2" ``` - - diff --git a/docs/user-guide/expressions/column_selections.md b/docs/user-guide/expressions/column_selections.md index 298f49014..d7453470f 100644 --- a/docs/user-guide/expressions/column_selections.md +++ b/docs/user-guide/expressions/column_selections.md @@ -1,4 +1,3 @@ - # Column Selections Let's create a dataset to use in this section: @@ -12,7 +11,7 @@ Let's create a dataset to use in this section: ## Expression Expansion -As we've seen in the previous section, we can select specific columns using the `pl.col` method. It can also select multiple columns - both as a means of convenience, and to *expand* the expression. +As we've seen in the previous section, we can select specific columns using the `pl.col` method. It can also select multiple columns - both as a means of convenience, and to _expand_ the expression. This kind of convenience feature isn't just decorative or syntactic sugar. It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: a single expression that specifies multiple columns expands into a list of expressions (depending on the DataFrame schema), resulting in being able to select multiple columns + run computation on them! @@ -21,11 +20,12 @@ This kind of convenience feature isn't just decorative or syntactic sugar. It al We can select all columns in the `DataFrame` object by providing the argument `*`: {{code_block('user-guide/expressions/column_selections', 'all',['all'])}} + ```python exec="on" result="text" session="user-guide/column_selections" --8<-- "python/user-guide/expressions/column_selections.py:all" ``` -Often, we don't just want to include all columns, but include all *while* excluding a few. This can be done easily as well: +Often, we don't just want to include all columns, but include all _while_ excluding a few. This can be done easily as well: {{code_block('user-guide/expressions/column_selections','exclude',['exclude'])}} @@ -35,7 +35,7 @@ Often, we don't just want to include all columns, but include all *while* exclud ### By multiple strings -Specifying multiple strings allows expressions to *expand* to all matching columns: +Specifying multiple strings allows expressions to _expand_ to all matching columns: {{code_block('user-guide/expressions/column_selections','expansion_by_names',['dt_to_string'])}} @@ -79,7 +79,7 @@ To select just the integer and string columns, we can do: ### Applying set operations -These *selectors* also allow for set based selection operations. For instance, to select the **numeric** columns **except** the **first** column that indicates row numbers: +These _selectors_ also allow for set based selection operations. For instance, to select the **numeric** columns **except** the **first** column that indicates row numbers: {{code_block('user-guide/expressions/column_selections','selectors_diff',['cs_first', 'cs_numeric'])}} @@ -97,7 +97,7 @@ We can also select the row number by name **and** any **non**-numeric columns: ### By patterns and substrings -*Selectors* can also be matched by substring and regex patterns: +_Selectors_ can also be matched by substring and regex patterns: {{code_block('user-guide/expressions/column_selections','selectors_by_name',['cs_contains', 'cs_matches'])}} diff --git a/docs/user-guide/expressions/folds.md b/docs/user-guide/expressions/folds.md index 842712604..5b0d0d7ef 100644 --- a/docs/user-guide/expressions/folds.md +++ b/docs/user-guide/expressions/folds.md @@ -16,7 +16,6 @@ Let's start with an example by implementing the `sum` operation ourselves, with --8<-- "python/user-guide/expressions/folds.py:mansum" ``` - The snippet above recursively applies the function `f(acc, x) -> acc` to an accumulator `acc` and a new column `x`. The function operates on columns individually and can take advantage of cache efficiency and vectorization. ### Conditional diff --git a/docs/user-guide/expressions/functions.md b/docs/user-guide/expressions/functions.md index 335cb2ec1..4e3532c22 100644 --- a/docs/user-guide/expressions/functions.md +++ b/docs/user-guide/expressions/functions.md @@ -18,9 +18,10 @@ By default if you perform an expression it will keep the same name as the origin {{code_block('user-guide/expressions/functions','samename',[])}} === ":fontawesome-brands-python: Python" - ``` python - --8<-- "python/user-guide/expressions/functions.py:samename" - ``` + +```python +--8<-- "python/user-guide/expressions/functions.py:samename" +``` ```python exec="on" result="text" session="user-guide/functions" --8<-- "python/user-guide/expressions/functions.py:samename" @@ -34,7 +35,7 @@ This might get problematic in the case you use the same column multiple times in --8<-- "python/user-guide/expressions/functions.py:samenametwice" ``` -You can change the output name of an expression by using the `alias` function +You can change the output name of an expression by using the `alias` function {{code_block('user-guide/expressions/functions','samenamealias',['alias'])}} @@ -42,18 +43,17 @@ You can change the output name of an expression by using the `alias` function --8<-- "python/user-guide/expressions/functions.py:samenamealias" ``` -In case of multiple columns for example when using `all()` or `col(*)` you can apply a mapping function `map_alias` to change the original column name into something else. In case you want to add a suffix (`suffix()`) or prefix (`prefix()`) these are also built in. +In case of multiple columns for example when using `all()` or `col(*)` you can apply a mapping function `map_alias` to change the original column name into something else. In case you want to add a suffix (`suffix()`) or prefix (`prefix()`) these are also built in. === ":fontawesome-brands-python: Python" - [:material-api: `prefix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.prefix.html) - [:material-api: `suffix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.suffix.html) - [:material-api: `map_alias`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_alias.html) +[:material-api: `prefix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.prefix.html) +[:material-api: `suffix`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.suffix.html) +[:material-api: `map_alias`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.Expr.map_alias.html) ## Count Unique Values There are two ways to count unique values in `Polars`: an exact methodology and an approximation. The approximation uses the [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) algorithm to approximate the cardinality and is especially useful for very large datasets where an approximation is good enough. - {{code_block('user-guide/expressions/functions','countunique',['n_unique','approx_n_unique'])}} ```python exec="on" result="text" session="user-guide/functions" diff --git a/docs/user-guide/expressions/lists.md b/docs/user-guide/expressions/lists.md index 5850c31fb..b87b656f2 100644 --- a/docs/user-guide/expressions/lists.md +++ b/docs/user-guide/expressions/lists.md @@ -9,6 +9,7 @@ Note: this is different from Python's `list` object, where the elements can be o Let's say we had the following data from different weather stations across a state. When the weather station is unable to get a result, an error code is recorded instead of the actual temperature at that time. {{code_block('user-guide/expressions/lists','weather_df',['DataFrame'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:setup" --8<-- "python/user-guide/expressions/lists.py:weather_df" @@ -19,6 +20,7 @@ Let's say we had the following data from different weather stations across a sta For the `weather` `DataFrame` created above, it's very likely we need to run some analysis on the temperatures that are captured by each station. To make this happen, we need to first be able to get individual temperature measurements. This is done by: {{code_block('user-guide/expressions/lists','string_to_list',['str.split'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:string_to_list" ``` @@ -26,6 +28,7 @@ For the `weather` `DataFrame` created above, it's very likely we need to run som One way we could go post this would be to convert each temperature measurement into its own row: {{code_block('user-guide/expressions/lists','explode_to_atomic',['DataFrame.explode'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:explode_to_atomic" ``` @@ -37,6 +40,7 @@ However, in Polars, we often do not need to do this to operate on the `List` ele Polars provides several standard operations on `List` columns. If we want the first three measurements, we can do a `head(3)`. The last three can be obtained via a `tail(3)`, or alternately, via `slice` (negative indexing is supported). We can also identify the number of observations via `lengths`. Let's see them in action: {{code_block('user-guide/expressions/lists','list_ops',['Expr.List'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:list_ops" ``` @@ -57,13 +61,15 @@ If we need to identify the stations that are giving the most number of errors fr The third step requires a casting (or alternately, a regex pattern search) operation to be perform on each element of the list. We can do this using by applying the operation on each element by first referencing them in the `pl.element()` context, and then calling a suitable Polars expression on them. Let's see how: {{code_block('user-guide/expressions/lists','count_errors',['Expr.List', 'element'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:count_errors" ``` -What if we chose the regex route (i.e. recognizing the presence of *any* alphabetical character?) +What if we chose the regex route (i.e. recognizing the presence of _any_ alphabetical character?) {{code_block('user-guide/expressions/lists','count_errors_regex',['str.contains'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:count_errors_regex" ``` @@ -77,6 +83,7 @@ This context is ideal for computing in row orientation. We can apply **any** Polars operations on the elements of the list with the `list.eval` (`list().eval` in Rust) expression! These expressions run entirely on Polars' query engine and can run in parallel, so will be well optimized. Let's say we have another set of weather data across three days, for different stations: {{code_block('user-guide/expressions/lists','weather_by_day',['DataFrame'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:weather_by_day" ``` @@ -84,6 +91,7 @@ We can apply **any** Polars operations on the elements of the list with the `lis Let's do something interesting, where we calculate the percentage rank of the temperatures by day, measured across stations. Pandas allows you to compute the percentages of the `rank` values. `Polars` doesn't provide a special function to do this directly, but because expressions are so versatile we can create our own percentage rank expression for highest temperature. Let's try that! {{code_block('user-guide/expressions/lists','weather_by_day_rank',['list.eval'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:weather_by_day_rank" ``` @@ -95,6 +103,7 @@ Let's do something interesting, where we calculate the percentage rank of the te We can define `Array` columns in this manner: {{code_block('user-guide/expressions/lists','array_df',['Array'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:array_df" ``` @@ -102,6 +111,7 @@ We can define `Array` columns in this manner: Basic operations are available on it: {{code_block('user-guide/expressions/lists','array_ops',['arr'])}} + ```python exec="on" result="text" session="user-guide/lists" --8<-- "python/user-guide/expressions/lists.py:array_ops" ``` diff --git a/docs/user-guide/expressions/null.md b/docs/user-guide/expressions/null.md index e22578452..5ded317ac 100644 --- a/docs/user-guide/expressions/null.md +++ b/docs/user-guide/expressions/null.md @@ -17,8 +17,8 @@ You can manually define a missing value with the python `None` value: --8<-- "python/user-guide/expressions/null.py:dataframe" ``` - !!! info + In `Pandas` the value for missing data depends on the dtype of the column. In `Polars` missing data is always represented as a `null` value. ## Missing data metadata @@ -35,7 +35,7 @@ The first piece of metadata is the `null_count` - this is the number of rows wit The `null_count` method can be called on a `DataFrame`, a column from a `DataFrame` or a `Series`. The `null_count` method is a cheap operation as `null_count` is already calculated for the underlying Arrow array. -The second piece of metadata is an array called a *validity bitmap* that indicates whether each data value is valid or missing. +The second piece of metadata is an array called a _validity bitmap_ that indicates whether each data value is valid or missing. The validity bitmap is memory efficient as it is bit encoded - each value is either a 0 or a 1. This bit encoding means the memory overhead per array is only (array length / 8) bytes. The validity bitmap is used by the `is_null` method in `Polars`. You can return a `Series` based on the validity bitmap for a column in a `DataFrame` or a `Series` with the `is_null` method: @@ -79,7 +79,6 @@ We can fill the missing data with a specified literal value with `pl.lit`: We can fill the missing data with a strategy such as filling forward: - {{code_block('user-guide/expressions/null','fillstrategy',['fill_null'])}} ```python exec="on" result="text" session="user-guide/null" @@ -111,7 +110,6 @@ In addition, we can fill nulls with interpolation (without using the `fill_null` --8<-- "python/user-guide/expressions/null.py:fillinterpolate" ``` - ## `NotaNumber` or `NaN` values Missing data in a `Series` has a `null` value. However, you can use `NotaNumber` or `NaN` values in columns with float datatypes. These `NaN` values can be created from Numpy's `np.nan` or the native python `float('nan')`: @@ -123,6 +121,7 @@ Missing data in a `Series` has a `null` value. However, you can use `NotaNumber` ``` !!! info + In `Pandas` by default a `NaN` value in an integer column causes the column to be cast to float. This does not happen in `Polars` - instead an exception is raised. `NaN` values are considered to be a type of floating point data and are **not considered to be missing data** in `Polars`. This means: @@ -138,4 +137,4 @@ One further difference between `null` and `NaN` values is that taking the `mean` ```python exec="on" result="text" session="user-guide/null" --8<-- "python/user-guide/expressions/null.py:nanfill" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/expressions/numpy.md b/docs/user-guide/expressions/numpy.md index 0e9b99256..6449ffd63 100644 --- a/docs/user-guide/expressions/numpy.md +++ b/docs/user-guide/expressions/numpy.md @@ -1,4 +1,4 @@ -# Numpy +# Numpy `Polars` expressions support `NumPy` [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). See [here](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs) for a list on all supported numpy functions. diff --git a/docs/user-guide/expressions/operators.md b/docs/user-guide/expressions/operators.md index 4fe2d9d18..47499af4f 100644 --- a/docs/user-guide/expressions/operators.md +++ b/docs/user-guide/expressions/operators.md @@ -27,4 +27,4 @@ This section describes how to use basic operators (e.g. addition, substraction) ```python exec="on" result="text" session="user-guide/operators" --8<-- "python/user-guide/expressions/operators.py:logical" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/expressions/strings.md b/docs/user-guide/expressions/strings.md index 2a3fca7db..f1722ca77 100644 --- a/docs/user-guide/expressions/strings.md +++ b/docs/user-guide/expressions/strings.md @@ -15,12 +15,10 @@ The `str` namespace can be accessed through the `.str` attribute of a column wit --8<-- "python/user-guide/expressions/strings.py:df" ``` - #### String Parsing `Polars` offers multiple methods for checking and parsing elements of a string. Firstly, we can use the `contains` method to check whether a given pattern exists within a substring. Subsequently, we can extract these patterns and replace them using other methods, which will be demonstrated in upcoming examples. - ##### Check for existence of a pattern To check for the presence of a pattern within a string, we can use the contains method. The `contains` method accepts either a regular substring or a regex pattern, depending on the value of the `literal` parameter. If the pattern we're searching for is a simple substring located either at the beginning or end of the string, we can alternatively use the `starts_with` and `ends_with` functions. @@ -31,7 +29,6 @@ To check for the presence of a pattern within a string, we can use the contains --8<-- "python/user-guide/expressions/strings.py:existence" ``` - ##### Extract a pattern The `extract` method allows us to extract a pattern from a specified string. This method takes a regex pattern containing one or more capture groups, which are defined by parentheses `()` in the pattern. The group index indicates which capture group to output. @@ -50,7 +47,6 @@ To extract all occurrences of a pattern within a string, we can use the `extract --8<-- "python/user-guide/expressions/strings.py:extract_all" ``` - ##### Replace a pattern We have discussed two methods for pattern matching and extraction thus far, and now we will explore how to replace a pattern within a string. Similar to `extract` and `extract_all`, Polars provides the `replace` and `replace_all` methods for this purpose. In the example below we replace one match of `abc` at the end of a word (`\b`) by `ABC` and we replace all occurrence of `a` with `-`. @@ -61,7 +57,6 @@ We have discussed two methods for pattern matching and extraction thus far, and --8<-- "python/user-guide/expressions/strings.py:replace" ``` - #### API Documentation -In addition to the examples covered above, Polars offers various other string manipulation methods for tasks such as formatting, stripping, splitting, and more. To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. \ No newline at end of file +In addition to the examples covered above, Polars offers various other string manipulation methods for tasks such as formatting, stripping, splitting, and more. To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. diff --git a/docs/user-guide/expressions/structs.md b/docs/user-guide/expressions/structs.md index 750ce83a4..9973e61d4 100644 --- a/docs/user-guide/expressions/structs.md +++ b/docs/user-guide/expressions/structs.md @@ -2,9 +2,10 @@ Polars `Struct`s are the idiomatic way of working with multiple columns. It is also a free operation i.e. moving columns into `Struct`s does not copy any data! -For this section, let's start with a `DataFrame` that captures the average rating of a few movies across some states in the U.S.: +For this section, let's start with a `DataFrame` that captures the average rating of a few movies across some states in the U.S.: {{code_block('user-guide/expressions/structs','ratings_df',['DataFrame'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:setup" --8<-- "python/user-guide/expressions/structs.py:ratings_df" @@ -15,6 +16,7 @@ For this section, let's start with a `DataFrame` that captures the average ratin A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. Checking the number of times a state appears the data will be done as so: {{code_block('user-guide/expressions/structs','state_value_counts',['value_counts'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:state_value_counts" ``` @@ -22,19 +24,21 @@ A common operation that will lead to a `Struct` column is the ever so popular `v Quite unexpected an output, especially if coming from tools that do not have such a data type. We're not in peril though, to get back to a more familiar output, all we need to do is `unnest` the `Struct` column into its constituent columns: {{code_block('user-guide/expressions/structs','struct_unnest',['unnest'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:struct_unnest" ``` !!! note "Why `value_counts` returns a `Struct`" - Polars expressions always have a `Fn(Series) -> Series` signature and `Struct` is thus the data type that allows us to provide multiple columns as input/ouput of an expression. In other words, all expressions have to return a `Series` object, and `Struct` allows us to stay consistent with that requirement. + Polars expressions always have a `Fn(Series) -> Series` signature and `Struct` is thus the data type that allows us to provide multiple columns as input/ouput of an expression. In other words, all expressions have to return a `Series` object, and `Struct` allows us to stay consistent with that requirement. ## Structs as `dict`s Polars will interpret a `dict` sent to the `Series` constructor as a `Struct`: {{code_block('user-guide/expressions/structs','series_struct',['Series'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct" ``` @@ -49,6 +53,7 @@ Polars will interpret a `dict` sent to the `Series` constructor as a `Struct`: Let's say that we needed to obtain just the `movie` value in the `Series` that we created above. We can use the `field` method to do so: {{code_block('user-guide/expressions/structs','series_struct_extract',['field'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct_extract" ``` @@ -58,6 +63,7 @@ Let's say that we needed to obtain just the `movie` value in the `Series` that w What if we need to rename individual `field`s of a `Struct` column? We first convert the `rating_Series` object to a `DataFrame` so that we can view the changes easily, and then use the `rename_fields` method: {{code_block('user-guide/expressions/structs','series_struct_rename',['rename_fields'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct_rename" ``` @@ -69,6 +75,7 @@ What if we need to rename individual `field`s of a `Struct` column? We first con Let's get back to the `ratings` data. We want to identify cases where there are duplicates at a `Movie` and `Theatre` level. This is where the `Struct` datatype shines: {{code_block('user-guide/expressions/structs','struct_duplicates',['is_duplicated', 'struct'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:struct_duplicates" ``` @@ -77,9 +84,10 @@ We can identify the unique cases at this level also with `is_unique`! ### Multi-column ranking -Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define *Count* of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: +Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define _Count_ of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: {{code_block('user-guide/expressions/structs','struct_ranking',['is_duplicated', 'struct'])}} + ```python exec="on" result="text" session="user-guide/structs" --8<-- "python/user-guide/expressions/structs.py:struct_ranking" ``` @@ -88,5 +96,4 @@ That's a pretty complex set of requirements done very elegantly in Polars! ### Using multi-column apply -This was discussed in the previous section on *User Defined Functions*. - +This was discussed in the previous section on _User Defined Functions_. diff --git a/docs/user-guide/expressions/user-defined-functions.md b/docs/user-guide/expressions/user-defined-functions.md index 942225128..2e78a8716 100644 --- a/docs/user-guide/expressions/user-defined-functions.md +++ b/docs/user-guide/expressions/user-defined-functions.md @@ -24,20 +24,22 @@ aggregated! Use cases for `map` are for instance passing the `Series` in an expression to a third party library. Below we show how we could use `map` to pass an expression column to a neural network model. - === ":fontawesome-brands-python: Python" - [:material-api: `map`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html) - ``` python - df.with_columns([ - pl.col("features").map(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") - ]) - ``` +[:material-api: `map`](https://pola-rs.github.io/polars/py-polars/html/reference/expressions/api/polars.map.html) + +```python +df.with_columns([ + pl.col("features").map(lambda s: MyNeuralNetwork.forward(s.to_numpy())).alias("activations") +]) +``` + === ":fontawesome-brands-rust: Rust" - ``` rust - df.with_columns([ - col("features").map(|s| Ok(my_nn.forward(s))).alias("activations") - ]) - ``` + +```rust +df.with_columns([ + col("features").map(|s| Ok(my_nn.forward(s))).alias("activations") +]) +``` Use cases for `map` in the `groupby` context are slim. They are only used for performance reasons, but can quite easily lead to incorrect results. Let me explain why. @@ -109,8 +111,7 @@ And observe, a valid result! 🎉 In the `select` context, the `apply` expression passes elements of the column to the python function. -*Note that you are -now running python, this will be slow.* +_Note that you are now running Python, this will be slow._ Let's go through some examples to see what to expect. We will continue with the `DataFrame` we defined at the start of this section and show an example with the `apply` function and a counter example where we use the expression API to @@ -121,7 +122,7 @@ achieve the same goals. In this example we create a global `counter` and then add the integer `1` to the global state at every element processed. Every iteration the result of the increment will be added to the element value. -> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this apply is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. +> Note, this example isn't provided in Rust. The reason is that the global `counter` value would lead to data races when this apply is evaluated in parallel. It would be possible to wrap it in a `Mutex` to protect the variable, but that would be obscuring the point of the example. This is a case where the Python Global Interpreter Lock's performance tradeoff provides some safety guarantees. {{code_block('user-guide/expressions/user-defined-functions','counter',['apply'])}} @@ -129,7 +130,6 @@ Every iteration the result of the increment will be added to the element value. --8<-- "python/user-guide/expressions/user-defined-functions.py:counter" ``` - ### Combining multiple column values If we want to have access to values of different columns in a single `apply` function call, we can create `struct` data @@ -144,7 +144,7 @@ type. This data type collects those columns as fields in the `struct`. So if we' ] ``` -In Python, those would be passed as `dict` to the calling python function and can thus be indexed by `field: str`. In rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast. +In Python, those would be passed as `dict` to the calling python function and can thus be indexed by `field: str`. In rust, you'll get a `Series` with the `Struct` type. The fields of the struct can then be indexed and downcast. {{code_block('user-guide/expressions/user-defined-functions','combine',['apply','struct'])}} @@ -180,4 +180,4 @@ Rust types map as follows: - `f32` or `f64` -> `Float64` - `bool` -> `Boolean` - `String` or `str` -> `Utf8` -- `Vec` -> `List[tp]` (where the inner type is inferred with the same rules) \ No newline at end of file +- `Vec` -> `List[tp]` (where the inner type is inferred with the same rules) diff --git a/docs/user-guide/expressions/window.md b/docs/user-guide/expressions/window.md index 03d2c3a5f..bd3ecdf12 100644 --- a/docs/user-guide/expressions/window.md +++ b/docs/user-guide/expressions/window.md @@ -1,4 +1,4 @@ -# Window functions +# Window functions Window functions are expressions with superpowers. They allow you to perform aggregations on groups in the `select` context. Let's get a feel for what that means. First we create a dataset. The dataset loaded in the @@ -18,11 +18,10 @@ are projected back to the original rows. Therefore, a window function will almos We will discuss later the cases where a window function can change the numbers of rows in a `DataFrame`. -Note how we call `.over("Type 1")` and `.over(["Type 1", "Type 2"])`. Using window functions we can aggregate over different groups in a single `select` call! Note that, in Rust, the type of the argument to `over()` must be a collection, so even when you're only using one column, you must provided it in an array. +Note how we call `.over("Type 1")` and `.over(["Type 1", "Type 2"])`. Using window functions we can aggregate over different groups in a single `select` call! Note that, in Rust, the type of the argument to `over()` must be a collection, so even when you're only using one column, you must provided it in an array. The best part is, this won't cost you anything. The computed groups are cached and shared between different `window` expressions. - {{code_block('user-guide/expressions/window','groupby',['over'])}} ```python exec="on" result="text" session="user-guide/window" @@ -42,7 +41,6 @@ Let's filter out some rows to make this more clear. --8<-- "python/user-guide/expressions/window.py:operations" ``` - Observe that the group `Water` of column `Type 1` is not contiguous. There are two rows of `Grass` in between. Also note that each pokemon within a group are sorted by `Speed` in `ascending` order. Unfortunately, for this example we want them sorted in `descending` speed order. Luckily with window functions this is easy to accomplish. @@ -90,4 +88,4 @@ For more exercise, below are some window functions for us to compute: ```python exec="on" result="text" session="user-guide/window" --8<-- "python/user-guide/expressions/window.py:examples" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index b31358426..8fb27a98c 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -14,7 +14,7 @@ For [`Pandas`](https://pandas.pydata.org/) users, our [Python package](https://p The goal of `Polars` is to provide a lightning fast `DataFrame` library that: - Utilizes all available cores on your machine. -- Optimizes queries to reduce unneeded work/memory allocations. +- Optimizes queries to reduce unneeded work/memory allocations. - Handles datasets much larger than your available RAM. - Has an API that is consistent and predictable. - Has a strict schema (data-types should be known before running the query). @@ -28,4 +28,4 @@ As such `Polars` goes to great lengths to: - Traverse memory cache efficiently. - Minimize contention in parallelism. - Process data in chunks. -- Reuse memory allocations. \ No newline at end of file +- Reuse memory allocations. diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md index a22e0f6c9..476908c30 100644 --- a/docs/user-guide/installation.md +++ b/docs/user-guide/installation.md @@ -17,6 +17,7 @@ Polars is a library and installation is as simple as invoking the package manage [dependencies] polars = { version = "x", features = ["lazy", ...]} ``` + === ":fontawesome-brands-node-js: NodeJS" ``` shell @@ -27,7 +28,6 @@ Polars is a library and installation is as simple as invoking the package manage To use the library import it into your project - === ":fontawesome-brands-python: Python" ``` python @@ -39,6 +39,7 @@ To use the library import it into your project ``` rust use polars::prelude::*; ``` + === ":fontawesome-brands-node-js: NodeJS" ``` javaScript @@ -49,14 +50,13 @@ To use the library import it into your project const pl = require('nodejs-polars'); ``` - ## Feature Flags By using the above command you install the core of `Polars` onto your system. However depending on your use case you might want to install the optional dependencies as well. These are made optional to minimize the footprint. The flags are different depending on the programming language. Throughout the user guide we will mention when a functionality is used that requires an additional dependency. ### Python -``` text +```text # For example pip install polars[numpy, fsspec] ``` @@ -75,115 +75,116 @@ pip install polars[numpy, fsspec] ### Rust -``` toml +```toml # Cargo.toml [dependencies] -polars = { version = "0.26.1", features = ["lazy","temporal","describe","json","parquet","dtype-datetime"]} +polars = { version = "0.26.1", features = ["lazy", "temporal", "describe", "json", "parquet", "dtype-datetime"] } ``` - The opt-in features are: - - - Additional data types: - - `dtype-date` - - `dtype-datetime` - - `dtype-time` - - `dtype-duration` - - `dtype-i8` - - `dtype-i16` - - `dtype-u8` - - `dtype-u16` - - `dtype-categorical` - - `dtype-struct` - - `performant` - Longer compile times more fast paths. - - `lazy` - Lazy API - - `lazy_regex` - Use regexes in [column selection](crate::lazy::dsl::col) - - `dot_diagram` - Create dot diagrams from lazy logical plans. - - `sql` - Pass SQL queries to polars. - - `streaming` - Be able to process datasets that are larger than RAM. - - `random` - Generate arrays with randomly sampled values - - `ndarray`- Convert from `DataFrame` to `ndarray` - - `temporal` - Conversions between [Chrono](https://docs.rs/chrono/) and Polars for temporal data types - - `timezones` - Activate timezone support. - - `strings` - Extra string utilities for `Utf8Chunked` - - `string_justify` - `zfill`, `ljust`, `rjust` - - `string_from_radix` - `parse_int` - - `object` - Support for generic ChunkedArrays called `ObjectChunked` (generic over `T`). - These are downcastable from Series through the [Any](https://doc.rust-lang.org/std/any/index.html) trait. - - Performance related: - - `nightly` - Several nightly only features such as SIMD and specialization. - - `performant` - more fast paths, slower compile times. - - `bigidx` - Activate this feature if you expect >> 2^32 rows. This has not been needed by anyone. - This allows polars to scale up way beyond that by using `u64` as an index. - Polars will be a bit slower with this feature activated as many data structures - are less cache efficient. - - `cse` - Activate common subplan elimination optimization - - IO related: - - - `serde` - Support for [serde](https://crates.io/crates/serde) serialization and deserialization. - Can be used for JSON and more serde supported serialization formats. - - `serde-lazy` - Support for [serde](https://crates.io/crates/serde) serialization and deserialization. - Can be used for JSON and more serde supported serialization formats. - - - `parquet` - Read Apache Parquet format - - `json` - JSON serialization - - `ipc` - Arrow's IPC format serialization - - `decompress` - Automatically infer compression of csvs and decompress them. - Supported compressions: - - zip - - gzip - - - `DataFrame` operations: - - `dynamic_groupby` - Groupby based on a time window instead of predefined keys. - Also activates rolling window group by operations. - - `sort_multiple` - Allow sorting a `DataFrame` on multiple columns - - `rows` - Create `DataFrame` from rows and extract rows from `DataFrames`. - And activates `pivot` and `transpose` operations - - `join_asof` - Join ASOF, to join on nearest keys instead of exact equality match. - - `cross_join` - Create the cartesian product of two DataFrames. - - `semi_anti_join` - SEMI and ANTI joins. - - `groupby_list` - Allow groupby operation on keys of type List. - - `row_hash` - Utility to hash DataFrame rows to UInt64Chunked - - `diagonal_concat` - Concat diagonally thereby combining different schemas. - - `horizontal_concat` - Concat horizontally and extend with null values if lengths don't match - - `dataframe_arithmetic` - Arithmetic on (Dataframe and DataFrames) and (DataFrame on Series) - - `partition_by` - Split into multiple DataFrames partitioned by groups. - - `Series`/`Expression` operations: - - `is_in` - [Check for membership in `Series`](crate::chunked_array::ops::IsIn) - - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip) - - `round_series` - round underlying float types of `Series`. - - `repeat_by` - [Repeat element in an Array N times, where N is given by another array. - - `is_first` - Check if element is first unique value. - - `is_last` - Check if element is last unique value. - - `checked_arithmetic` - checked arithmetic/ returning `None` on invalid operations. - - `dot_product` - Dot/inner product on Series and Expressions. - - `concat_str` - Concat string data in linear time. - - `reinterpret` - Utility to reinterpret bits to signed/unsigned - - `take_opt_iter` - Take from a Series with `Iterator>` - - `mode` - [Return the most occurring value(s)](crate::chunked_array::ops::ChunkUnique::mode) - - `cum_agg` - cumsum, cummin, cummax aggregation. - - `rolling_window` - rolling window functions, like rolling_mean - - `interpolate` [interpolate None values](crate::chunked_array::ops::Interpolate) - - `extract_jsonpath` - [Run jsonpath queries on Utf8Chunked](https://goessner.net/articles/JsonPath/) - - `list` - List utils. - - `list_take` take sublist by multiple indices - - `rank` - Ranking algorithms. - - `moment` - kurtosis and skew statistics - - `ewma` - Exponential moving average windows - - `abs` - Get absolute values of Series - - `arange` - Range operation on Series - - `product` - Compute the product of a Series. - - `diff` - `diff` operation. - - `pct_change` - Compute change percentages. - - `unique_counts` - Count unique values in expressions. - - `log` - Logarithms for `Series`. - - `list_to_struct` - Convert `List` to `Struct` dtypes. - - `list_count` - Count elements in lists. - - `list_eval` - Apply expressions over list elements. - - `cumulative_eval` - Apply expressions over cumulatively increasing windows. - - `arg_where` - Get indices where condition holds. - - `search_sorted` - Find indices where elements should be inserted to maintain order. - - `date_offset` Add an offset to dates that take months and leap years into account. - - `trigonometry` Trigonometric functions. - - `sign` Compute the element-wise sign of a Series. - - `propagate_nans` NaN propagating min/max aggregations. - - `DataFrame` pretty printing - - `fmt` - Activate DataFrame formatting \ No newline at end of file + +The opt-in features are: + +- Additional data types: + - `dtype-date` + - `dtype-datetime` + - `dtype-time` + - `dtype-duration` + - `dtype-i8` + - `dtype-i16` + - `dtype-u8` + - `dtype-u16` + - `dtype-categorical` + - `dtype-struct` +- `performant` - Longer compile times more fast paths. +- `lazy` - Lazy API + - `lazy_regex` - Use regexes in [column selection](crate::lazy::dsl::col) + - `dot_diagram` - Create dot diagrams from lazy logical plans. +- `sql` - Pass SQL queries to polars. +- `streaming` - Be able to process datasets that are larger than RAM. +- `random` - Generate arrays with randomly sampled values +- `ndarray`- Convert from `DataFrame` to `ndarray` +- `temporal` - Conversions between [Chrono](https://docs.rs/chrono/) and Polars for temporal data types +- `timezones` - Activate timezone support. +- `strings` - Extra string utilities for `Utf8Chunked` + - `string_justify` - `zfill`, `ljust`, `rjust` + - `string_from_radix` - `parse_int` +- `object` - Support for generic ChunkedArrays called `ObjectChunked` (generic over `T`). + These are downcastable from Series through the [Any](https://doc.rust-lang.org/std/any/index.html) trait. +- Performance related: + - `nightly` - Several nightly only features such as SIMD and specialization. + - `performant` - more fast paths, slower compile times. + - `bigidx` - Activate this feature if you expect >> 2^32 rows. This has not been needed by anyone. + This allows polars to scale up way beyond that by using `u64` as an index. + Polars will be a bit slower with this feature activated as many data structures + are less cache efficient. + - `cse` - Activate common subplan elimination optimization +- IO related: + + - `serde` - Support for [serde](https://crates.io/crates/serde) serialization and deserialization. + Can be used for JSON and more serde supported serialization formats. + - `serde-lazy` - Support for [serde](https://crates.io/crates/serde) serialization and deserialization. + Can be used for JSON and more serde supported serialization formats. + + - `parquet` - Read Apache Parquet format + - `json` - JSON serialization + - `ipc` - Arrow's IPC format serialization + - `decompress` - Automatically infer compression of csvs and decompress them. + Supported compressions: + - zip + - gzip + +- `DataFrame` operations: + - `dynamic_groupby` - Groupby based on a time window instead of predefined keys. + Also activates rolling window group by operations. + - `sort_multiple` - Allow sorting a `DataFrame` on multiple columns + - `rows` - Create `DataFrame` from rows and extract rows from `DataFrames`. + And activates `pivot` and `transpose` operations + - `join_asof` - Join ASOF, to join on nearest keys instead of exact equality match. + - `cross_join` - Create the cartesian product of two DataFrames. + - `semi_anti_join` - SEMI and ANTI joins. + - `groupby_list` - Allow groupby operation on keys of type List. + - `row_hash` - Utility to hash DataFrame rows to UInt64Chunked + - `diagonal_concat` - Concat diagonally thereby combining different schemas. + - `horizontal_concat` - Concat horizontally and extend with null values if lengths don't match + - `dataframe_arithmetic` - Arithmetic on (Dataframe and DataFrames) and (DataFrame on Series) + - `partition_by` - Split into multiple DataFrames partitioned by groups. +- `Series`/`Expression` operations: + - `is_in` - [Check for membership in `Series`](crate::chunked_array::ops::IsIn) + - `zip_with` - [Zip two Series/ ChunkedArrays](crate::chunked_array::ops::ChunkZip) + - `round_series` - round underlying float types of `Series`. + - `repeat_by` - [Repeat element in an Array N times, where N is given by another array. + - `is_first` - Check if element is first unique value. + - `is_last` - Check if element is last unique value. + - `checked_arithmetic` - checked arithmetic/ returning `None` on invalid operations. + - `dot_product` - Dot/inner product on Series and Expressions. + - `concat_str` - Concat string data in linear time. + - `reinterpret` - Utility to reinterpret bits to signed/unsigned + - `take_opt_iter` - Take from a Series with `Iterator>` + - `mode` - [Return the most occurring value(s)](crate::chunked_array::ops::ChunkUnique::mode) + - `cum_agg` - cumsum, cummin, cummax aggregation. + - `rolling_window` - rolling window functions, like rolling_mean + - `interpolate` [interpolate None values](crate::chunked_array::ops::Interpolate) + - `extract_jsonpath` - [Run jsonpath queries on Utf8Chunked](https://goessner.net/articles/JsonPath/) + - `list` - List utils. + - `list_take` take sublist by multiple indices + - `rank` - Ranking algorithms. + - `moment` - kurtosis and skew statistics + - `ewma` - Exponential moving average windows + - `abs` - Get absolute values of Series + - `arange` - Range operation on Series + - `product` - Compute the product of a Series. + - `diff` - `diff` operation. + - `pct_change` - Compute change percentages. + - `unique_counts` - Count unique values in expressions. + - `log` - Logarithms for `Series`. + - `list_to_struct` - Convert `List` to `Struct` dtypes. + - `list_count` - Count elements in lists. + - `list_eval` - Apply expressions over list elements. + - `cumulative_eval` - Apply expressions over cumulatively increasing windows. + - `arg_where` - Get indices where condition holds. + - `search_sorted` - Find indices where elements should be inserted to maintain order. + - `date_offset` Add an offset to dates that take months and leap years into account. + - `trigonometry` Trigonometric functions. + - `sign` Compute the element-wise sign of a Series. + - `propagate_nans` NaN propagating min/max aggregations. +- `DataFrame` pretty printing + - `fmt` - Activate DataFrame formatting diff --git a/docs/user-guide/io/aws.md b/docs/user-guide/io/aws.md index 3c1f94ab9..e19efc74b 100644 --- a/docs/user-guide/io/aws.md +++ b/docs/user-guide/io/aws.md @@ -5,16 +5,16 @@ To read from or write to an AWS bucket, additional dependencies are needed in Rust: === ":fontawesome-brands-rust: Rust" - ``` shell-rust - $ cargo add aws_sdk_s3 aws_config tokio --features tokio/full - ``` + +```shell +$ cargo add aws_sdk_s3 aws_config tokio --features tokio/full +``` In the next few snippets we'll demonstrate interacting with a `Parquet` file located on an AWS bucket. ## Read - Load a `.parquet` file using: {{code_block('user-guide/io/aws','bucket',['from_arrow'])}} diff --git a/docs/user-guide/io/bigquery.md b/docs/user-guide/io/bigquery.md index 9a28b0ced..684497f80 100644 --- a/docs/user-guide/io/bigquery.md +++ b/docs/user-guide/io/bigquery.md @@ -3,9 +3,10 @@ To read or write from GBQ, additional dependencies are needed: === ":fontawesome-brands-python: Python" - ``` shell-python - $ pip install google-cloud-bigquery - ``` + +```shell +$ pip install google-cloud-bigquery +``` ## Read @@ -13,7 +14,6 @@ We can load a query into a `DataFrame` like this: {{code_block('user-guide/io/bigquery','read',['from_arrow'])}} - ## Write --8<-- "docs/_build/snippets/under_construction.md" diff --git a/docs/user-guide/io/csv.md b/docs/user-guide/io/csv.md index a1c22f533..91962e2c1 100644 --- a/docs/user-guide/io/csv.md +++ b/docs/user-guide/io/csv.md @@ -4,17 +4,15 @@ Reading a CSV file should look familiar: - {{code_block('user-guide/io/csv','read',['read_csv'])}} - Writing a CSV file is similar with the `write_csv` function: {{code_block('user-guide/io/csv','write',['write_csv'])}} ## Scan -`Polars` allows you to *scan* a CSV input. Scanning delays the actual parsing of the +`Polars` allows you to _scan_ a CSV input. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. {{code_block('user-guide/io/csv','scan',['scan_csv'])}} diff --git a/docs/user-guide/io/database.md b/docs/user-guide/io/database.md index 3490d0d47..5d43e7367 100644 --- a/docs/user-guide/io/database.md +++ b/docs/user-guide/io/database.md @@ -10,9 +10,9 @@ For example, the following snippet shows the general patterns for reading all co ### Engines -Polars doesn't manage connections and data transfer from databases by itself. Instead external libraries (known as *engines*) handle this. At present Polars can use two engines to read from databases: +Polars doesn't manage connections and data transfer from databases by itself. Instead external libraries (known as _engines_) handle this. At present Polars can use two engines to read from databases: -- [ConnectorX](https://github.com/sfu-db/connector-x) and +- [ConnectorX](https://github.com/sfu-db/connector-x) and - [ADBC](https://arrow.apache.org/docs/format/ADBC.html) #### ConnectorX @@ -22,7 +22,7 @@ ConnectorX is the default engine and [supports numerous databases](https://githu To read from one of the supported databases with `ConnectorX` you need to activate the additional dependancy `ConnectorX` when installing Polars or install it manually with ```shell -$ pip install connectorx +$ pip install connectorx ``` #### ADBC @@ -32,7 +32,7 @@ ADBC (Arrow Database Connectivity) is an engine supported by the Apache Arrow pr It is still early days for ADBC so support for different databases is still limited. At present drivers for ADBC are only available for [Postgres and SQLite](https://arrow.apache.org/adbc/0.1.0/driver/cpp/index.html). To install ADBC you need to install the driver for your database. For example to install the driver for SQLite you run ```shell -$ pip install adbc-driver-sqlite +$ pip install adbc-driver-sqlite ``` As ADBC is not the default engine you must specify the engine as an argument to `pl.read_database` @@ -41,25 +41,30 @@ As ADBC is not the default engine you must specify the engine as an argument to ## Write to a database -We can write to a database with Polars using the `pl.write_database` function. +We can write to a database with Polars using the `pl.write_database` function. ### Engines -As with reading from a database above Polars uses an *engine* to write to a database. The currently supported engines are: + +As with reading from a database above Polars uses an _engine_ to write to a database. The currently supported engines are: - [SQLAlchemy](https://www.sqlalchemy.org/) and - Arrow Database Connectivity (ADBC) #### SQLAlchemy + With the default engine SQLAlchemy you can write to any database supported by SQLAlchemy. To use this engine you need to install SQLAlchemy and Pandas + ```shell -$ pip install SQLAlchemy pandas +$ pip install SQLAlchemy pandas ``` + In this example, we write the `DataFrame` to a table called `records` in the database {{code_block('user-guide/io/database','write',['write_database'])}} -In the SQLAlchemy approach Polars converts the `DataFrame` to a Pandas `DataFrame` backed by PyArrow and then uses SQLAlchemy methods on a Pandas `DataFrame` to write to the database. +In the SQLAlchemy approach Polars converts the `DataFrame` to a Pandas `DataFrame` backed by PyArrow and then uses SQLAlchemy methods on a Pandas `DataFrame` to write to the database. #### ADBC + As with reading from a database you can also use ADBC to write to a SQLite or Posgres database. As shown above you need to install the appropriate ADBC driver for your database. {{code_block('user-guide/io/database','write_adbc',['write_database'])}} diff --git a/docs/user-guide/io/json_file.md b/docs/user-guide/io/json_file.md index 36d4c877b..01697e825 100644 --- a/docs/user-guide/io/json_file.md +++ b/docs/user-guide/io/json_file.md @@ -8,7 +8,6 @@ Reading a JSON file should look familiar: {{code_block('user-guide/io/json-file','read',['read_json'])}} - ### Newline Delimited JSON JSON objects that are delimited by newlines can be read into polars in a much more performant way than standard json. @@ -19,10 +18,9 @@ JSON objects that are delimited by newlines can be read into polars in a much mo {{code_block('user-guide/io/json-file','write',['write_json','write_ndjson'])}} - ## Scan `Polars` allows you to _scan_ a JSON input **only for newline delimited json**. Scanning delays the actual parsing of the file and instead returns a lazy computation holder called a `LazyFrame`. -{{code_block('user-guide/io/json-file','scan',['scan_ndjson'])}} \ No newline at end of file +{{code_block('user-guide/io/json-file','scan',['scan_ndjson'])}} diff --git a/docs/user-guide/io/multiple.md b/docs/user-guide/io/multiple.md index 72a79a9d0..0de026104 100644 --- a/docs/user-guide/io/multiple.md +++ b/docs/user-guide/io/multiple.md @@ -10,10 +10,8 @@ Let's create some files to give us some context: To read multiple files into a single `DataFrame`, we can use globbing patterns: - {{code_block('user-guide/io/multiple','read',['read_csv'])}} - ```python exec="on" result="text" session="user-guide/io/multiple" --8<-- "python/user-guide/io/multiple.py:create" --8<-- "python/user-guide/io/multiple.py:read" @@ -39,4 +37,4 @@ All query plan execution is embarrassingly parallel and doesn't require any comm ```python exec="on" result="text" session="user-guide/io/multiple" --8<-- "python/user-guide/io/multiple.py:glob" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/io/parquet.md b/docs/user-guide/io/parquet.md index 983648a8e..71a5399bb 100644 --- a/docs/user-guide/io/parquet.md +++ b/docs/user-guide/io/parquet.md @@ -8,10 +8,8 @@ copying as we read `Parquet` directly into `Arrow` memory and _keep it there_. ## Read - {{code_block('user-guide/io/parquet','read',['read_parquet'])}} - ## Write {{code_block('user-guide/io/parquet','write',['write_parquet'])}} @@ -23,5 +21,4 @@ file and instead returns a lazy computation holder called a `LazyFrame`. {{code_block('user-guide/io/parquet','scan',['scan_parquet'])}} - -If you want to know why this is desirable, you can read more about those `Polars` optimizations [here](../concepts/lazy-vs-eager.md). \ No newline at end of file +If you want to know why this is desirable, you can read more about those `Polars` optimizations [here](../concepts/lazy-vs-eager.md). diff --git a/docs/user-guide/lazy/execution.md b/docs/user-guide/lazy/execution.md index 68b047585..522ceab2e 100644 --- a/docs/user-guide/lazy/execution.md +++ b/docs/user-guide/lazy/execution.md @@ -4,7 +4,6 @@ Our example query on the Reddit dataset is: {{code_block('user-guide/lazy/execution','df',['scan_csv'])}} - If we were to run the code above on the Reddit CSV the query would not be evaluated. Instead Polars takes each line of code, adds it to the internal query graph and optimizes the query graph. When we execute the code Polars executes the optimized query graph by default. @@ -44,7 +43,7 @@ With the default `collect` method Polars processes all of your data as one batch ### Execution on larger-than-memory data -If your data requires more memory than you have available Polars may be able to process the data in batches using *streaming* mode. To use streaming mode you simply pass the `streaming=True` argument to `collect` +If your data requires more memory than you have available Polars may be able to process the data in batches using _streaming_ mode. To use streaming mode you simply pass the `streaming=True` argument to `collect` {{code_block('user-guide/lazy/execution','stream',['scan_csv','collect'])}} @@ -77,4 +76,4 @@ shape: (27, 6) │ 77766 ┆ GENERICBOB ┆ 1137474000 ┆ 1536528276 ┆ 291 ┆ 14 │ │ 77768 ┆ TINHEADNED ┆ 1139665457 ┆ 1536497404 ┆ 4434 ┆ 103 │ └───────┴───────────────────────────┴─────────────┴────────────┴───────────────┴────────────┘ -``` \ No newline at end of file +``` diff --git a/docs/user-guide/lazy/optimizations.md b/docs/user-guide/lazy/optimizations.md index d785ee4d6..a48b68ddf 100644 --- a/docs/user-guide/lazy/optimizations.md +++ b/docs/user-guide/lazy/optimizations.md @@ -6,12 +6,12 @@ others are determined just in time as the materialized data comes in. Here is a non-complete overview of optimizations done by polars, what they do and how often they run. | Optimization | Explanation | runs | -|----------------------------|--------------------------------------------------------------------------------------------------------------|-------------------------------| +| -------------------------- | ------------------------------------------------------------------------------------------------------------ | ----------------------------- | | Predicate pushdown | Applies filters as early as possible/ at scan level. | 1 time | | Projection pushdown | Select only the columns that are needed at the scan level. | 1 time | -| Slice pushdown | Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. join.head(10)). | 1 time | +| Slice pushdown | Only load the required slice from the scan level. Don't materialize sliced outputs (e.g. join.head(10)). | 1 time | | Common subplan elimination | Cache subtrees/file scans that are used by multiple subtrees in the query plan. | 1 time | | Simplify expressions | Various optimizations, such as constant folding and replacing expensive operations with faster alternatives. | until fixed point | | Join ordering | Estimates the branches of joins that should be executed first in order to reduce memory pressure. | 1 time | | Type coercion | Coerce types such that operations succeed and run on minimal required memory. | until fixed point | -| Cardinality estimation | Estimates cardinality in order to determine optimal groupby strategy. | 0/n times; dependent on query | \ No newline at end of file +| Cardinality estimation | Estimates cardinality in order to determine optimal groupby strategy. | 0/n times; dependent on query | diff --git a/docs/user-guide/lazy/query_plan.md b/docs/user-guide/lazy/query_plan.md index 00438aba6..a147a92c3 100644 --- a/docs/user-guide/lazy/query_plan.md +++ b/docs/user-guide/lazy/query_plan.md @@ -93,4 +93,4 @@ The optimized plan is to: - apply the filter on the `comment_karma` column while the CSV is being read line-by-line - transform the `name` column to uppercase -In this case the query optimizer has identified that the `filter` can be applied while the CSV is read from disk rather than reading the whole file into memory and then applying the filter. This optimization is called *Predicate Pushdown*. +In this case the query optimizer has identified that the `filter` can be applied while the CSV is read from disk rather than reading the whole file into memory and then applying the filter. This optimization is called _Predicate Pushdown_. diff --git a/docs/user-guide/lazy/schemas.md b/docs/user-guide/lazy/schemas.md index e5f959b2d..77d2be54b 100644 --- a/docs/user-guide/lazy/schemas.md +++ b/docs/user-guide/lazy/schemas.md @@ -4,7 +4,6 @@ The schema of a Polars `DataFrame` or `LazyFrame` sets out the names of the colu {{code_block('user-guide/lazy/schema','schema',['DataFrame','lazy'])}} - ```python exec="on" result="text" session="user-guide/lazy/schemas" --8<-- "python/user-guide/lazy/schema.py:setup" --8<-- "python/user-guide/lazy/schema.py:schema" @@ -22,7 +21,7 @@ We see how this works in the following simple example where we call the `.round` The `.round` expression is only valid for columns with a floating point dtype. Calling `.round` on an integer column means the operation will raise an `InvalidOperationError` when we evaluate the query with `collect`. This schema check happens before the data is processed when we call `collect`. -```python exec="on" result="text" session="user-guide/lazy/schemas"``` +`python exec="on" result="text" session="user-guide/lazy/schemas"` If we executed this query in eager mode the error would only be found once the data had been processed in all earlier steps. @@ -58,4 +57,4 @@ We show how to deal with a non-lazy operation in this example where we: ```python exec="on" result="text" session="user-guide/lazy/schemas" --8<-- "python/user-guide/lazy/schema.py:lazyeager" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/migration/pandas.md b/docs/user-guide/migration/pandas.md index 89cd7c663..e0c6f7984 100644 --- a/docs/user-guide/migration/pandas.md +++ b/docs/user-guide/migration/pandas.md @@ -23,7 +23,6 @@ more explicit, more readable and less error-prone. Note that an 'index' data structure as known in databases will be used by polars as an optimization technique. - ### `Polars` uses Apache Arrow arrays to represent data in memory while `Pandas` uses `Numpy` arrays `Polars` represents data in memory with Arrow arrays while `Pandas` represents data in @@ -110,8 +109,8 @@ The CSV file has numerous columns but we just want to do a groupby on one of the columns (`id1`) and then sum by a value column (`v1`). In `Pandas` this would be: ```python - df = pd.read_csv(csv_file, usecols=['id1','v1']) - grouped_df = df.loc[:,['id1','v1']].groupby('id1').sum('v1') +df = pd.read_csv(csv_file, usecols=['id1','v1']) +grouped_df = df.loc[:,['id1','v1']].groupby('id1').sum('v1') ``` In `Polars` you can build this query in lazy mode with query optimization and evaluate @@ -119,8 +118,8 @@ it by replacing the eager `Pandas` function `read_csv` with the implicitly lazy function `scan_csv`: ```python - df = pl.scan_csv(csv_file) - grouped_df = df.groupby('id1').agg(pl.col('v1').sum()).collect() +df = pl.scan_csv(csv_file) +grouped_df = df.groupby('id1').agg(pl.col('v1').sum()).collect() ``` `Polars` optimizes this query by identifying that only the `id1` and `v1` columns are @@ -316,7 +315,6 @@ shape: (7, 5) ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ │ 2 ┆ n ┆ 4 ┆ 5 ┆ 1 │ └─────┴──────┴──────┴─────┴──────────────┘ - ``` ## Missing data @@ -327,4 +325,4 @@ For float columns `Polars` permits the use of `NaN` values. These `NaN` values a In `Pandas` an integer column with missing values is cast to be a float column with `NaN` values for the missing values (unless using optional nullable integer dtypes). In `Polars` any missing values in an integer column are simply `null` values and the column remains an integer column. -See the [missing data](../expressions/null.md) section for more details. \ No newline at end of file +See the [missing data](../expressions/null.md) section for more details. diff --git a/docs/user-guide/migration/spark.md b/docs/user-guide/migration/spark.md index 156693f43..ea1a41abb 100644 --- a/docs/user-guide/migration/spark.md +++ b/docs/user-guide/migration/spark.md @@ -155,4 +155,4 @@ Output: | a| 5| | b| 4| +---+---+ -``` \ No newline at end of file +``` diff --git a/docs/user-guide/misc/alternatives.md b/docs/user-guide/misc/alternatives.md index cc9c41e5f..a5544e7db 100644 --- a/docs/user-guide/misc/alternatives.md +++ b/docs/user-guide/misc/alternatives.md @@ -1,66 +1,66 @@ -# Alternatives +# Alternatives These are some tools that share similar functionality to what polars does. - Pandas - A very versatile tool for small data. Read [10 things I hate about pandas](https://wesmckinney.com/blog/apache-arrow-pandas-internals/) - written by the author himself. Polars has solved all those 10 things. - Polars is a versatile tool for small and large data with a more predictable, less ambiguous, and stricter API. + A very versatile tool for small data. Read [10 things I hate about pandas](https://wesmckinney.com/blog/apache-arrow-pandas-internals/) + written by the author himself. Polars has solved all those 10 things. + Polars is a versatile tool for small and large data with a more predictable, less ambiguous, and stricter API. - Pandas the API - The API of pandas was designed for in memory data. This makes it a poor fit for performant analysis on large data - (read anything that does not fit into RAM). Any tool that tries to distribute that API will likely have a - suboptimal query plan compared to plans that follow from a declarative API like SQL or Polars' API. + The API of pandas was designed for in memory data. This makes it a poor fit for performant analysis on large data + (read anything that does not fit into RAM). Any tool that tries to distribute that API will likely have a + suboptimal query plan compared to plans that follow from a declarative API like SQL or Polars' API. - Dask - Parallelizes existing single-threaded libraries like `NumPy` and `Pandas`. As a consumer of those libraries Dask - therefore has less control over low level performance and semantics. - Those libraries are treated like a black box. - On a single machine the parallelization effort can also be seriously stalled by pandas strings. - Pandas strings, by default, are stored as python objects in - numpy arrays meaning that any operation on them is GIL bound and therefore single threaded. This can be circumvented - by multi-processing but has a non-trivial cost. + Parallelizes existing single-threaded libraries like `NumPy` and `Pandas`. As a consumer of those libraries Dask + therefore has less control over low level performance and semantics. + Those libraries are treated like a black box. + On a single machine the parallelization effort can also be seriously stalled by pandas strings. + Pandas strings, by default, are stored as python objects in + numpy arrays meaning that any operation on them is GIL bound and therefore single threaded. This can be circumvented + by multi-processing but has a non-trivial cost. - Modin - Similar to Dask + Similar to Dask - Vaex - Vaexs method of out-of-core analysis is memory mapping files. This works until it doesn't. For instance parquet - or csv files first need to be read and converted to a file format that can be memory mapped. Another downside is - that the OS determines when pages will be swapped. Operations that need a full data shuffle, such as - sorts, have terrible performance on memory mapped data. - Polars' out of core processing is not based on memory mapping, but on streaming data in batches (and spilling to disk - if needed), we control which data must be hold in memory, not the OS, meaning that we don't have unexpected IO stalls. + Vaexs method of out-of-core analysis is memory mapping files. This works until it doesn't. For instance parquet + or csv files first need to be read and converted to a file format that can be memory mapped. Another downside is + that the OS determines when pages will be swapped. Operations that need a full data shuffle, such as + sorts, have terrible performance on memory mapped data. + Polars' out of core processing is not based on memory mapping, but on streaming data in batches (and spilling to disk + if needed), we control which data must be hold in memory, not the OS, meaning that we don't have unexpected IO stalls. - DuckDB - Polars and DuckDB have many similarities. DuckDB is focused on providing an in-process OLAP Sqlite alternative, - Polars is focused on providing a scalable `DataFrame` interface to many languages. Those different front-ends lead to - different optimization strategies and different algorithm prioritization. The interoperability between both is zero-copy. - See more: https://duckdb.org/docs/guides/python/polars + Polars and DuckDB have many similarities. DuckDB is focused on providing an in-process OLAP Sqlite alternative, + Polars is focused on providing a scalable `DataFrame` interface to many languages. Those different front-ends lead to + different optimization strategies and different algorithm prioritization. The interoperability between both is zero-copy. + See more: https://duckdb.org/docs/guides/python/polars - Spark - Spark is designed for distributed workloads and uses the JVM. The setup for spark is complicated and the startup-time - is slow. On a single machine Polars has much better performance characteristics. If you need to process TB's of data - Spark is a better choice. + Spark is designed for distributed workloads and uses the JVM. The setup for spark is complicated and the startup-time + is slow. On a single machine Polars has much better performance characteristics. If you need to process TB's of data + Spark is a better choice. - CuDF - GPU's and CuDF are fast! - However, GPU's are not readily available and expensive in production. The amount of memory available on a GPU - is often a fraction of the available RAM. - This (and out-of-core) processing means that Polars can handle much larger data-sets. - Next to that Polars can be close in [performance to CuDF](https://zakopilo.hatenablog.jp/entry/2023/02/04/220552). - CuDF doesn't optimize your query, so is not uncommon that on ETL jobs Polars will be faster because it can elide - unneeded work and materializations. + GPU's and CuDF are fast! + However, GPU's are not readily available and expensive in production. The amount of memory available on a GPU + is often a fraction of the available RAM. + This (and out-of-core) processing means that Polars can handle much larger data-sets. + Next to that Polars can be close in [performance to CuDF](https://zakopilo.hatenablog.jp/entry/2023/02/04/220552). + CuDF doesn't optimize your query, so is not uncommon that on ETL jobs Polars will be faster because it can elide + unneeded work and materializations. - Any - Polars is written in Rust. This gives it strong safety, performance and concurrency guarantees. - Polars is written in a modular manner. Parts of polars can be used in other query programs and can be added as a library. \ No newline at end of file + Polars is written in Rust. This gives it strong safety, performance and concurrency guarantees. + Polars is written in a modular manner. Parts of Polars can be used in other query programs and can be added as a library. diff --git a/docs/user-guide/misc/contributing.md b/docs/user-guide/misc/contributing.md index 2e3774367..b6df23be7 100644 --- a/docs/user-guide/misc/contributing.md +++ b/docs/user-guide/misc/contributing.md @@ -8,4 +8,4 @@ Here is an example [commit](https://github.com/pola-rs/polars/pull/3567/commits/ If you spot any gaps in this User Guide you can submit fixes to the [`pola-rs/polars-book`](https://github.com/pola-rs/polars-book) repo. -Happy hunting! \ No newline at end of file +Happy hunting! diff --git a/docs/user-guide/sql/cte.md b/docs/user-guide/sql/cte.md index 8df9fd143..1129f6d19 100644 --- a/docs/user-guide/sql/cte.md +++ b/docs/user-guide/sql/cte.md @@ -15,7 +15,7 @@ In this syntax, `cte_name` is the name of the CTE, and `subquery` is the subquer CTEs are particularly useful when working with complex queries that involve multiple levels of subqueries, as they allow you to break down the query into smaller, more manageable pieces that are easier to understand and debug. Additionally, CTEs can help improve query performance by allowing the database to optimize and cache the results of subqueries, reducing the number of times they need to be executed. -Polars supports Common Table Expressions (CTEs) using the WITH clause in SQL syntax. Below is an example +Polars supports Common Table Expressions (CTEs) using the WITH clause in SQL syntax. Below is an example {{code_block('user-guide/sql/cte','cte',['SQLregister','SQLexecute'])}} @@ -24,7 +24,4 @@ Polars supports Common Table Expressions (CTEs) using the WITH clause in SQL syn --8<-- "python/user-guide/sql/cte.py:cte" ``` - - - -In this example, we use the `execute()` method of the `SQLContext` to execute a SQL query that includes a CTE. The CTE selects all rows from the `my_table` LazyFrame where the `age` column is greater than 30 and gives it the alias `older_people`. We then execute a second SQL query that selects all rows from the `older_people` CTE where the `name` column starts with the letter 'C'. \ No newline at end of file +In this example, we use the `execute()` method of the `SQLContext` to execute a SQL query that includes a CTE. The CTE selects all rows from the `my_table` LazyFrame where the `age` column is greater than 30 and gives it the alias `older_people`. We then execute a second SQL query that selects all rows from the `older_people` CTE where the `name` column starts with the letter 'C'. diff --git a/docs/user-guide/sql/intro.md b/docs/user-guide/sql/intro.md index 48d056087..815231e3d 100644 --- a/docs/user-guide/sql/intro.md +++ b/docs/user-guide/sql/intro.md @@ -2,8 +2,8 @@ While Polars does support writing queries in SQL, it's recommended that users familiarize themselves with the [expression syntax](../concepts/expressions.md) for more readable and expressive code. As a primarily DataFrame library, new features will typically be added to the expression API first. However, if you already have an existing SQL codebase or prefer to use SQL, Polars also offers support for SQL queries. - !!! note Execution + In Polars, there is no separate SQL engine because Polars translates SQL queries into [expressions](../concepts/expressions.md), which are then executed using its built-in execution engine. This approach ensures that Polars maintains its performance and scalability advantages as a native DataFrame library while still providing users with the ability to work with SQL queries. ## Context @@ -39,6 +39,7 @@ We can also register Pandas DataFrames by converting them to Polars first. ``` !!! note Pandas + Converting a Pandas DataFrame backed by Numpy to Polars triggers a conversion to the Arrow format. This conversion has a computation cost. Converting a Pandas DataFrame backed by Arrow on the other hand will be free or almost free. Once the `SQLContext` is initialized, we can register additional Dataframes or unregister existing Dataframes with: @@ -72,7 +73,7 @@ In the example below, we register : - a NDJSON file loaded lazily - a Pandas DataFrame -And we join them together with SQL. +And we join them together with SQL. Lazy reading allows to only load the necessary rows and columns from the files. In the same way, it's possible to register cloud datalakes (S3, Azure Data Lake). A PyArrow dataset can point to the datalake, then Polars can read it with `scan_pyarrow_dataset`. @@ -85,9 +86,9 @@ In the same way, it's possible to register cloud datalakes (S3, Azure Data Lake) --8<-- "python/user-guide/sql/intro.py:clean_multiple_sources" ``` -[^1]: Additionally it also tracks the [common table expressions](./cte.md) as well. +[^1]: Additionally it also tracks the [common table expressions](./cte.md) as well. -## Compatibility +## Compatibility Polars does not support the full SQL language, in Polars you are allowed to: @@ -102,4 +103,4 @@ The following is not yet supported: - Table aliasing (e.g. `SELECT p.Name from pokemon AS p`) - Meta queries such as `ANALYZE`, `EXPLAIN` -In the upcoming sections we will cover each of the statements in more details. \ No newline at end of file +In the upcoming sections we will cover each of the statements in more details. diff --git a/docs/user-guide/sql/select.md b/docs/user-guide/sql/select.md index 240bcd4b8..dc94f6363 100644 --- a/docs/user-guide/sql/select.md +++ b/docs/user-guide/sql/select.md @@ -9,7 +9,6 @@ FROM table_name; Here, `column1`, `column2`, etc. are the columns that you want to select from the table. You can also use the wildcard `*` to select all columns. `table_name` is the name of the table or that you want to retrieve data from. In the sections below we will cover some of the more common SELECT variants - {{code_block('user-guide/sql/sql_select','df',['SQLregister','SQLexecute'])}} ```python exec="on" result="text" session="user-guide/sql/select" @@ -21,17 +20,15 @@ Here, `column1`, `column2`, etc. are the columns that you want to select from th The `GROUP BY` statement is used to group rows in a table by one or more columns and compute aggregate functions on each group. - {{code_block('user-guide/sql/sql_select','groupby',['SQLexecute'])}} ```python exec="on" result="text" session="user-guide/sql/select" --8<-- "python/user-guide/sql/sql_select.py:groupby" ``` - ### ORDER BY -The `ORDER BY` statement is used to sort the result set of a query by one or more columns in ascending or descending order. +The `ORDER BY` statement is used to sort the result set of a query by one or more columns in ascending or descending order. {{code_block('user-guide/sql/sql_select','orderby',['SQLexecute'])}} @@ -64,7 +61,6 @@ For a full list of supported functions go the [API documentation](https://docs.r --8<-- "python/user-guide/sql/sql_select.py:functions" ``` - ### Table Functions In the examples earlier we first generated a DataFrame which we registered in the `SQLContext`. Polars also support directly reading from CSV, Parquet, JSON and IPC in your SQL query using table functions `read_xxx`. @@ -74,4 +70,3 @@ In the examples earlier we first generated a DataFrame which we registered in th ```python exec="on" result="text" session="user-guide/sql/select" --8<-- "python/user-guide/sql/sql_select.py:tablefunctions" ``` - diff --git a/docs/user-guide/sql/show.md b/docs/user-guide/sql/show.md index 670d35b07..70453ebcb 100644 --- a/docs/user-guide/sql/show.md +++ b/docs/user-guide/sql/show.md @@ -17,7 +17,6 @@ Here's an example of how to use the `SHOW TABLES` statement in Polars: --8<-- "python/user-guide/sql/show.py:show" ``` - In this example, we create two DataFrames and register them with the `SQLContext` using different names. We then execute a `SHOW TABLES` statement using the `execute()` method of the `SQLContext` object, which returns a DataFrame containing a list of all the registered tables and their names. The resulting DataFrame is then printed using the `print()` function. -Note that the `SHOW TABLES` statement only lists tables that have been registered with the current `SQLContext`. If you register a DataFrame with a different `SQLContext` or in a different Python session, it will not appear in the list of tables returned by `SHOW TABLES`. \ No newline at end of file +Note that the `SHOW TABLES` statement only lists tables that have been registered with the current `SQLContext`. If you register a DataFrame with a different `SQLContext` or in a different Python session, it will not appear in the list of tables returned by `SHOW TABLES`. diff --git a/docs/user-guide/transformations/concatenation.md b/docs/user-guide/transformations/concatenation.md index 6e36e6804..8deff923a 100644 --- a/docs/user-guide/transformations/concatenation.md +++ b/docs/user-guide/transformations/concatenation.md @@ -48,4 +48,4 @@ When the dataframe shapes do not match and we have an overlapping semantic key t ## Rechunking Before a concatenation we have two dataframes `df1` and `df2`. Each column in `df1` and `df2` is in one or more chunks in memory. By default, during concatenation the chunks in each column are copied to a single new chunk - this is known as **rechunking**. Rechunking is an expensive operation, but is often worth it because future operations will be faster. -If you do not want Polars to rechunk the concatenated `DataFrame` you specify `rechunk = False` when doing the concatenation. \ No newline at end of file +If you do not want Polars to rechunk the concatenated `DataFrame` you specify `rechunk = False` when doing the concatenation. diff --git a/docs/user-guide/transformations/joins.md b/docs/user-guide/transformations/joins.md index 89ed545f1..39841910f 100644 --- a/docs/user-guide/transformations/joins.md +++ b/docs/user-guide/transformations/joins.md @@ -4,22 +4,20 @@ `Polars` supports the following join strategies by specifying the `strategy` argument: -Strategy | Description | -----------|-------------| -`inner` | Returns row with matching keys in *both* frames. Non-matching rows in either the left or right frame are discarded. -`left` | Returns all rows in the left dataframe, whether or not a match in the right-frame is found. Non-matching rows have their right columns null-filled. -`outer` | Returns all rows from both the left and right dataframe. If no match is found in one frame, columns from the other frame are null-filled. -`cross` | Returns the Cartesian product of all rows from the left frame with all rows from the right frame. Duplicates rows are retained; the table length of `A` cross-joined with `B` is always `len(A) × len(B)`. -`asof` | A left-join in which the match is performed on the _nearest_ key rather than on equal keys. -`semi` | Returns all rows from the left frame in which the join key is also present in the right frame. -`anti` | Returns all rows from the left frame in which the join key is _not_ present in the right frame. - +| Strategy | Description | +| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `inner` | Returns row with matching keys in _both_ frames. Non-matching rows in either the left or right frame are discarded. | +| `left` | Returns all rows in the left dataframe, whether or not a match in the right-frame is found. Non-matching rows have their right columns null-filled. | +| `outer` | Returns all rows from both the left and right dataframe. If no match is found in one frame, columns from the other frame are null-filled. | +| `cross` | Returns the Cartesian product of all rows from the left frame with all rows from the right frame. Duplicates rows are retained; the table length of `A` cross-joined with `B` is always `len(A) × len(B)`. | +| `asof` | A left-join in which the match is performed on the _nearest_ key rather than on equal keys. | +| `semi` | Returns all rows from the left frame in which the join key is also present in the right frame. | +| `anti` | Returns all rows from the left frame in which the join key is _not_ present in the right frame. | ### Inner join An `inner` join produces a `DataFrame` that contains only the rows where the join key exists in both `DataFrames`. Let's take for example the following two `DataFrames`: - {{code_block('user-guide/transformations/joins','innerdf',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -69,7 +67,6 @@ The `outer` join produces a `DataFrame` that contains all the rows from both `Da A `cross` join is a cartesian product of the two `DataFrames`. This means that every row in the left `DataFrame` is joined with every row in the right `DataFrame`. The `cross` join is useful for creating a `DataFrame` with all possible combinations of the columns in two `DataFrames`. Let's take for example the following two `DataFrames`. - {{code_block('user-guide/transformations/joins','df3',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -78,7 +75,6 @@ A `cross` join is a cartesian product of the two `DataFrames`. This means that e

- {{code_block('user-guide/transformations/joins','df4',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -87,7 +83,6 @@ A `cross` join is a cartesian product of the two `DataFrames`. This means that e We can now create a `DataFrame` containing all possible combinations of the colors and sizes with a `cross` join: - {{code_block('user-guide/transformations/joins','cross',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -102,7 +97,6 @@ The `inner`, `left`, `outer` and `cross` join strategies are standard amongst da The `semi` join retuns all rows from the left frame in which the join key is also present in the right frame. Consider the following scenario: a car rental company has a `DataFrame` showing the cars that it owns with each car having a unique `id`. - {{code_block('user-guide/transformations/joins','df5',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -111,7 +105,6 @@ The `semi` join retuns all rows from the left frame in which the join key is als The company has another `DataFrame` showing each repair job carried out on a vehicle. - {{code_block('user-guide/transformations/joins','df6',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -122,7 +115,6 @@ You want to answer this question: which of the cars have had repairs carried out An inner join does not answer this question directly as it produces a `DataFrame` with multiple rows for each car that has had multiple repair jobs: - {{code_block('user-guide/transformations/joins','inner2',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -131,10 +123,8 @@ An inner join does not answer this question directly as it produces a `DataFrame However, a semi join produces a single row for each car that has had a repair job carried out. - {{code_block('user-guide/transformations/joins','semi',['join'])}} - ```python exec="on" result="text" session="user-guide/transformations/joins" --8<-- "python/user-guide/transformations/joins.py:semi" ``` @@ -143,7 +133,6 @@ However, a semi join produces a single row for each car that has had a repair jo Continuing this example, an alternative question might be: which of the cars have **not** had a repair job carried out? An anti join produces a `DataFrame` showing all the cars from `df_cars` where the `id` is not present in the `df_repairs` `DataFrame`. - {{code_block('user-guide/transformations/joins','anti',['join'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" @@ -157,14 +146,12 @@ In `Polars` we can do an asof join with the `join` method and specifying `strate Consider the following scenario: a stock market broker has a `DataFrame` called `df_trades` showing transactions it has made for different stocks. - {{code_block('user-guide/transformations/joins','df7',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/transformations/joins" --8<-- "python/user-guide/transformations/joins.py:df7" ``` - The broker has another `DataFrame` called `df_quotes` showing prices it has quoted for these stocks. {{code_block('user-guide/transformations/joins','df8',['DataFrame'])}} @@ -173,26 +160,24 @@ The broker has another `DataFrame` called `df_quotes` showing prices it has quot --8<-- "python/user-guide/transformations/joins.py:df8" ``` - -You want to produce a `DataFrame` showing for each trade the most recent quote provided *before* the trade. You do this with `join_asof` (using the default `strategy = "backward"`). +You want to produce a `DataFrame` showing for each trade the most recent quote provided _before_ the trade. You do this with `join_asof` (using the default `strategy = "backward"`). To avoid joining between trades on one stock with a quote on another you must specify an exact preliminary join on the stock column with `by="stock"`. {{code_block('user-guide/transformations/joins','asof',['join_asof'])}} - ```python exec="on" result="text" session="user-guide/transformations/joins" --8<-- "python/user-guide/transformations/joins.py:asofpre" --8<-- "python/user-guide/transformations/joins.py:asof" ``` - If you want to make sure that only quotes within a certain time range are joined to the trades you can specify the `tolerance` argument. In this case we want to make sure that the last preceding quote is within 1 minute of the trade so we set `tolerance = "1m"`. === ":fontawesome-brands-python: Python" - ``` python - --8<-- "python/user-guide/transformations/joins.py:asof2" - ``` + +```python +--8<-- "python/user-guide/transformations/joins.py:asof2" +``` ```python exec="on" result="text" session="user-guide/transformations/joins" --8<-- "python/user-guide/transformations/joins.py:asof2" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/transformations/melt.md b/docs/user-guide/transformations/melt.md index 36cbbe741..b004c1dc9 100644 --- a/docs/user-guide/transformations/melt.md +++ b/docs/user-guide/transformations/melt.md @@ -18,4 +18,4 @@ Melt operations unpivot a DataFrame from wide format to long format ```python exec="on" result="text" session="user-guide/transformations/melt" --8<-- "python/user-guide/transformations/melt.py:melt" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/transformations/pivot.md b/docs/user-guide/transformations/pivot.md index 35bb5ba0b..9850dbed0 100644 --- a/docs/user-guide/transformations/pivot.md +++ b/docs/user-guide/transformations/pivot.md @@ -17,7 +17,6 @@ aggregation. {{code_block('user-guide/transformations/pivot','df',['DataFrame'])}} - ```python exec="on" result="text" session="user-guide/transformations/pivot" --8<-- "python/user-guide/transformations/pivot.py:setup" --8<-- "python/user-guide/transformations/pivot.py:df" @@ -27,7 +26,6 @@ aggregation. {{code_block('user-guide/transformations/pivot','eager',['pivot'])}} - ```python exec="on" result="text" session="user-guide/transformations/pivot" --8<-- "python/user-guide/transformations/pivot.py:eager" ``` diff --git a/docs/user-guide/transformations/time-series/filter.md b/docs/user-guide/transformations/time-series/filter.md index 7ed1445a2..326969c34 100644 --- a/docs/user-guide/transformations/time-series/filter.md +++ b/docs/user-guide/transformations/time-series/filter.md @@ -45,4 +45,4 @@ does not. So for filtering, you should use attributes in the `.dt` namespace: ```python exec="on" result="text" session="user-guide/transformations/ts/filter" --8<-- "python/user-guide/transformations/time-series/filter.py:negative" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/transformations/time-series/parsing.md b/docs/user-guide/transformations/time-series/parsing.md index 920202474..a31095d07 100644 --- a/docs/user-guide/transformations/time-series/parsing.md +++ b/docs/user-guide/transformations/time-series/parsing.md @@ -51,9 +51,8 @@ You can extract data features such as the year or day from a date column using t If you have mixed offsets (say, due to crossing daylight saving time), then you can use `utc=True` and then convert to your time zone: - {{code_block('user-guide/transformations/time-series/parsing','mixed',['strptime','convert_time_zone'])}} ```python exec="on" result="text" session="user-guide/transformations/ts/parsing" --8<-- "python/user-guide/transformations/time-series/parsing.py:mixed" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/transformations/time-series/resampling.md b/docs/user-guide/transformations/time-series/resampling.md index 80d1b80ca..cf9e724ec 100644 --- a/docs/user-guide/transformations/time-series/resampling.md +++ b/docs/user-guide/transformations/time-series/resampling.md @@ -29,7 +29,6 @@ In this example we upsample from the original 30 minutes to 15 minutes and then {{code_block('user-guide/transformations/time-series/resampling','upsample',['upsample'])}} - ```python exec="on" result="text" session="user-guide/transformations/ts/resampling" --8<-- "python/user-guide/transformations/time-series/resampling.py:upsample" ``` @@ -40,4 +39,4 @@ In this example we instead fill the nulls by linear interpolation: ```python exec="on" result="text" session="user-guide/transformations/ts/resampling" --8<-- "python/user-guide/transformations/time-series/resampling.py:upsample2" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/transformations/time-series/rolling.md b/docs/user-guide/transformations/time-series/rolling.md index dfeb5d4e4..afe3ad501 100644 --- a/docs/user-guide/transformations/time-series/rolling.md +++ b/docs/user-guide/transformations/time-series/rolling.md @@ -15,7 +15,8 @@ In following simple example we calculate the annual average closing price of App --8<-- "python/user-guide/transformations/time-series/rolling.py:df" ``` -!!! info +!!! info + The dates are sorted in ascending order - if they are not sorted in this way the `groupby_dynamic` output will not be correct! To get the annual average closing price we tell `groupby_dynamic` that we want to: @@ -53,8 +54,6 @@ or leave boundaries between them. Let's see how the windows for some parameter combinations would look. Let's start out boring. 🥱 -> - - every: 1 day -> `"1d"` - period: 1 day -> `"1d"` @@ -65,8 +64,6 @@ this creates adjacent windows of the same size |--| ``` -> - - every: 1 day -> `"1d"` - period: 2 days -> `"2d"` @@ -77,8 +74,6 @@ these windows have an overlap of 1 day |----| ``` -> - - every: 2 days -> `"2d"` - period: 1 day -> `"1d"` @@ -114,7 +109,6 @@ Below we show an example where we use **groupby_dynamic** to compute: --8<-- "python/user-guide/transformations/time-series/rolling.py:groupbydyn" ``` - ## Grouping by rolling windows The rolling groupby, `groupby_rolling`, is another entrance to the `groupby` context. But different from the `groupby_dynamic` the windows are @@ -125,7 +119,6 @@ So imagine having a time column with the values `{2021-01-06, 2021-01-10}` and a windows: ```text - 2021-01-01 2021-01-06 |----------| @@ -143,7 +136,7 @@ Rolling and dynamic groupby's can be combined with normal groupby operations. Below is an example with a dynamic groupby. {{code_block('user-guide/transformations/time-series/rolling','groupbyroll',['DataFrame'])}} - + ```python exec="on" result="text" session="user-guide/transformations/ts/rolling" --8<-- "python/user-guide/transformations/time-series/rolling.py:groupbyroll" ``` @@ -152,4 +145,4 @@ Below is an example with a dynamic groupby. ```python exec="on" result="text" session="user-guide/transformations/ts/rolling" --8<-- "python/user-guide/transformations/time-series/rolling.py:groupbydyn2" -``` \ No newline at end of file +``` diff --git a/docs/user-guide/transformations/time-series/timezones.md b/docs/user-guide/transformations/time-series/timezones.md index 8ba4cee39..48f6870e8 100644 --- a/docs/user-guide/transformations/time-series/timezones.md +++ b/docs/user-guide/transformations/time-series/timezones.md @@ -2,10 +2,12 @@ hide: - toc --- + # Time zones !!! quote "Tom Scott" - You really should never, ever deal with time zones if you can help it + + You really should never, ever deal with time zones if you can help it. The `Datetime` datatype can have a time zone associated with it. Examples of valid time zones are: @@ -41,4 +43,4 @@ Let's look at some examples of common operations: ```python exec="on" result="text" session="user-guide/transformations/ts/timezones" --8<-- "python/user-guide/transformations/time-series/timezones.py:example2" -``` \ No newline at end of file +```