From c3990675d148f404fdd0331c036c9bf213ac73a8 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 6 Jun 2025 14:26:28 +0100 Subject: [PATCH 1/9] DOC-5282 started restructuring job file docs --- .../data-pipelines/data-pipelines.md | 129 +---------------- .../data-pipelines/deploy.md | 2 +- .../transform-examples/_index.md | 134 +++++++++++++++++- 3 files changed, 137 insertions(+), 128 deletions(-) diff --git a/content/integrate/redis-data-integration/data-pipelines/data-pipelines.md b/content/integrate/redis-data-integration/data-pipelines/data-pipelines.md index bad74e5dc9..448f78ebdc 100644 --- a/content/integrate/redis-data-integration/data-pipelines/data-pipelines.md +++ b/content/integrate/redis-data-integration/data-pipelines/data-pipelines.md @@ -281,129 +281,12 @@ sudo service k3s restart ## Job files -You can optionally supply one or more job files that specify how you want to -transform the captured data before writing it to the target. -Each job file contains a YAML -configuration that controls the transformation for a particular table from the source -database. You can also add a `default-job.yaml` file to provide -a default transformation for tables that don't have a specific job file of their own. - -The job files have a structure like the following example. This configures a default -job that: - -- Writes the data to a Redis hash -- Adds a field `app_code` to the hash with a value of `foo` -- Adds a prefix of `aws` and a suffix of `gcp` to the key - -```yaml -source: - table: "*" - row_format: full -transform: - - uses: add_field - with: - fields: - - field: after.app_code - expression: "`foo`" - language: jmespath -output: - - uses: redis.write - with: - data_type: hash - key: - expression: concat(['aws', '#', table, '#', keys(key)[0], '#', values(key)[0], '#gcp']) - language: jmespath -``` - -The main sections of these files are: - -- `source`: This is a mandatory section that specifies the data items that you want to - use. You can add the following properties here: - - `server_name`: Logical server name (optional). - - `db`: Database name (optional) - - `schema`: Database schema (optional) - - `table`: Database table name. This refers to a table name you supplied in `config.yaml`. The default - job doesn't apply to a specific table, so use "*" in place of the table name for this job only. - - `row_format`: Format of the data to be transformed. This can take the values `data_only` (default) to - use only the payload data, or `full` to use the complete change record. See the `transform` section below - for details of the extra data you can access when you use the `full` option. - - `case_insensitive`: This applies to the `server_name`, `db`, `schema`, and `table` properties - and is set to `true` by default. Set it to `false` if you need to use case-sensitive values for these - properties. - -- `transform`: This is an optional section describing the transformation that the pipeline - applies to the data before writing it to the target. The `uses` property specifies a - *transformation block* that will use the parameters supplied in the `with` section. See the - [data transformation reference]({{< relref "/integrate/redis-data-integration/reference/data-transformation" >}}) - for more details about the supported transformation blocks, and also the - [JMESPath custom functions]({{< relref "/integrate/redis-data-integration/reference/jmespath-custom-functions" >}}) reference. You can test your transformation logic using the [dry run]({{< relref "/integrate/redis-data-integration/reference/api-reference/#tag/secure/operation/job_dry_run_api_v1_pipelines_jobs_dry_run_post" >}}) feature in the API. - - {{< note >}}If you set `row_format` to `full` under the `source` settings, you can access extra data from the - change record in the transformation: - - Use the `key` object to access the attributes of the key. For example, `key.id` will give you the value of the `id` column as long as it is part of the primary key. - - Use `before.` to get the value of a field *before* it was updated in the source database - - Use `after.` to get the value of a field *after* it was updated in the source database - - Use `after.` when adding new fields during transformations - - See [Row Format]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples/redis-row-format#full" >}}) for a more detailed explanation of the full format. - {{< /note >}} - -- `output`: This is a mandatory section to specify the data structure(s) that - RDI will write to - the target along with the text pattern for the key(s) that will access it. - Note that you can map one record to more than one key in Redis or nest - a record as a field of a JSON structure (see - [Data denormalization]({{< relref "/integrate/redis-data-integration/data-pipelines/data-denormalization" >}}) - for more information about nesting). You can add the following properties in the `output` section: - - `uses`: This must have the value `redis.write` to specify writing to a Redis data - structure. You can add more than one block of this type in the same job. - - `with`: - - `connection`: Connection name as defined in `config.yaml` (by default, the connection named `target` is used). - - `data_type`: Target data structure when writing data to Redis. The supported types are `hash`, `json`, `set`, - `sorted_set`, `stream` and `string`. - - `key`: This lets you override the default key for the data structure with custom logic: - - `expression`: Expression to generate the key. - - `language`: Expression language, which must be `jmespath` or `sql`. - - `expire`: Positive integer value indicating a number of seconds for the key to expire. - If you don't specify this property, the key will never expire. - -{{< note >}}In a job file, the `transform` section is optional, but if you don't specify -a `transform`, you must specify custom key logic in `output.with.key`. You can include -both of these sections if you want both a custom transform and a custom key.{{< /note >}} - -Another example below shows how you can rename the `fname` field to `first_name` in the table `emp` -using the -[`rename_field`]({{< relref "/integrate/redis-data-integration/reference/data-transformation/rename_field" >}}) block. It also demonstrates how you can set the key of this record instead of relying on -the default logic. (See the -[Transformation examples]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}) -section for more examples of job files.) - -```yaml -source: - server_name: redislabs - schema: dbo - table: emp -transform: - - uses: rename_field - with: - from_field: fname - to_field: first_name -output: - - uses: redis.write - with: - connection: target - key: - expression: concat(['emp:fname:',fname,':lname:',lname]) - language: jmespath -``` - -See the -[RDI configuration file]({{< relref "/integrate/redis-data-integration/reference/config-yaml-reference" >}}) -reference for full details about the -available source, transform, and target configuration options and see -also the -[data transformation reference]({{< relref "/integrate/redis-data-integration/reference/data-transformation" >}}) -for details of all the available transformation blocks. +You can use one or more job files to configure which fields from the source tables +you want to use, and which data structure you want to write to the target. You +can also optionally specify a transformation to apply to the data before writing it +to the target. See the +[Job files]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}) +section for full details of the file format and examples of common tasks for job files. ## Source preparation diff --git a/content/integrate/redis-data-integration/data-pipelines/deploy.md b/content/integrate/redis-data-integration/data-pipelines/deploy.md index e3c8e01074..255d01dd71 100644 --- a/content/integrate/redis-data-integration/data-pipelines/deploy.md +++ b/content/integrate/redis-data-integration/data-pipelines/deploy.md @@ -13,7 +13,7 @@ linkTitle: Deploy summary: Redis Data Integration keeps Redis in sync with the primary database in near real time. type: integration -weight: 2 +weight: 10 --- The sections below explain how to deploy a pipeline after you have created the required diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md index 6e8dfab1cc..31630de59f 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md @@ -1,5 +1,5 @@ --- -Title: Transformation examples +Title: Job files aliases: /integrate/redis-data-integration/ingest/data-pipelines/transform-examples/ alwaysopen: false categories: @@ -7,12 +7,138 @@ categories: - integrate - rs - rdi -description: Explore some examples of common RDI transformations +description: Learn how to configure job files for data transformation. group: di hideListLinks: false -linkTitle: Transformation examples +linkTitle: Job files summary: Redis Data Integration keeps Redis in sync with the primary database in near real time. type: integration -weight: 30 +weight: 5 --- + +You can optionally supply one or more job files that specify how you want to +transform the captured data before writing it to the target. +Each job file contains a YAML +configuration that controls the transformation for a particular table from the source +database. You can also add a `default-job.yaml` file to provide +a default transformation for tables that don't have a specific job file of their own. + +The job files have a structure like the following example. This configures a default +job that: + +- Writes the data to a Redis hash +- Adds a field `app_code` to the hash with a value of `foo` +- Adds a prefix of `aws` and a suffix of `gcp` to the key + +```yaml +source: + table: "*" + row_format: full +transform: + - uses: add_field + with: + fields: + - field: after.app_code + expression: "`foo`" + language: jmespath +output: + - uses: redis.write + with: + data_type: hash + key: + expression: concat(['aws', '#', table, '#', keys(key)[0], '#', values(key)[0], '#gcp']) + language: jmespath +``` + +The main sections of these files are: + +- `source`: This is a mandatory section that specifies the data items that you want to + use. You can add the following properties here: + - `server_name`: Logical server name (optional). + - `db`: Database name (optional) + - `schema`: Database schema (optional) + - `table`: Database table name. This refers to a table name you supplied in `config.yaml`. The default + job doesn't apply to a specific table, so use "*" in place of the table name for this job only. + - `row_format`: Format of the data to be transformed. This can take the values `data_only` (default) to + use only the payload data, or `full` to use the complete change record. See the `transform` section below + for details of the extra data you can access when you use the `full` option. + - `case_insensitive`: This applies to the `server_name`, `db`, `schema`, and `table` properties + and is set to `true` by default. Set it to `false` if you need to use case-sensitive values for these + properties. + +- `transform`: This is an optional section describing the transformation that the pipeline + applies to the data before writing it to the target. The `uses` property specifies a + *transformation block* that will use the parameters supplied in the `with` section. See the + [data transformation reference]({{< relref "/integrate/redis-data-integration/reference/data-transformation" >}}) + for more details about the supported transformation blocks, and also the + [JMESPath custom functions]({{< relref "/integrate/redis-data-integration/reference/jmespath-custom-functions" >}}) reference. You can test your transformation logic using the [dry run]({{< relref "/integrate/redis-data-integration/reference/api-reference/#tag/secure/operation/job_dry_run_api_v1_pipelines_jobs_dry_run_post" >}}) feature in the API. + + {{< note >}}If you set `row_format` to `full` under the `source` settings, you can access extra data from the + change record in the transformation: + - Use the `key` object to access the attributes of the key. For example, `key.id` will give you the value of the `id` column as long as it is part of the primary key. + - Use `before.` to get the value of a field *before* it was updated in the source database + - Use `after.` to get the value of a field *after* it was updated in the source database + - Use `after.` when adding new fields during transformations + + See [Row Format]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples/redis-row-format#full" >}}) for a more detailed explanation of the full format. + {{< /note >}} + +- `output`: This is a mandatory section to specify the data structure(s) that + RDI will write to + the target along with the text pattern for the key(s) that will access it. + Note that you can map one record to more than one key in Redis or nest + a record as a field of a JSON structure (see + [Data denormalization]({{< relref "/integrate/redis-data-integration/data-pipelines/data-denormalization" >}}) + for more information about nesting). You can add the following properties in the `output` section: + - `uses`: This must have the value `redis.write` to specify writing to a Redis data + structure. You can add more than one block of this type in the same job. + - `with`: + - `connection`: Connection name as defined in `config.yaml` (by default, the connection named `target` is used). + - `data_type`: Target data structure when writing data to Redis. The supported types are `hash`, `json`, `set`, + `sorted_set`, `stream` and `string`. + - `key`: This lets you override the default key for the data structure with custom logic: + - `expression`: Expression to generate the key. + - `language`: Expression language, which must be `jmespath` or `sql`. + - `expire`: Positive integer value indicating a number of seconds for the key to expire. + If you don't specify this property, the key will never expire. + +{{< note >}}In a job file, the `transform` section is optional, but if you don't specify +a `transform`, you must specify custom key logic in `output.with.key`. You can include +both of these sections if you want both a custom transform and a custom key.{{< /note >}} + +Another example below shows how you can rename the `fname` field to `first_name` in the table `emp` +using the +[`rename_field`]({{< relref "/integrate/redis-data-integration/reference/data-transformation/rename_field" >}}) block. It also demonstrates how you can set the key of this record instead of relying on +the default logic. + +```yaml +source: + server_name: redislabs + schema: dbo + table: emp +transform: + - uses: rename_field + with: + from_field: fname + to_field: first_name +output: + - uses: redis.write + with: + connection: target + key: + expression: concat(['emp:fname:',fname,':lname:',lname]) + language: jmespath +``` + +See the +[RDI configuration file]({{< relref "/integrate/redis-data-integration/reference/config-yaml-reference" >}}) +reference for full details about the +available source, transform, and target configuration options and see +also the +[data transformation reference]({{< relref "/integrate/redis-data-integration/reference/data-transformation" >}}) +for details of all the available transformation blocks. + +## Examples + +The pages listed below show examples of typical job files for different use cases. From 75ac236d47d05d548c00a3eb67377b8d67e86e86 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Tue, 10 Jun 2025 11:15:01 +0100 Subject: [PATCH 2/9] DOC-5282 restructured pipeline/job config info --- .../data-pipelines/_index.md | 128 ++++++++++++++- .../data-pipelines/deploy.md | 8 +- .../{data-pipelines.md => pipeline-config.md} | 147 ++++-------------- .../data-pipelines/prepare-dbs/_index.md | 2 +- 4 files changed, 159 insertions(+), 126 deletions(-) rename content/integrate/redis-data-integration/data-pipelines/{data-pipelines.md => pipeline-config.md} (60%) diff --git a/content/integrate/redis-data-integration/data-pipelines/_index.md b/content/integrate/redis-data-integration/data-pipelines/_index.md index c9cc0609af..479b8b338c 100644 --- a/content/integrate/redis-data-integration/data-pipelines/_index.md +++ b/content/integrate/redis-data-integration/data-pipelines/_index.md @@ -1,13 +1,15 @@ --- Title: Data pipelines -aliases: /integrate/redis-data-integration/ingest/data-pipelines/ +aliases: +- /integrate/redis-data-integration/ingest/data-pipelines/ +- /integrate/redis-data-integration/data-pipelines/data-pipelines/ alwaysopen: false categories: - docs - integrate - rs - rdi -description: Learn how an RDI pipeline can transform source data before writing +description: Learn how to configure RDI for data capture and transformation. group: di hideListLinks: false linkTitle: Data pipelines @@ -16,3 +18,125 @@ summary: Redis Data Integration keeps Redis in sync with the primary database in type: integration weight: 30 --- + +RDI implements +[change data capture](https://en.wikipedia.org/wiki/Change_data_capture) (CDC) +with *pipelines*. (See the +[architecture overview]({{< relref "/integrate/redis-data-integration/architecture#overview" >}}) +for an introduction to pipelines.) + +## How a pipeline works + +An RDI pipeline captures change data records from the source database, and transforms them +into Redis data structures. It writes each of these new structures to a Redis target +database under its own key. + +By default, RDI transforms the source data into +[hashes]({{< relref "/develop/data-types/hashes" >}}) or +[JSON objects]({{< relref "/develop/data-types/json" >}}) for the target with a +standard data mapping and a standard format for the key. +However, you can also provide your own custom transformation [jobs](#job-files) +for each source table, using your own data mapping and key pattern. You specify these +jobs declaratively with YAML configuration files that require no coding. + +The data tranformation involves two separate stages: + +1. The data ingested during CDC is automatically transformed to an intermediate JSON + change event format. +1. This JSON change event data gets passed on to your custom transformation for further + processing. + +The diagram below shows the flow of data through the pipeline: + +{{< image filename="/images/rdi/ingest/RDIPipeDataflow.webp" >}} + +You can provide a job file for each source table for which you want to specify a custom +transformation. You can also add a *default job file* for any tables that don't have their own. +You must specify the full name of the source table in the job file (or the special +name "*" in the default job) and you +can also include filtering logic to skip data that matches a particular condition. +As part of the transformation, you can specify any of the following data types +to store the data in Redis: + +- [JSON objects]({{< relref "/develop/data-types/json" >}}) +- [Hashes]({{< relref "/develop/data-types/hashes" >}}) +- [Sets]({{< relref "/develop/data-types/sets" >}}) +- [Streams]({{< relref "/develop/data-types/streams" >}}) +- [Sorted sets]({{< relref "/develop/data-types/sorted-sets" >}}) +- [Strings]({{< relref "/develop/data-types/strings" >}}) + +### Pipeline lifecycle + +After you deploy a pipeline, it goes through the following phases: + +1. *Deploy* - when you deploy the pipeline, RDI first validates it before use. +Then, the [operator]({{< relref "/integrate/redis-data-integration/architecture#how-rdi-is-deployed">}}) creates and configures the collector and stream processor that will run the pipeline. +1. *Snapshot* - The collector starts the pipeline by creating a snapshot of the full +dataset. This involves reading all the relevant source data, transforming it and then +writing it into the Redis target. You should expect this phase to take minutes or +hours to complete if you have a lot of data. +1. *CDC* - Once the snapshot is complete, the collector starts listening for updates to +the source data. Whenever a change is committed to the source, the collector captures +it and adds it to the target through the pipeline. This phase continues indefinitely +unless you change the pipeline configuration. +1. *Update* - If you update the pipeline configuration, the operator applies it +to the collector and the stream processor. Note that the changes only affect newly-captured +data unless you reset the pipeline completely. Once RDI has accepted the updates, the +pipeline returns to the CDC phase with the new configuration. +1. *Reset* - There are circumstances where you might want to rebuild the dataset +completely. For example, you might want to apply a new transformation to all the source +data or refresh the dataset if RDI is disconnected from the +source for a long time. In situations like these, you can *reset* the pipeline back +to the snapshot phase. When this is complete, the pipeline continues with CDC as usual. + +## Using a pipeline + +Follow the steps described in the sections below to prepare and run an RDI pipeline. + +### 1. Prepare the source database + +Before using the pipeline you must first prepare your source database to use +the Debezium connector for *change data capture (CDC)*. See the +[architecture overview]({{< relref "/integrate/redis-data-integration/architecture#overview" >}}) +for more information about CDC. +Each database type has a different set of preparation steps. You can +find the preparation guides for the databases that RDI supports in the +[Prepare source databases]({{< relref "/integrate/redis-data-integration/data-pipelines/prepare-dbs" >}}) +section. + +### 2. Configure the pipeline + +RDI uses a set of [YAML](https://en.wikipedia.org/wiki/YAML) +files to configure each pipeline. The following diagram shows the folder +structure of the configuration: + +{{< image filename="images/rdi/ingest/ingest-config-folders.webp" width="600px" >}} + +The main configuration for the pipeline is in the `config.yaml` file. +This specifies the connection details for the source database (such +as host, username, and password) and also the queries that RDI will use +to extract the required data. You should place job configurations in the `Jobs` +folder if you want to specify your own data transformations. + +See +[Pipeline configuration file]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config" >}}) +for a full description of the `config.yaml` file and some example configurations. + +### 3. Create job files (optional) + +You can use one or more job files to configure which fields from the source tables +you want to use, and which data structure you want to write to the target. You +can also optionally specify a transformation to apply to the data before writing it +to the target. See the +[Job files]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}) +section for full details of the file format and examples of common tasks for job files. + +### 4. Deploy the pipeline + +When your configuration is ready, you must deploy it to start using the pipeline. See +[Deploy a pipeline]({{< relref "/integrate/redis-data-integration/data-pipelines/deploy" >}}) +to learn how to do this. + +## More information + +See the other pages in this section for more information and examples: diff --git a/content/integrate/redis-data-integration/data-pipelines/deploy.md b/content/integrate/redis-data-integration/data-pipelines/deploy.md index 255d01dd71..81692155a4 100644 --- a/content/integrate/redis-data-integration/data-pipelines/deploy.md +++ b/content/integrate/redis-data-integration/data-pipelines/deploy.md @@ -17,7 +17,7 @@ weight: 10 --- The sections below explain how to deploy a pipeline after you have created the required -[configuration]({{< relref "/integrate/redis-data-integration/data-pipelines/data-pipelines" >}}). +[configuration]({{< relref "/integrate/redis-data-integration/data-pipelines" >}}). ## Set secrets @@ -26,7 +26,9 @@ source and target databases. Each secret has a name that you can pass to the [`redis-di set-secret`]({{< relref "/integrate/redis-data-integration/reference/cli/redis-di-set-secret" >}}) command (VM deployment) or the `rdi-secret.sh` script (K8s deployment) to set the secret value. You can then refer to these secrets in the `config.yaml` file using the syntax "`${SECRET_NAME}`" -(the sample [config.yaml file]({{< relref "/integrate/redis-data-integration/data-pipelines/data-pipelines#the-configyaml-file" >}}) shows these secrets in use). +(the sample +[config.yaml file]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config#example" >}}) +shows these secrets in use). The table below lists all valid secret names. Note that the username and password are required for the source and target, but the other @@ -252,7 +254,7 @@ Note that the certificate paths contained in the secrets `SOURCE_DB_CACERT`, `SO ## Deploy a pipeline -When you have created your configuration, including the [jobs]({{< relref "/integrate/redis-data-integration/data-pipelines/data-pipelines#job-files" >}}), you are +When you have created your configuration, including the [jobs]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}), you are ready to deploy. Use [Redis Insight]({{< relref "/develop/tools/insight/rdi-connector" >}}) to configure and deploy pipelines for both VM and K8s installations. diff --git a/content/integrate/redis-data-integration/data-pipelines/data-pipelines.md b/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md similarity index 60% rename from content/integrate/redis-data-integration/data-pipelines/data-pipelines.md rename to content/integrate/redis-data-integration/data-pipelines/pipeline-config.md index 448f78ebdc..0245af9fe8 100644 --- a/content/integrate/redis-data-integration/data-pipelines/data-pipelines.md +++ b/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md @@ -1,74 +1,29 @@ --- -Title: Configure data pipelines -linkTitle: Configure -description: Learn how to configure ingest pipelines for data transformation -weight: 1 +Title: Pipeline configuration file alwaysopen: false -categories: ["redis-di"] -aliases: /integrate/redis-data-integration/ingest/data-pipelines/data-pipelines/ +categories: +- docs +- integrate +- rs +- rdi +description: Learn how to specify the main configuration details for an RDI pipeline. +group: di +linkTitle: Pipeline configuration file +summary: Redis Data Integration keeps Redis in sync with the primary database in near + real time. +type: integration +weight: 3 --- -RDI implements -[change data capture](https://en.wikipedia.org/wiki/Change_data_capture) (CDC) -with *pipelines*. (See the -[architecture overview]({{< relref "/integrate/redis-data-integration/architecture#overview" >}}) -for an introduction to pipelines.) +The main configuration details for an RDI pipeline are in the `config.yaml` file. +This file specifies the connection details for the source and target databases, +and also the set of tables you want to capture. You can also add one or more +[job files]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}) +if you want to apply custom transformations to the captured data. -## Overview +## Example -An RDI pipeline captures change data records from the source database, and transforms them -into Redis data structures. It writes each of these new structures to a Redis target -database under its own key. - -By default, RDI transforms the source data into -[hashes]({{< relref "/develop/data-types/hashes" >}}) or -[JSON objects]({{< relref "/develop/data-types/json" >}}) for the target with a -standard data mapping and a standard format for the key. -However, you can also provide your own custom transformation [jobs](#job-files) -for each source table, using your own data mapping and key pattern. You specify these -jobs declaratively with YAML configuration files that require no coding. - -The data tranformation involves two separate stages. First, the data ingested -during CDC is automatically transformed to a JSON format. Then, -this JSON data gets passed on to your custom transformation for further processing. - -You can provide a job file for each source table you want to transform, but you -can also add a *default job* for any tables that don't have their own. -You must specify the full name of the source table in the job file (or the special -name "*" in the default job) and you -can also include filtering logic to skip data that matches a particular condition. -As part of the transformation, you can specify whether you want to store the -data in Redis as -[JSON objects]({{< relref "/develop/data-types/json" >}}), -[hashes]({{< relref "/develop/data-types/hashes" >}}), -[sets]({{< relref "/develop/data-types/sets" >}}), -[streams]({{< relref "/develop/data-types/streams" >}}), -[sorted sets]({{< relref "/develop/data-types/sorted-sets" >}}), or -[strings]({{< relref "/develop/data-types/strings" >}}). - -The diagram below shows the flow of data through the pipeline: - -{{< image filename="/images/rdi/ingest/RDIPipeDataflow.webp" >}} - -## Pipeline configuration - -RDI uses a set of [YAML](https://en.wikipedia.org/wiki/YAML) -files to configure each pipeline. The following diagram shows the folder -structure of the configuration: - -{{< image filename="images/rdi/ingest/ingest-config-folders.webp" width="600px" >}} - -The main configuration for the pipeline is in the `config.yaml` file. -This specifies the connection details for the source database (such -as host, username, and password) and also the queries that RDI will use -to extract the required data. You should place job configurations in the `Jobs` -folder if you want to specify your own data transformations. - -The sections below describe the two types of configuration file in more detail. - -## The `config.yaml` file - -Here is an example of a `config.yaml` file. Note that the values of the +Below is an example of a `config.yaml` file. Note that the values of the form "`${name}`" refer to secrets that you should set as described in [Set secrets]({{< relref "/integrate/redis-data-integration/data-pipelines/deploy#set-secrets" >}}). In particular, you should normally use secrets as shown to set the source @@ -201,18 +156,20 @@ processors: # error_handling: dlq ``` +## Sections + The main sections of the file configure [`sources`](#sources) and [`targets`](#targets). ### Sources The `sources` section has a subsection for the source that you need to configure. The source section starts with a unique name -to identify the source (in the example we have a source +to identify the source (in the example, there is a source called `mysql` but you can choose any name you like). The example configuration contains the following data: - `type`: The type of collector to use for the pipeline. - Currently, the only types we support are `cdc` and `external`. + Currently, the only types RDI supports are `cdc` and `external`. If the source type is set to `external`, no collector resources will be created by the operator, and all other source sections should be empty or not specified at all. - `connection`: The connection details for the source database: `type`, `host`, `port`, @@ -255,8 +212,8 @@ configuration contains the following data: Use this section to provide the connection details for the target Redis database(s). As with the sources, you should start each target section -with a unique name that you are free to choose (here, we have used -`target` as an example). In the `connection` section, you can specify the +with a unique name that you are free to choose (here, the example uses the name +`target`). In the `connection` section, you can specify the `type` of the target database, which must be `redis`, along with connection details such as `host`, `port`, and credentials (`username` and `password`). If you use [TLS](https://en.wikipedia.org/wiki/Transport_Layer_Security)/ @@ -269,7 +226,7 @@ that you should set as described in [Set secrets]({{< relref "/integrate/redis-d {{< note >}}If you specify `localhost` as the address of either the source or target server during installation then the connection will fail if the actual IP address changes for the local -VM. For this reason, we recommend that you don't use `localhost` for the address. However, +VM. For this reason, it is recommended that you don't use `localhost` for the address. However, if you do encounter this problem, you can fix it using the following commands on the VM that is running RDI itself: @@ -278,53 +235,3 @@ sudo k3s kubectl delete nodes --all sudo service k3s restart ``` {{< /note >}} - -## Job files - -You can use one or more job files to configure which fields from the source tables -you want to use, and which data structure you want to write to the target. You -can also optionally specify a transformation to apply to the data before writing it -to the target. See the -[Job files]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}) -section for full details of the file format and examples of common tasks for job files. - -## Source preparation - -Before using the pipeline you must first prepare your source database to use -the Debezium connector for *change data capture (CDC)*. See the -[architecture overview]({{< relref "/integrate/redis-data-integration/architecture#overview" >}}) -for more information about CDC. -Each database type has a different set of preparation steps. You can -find the preparation guides for the databases that RDI supports in the -[Prepare source databases]({{< relref "/integrate/redis-data-integration/data-pipelines/prepare-dbs" >}}) -section. - -## Deploy a pipeline - -When your configuration is ready, you must deploy it to start using the pipeline. See -[Deploy a pipeline]({{< relref "/integrate/redis-data-integration/data-pipelines/deploy" >}}) -to learn how to do this. - -## Pipeline lifecycle - -A pipeline goes through the following phases: - -1. *Deploy* - when you deploy the pipeline, RDI first validates it before use. -Then, the [operator]({{< relref "/integrate/redis-data-integration/architecture#how-rdi-is-deployed">}}) creates and configures the collector and stream processor that will run the pipeline. -1. *Snapshot* - The collector starts the pipeline by creating a snapshot of the full -dataset. This involves reading all the relevant source data, transforming it and then -writing it into the Redis target. You should expect this phase to take minutes or -hours to complete if you have a lot of data. -1. *CDC* - Once the snapshot is complete, the collector starts listening for updates to -the source data. Whenever a change is committed to the source, the collector captures -it and adds it to the target through the pipeline. This phase continues indefinitely -unless you change the pipeline configuration. -1. *Update* - If you update the pipeline configuration, the operator applies it -to the collector and the stream processor. Note that the changes only affect newly-captured -data unless you reset the pipeline completely. Once RDI has accepted the updates, the -pipeline returns to the CDC phase with the new configuration. -1. *Reset* - There are circumstances where you might want to rebuild the dataset -completely. For example, you might want to apply a new transformation to all the source -data or refresh the dataset if RDI is disconnected from the -source for a long time. In situations like these, you can *reset* the pipeline back -to the snapshot phase. When this is complete, the pipeline continues with CDC as usual. diff --git a/content/integrate/redis-data-integration/data-pipelines/prepare-dbs/_index.md b/content/integrate/redis-data-integration/data-pipelines/prepare-dbs/_index.md index 5bdfbaa958..e1cef5a262 100644 --- a/content/integrate/redis-data-integration/data-pipelines/prepare-dbs/_index.md +++ b/content/integrate/redis-data-integration/data-pipelines/prepare-dbs/_index.md @@ -14,7 +14,7 @@ linkTitle: Prepare source databases summary: Redis Data Integration keeps Redis in sync with the primary database in near real time. type: integration -weight: 30 +weight: 1 --- Each database uses a different mechanism to track changes to its data and From 92823ca4d95cb3107b9a0e3f3f24ff16b66c2bb0 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Tue, 10 Jun 2025 12:12:06 +0100 Subject: [PATCH 3/9] DOC-5282 small changes suggested by AI --- .../data-pipelines/_index.md | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/content/integrate/redis-data-integration/data-pipelines/_index.md b/content/integrate/redis-data-integration/data-pipelines/_index.md index 479b8b338c..26942b2e5b 100644 --- a/content/integrate/redis-data-integration/data-pipelines/_index.md +++ b/content/integrate/redis-data-integration/data-pipelines/_index.md @@ -19,11 +19,12 @@ type: integration weight: 30 --- -RDI implements -[change data capture](https://en.wikipedia.org/wiki/Change_data_capture) (CDC) -with *pipelines*. (See the +RDI uses *pipelines* to implement +[change data capture](https://en.wikipedia.org/wiki/Change_data_capture) (CDC). (See the [architecture overview]({{< relref "/integrate/redis-data-integration/architecture#overview" >}}) for an introduction to pipelines.) +The sections below explain how pipelines work and give an overview of how to configure and +deploy them. ## How a pipeline works @@ -39,18 +40,18 @@ However, you can also provide your own custom transformation [jobs](#job-files) for each source table, using your own data mapping and key pattern. You specify these jobs declaratively with YAML configuration files that require no coding. -The data tranformation involves two separate stages: +Data transformation involves two stages: 1. The data ingested during CDC is automatically transformed to an intermediate JSON change event format. -1. This JSON change event data gets passed on to your custom transformation for further +1. RDI passes this JSON change event data to your custom transformation for further processing. The diagram below shows the flow of data through the pipeline: {{< image filename="/images/rdi/ingest/RDIPipeDataflow.webp" >}} -You can provide a job file for each source table for which you want to specify a custom +You can provide a job file for each source table that needs a custom transformation. You can also add a *default job file* for any tables that don't have their own. You must specify the full name of the source table in the job file (or the special name "*" in the default job) and you @@ -58,7 +59,7 @@ can also include filtering logic to skip data that matches a particular conditio As part of the transformation, you can specify any of the following data types to store the data in Redis: -- [JSON objects]({{< relref "/develop/data-types/json" >}}) +- [JSON]({{< relref "/develop/data-types/json" >}}) - [Hashes]({{< relref "/develop/data-types/hashes" >}}) - [Sets]({{< relref "/develop/data-types/sets" >}}) - [Streams]({{< relref "/develop/data-types/streams" >}}) @@ -73,8 +74,8 @@ After you deploy a pipeline, it goes through the following phases: Then, the [operator]({{< relref "/integrate/redis-data-integration/architecture#how-rdi-is-deployed">}}) creates and configures the collector and stream processor that will run the pipeline. 1. *Snapshot* - The collector starts the pipeline by creating a snapshot of the full dataset. This involves reading all the relevant source data, transforming it and then -writing it into the Redis target. You should expect this phase to take minutes or -hours to complete if you have a lot of data. +writing it into the Redis target. This phase typically takes minutes to +hours if you have a lot of data. 1. *CDC* - Once the snapshot is complete, the collector starts listening for updates to the source data. Whenever a change is committed to the source, the collector captures it and adds it to the target through the pipeline. This phase continues indefinitely @@ -115,7 +116,7 @@ structure of the configuration: The main configuration for the pipeline is in the `config.yaml` file. This specifies the connection details for the source database (such as host, username, and password) and also the queries that RDI will use -to extract the required data. You should place job configurations in the `Jobs` +to extract the required data. You should place job files in the `Jobs` folder if you want to specify your own data transformations. See From 33821d03623300279e421c2aebd1103bb5481eb2 Mon Sep 17 00:00:00 2001 From: andy-stark-redis <164213578+andy-stark-redis@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:44:30 +0100 Subject: [PATCH 4/9] Apply suggestions from code review Co-authored-by: Zdravko Donev --- .../data-pipelines/transform-examples/_index.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md index 31630de59f..a4f21f50fa 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md @@ -56,11 +56,11 @@ The main sections of these files are: - `source`: This is a mandatory section that specifies the data items that you want to use. You can add the following properties here: - `server_name`: Logical server name (optional). - - `db`: Database name (optional) - - `schema`: Database schema (optional) + - `db`: Database name (optional). This refers to a db name you supplied in config.yaml + - `schema`: Database schema (optional). This refers to a schema name you supplied in config.yaml - `table`: Database table name. This refers to a table name you supplied in `config.yaml`. The default job doesn't apply to a specific table, so use "*" in place of the table name for this job only. - - `row_format`: Format of the data to be transformed. This can take the values `data_only` (default) to + - `row_format`: Format of the data to be transformed. This can take the values `partial` (default) to use only the payload data, or `full` to use the complete change record. See the `transform` section below for details of the extra data you can access when you use the `full` option. - `case_insensitive`: This applies to the `server_name`, `db`, `schema`, and `table` properties From 5d22c468831165004878a2023facfe6547a37282 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 13 Jun 2025 15:52:26 +0100 Subject: [PATCH 5/9] DOC-5282 implemented PR feedback --- .../data-pipelines/data-type-handling.md | 48 ------------------- .../data-pipelines/deploy.md | 2 +- .../transform-examples/_index.md | 11 +++-- 3 files changed, 8 insertions(+), 53 deletions(-) delete mode 100644 content/integrate/redis-data-integration/data-pipelines/data-type-handling.md diff --git a/content/integrate/redis-data-integration/data-pipelines/data-type-handling.md b/content/integrate/redis-data-integration/data-pipelines/data-type-handling.md deleted file mode 100644 index fd845b7a00..0000000000 --- a/content/integrate/redis-data-integration/data-pipelines/data-type-handling.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -Title: Data type handling -aliases: /integrate/redis-data-integration/ingest/data-pipelines/data-type-handling/ -alwaysopen: false -categories: -- docs -- integrate -- rs -- rdi -description: Learn how relational data types are converted to Redis data types -group: di -linkTitle: Data type handling -summary: Redis Data Integration keeps Redis in sync with the primary database in near - real time. -type: integration -weight: 20 ---- - -RDI automatically converts data that has a Debezium JSON schema into Redis types. -Some Debezium types require special conversion. For example: - -- Date and Time types are converted to epoch time. -- Decimal numeric types are converted to strings so your app can use them - without losing precision. - -The following Debezium logical types are supported: - -- double -- float -- io.debezium.data.Bits -- io.debezium.data.Json -- io.debezium.data.VariableScaleDecimal -- io.debezium.time.Date -- io.debezium.time.NanoTime -- io.debezium.time.NanoTimestamp -- io.debezium.time.MicroTime -- io.debezium.time.MicroTimestamp -- io.debezium.time.ZonedTime -- io.debezium.time.ZonedTimestamp -- org.apache.kafka.connect.data.Date -- org.apache.kafka.connect.data.Decimal -- org.apache.kafka.connect.data.Time - -These types are **not** supported and will return "Unsupported Error": - -- io.debezium.time.interval - -All other values are treated as plain strings. diff --git a/content/integrate/redis-data-integration/data-pipelines/deploy.md b/content/integrate/redis-data-integration/data-pipelines/deploy.md index 81692155a4..0da6ab87a6 100644 --- a/content/integrate/redis-data-integration/data-pipelines/deploy.md +++ b/content/integrate/redis-data-integration/data-pipelines/deploy.md @@ -13,7 +13,7 @@ linkTitle: Deploy summary: Redis Data Integration keeps Redis in sync with the primary database in near real time. type: integration -weight: 10 +weight: 50 --- The sections below explain how to deploy a pipeline after you have created the required diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md index a4f21f50fa..ea454e8490 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/_index.md @@ -56,8 +56,10 @@ The main sections of these files are: - `source`: This is a mandatory section that specifies the data items that you want to use. You can add the following properties here: - `server_name`: Logical server name (optional). - - `db`: Database name (optional). This refers to a db name you supplied in config.yaml - - `schema`: Database schema (optional). This refers to a schema name you supplied in config.yaml + - `db`: Database name (optional). This refers to a database name you supplied in + [config.yaml]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config" >}}). + - `schema`: Database schema (optional). This refers to a schema name you supplied in + [config.yaml]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config" >}}). - `table`: Database table name. This refers to a table name you supplied in `config.yaml`. The default job doesn't apply to a specific table, so use "*" in place of the table name for this job only. - `row_format`: Format of the data to be transformed. This can take the values `partial` (default) to @@ -100,8 +102,9 @@ The main sections of these files are: - `key`: This lets you override the default key for the data structure with custom logic: - `expression`: Expression to generate the key. - `language`: Expression language, which must be `jmespath` or `sql`. - - `expire`: Positive integer value indicating a number of seconds for the key to expire. - If you don't specify this property, the key will never expire. + - `expire`: Positive integer value or SQL/JMESPath expression indicating a number of seconds + for the key to expire. If you don't specify this property, the key will never expire. + See [Set custom expiration times / TTL]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples/redis-expiration-example" >}}) for more information and examples. {{< note >}}In a job file, the `transform` section is optional, but if you don't specify a `transform`, you must specify custom key logic in `output.with.key`. You can include From 6d942703e7a76b8b520e5a164a9b7ec3d0c62d33 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 13 Jun 2025 16:39:34 +0100 Subject: [PATCH 6/9] DOC-5282 add processors section to config docs --- .../data-pipelines/pipeline-config.md | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md b/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md index 0245af9fe8..35faa74a26 100644 --- a/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md +++ b/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md @@ -158,7 +158,8 @@ processors: ## Sections -The main sections of the file configure [`sources`](#sources) and [`targets`](#targets). +The main sections of the file configure [`sources`](#sources), [`targets`](#targets), +and [`processors`](#processors). ### Sources @@ -235,3 +236,29 @@ sudo k3s kubectl delete nodes --all sudo service k3s restart ``` {{< /note >}} + +### Processors + +The `processors` section configures the behavior of the pipeline. The [example](#example) +configuration above contains the following properties: + +- `on_failed_retry_interval`: Number of seconds to wait before retrying a failed operation. + The default is 5 seconds. +- `read_batch_size`: Maximum number of records to read from the source database. RDI will + wait for the batch to fill up to `read_batch_size` or for `duration` to elapse, + whichever happens first. The default is 2000. +- `duration`: Time (in ms) after which data will be read from the stream even if + `read_batch_size` was not reached. The default is 100 ms. +- `write_batch_size`: The batch size for writing data to the target Redis database. This should be + less than or equal to the `read_batch_size`. The default is 200. +- `dedup`: Boolean value to enable the deduplication mechanism. The default is `false`. +- `dedup_max_size`: Maximum size of the deduplication set. The default is 1024. +- `error_handling`: The strategy to use when an invalid record is encountered. The available + strategies are `ignore` and `dlq` (store rejected messages in a dead letter queue). + The default is `dlq`. See + [What does RDI do if the data is corrupted or invalid?]({{< relref "/integrate/redis-data-integration/faq#what-does-rdi-do-if-the-data-is-corrupted-or-invalid" >}}) + for more information about the dead letter queue. + +See also the +[RDI configuration file reference]({{< relref "/integrate/redis-data-integration/reference/config-yaml-reference#processors" >}}) +for full details of the other available properties. From d3bca2848d9316840bb2b9d64f56c57c4d471d65 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 13 Jun 2025 16:43:37 +0100 Subject: [PATCH 7/9] DOC-5282 corrected row format options everywhere --- .../data-pipelines/transform-examples/redis-row-format.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-row-format.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-row-format.md index 1c500f738a..9278ee0a51 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-row-format.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-row-format.md @@ -17,7 +17,7 @@ weight: 30 The RDI pipelines support two separate row formats which you can specify in the `source` section of the job file: -- `basic` - (Default) Contains the current value of the row only. +- `partial` - (Default) Contains the current value of the row only. - `full` - Contains all information available for the row, including the key, the before and after values, and the operation code. The `full` row format is useful when you want to access the metadata associated with the row, such as the operation code, and the before and after values. From d2fe5fb3abdb6d546950aa531eb2f4248080778e Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 13 Jun 2025 16:52:12 +0100 Subject: [PATCH 8/9] DOC-5282 fixed links in examples --- .../data-pipelines/transform-examples/map-example.md | 4 ++-- .../transform-examples/redis-add-field-example.md | 2 +- .../transform-examples/redis-remove-field-example.md | 2 +- .../data-pipelines/transform-examples/redis-string-example.md | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/map-example.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/map-example.md index 296297828c..75f83e4d56 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/map-example.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/map-example.md @@ -27,14 +27,14 @@ transformation. ## Map to a new JSON structure The first -[job file]({{< relref "/integrate/redis-data-integration/data-pipelines/data-pipelines#job-files" >}}) +[job file]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}) example creates a new [JSON]({{< relref "/develop/data-types/json" >}}) object structure to write to the target. The `source` section selects the `employee` table of the [`chinook`](https://github.com/Redislabs-Solution-Architects/rdi-quickstart-postgres) database (the optional `db` value here corresponds to the `sources..connection.database` value defined in -[`config.yaml`]({{< relref "/integrate/redis-data-integration/data-pipelines/data-pipelines#the-configyaml-file" >}})). +[`config.yaml`]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config" >}})). In the `transform` section, the `map` transformation uses a [JMESPath](https://jmespath.org/) expression to specify the new JSON format. (Note that the vertical bar "|" in the `expression` diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-add-field-example.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-add-field-example.md index 4ba3824e16..c64f25fa03 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-add-field-example.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-add-field-example.md @@ -29,7 +29,7 @@ The `source` section selects the `customer` table of the [`chinook`](https://github.com/Redislabs-Solution-Architects/rdi-quickstart-postgres) database (the optional `db` value here corresponds to the `sources..connection.database` value defined in -[`config.yaml`]({{< relref "/integrate/redis-data-integration/data-pipelines/data-pipelines#the-configyaml-file" >}})). +[`config.yaml`]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config" >}})). In the `transform` section, the `add_field` transformation adds an extra field called `localphone` to the object, which is created by removing the country and area code from the `phone` diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-remove-field-example.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-remove-field-example.md index 45f58e3e67..b8321ed425 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-remove-field-example.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-remove-field-example.md @@ -29,7 +29,7 @@ The `source` section selects the `employee` table of the [`chinook`](https://github.com/Redislabs-Solution-Architects/rdi-quickstart-postgres) database (the optional `db` field here corresponds to the `sources..connection.database` field defined in -[`config.yaml`]({{< relref "/integrate/redis-data-integration/data-pipelines/data-pipelines#the-configyaml-file" >}})). +[`config.yaml`]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config" >}})). In the `transform` section, the `remove_field` transformation removes the `hiredate` field. diff --git a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-string-example.md b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-string-example.md index 50d1091a6f..c615721269 100644 --- a/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-string-example.md +++ b/content/integrate/redis-data-integration/data-pipelines/transform-examples/redis-string-example.md @@ -24,7 +24,7 @@ The `title` is then written to the Redis target database as a string under a cus form `AlbumTitle:42`, where the `42` is the primary key value of the table (the `albumid` column). The `connection` is an optional parameter that refers to the corresponding connection name defined in -[`config.yaml`]({{< relref "integrate/redis-data-integration/data-pipelines/data-pipelines#the-configyaml-file" >}}). +[`config.yaml`]({{< relref "/integrate/redis-data-integration/data-pipelines/pipeline-config" >}}). When you specify the `data_type` parameter for the job, it overrides the system-wide setting `target_data_type` defined in `config.yaml`. Here, the `string` data type also requires an `args` subsection with a `value` argument that specifies the column you want to capture from the source table. From 342159c6a098037c1d9ebb18d96e816f3d3267b7 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 16 Jun 2025 11:19:37 +0100 Subject: [PATCH 9/9] DOC-5282 added description of target_data_type property --- .../data-pipelines/pipeline-config.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md b/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md index 35faa74a26..1383febf0a 100644 --- a/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md +++ b/content/integrate/redis-data-integration/data-pipelines/pipeline-config.md @@ -144,6 +144,9 @@ processors: # Time (in ms) after which data will be read from stream even if # read_batch_size was not reached. # duration: 100 + # Data type to use in Redis target database: `hash` for Redis Hash, + # `json` for JSON (which requires the RedisJSON module). + # target_data_type: hash # The batch size for writing data to the target Redis database. Should be # less than or equal to the read_batch_size. # write_batch_size: 200 @@ -247,6 +250,13 @@ configuration above contains the following properties: - `read_batch_size`: Maximum number of records to read from the source database. RDI will wait for the batch to fill up to `read_batch_size` or for `duration` to elapse, whichever happens first. The default is 2000. +- `target_data_type`: Data type to use in the target Redis database. The options are `hash` + for Redis Hash (the default), or `json` for RedisJSON, which is available only if you have added the + RedisJSON module to the target database. Note that this setting is mainly useful when you + don't provide any custom jobs. When you do provide jobs, you can specify the + target data type in each job individually and choose from a wider range of data types. + See [Job files]({{< relref "/integrate/redis-data-integration/data-pipelines/transform-examples" >}}) + (which requires the RedisJSON module) for more information. - `duration`: Time (in ms) after which data will be read from the stream even if `read_batch_size` was not reached. The default is 100 ms. - `write_batch_size`: The batch size for writing data to the target Redis database. This should be