/
config.kinesis.extended.hocon
381 lines (311 loc) · 13.4 KB
/
config.kinesis.extended.hocon
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
{
# Full license text available in LICENSE.md
"license": {
"accept": true
}
# Where to read collector payloads from
"input": {
"type": "Kinesis"
# Optional. Name of the application which the KCL daemon should assume
"appName": "snowplow-enrich-kinesis"
# Name of the Kinesis stream to read from
"streamName": "collector-payloads"
# Optional. Region where the Kinesis stream is located
# This field is optional if it can be resolved with AWS region provider chain.
# It checks places like env variables, system properties, AWS profile file.
# https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/regions/providers/DefaultAwsRegionProviderChain.html
"region": "eu-central-1"
# Optional, set the initial position to consume the Kinesis stream
# Must be TRIM_HORIZON, LATEST or AT_TIMESTAMP
# LATEST: most recent data.
# TRIM_HORIZON: oldest available data.
# AT_TIMESTAMP: start from the record at or after the specified timestamp
"initialPosition": {
"type": "TRIM_HORIZON"
}
# "initialPosition": {
# "type": "AT_TIMESTAMP"
# "timestamp": "2020-07-17T10:00:00Z" # Required for AT_TIMESTAMP
# }
# Optional, set the mode for retrieving records.
"retrievalMode": {
"type": "Polling"
# Maximum size of a batch returned by a call to getRecords.
# Records are checkpointed after a batch has been fully processed,
# thus the smaller maxRecords, the more often records can be checkpointed
# into DynamoDb, but possibly reducing the throughput.
"maxRecords": 10000
}
# "retrievalMode": {
# "type": "FanOut"
# }
# Optional. Size of the internal buffer used when reading messages from Kinesis,
# each buffer holding up to maxRecords from above
"bufferSize": 3
# Optional. Settings for backoff policy for checkpointing.
# Records are checkpointed after all the records of the same chunk have been enriched
"checkpointBackoff": {
"minBackoff": 100 milliseconds
"maxBackoff": 10 seconds
"maxRetries": 10
}
# Optional, endpoint url configuration to override aws kinesis endpoints
# Can be used to specify local endpoint when using localstack
# "customEndpoint": "http://localhost:4566"
# Optional, endpoint url configuration to override aws dyanomdb endpoint for Kinesis checkpoints lease table
# Can be used to specify local endpoint when using localstack
# "dynamodbCustomEndpoint": "http://localhost:4566"
# Optional, endpoint url configuration to override aws cloudwatch endpoint for metrics
# Can be used to specify local endpoint when using localstack
# "cloudwatchCustomEndpoint": "http://localhost:4566"
}
"output": {
# Enriched events output
"good": {
"type": "Kinesis"
# Name of the Kinesis stream to write to
"streamName": "enriched"
# Optional. Region where the Kinesis stream is located
# This field is optional if it can be resolved with AWS region provider chain.
# It checks places like env variables, system properties, AWS profile file.
# https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/regions/providers/DefaultAwsRegionProviderChain.html
"region": "eu-central-1"
# Optional. How the output stream/topic will be partitioned in Kinesis
# Possible partition keys are: event_id, event_fingerprint, domain_userid, network_userid,
# user_ipaddress, domain_sessionid, user_fingerprint
# Refer to https://github.com/snowplow/snowplow/wiki/canonical-event-model to know what the
# possible partition keys correspond to.
# Otherwise, the partition key will be a random UUID.
# "partitionKey": "user_id"
# Optional. Policy to retry if writing to kinesis fails with unexepected errors
"backoffPolicy": {
"minBackoff": 100 milliseconds
"maxBackoff": 10 seconds
"maxRetries": 10
}
# Optional. Policy to retry if writing to kinesis exceeds the provisioned throughput.
"throttledBackoffPolicy": {
"minBackoff": 100 milliseconds
"maxBackoff": 1 second
}
# Optional. Limits the number of events in a single PutRecords request.
# Several requests are made in parallel
# Maximum allowed: 500
"recordLimit": 500
# Optional. Limits the number of bytes in a single PutRecords request,
# including records and partition keys.
# Several requests are made in parallel
# Maximum allowed: 5 MB
"byteLimit": 5242880
# Optional. Use a custom Kinesis endpoint.
# Can be used for instance to work locally with localstack
# "customEndpoint": "https://localhost:4566"
}
# Pii events output
"pii": {
"type": "Kinesis"
# Name of the Kinesis stream to write to
"streamName": "pii"
# Optional. Region where the Kinesis stream is located
# This field is optional if it can be resolved with AWS region provider chain.
# It checks places like env variables, system properties, AWS profile file.
# https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/regions/providers/DefaultAwsRegionProviderChain.html
"region": "eu-central-1"
# Optional. How the output stream/topic will be partitioned in Kinesis
# Possible partition keys are: event_id, event_fingerprint, domain_userid, network_userid,
# user_ipaddress, domain_sessionid, user_fingerprint
# Refer to https://github.com/snowplow/snowplow/wiki/canonical-event-model to know what the
# possible parittion keys correspond to.
# Otherwise, the partition key will be a random UUID.
# "partitionKey": "user_id"
# Optional. Policy to retry if writing to kinesis fails with unexepcted errors
"backoffPolicy": {
"minBackoff": 100 milliseconds
"maxBackoff": 10 seconds
"maxRetries": 10
}
# Optional. Policy to retry if writing to kinesis exceeds the provisioned throughput.
"throttledBackoffPolicy": {
"minBackoff": 100 milliseconds
"maxBackoff": 1 second
}
# Optional. Limits the number of events in a single PutRecords request.
# Several requests are made in parallel
# Maximum allowed: 500
"recordLimit": 500
# Optional. Limits the number of bytes in a single PutRecords request,
# including records and partition keys.
# Several requests are made in parallel
# Maximum allowed: 5 MB
"byteLimit": 5242880
# Optional. Use a custom Kinesis endpoint.
# Can be used for instance to work locally with localstack
# "customEndpoint": "https://localhost:4566"
}
# Bad rows output
"bad": {
"type": "Kinesis"
# Name of the Kinesis stream to write to
"streamName": "bad"
# Optional. Region where the Kinesis stream is located
# This field is optional if it can be resolved with AWS region provider chain.
# It checks places like env variables, system properties, AWS profile file.
# https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/regions/providers/DefaultAwsRegionProviderChain.html
"region": "eu-central-1"
# Optional. Policy to retry if writing to kinesis fails with unexepcted errors
"backoffPolicy": {
"minBackoff": 100 milliseconds
"maxBackoff": 10 seconds
"maxRetries": 10
}
# Optional. Policy to retry if writing to kinesis exceeds the provisioned throughput.
"throttledBackoffPolicy": {
"minBackoff": 100 milliseconds
"maxBackoff": 1 second
}
# Optional. Limits the number of events in a single PutRecords request.
# Several requests are made in parallel
# Maximum allowed: 500
"recordLimit": 500
# Optional. Limits the number of bytes in a single PutRecords request,
# including records and partition keys.
# Several requests are made in parallel
# Maximum allowed: 5 MB
"byteLimit": 5242880
# Optional. Use a custom Kinesis endpoint.
# Can be used for instance to work locally with localstack
# "customEndpoint": "https://localhost:4566"
}
}
# Optional. Concurrency of the app
"concurrency" : {
# Number of events that can get enriched at the same time within a chunk
"enrich": 256
# Number of chunks that can get sunk at the same time
# WARNING: if greater than 1, records can get checkpointed before they are sunk
"sink": 1
}
# Optional, period after which enrich assets should be checked for updates
# no assets will be updated if the key is absent
"assetsUpdatePeriod": "7 days"
# Optional, configuration of remote adapters
"remoteAdapters": {
# how long enrich waits to establish a connection to remote adapters
"connectionTimeout": "10 seconds",
# how long enrich waits to get a response from remote adapters
"readTimeout": "45 seconds",
# how many connections enrich opens at maximum for remote adapters
# increasing this could help with throughput in case of adapters with high latency
"maxConnections": 10,
# a list of remote adapter configs
"configs": [
{
"vendor": "com.example",
"version": "v1",
"url": "https://remote-adapter.com"
}
]
}
"monitoring": {
# Optional, for tracking runtime exceptions
"sentry": {
"dsn": "http://sentry.acme.com"
}
# Optional, configure how metrics are reported
"metrics": {
# Optional. Send metrics to a StatsD server on localhost
"statsd": {
"hostname": "localhost"
"port": 8125
# Required, how frequently to report metrics
"period": "10 seconds"
# Any key-value pairs to be tagged on every StatsD metric
"tags": {
"app": enrich
}
# Optional, override the default metric prefix
# "prefix": "snowplow.enrich."
}
# Optional. Log to stdout using Slf4j
"stdout": {
"period": "10 seconds"
# Optional, override the default metric prefix
# "prefix": "snowplow.enrich."
}
# Optional. Send KCL and KPL metrics to Cloudwatch
"cloudwatch": true
}
}
# Optional, configure telemetry
# All the fields are optional
"telemetry": {
# Set to true to disable telemetry
"disable": false
# Interval for the heartbeat event
"interval": 15 minutes
# HTTP method used to send the heartbeat event
"method": POST
# URI of the collector receiving the heartbeat event
"collectorUri": collector-g.snowplowanalytics.com
# Port of the collector receiving the heartbeat event
"collectorPort": 443
# Whether to use https or not
"secure": true
# Identifier intended to tie events together across modules,
# infrastructure and apps when used consistently
"userProvidedId": my_pipeline
# ID automatically generated upon running a modules deployment script
# Intended to identify each independent module, and the infrastructure it controls
"autoGeneratedId": hfy67e5ydhtrd
# Unique identifier for the VM instance
# Unique for each instance of the app running within a module
"instanceId": 665bhft5u6udjf
# Name of the terraform module that deployed the app
"moduleName": enrich-kinesis-ce
# Version of the terraform module that deployed the app
"moduleVersion": 1.0.0
}
# Optional. To activate/deactive enrich features that are still in beta
# or that are here for transition.
# This section might change in future versions
"featureFlags" : {
# Enrich 3.0.0 introduces the validation of the enriched events against atomic schema
# before emitting.
# If set to false, a bad row will be emitted instead of the enriched event
# if validation fails.
# If set to true, invalid enriched events will be emitted, as before.
# WARNING: this feature flag will be removed in a future version
# and it will become impossible to emit invalid enriched events.
# More details: https://github.com/snowplow/enrich/issues/517#issuecomment-1033910690
"acceptInvalid": false
# In early versions of enrich-kinesis and enrich-pubsub (pre-3.1.4), the Javascript enrichment
# incorrectly ran before the currency, weather, and IP Lookups enrichments. Set this flag to true
# to keep the erroneous behaviour of those previous versions. This flag will be removed in a
# future version.
# More details: https://github.com/snowplow/enrich/issues/619
"legacyEnrichmentOrder": false
# Try to base64 decode event if initial Thrift serialization fail
"tryBase64Decoding": false
}
# Optional. Configuration for experimental/preview features
"experimental": {
# Whether to export metadata using a webhook URL.
# Follows iglu-webhook protocol.
"metadata": {
"endpoint": "https://my_pipeline.my_domain.com/iglu"
"interval": 5 minutes
"organizationId": "c5f3a09f-75f8-4309-bec5-fea560f78455"
"pipelineId": "75a13583-5c99-40e3-81fc-541084dfc784"
}
}
# Optional. Configuration section for various validation-oriented settings.
"validation": {
# Optional. Configuration for custom maximum atomic fields (strings) length.
# Map-like structure with keys being field names and values being their max allowed length
"atomicFieldsLimits": {
"app_id": 5
"mkt_clickid": 100000
# ...and any other 'atomic' field with custom limit
}
}
}