From 39f3eb19016af50d5a581aca623a1551934207a9 Mon Sep 17 00:00:00 2001 From: Anton Parkhomenko Date: Sun, 5 Apr 2020 17:01:26 +0700 Subject: [PATCH] Initial commit --- .github/ISSUE_TEMPLATE.md | 23 ++ .github/PULL_REQUEST_TEMPLATE.md | 12 + .github/check_tag.sh | 14 + .github/deploy_common.sh | 19 ++ .../schemas/me.chuwy/pg-test/jsonschema/1-0-0 | 22 ++ .../schemas/me.chuwy/pg-test/jsonschema/1-0-1 | 24 ++ .../schemas/me.chuwy/pg-test/jsonschema/1-0-2 | 25 ++ .github/server.conf | 16 + .github/start_environment.sh | 42 +++ .github/workflows/snyk.yml | 20 ++ .github/workflows/test.yml | 79 +++++ .gitignore | 2 + .scalafix.conf | 17 + .scalafmt.conf | 22 ++ .travis.yml | 31 ++ CHANGELOG | 3 + CONTRIBUTING.md | 85 +++++ LICENSE | 201 ++++++++++++ README.md | 52 ++++ build.sbt | 69 +++++ config/config.json | 25 ++ config/resolver.json | 34 ++ .../postgresql_config/jsonschema/3-0-0 | 218 +++++++++++++ .../snowplow/postgres/api/DB.scala | 114 +++++++ .../snowplow/postgres/api/SchemaState.scala | 107 +++++++ .../snowplow/postgres/api/State.scala | 85 +++++ .../snowplow/postgres/api/TableState.scala | 20 ++ .../snowplow/postgres/config/DBConfig.scala | 47 +++ .../postgres/logging/Slf4jLogHandler.scala | 59 ++++ .../snowplow/postgres/package.scala | 30 ++ .../snowplow/postgres/resources.scala | 85 +++++ .../snowplow/postgres/shredding/Entity.scala | 32 ++ .../postgres/shredding/Shredded.scala | 20 ++ .../snowplow/postgres/shredding/Type.scala | 240 +++++++++++++++ .../snowplow/postgres/shredding/Value.scala | 56 ++++ .../snowplow/postgres/shredding/schema.scala | 77 +++++ .../postgres/shredding/transform.scala | 291 ++++++++++++++++++ .../postgres/storage/CommentIssue.scala | 38 +++ .../snowplow/postgres/storage/ddl.scala | 105 +++++++ .../postgres/storage/definitions.scala | 214 +++++++++++++ .../snowplow/postgres/storage/query.scala | 59 ++++ .../snowplow/postgres/storage/sql.scala | 125 ++++++++ .../snowplow/postgres/storage/utils.scala | 42 +++ .../postgres/streaming/UnorderedPipe.scala | 52 ++++ .../snowplow/postgres/streaming/data.scala | 48 +++ .../snowplow/postgres/streaming/package.scala | 27 ++ .../snowplow/postgres/streaming/sink.scala | 117 +++++++ .../src/test/resources/logback-test.xml | 19 ++ .../snowplow/postgres/Database.scala | 93 ++++++ .../postgres/api/SchemaStateSpec.scala | 83 +++++ .../snowplow/postgres/api/StateSpec.scala | 142 +++++++++ .../snowplow/postgres/queryspec.scala | 59 ++++ .../postgres/streaming/sinkspec.scala | 124 ++++++++ modules/loader/src/main/resources/logback.xml | 28 ++ .../snowplow/postgres/config/Cli.scala | 120 ++++++++ .../postgres/config/LoaderConfig.scala | 125 ++++++++ .../snowplow/postgres/loader/Main.scala | 61 ++++ .../snowplow/postgres/streaming/source.scala | 131 ++++++++ .../config/CliSpec.scala | 54 ++++ project/BuildSettings.scala | 96 ++++++ project/Dependencies.scala | 80 +++++ project/build.properties | 1 + project/plugins.sbt | 10 + 63 files changed, 4371 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100755 .github/check_tag.sh create mode 100755 .github/deploy_common.sh create mode 100644 .github/schemas/me.chuwy/pg-test/jsonschema/1-0-0 create mode 100644 .github/schemas/me.chuwy/pg-test/jsonschema/1-0-1 create mode 100644 .github/schemas/me.chuwy/pg-test/jsonschema/1-0-2 create mode 100644 .github/server.conf create mode 100755 .github/start_environment.sh create mode 100644 .github/workflows/snyk.yml create mode 100644 .github/workflows/test.yml create mode 100644 .gitignore create mode 100644 .scalafix.conf create mode 100644 .scalafmt.conf create mode 100644 .travis.yml create mode 100644 CHANGELOG create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE create mode 100644 README.md create mode 100644 build.sbt create mode 100644 config/config.json create mode 100644 config/resolver.json create mode 100644 modules/common/src/main/resources/iglu-client-embedded/schemas/com.snowplowanalytics.snowplow.storage/postgresql_config/jsonschema/3-0-0 create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/logging/Slf4jLogHandler.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/package.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala create mode 100644 modules/common/src/test/resources/logback-test.xml create mode 100644 modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala create mode 100644 modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala create mode 100644 modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala create mode 100644 modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/queryspec.scala create mode 100644 modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala create mode 100644 modules/loader/src/main/resources/logback.xml create mode 100644 modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala create mode 100644 modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala create mode 100644 modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala create mode 100644 modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala create mode 100644 modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala create mode 100644 project/BuildSettings.scala create mode 100644 project/Dependencies.scala create mode 100644 project/build.properties create mode 100644 project/plugins.sbt diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..218d3f5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,23 @@ + + +**Version**: + +**Expected behavior**: + +**Actual behavior**: + +**Steps to reproduce**: + + + 1. + 2. + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..a6f8f34 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,12 @@ + + diff --git a/.github/check_tag.sh b/.github/check_tag.sh new file mode 100755 index 0000000..a6146a0 --- /dev/null +++ b/.github/check_tag.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +set -e + +tag=$1 + +project_version=$(sbt version -Dsbt.log.noformat=true | perl -ne 'print "$1\n" if /info.*(\d+\.\d+\.\d+[^\r\n]*)/' | tail -n 1 | tr -d '\n') + +if [[ "${tag}" = "${project_version}" ]]; then + echo "Tag version (${tag}) matches project version (${project_version}). Deploying!" +else + echo "Tag version (${tag}) doesn't match version in scala project (${project_version}). Aborting!" + exit 1 +fi \ No newline at end of file diff --git a/.github/deploy_common.sh b/.github/deploy_common.sh new file mode 100755 index 0000000..3ab7b20 --- /dev/null +++ b/.github/deploy_common.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +set -e + +tag=$1 + +mkdir ~/.bintray/ +FILE=$HOME/.bintray/.credentials +cat <$FILE +realm = Bintray API Realm +host = api.bintray.com +user = $BINTRAY_SNOWPLOW_MAVEN_USER +password = $BINTRAY_SNOWPLOW_MAVEN_API_KEY +EOF + +sbt "project common" +publish +echo "Snowplow Postgres: published to Bintray Maven" +sbt "project common" +bintraySyncMavenCentral +echo "Snowplow Postgres: synced to Maven Central" diff --git a/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-0 b/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-0 new file mode 100644 index 0000000..a918c2b --- /dev/null +++ b/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-0 @@ -0,0 +1,22 @@ +{ + "self": { + "vendor": "me.chuwy", + "name": "pg-test", + "format": "jsonschema", + "version": "1-0-0" + }, + "properties": { + "requiredString": { "type": "string" }, + "requiredUnion": { "type": ["string", "boolean"] }, + "nested": { + "properties": { + "a": { "type": "number" }, + "b": {} + }, + "required": ["a"] + }, + "someArray": { "type": "array" }, + "id": { "type": "string", "format": "uuid" } + }, + "required": ["requiredString", "requiredUnion"] +} diff --git a/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-1 b/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-1 new file mode 100644 index 0000000..f54d021 --- /dev/null +++ b/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-1 @@ -0,0 +1,24 @@ +{ + "self": { + "vendor": "me.chuwy", + "name": "pg-test", + "format": "jsonschema", + "version": "1-0-1" + }, + "properties": { + "requiredString": { "type": "string" }, + "requiredUnion": { "type": ["string", "boolean"] }, + "nested": { + "properties": { + "a": { "type": "number" }, + "b": {}, + "c": { "type": ["integer", "null"] } + }, + "required": ["a"] + }, + "someArray": { "type": "array" }, + "id": { "type": "string", "format": "uuid" }, + "someDate": { "type": "string", "format": "date-time" } + }, + "required": ["requiredString", "requiredUnion"] +} diff --git a/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-2 b/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-2 new file mode 100644 index 0000000..a2511ef --- /dev/null +++ b/.github/schemas/me.chuwy/pg-test/jsonschema/1-0-2 @@ -0,0 +1,25 @@ +{ + "self": { + "vendor": "me.chuwy", + "name": "pg-test", + "format": "jsonschema", + "version": "1-0-2" + }, + "properties": { + "requiredString": { "type": "string" }, + "requiredUnion": { "type": ["string", "boolean"] }, + "nested": { + "properties": { + "a": { "type": "number" }, + "b": {}, + "c": { "type": ["integer", "null"] } + }, + "required": ["a"] + }, + "someArray": { "type": "array" }, + "id": { "type": "string", "format": "uuid" }, + "someDate": { "type": "string", "format": "date-time" }, + "bigInt": { "type": "integer", "maximum": 100000000000000 } + }, + "required": ["requiredString", "requiredUnion"] +} diff --git a/.github/server.conf b/.github/server.conf new file mode 100644 index 0000000..0f6caf3 --- /dev/null +++ b/.github/server.conf @@ -0,0 +1,16 @@ +# Dummy Iglu Server configuration to assist in testing + +repo-server { + interface = "0.0.0.0" + port = 8080 + idleTimeout = 5 + threadPool = { + type = "global" + } +} + +database { + type = "dummy" +} + +debug = true \ No newline at end of file diff --git a/.github/start_environment.sh b/.github/start_environment.sh new file mode 100755 index 0000000..672604d --- /dev/null +++ b/.github/start_environment.sh @@ -0,0 +1,42 @@ +#!/bin/sh + +set -e + +if [ -z ${GITHUB_WORKSPACE+x} ]; then + echo "GITHUB_WORKSPACE is unset"; + exit 1 +fi + +IGLUCTL_ZIP="igluctl_0.7.2_rc1.zip" +IGLUCTL_URI="http://dl.bintray.com/snowplow/snowplow-generic/$IGLUCTL_ZIP" +IGLUCENTRAL_PATH="$GITHUB_WORKSPACE/iglu-central" +SCHEMAS_PATH="$IGLUCENTRAL_PATH/schemas/" +TEST_SCHEMAS="$GITHUB_WORKSPACE/.github/schemas/" +POSTGRES_PASSWORD=mysecretpassword + +git clone https://github.com/snowplow/iglu-central.git $IGLUCENTRAL_PATH + +docker run \ + -p 8080:8080 \ + -v $GITHUB_WORKSPACE/.github:/iglu \ + --rm -d \ + snowplow-docker-registry.bintray.io/snowplow/iglu-server:0.6.1 \ + --config /iglu/server.conf + +echo "Waiting for Iglu Server..." +sleep 5 + +wget $IGLUCTL_URI +unzip -j $IGLUCTL_ZIP + +./igluctl static push \ + $SCHEMAS_PATH \ + http://localhost:8080/ \ + 48b267d7-cd2b-4f22-bae4-0f002008b5ad \ + --public + +./igluctl static push \ + $TEST_SCHEMAS \ + http://localhost:8080/ \ + 48b267d7-cd2b-4f22-bae4-0f002008b5ad \ + --public diff --git a/.github/workflows/snyk.yml b/.github/workflows/snyk.yml new file mode 100644 index 0000000..356e980 --- /dev/null +++ b/.github/workflows/snyk.yml @@ -0,0 +1,20 @@ +name: Snyk + +on: + push: + branches: [ master ] + +jobs: + security: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Run Snyk to check for vulnerabilities + uses: snyk/actions/scala@master + with: + command: monitor + args: --project-name=snowplow-postgres-loader + env: + SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..e704f4e --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,79 @@ +name: Test + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + + services: + postgres: + image: postgres + ports: + - 5432:5432 + env: + # See src/test/scala/com/snowplowanalytics/snowplow/postgres/loader/Database.scala + POSTGRES_USER: postgres + POSTGRES_PASSWORD: mysecretpassword + POSTGRES_DB: snowplow + POSTGRES_PORT: 5432 + options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 + + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Prepare test environment + run: $GITHUB_WORKSPACE/.github/start_environment.sh + - name: Run tests + run: sbt clean coverage test + - name: Aggregate coverage data + if: ${{ always() }} + run: sbt coverageAggregate + - name: Submit coveralls data + if: ${{ always() }} + run: sbt coverageReport coveralls + env: + COVERALLS_REPO_TOKEN: ${{ secrets.COVERALLS_REPO_TOKEN }} + + deploy: + needs: test + if: startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Compare SBT version with git tag + run: .github/check_tag.sh ${GITHUB_REF##*/} + - name: Docker login + run: docker login -u $DOCKER_USERNAME -p $DOCKER_PASSWORD + env: + DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} + DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} + - name: Build and publish Docker image + run: sbt "project loader" docker:publish + + deploy_common: + needs: test + if: startsWith(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Compare SBT version with git tag + run: .github/check_tag.sh ${GITHUB_REF##*/} + - name: Deploy SCE on Bintray Maven and Maven Central + run: .github/deploy_common.sh + env: + SONA_USER: snowplow + SONA_PASS: ${{ secrets.SONA_PASS }} + BINTRAY_SNOWPLOW_MAVEN_USER: ${{ secrets.BINTRAY_SNOWPLOW_MAVEN_USER }} + BINTRAY_SNOWPLOW_MAVEN_API_KEY: ${{ secrets.BINTRAY_SNOWPLOW_MAVEN_API_KEY }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..555feb4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +target/ +project/target/ diff --git a/.scalafix.conf b/.scalafix.conf new file mode 100644 index 0000000..c6e4e6d --- /dev/null +++ b/.scalafix.conf @@ -0,0 +1,17 @@ +rules = [ + DisableSyntax +] +DisableSyntax.noUniversalEquality = true +DisableSyntax.noVars = true +DisableSyntax.noNulls = true +DisableSyntax.noReturns = true +DisableSyntax.noWhileLoops = true +DisableSyntax.noAsInstanceOf = true +DisableSyntax.noIsInstanceOf = true +DisableSyntax.noXml = true +DisableSyntax.noDefaultArgs = true +DisableSyntax.noFinalVal = true +DisableSyntax.noFinalize = true +DisableSyntax.noValPatterns = true +DisableSyntax.noUniversalEquality = true +// DisableSyntax.noThrows = true diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 0000000..2c944fb --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,22 @@ +version = 2.5.2 +style = default +maxColumn = 140 +optIn.breakChainOnFirstMethodDot = false +assumeStandardLibraryStripMargin = true +align = none +align.openParenCallSite = true +align.openParenDefnSite = true +danglingParentheses = true +verticalMultiline.newlineAfterOpenParen = true +newlines.afterCurlyLambda = preserve +continuationIndent.defnSite = 2 +rewrite.rules = [ + AsciiSortImports, + AvoidInfix, + PreferCurlyFors, + RedundantBraces, + RedundantParens, + SortModifiers +] +project.git = true +includeNoParensInSelectChains = true diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..711e2b0 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,31 @@ +language: scala +dist: trusty +scala: + - 2.13.2 +cache: + directories: + - $HOME/.ivy2 + - $HOME/.coursier + - $HOME/.sbt +jdk: + - oraclejdk8 +services: + - docker +script: + - sbt test +before_install: + - bash ./.travis/checkTag.sh $TRAVIS_TAG + - docker login -u $DOCKER_USERNAME -p $DOCKER_PASSWORD + - sbt publishLocal +deploy: + provider: script + script: ./.travis/deploy.sh $TRAVIS_TAG + skip_cleanup: true + on: + tags: true +env: + global: + # DOCKER_USERNAME + - secure: encrypted_username + # DOCKER_PASSWORD + - secure: encrypted_password diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 0000000..4e67837 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,3 @@ +Version 0.1.0 (2020-10-05) +-------------------------- +Initial release diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..44a662e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,85 @@ +# Contributor guide + +Snowplow Postgres Loader is maintained by the Engineering team at Snowplow Analytics and improved on by external contributors for which we are +extremely grateful. + +## Getting in touch + +### Community support requests + +First and foremost, please do not log an issue if you are asking for support, all of our community support requests go through +our Discourse forum: https://discourse.snowplowanalytics.com/. + +Posting your problem there ensures more people will see it and you should get support faster than creating a new issue on +GitHub. Please do create a new issue on GitHub if you think you've found a bug though! + +### Gitter + +If you want to discuss already created issues, potential bugs, new features you would like to work on or any kind of developer +chat, you can head over to our [Gitter room](https://gitter.im/snowplow/snowplow). + +## Roadmap visibility + +Being an open source company, transparency is very important to us, that's why we try to share as much as possible regarding +what we will be working on next so that you can: + +- see how your contributions fit into our roadmap +- help us design new features +- share your opinions on the technical direction of the Snowplow pipeline + +For insights into what we will be working on next, you can look at +[the RFC category in our Discourse](https://discourse.snowplowanalytics.com/c/roadmap/rfcs). + +## Issues + +### Creating an issue + +The project contains an issue template which should help guiding you through the process. However, please keep in mind +that support requests should go to our Discourse forum: https://discourse.snowplowanalytics.com/ and not GitHub issues. + +It's also a good idea to log an issue before starting to work on a pull request to discuss it with the maintainers. + +### Working on an issue + +If you see an issue you would like to work on, please let us know in the issue! That will help us in terms of scheduling and +not doubling the amount of work. + +If you don't know where to start contributing, you can look at +[the issues labeled `good first issue`](https://github.com/snowplow-incubator/snowplow-postgres-loader/labels/good%20first%20issue). + +## Pull requests + +These are a few guidelines to keep in mind when opening pull requests, there is a GitHub template that reiterates most of the +points described here. + +### Commit hygiene + +We keep a strict 1-to-1 correspondance between commits and issues, as such our commit messages are formatted in the following +fashion: + +`Commit description (close #1234)` + +for example: + +`Add Apache Kafka support (close #1234)` + +### Writing tests + +Whenever necessary, it's good practice to add the corresponding unit tests to whichever feature you are working on. + +### Feedback cycle + +Reviews should happen fairly quickly during weekdays. If you feel your pull request has been forgotten, please ping one +or more maintainers in the pull request. + +### Getting your pull request merged + +If your pull request is fairly chunky, there might be a non-trivial delay between the moment the pull request is approved and +the moment it gets merged. This is because your pull request will have been scheduled for a specific milestone which might or +might not be actively worked on by a maintainer at the moment. + +### Contributor license agreement + +We require outside contributors to sign a Contributor license agreement (or CLA) before we can merge their pull requests. +You can find more information on the topic in [the dedicated wiki page](https://github.com/snowplow/snowplow/wiki/CLA). +The @snowplowcla bot will guide you through the process. diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..14c847b --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2020 Snowplow Analytics Ltd. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..41e944e --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ +[![License][license-image]][license] +[![Coverage Status][coveralls-image]][coveralls] +[![Test][test-image]][test] +[![Docker][docker-image]][docker] + +# Snowplow Postgres Loader + +## Quickstart + +Assuming [Docker][docker] is installed: + +1. Add own [`config.json`][config] (specify connection and stream details) +2. Add own [`resolver.json`][resolver] (all schemas must be on [Iglu Server 0.6.0+][iglu-server]) +3. Run the Docker image: + +```bash +$ docker run --rm -v $PWD/config:/snowplow/config \ + snowplow/snowplow-postgres-loader:latest \ + --resolver /snowplow/config/resolver.json \ + --config /snowplow/config/config.json +``` + +## Copyright and License + +Snowplow Postgres Loader is copyright 2020 Snowplow Analytics Ltd. + +Licensed under the **[Apache License, Version 2.0][license]** (the "License"); +you may not use this software except in compliance with the License. + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +[config]: https://github.com/snowplow-incubator/snowplow-postgres-loader/blob/master/config/config.json +[resolver]: https://github.com/snowplow-incubator/snowplow-postgres-loader/blob/master/config/resolver.json + +[docker]: https://www.docker.com/ +[iglu-server]: https://github.com/snowplow-incubator/iglu-server + +[docker]: https://hub.docker.com/r/snowplow/snowplow-postgres-loader/tags +[docker-image]: https://img.shields.io/docker/v/snowplow/snowplow-postgres-loader/latest + +[test]: https://github.com/snowplow-incubator/snowplow-postgres-loader/actions?query=workflow%3ATest +[test-image]: https://github.com/snowplow-incubator/snowplow-postgres-loader/workflows/Test/badge.svg + +[license]: http://www.apache.org/licenses/LICENSE-2.0 +[license-image]: http://img.shields.io/badge/license-Apache--2-blue.svg?style=flat + +[coveralls]: https://coveralls.io/github/snowplow-incubator/snowplow-postgres-loader?branch=master +[coveralls-image]: https://coveralls.io/repos/github/snowplow-incubator/snowplow-postgres-loader/badge.svg?branch=master diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000..b71d747 --- /dev/null +++ b/build.sbt @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ + +lazy val root = project.in(file(".")) + .settings(BuildSettings.projectSettings) + .aggregate(common, loader) + +lazy val common = project + .in(file("modules/common")) + .settings(name := "snowplow-postgres") + .enablePlugins(BuildInfoPlugin) + .settings(BuildSettings.projectSettings) + .settings(BuildSettings.scoverageSettings) + .settings(BuildSettings.mavenSettings) + .settings( + resolvers += Dependencies.SnowplowBintray, + libraryDependencies ++= Seq( + Dependencies.postgres, + Dependencies.catsEffect, + Dependencies.circe, + Dependencies.circeGeneric, + Dependencies.circeExtras, + Dependencies.circeParser, + Dependencies.circeLiteral, + Dependencies.doobie, + Dependencies.doobiePg, + Dependencies.doobiePgCirce, + Dependencies.doobieHikari, + Dependencies.log4s, + Dependencies.logback, + Dependencies.analyticsSdk, + Dependencies.badRows, + Dependencies.schemaDdl, + Dependencies.specs2, + Dependencies.specs2Check, + Dependencies.scalaCheck + ) + ) + +lazy val loader = project + .in(file("modules/loader")) + .settings(name := "snowplow-postgres-loader") + .settings(BuildSettings.projectSettings) + .settings(BuildSettings.dockerSettings) + .settings(BuildSettings.buildInfoSettings) + .settings(BuildSettings.addExampleConfToTestCp) + .settings( + libraryDependencies ++= Seq( + Dependencies.commons, + Dependencies.fs2Aws, + Dependencies.fs2PubSub, + Dependencies.decline, + Dependencies.specs2 + ) + ) + .dependsOn(common) + .enablePlugins(JavaAppPackaging, DockerPlugin, BuildInfoPlugin) + +addCompilerPlugin("com.olegpy" %% "better-monadic-for" % "0.3.1") diff --git a/config/config.json b/config/config.json new file mode 100644 index 0000000..4fe48c1 --- /dev/null +++ b/config/config.json @@ -0,0 +1,25 @@ +{ + "schema": "iglu:com.snowplowanalytics.snowplow.storage/postgresql_config/jsonschema/3-0-0", + "data": { + "name": "Acme Ltd. Snowplow Postgres", + "id": "5c5e4353-4eeb-43da-98f8-2de6dc7fa947", + "source": { + "kinesis": { + "appName": "acme-postgres-loader", + "streamName": "enriched-events", + "region": "eu-central-1", + "initialPosition": "TRIM_HORIZON" + } + }, + + "host": "localhost", + "port": 5432, + "database": "snowplow", + "username": "postgres", + "password": "mysecretpassword", + "schema": "atomic", + + "sslMode": "REQUIRE", + "purpose": "ENRICHED_EVENTS" + } +} diff --git a/config/resolver.json b/config/resolver.json new file mode 100644 index 0000000..e4a3ac3 --- /dev/null +++ b/config/resolver.json @@ -0,0 +1,34 @@ +{ + "schema": "iglu:com.snowplowanalytics.iglu/resolver-config/jsonschema/1-0-2", + "data": { + "cacheSize": 500, + "cacheTtl": 600, + "repositories": [ + { + "name": "Iglu Central", + "priority": 1, + "vendorPrefixes": [ + "com.snowplowanalytics" + ], + "connection": { + "http": { + "uri": "http://iglucentral.com" + } + } + }, + + { + "name": "Iglu Central - Mirror 01", + "priority": 1, + "vendorPrefixes": [ + "com.snowplowanalytics" + ], + "connection": { + "http": { + "uri": "http://mirror01.iglucentral.com" + } + } + } + ] + } +} diff --git a/modules/common/src/main/resources/iglu-client-embedded/schemas/com.snowplowanalytics.snowplow.storage/postgresql_config/jsonschema/3-0-0 b/modules/common/src/main/resources/iglu-client-embedded/schemas/com.snowplowanalytics.snowplow.storage/postgresql_config/jsonschema/3-0-0 new file mode 100644 index 0000000..5cb1e8c --- /dev/null +++ b/modules/common/src/main/resources/iglu-client-embedded/schemas/com.snowplowanalytics.snowplow.storage/postgresql_config/jsonschema/3-0-0 @@ -0,0 +1,218 @@ +{ + "$schema": "http://iglucentral.com/schemas/com.snowplowanalytics.self-desc/schema/jsonschema/1-0-0#", + "description": "Snowplow PostgreSQL storage configuration", + "self": { + "vendor": "com.snowplowanalytics.snowplow.storage", + "name": "postgresql_config", + "format": "jsonschema", + "version": "3-0-0" + }, + "type": "object", + "properties": { + "name": { + "description": "Human-readable storage target name, used only for logging", + "type": "string", + "maxLength": 255 + }, + "id": { + "description": "Machine-readable unique identificator", + "type": "string", + "format": "uuid" + }, + "source": { + "type": "object", + "oneOf": [ + { + "properties": { + "kinesis": { + "properties": { + "appName": { + "description": "Kinesis app name", + "type": "string" + }, + "streamName": { + "description": "Kinesis stream", + "type": "string" + }, + "region": { + "description": "AWS Region", + "type": "string" + }, + "initialPosition": { + "description": "Initial position in the Kinesis stream", + "oneOf": [ + { + "enum": ["TRIM_HORIZON", "LATEST"] + }, + { + "type": "object", + "properties": { + "AT_TIMESTAMP": { + "properties": { + "timestamp": { + "description": "Timestamp to load data from, e.g. 2020-06-06T00:00:00Z", + "type": "string", + "format": "date-time" + } + }, + "additionalProperties": false, + "required": ["timestamp"] + } + }, + "additionalProperties": false, + "required": ["AT_TIMESTAMP"] + } + ] + } + }, + "required": ["appName", "streamName", "region", "initialPosition"] + } + } + } + ] + }, + "host": { + "description": "PostgreSQL host ('localhost' for enabled SSH Tunnel)", + "type": "string", + "anyOf": [ + { "format": "hostname" }, + { "format": "ipv4" }, + { "format": "ipv6" } + ] + }, + "database": { + "description": "PostgreSQL database name", + "type": "string", + "minLength": 1, + "maxLength": 64 + }, + "port": { + "description": "PostgreSQL database port", + "type": "integer", + "minimum": 1, + "maximum": 65535 + }, + "username": { + "description": "PostgreSQL user used to load data", + "type": "string", + "maxLength": 64 + }, + "password": { + "description": "PostgreSQL password, either plain-text or encrypted key for EC2 Parameter Storage", + "type": ["string", "object"], + "properties": { + "ec2ParameterStore": { + "description": "EC2 Parameter Storage configuration", + "type": "object", + "properties": { + "parameterName": { + "description": "EC2 Parameter with encrypted password", + "type": "string" + } + }, + "required": ["parameterName"] + } + }, + "required": ["ec2ParameterStore"] + }, + "schema": { + "description": "PostgreSQL database schema (e.g. 'atomic')", + "type": "string", + "maxLength": 64 + }, + "sshTunnel": { + "description": "Optional SSH Tunnel configuration", + "type": ["object", "null"], + "properties": { + "bastion": { + "description": "Bastion host configuration", + "type": "object", + "properties": { + "host": { + "description": "Bastion SSH host", + "type": "string", + "anyOf": [ + { "format": "hostname" }, + { "format": "ipv4" }, + { "format": "ipv6" } + ] + }, + "port": { + "description": "Bastion SSH port", + "type": "integer", + "minimum": 1, + "maximum": 65535 + }, + "user": { + "description": "SSH user", + "type": "string" + }, + "passphrase": { + "description": "Plain-text SSH user's passphrase", + "type": ["string", "null"], + "maxLength": 2048 + }, + "key": { + "description": "SSH-key stored in EC2 Parameter Storage", + "type": ["object", "null"], + "properties": { + "ec2ParameterStore": { + "type": "object", + "properties": { + "parameterName": { + "type": "string", + "maxLength": 2048 + } + }, + "required": ["parameterName"] + } + }, + "required": ["ec2ParameterStore"] + } + }, + "required": ["host", "port", "user", "passphrase", "key"] + }, + "destination": { + "description": "Database socket inside private network", + "type": "object", + "properties": { + "host": { + "description": "PostgreSQL host inside private network (root-level host should be changed to 'localhost')", + "type": "string", + "anyOf": [ + { "format": "hostname" }, + { "format": "ipv4" }, + { "format": "ipv6" } + ] + }, + "port": { + "description": "PostgreSQL port inside private network (root-level port should be changed to be indentical to 'localPort')", + "type": "integer", + "minimum": 1, + "maximum": 65535 + } + }, + "required": ["host", "port"] + }, + "localPort": { + "description": "Arbitrary port on node, running Loader (shoul be identical to root-level 'port')", + "type": "integer", + "minimum": 1, + "maximum": 65535 + } + }, + "required": ["bastion", "destination", "localPort"] + }, + "sslMode": { + "description": "JDBC sslMode", + "type": "string", + "enum": ["DISABLE", "REQUIRE", "VERIFY_CA", "VERIFY_FULL"] + }, + "purpose": { + "description": "Kind of data stored in this instance", + "enum": ["ENRICHED_EVENTS"] + } + }, + "additionalProperties": false, + "required": ["name", "id", "host", "database", "port", "username", "password", "schema", "sslMode", "purpose"] +} \ No newline at end of file diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala new file mode 100644 index 0000000..40445a1 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.api + +import cats.data.EitherT +import cats.implicits._ + +import cats.effect.{Bracket, Clock, Sync} + +import doobie.implicits._ +import doobie.util.transactor.Transactor + +import com.snowplowanalytics.iglu.core.SchemaKey + +import com.snowplowanalytics.iglu.client.Resolver + +import com.snowplowanalytics.iglu.schemaddl.migrations.SchemaList + +import com.snowplowanalytics.snowplow.postgres.shredding.{Entity, Shredded, schema} +import com.snowplowanalytics.snowplow.postgres.storage.ddl +import com.snowplowanalytics.snowplow.postgres.streaming.sink + +trait DB[F[_]] { + def insert(event: List[Entity]): F[Unit] + def alter(schemaKey: SchemaKey): F[Unit] + def create(schemaKey: SchemaKey, includeMeta: Boolean): F[Unit] + + def getSchemaList(schemaKey: SchemaKey): F[SchemaList] +} + +object DB { + + def apply[F[_]](implicit ev: DB[F]): DB[F] = ev + + def process[F[_]](shredded: Shredded, state: State[F])(implicit D: DB[F], B: Bracket[F, Throwable]): F[Unit] = { + val (includeMeta, entities) = shredded match { + case Shredded.ShreddedSnowplow(atomic, entities) => (true, atomic :: entities) + case Shredded.ShreddedSelfDescribing(entity) => (false, List(entity)) + } + val insert = D.insert(entities) + + // Mutate table and Loader's mutable variable. Only for locked state! + def mutate(missing: Set[SchemaKey], outdated: Set[SchemaKey]): F[Unit] = + for { + _ <- missing.toList.traverse(key => D.create(key, includeMeta)) // Create missing tables if any + _ <- outdated.toList.traverse(D.alter) // Updated outdated tables if any + _ <- (missing ++ outdated).toList.traverse_ { entity => + for { // Update state with new schemas + list <- D.getSchemaList(entity) + _ <- state.put(list) + } yield () + } + } yield () + + state.checkAndRun(_.checkEvent(entities), insert, mutate) + } + + sealed trait StateCheck { + def missing: Set[SchemaKey] + def outdated: Set[SchemaKey] + + final def add(entity: SchemaKey, state: TableState): StateCheck = + state match { + case TableState.Match => this + case TableState.Missing => StateCheck.Block(missing + entity, outdated) + case TableState.Outdated => StateCheck.Block(missing, outdated + entity) + } + } + + object StateCheck { + case class Block(missing: Set[SchemaKey], outdated: Set[SchemaKey]) extends StateCheck + case object Ok extends StateCheck { + def missing: Set[SchemaKey] = Set.empty + def outdated: Set[SchemaKey] = Set.empty + } + } + + def interpreter[F[_]: Sync: Clock](resolver: Resolver[F], xa: Transactor[F], schemaName: String): DB[F] = + new DB[F] { + def insert(event: List[Entity]): F[Unit] = + event.traverse_(sink.insertStatement(schemaName, _)).transact(xa) + + def alter(schemaKey: SchemaKey): F[Unit] = { + val result = ddl.alterTable[F](resolver, schemaName, schemaKey) + rethrow(result.semiflatMap(_.transact(xa))) + } + + def create(schemaKey: SchemaKey, includeMeta: Boolean): F[Unit] = { + val result = ddl.createTable[F](resolver, schemaName, schemaKey, includeMeta) + rethrow(result.semiflatMap(_.transact(xa))) + } + + def getSchemaList(schemaKey: SchemaKey): F[SchemaList] = { + val result = schema.getSchemaList[F](resolver)(schemaKey.vendor, schemaKey.name, schemaKey.version.model) + rethrow(result) + } + + private def rethrow[A, E](f: EitherT[F, E, A]): F[A] = + f.value.flatMap { + case Right(result) => Sync[F].pure(result) + case Left(error) => Sync[F].raiseError(new RuntimeException(error.toString)) + } + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala new file mode 100644 index 0000000..db917bf --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.api + +import cats.data.EitherT +import cats.implicits._ + +import cats.effect.concurrent.Ref +import cats.effect.{Clock, Sync} + +import com.snowplowanalytics.iglu.core.SchemaKey + +import com.snowplowanalytics.iglu.client.resolver.Resolver + +import com.snowplowanalytics.iglu.schemaddl.ModelGroup +import com.snowplowanalytics.iglu.schemaddl.migrations.SchemaList + +import com.snowplowanalytics.snowplow.badrows.FailureDetails.LoaderIgluError +import com.snowplowanalytics.snowplow.postgres.loader._ +import com.snowplowanalytics.snowplow.postgres.shredding + +/** + * State of the DB schema, where every `ModelGroup` (read "table") + * is associated with list of schemas. Each of these schemas is reflected + * in the structure of the table. If `SchemaKey` matches the `ModelGroup`, + * but not associated with it - the table is outdated. After table has been + * migrated to reflect the newest schema - state need to be updated up to + * that schema + */ +case class SchemaState(tables: Map[ModelGroup, SchemaList]) { + + /** + * Check if `SchemaKey` is known to the state + * @param entity `SchemaKey` taken from table comment + * @return one of three possible tables states + */ + private[postgres] def check(entity: SchemaKey): TableState = { + val Atomic = shredding.transform.Atomic + val group = (entity.vendor, entity.name, entity.version.model) + + group match { + case (Atomic.vendor, Atomic.name, Atomic.version.model) => + TableState.Match + case _ => + tables.get(group) match { + case Some(SchemaList.Full(schemas)) => + if (schemas.toList.map(_.self.schemaKey).contains(entity)) TableState.Match else TableState.Outdated + case Some(SchemaList.Single(schema)) => + if (schema.self.schemaKey === entity) TableState.Match else TableState.Outdated + case None => + TableState.Missing + } + } + } + + /** Check if any entities from an event are missing in current state */ + def checkEvent(entities: List[shredding.Entity]): DB.StateCheck = + entities.foldLeft(DB.StateCheck.Ok: DB.StateCheck)((acc, key) => acc.add(key.origin, check(key.origin))) + + /** Add a whole `SchemaList` to the state (replace if it exists) */ + def put(list: SchemaList): SchemaState = { + val entity = list.latest.schemaKey + val modelGroup = (entity.vendor, entity.name, entity.version.model) + SchemaState(tables ++ Map(modelGroup -> list)) + } +} + +object SchemaState { + + /** + * Initialize internal mutable state by traversing all table comments to get their latest version + * For every schema URI, the whole list will be fetched to keep ordering consistent + * All newer versions (present on registry, but not reflected on table) will be dropped + * + * @param resolver Iglu Resolver attached to Iglu Server + * @return a list of potential schema issues (not fatal errors, to be logged) and + * an actual mutable reference with the state + */ + def init[F[_]: Sync: Clock](keys: List[SchemaKey], resolver: Resolver[F]) = { + val initial = SchemaState(Map.empty) + val availableSchemas = keys.traverse { key => + EitherT(resolver.listSchemas(key.vendor, key.name, key.version.model)) + .leftMap(resolutionError => LoaderIgluError.IgluError(key, resolutionError)) + .flatMap(schemaKeyList => SchemaList.fromSchemaList(schemaKeyList, shredding.schema.fetch(resolver))) + .map { list => + list.until(key) match { + case Some(updatedList) => updatedList + case None => throw new IllegalStateException(s"SchemaList $list doesn't match vendor of ${key.toSchemaUri}") + } + } + } + + availableSchemas.map(list => list.foldLeft(initial)((acc, cur) => acc.put(cur))).flatMap { state => + EitherT.liftF[F, LoaderIgluError, Ref[F, SchemaState]](Ref.of[F, SchemaState](state)) + } + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala new file mode 100644 index 0000000..a544bec --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.api + +import cats.Monad +import cats.data.EitherT +import cats.implicits._ + +import cats.effect.concurrent.{MVar2, MVar, Ref} +import cats.effect.{Bracket, Clock, Concurrent} +import cats.effect.implicits._ + +import com.snowplowanalytics.iglu.core.SchemaKey + +import com.snowplowanalytics.iglu.client.resolver.Resolver + +import com.snowplowanalytics.iglu.schemaddl.migrations.SchemaList + +import com.snowplowanalytics.snowplow.badrows.FailureDetails.LoaderIgluError +import com.snowplowanalytics.snowplow.postgres.api.DB.StateCheck + +/** + * Mutable variable, protected by by lock. + * [[checkAndRun]] is the only function that should be able to mutate this structure + */ +final class State[F[_]](lock: MVar2[F, Unit], state: Ref[F, SchemaState]) { + + /** + * Primary state-handling and the only state-mutation function. + * + * Most of the time `stateCheck` returns `StateCheck.Ok`, meaning that data can be + * inserted without state or DB schema mutation and lock is not acquired, while + * `action` gets executed. + * + * If new schemas are coming through and state and DB schema have to be changed + * it acquires a lock, preventing other threads from mutating data first, then checks + * if state is still outdated (in case other thread acquired the lock first) and + * performs `mutate` and `action`, releasing the lock afterwards + * If another thread already updated the state it just performs `action` + * + * @param stateCheck check if lock has to be acquired + * @param action primary IO - DB insert statement + * @param mutate IO that mutates the internal state and DB schema + */ + def checkAndRun(stateCheck: SchemaState => StateCheck, action: F[Unit], mutate: (Set[SchemaKey], Set[SchemaKey]) => F[Unit])(implicit + F: Bracket[F, Throwable] + ): F[Unit] = { + // Just insert OR mutate and insert + def check(update: (Set[SchemaKey], Set[SchemaKey]) => F[Unit]) = + state.get.map(stateCheck).flatMap { + case StateCheck.Ok => + Monad[F].unit + case StateCheck.Block(missingTables, outdatedTables) => + update(missingTables, outdatedTables) + } + + check((_, _) => withLock(check(mutate))) *> action + } + + /** Update [[SchemaState]] with new `SchemaList` */ + private[api] def put(schemaList: SchemaList): F[Unit] = + state.update(_.put(schemaList)) + + private def withLock[A](fa: F[A])(implicit F: Bracket[F, Throwable]): F[A] = + lock.take.bracket(_ => fa)(_ => lock.put(())) + +} + +object State { + def init[F[_]: Concurrent: Clock](keys: List[SchemaKey], resolver: Resolver[F]): EitherT[F, LoaderIgluError, State[F]] = + for { + lock <- EitherT.liftF[F, LoaderIgluError, MVar2[F, Unit]](MVar[F].of(())) + state <- SchemaState.init[F](keys, resolver) + } yield new State[F](lock, state) +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala new file mode 100644 index 0000000..ad917cb --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.api + +sealed private[postgres] trait TableState extends Product with Serializable +private[postgres] object TableState { + case object Match extends TableState + case object Outdated extends TableState + case object Missing extends TableState +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala new file mode 100644 index 0000000..1110f1c --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.config + +import com.zaxxer.hikari.HikariConfig +import DBConfig.JdbcUri + +case class DBConfig(host: String, + port: Int, + database: String, + username: String, + password: String, // TODO: can be EC2 store + sslMode: String, + schema: String +) { + def getJdbc: JdbcUri = + JdbcUri(host, port, database, sslMode.toLowerCase().replace('_', '-')) +} + +object DBConfig { + + case class JdbcUri(host: String, port: Int, database: String, sslMode: String) { + override def toString = + s"jdbc:postgresql://$host:$port/$database?sslmode=$sslMode" + } + + def hikariConfig(dbConfig: DBConfig) = { + val config = new HikariConfig() + config.setDriverClassName("org.postgresql.Driver") + config.setJdbcUrl(dbConfig.getJdbc.toString) + config.setUsername(dbConfig.username) + config.setPassword(dbConfig.password) + // TODO: DBConfig could take a MaxConnections field, and set `config.setMaximumPoolSize`. + config + } + +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/logging/Slf4jLogHandler.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/logging/Slf4jLogHandler.scala new file mode 100644 index 0000000..2398145 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/logging/Slf4jLogHandler.scala @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.logging + +import doobie.util.log._ +import org.log4s.Logger + +/** Log doobie events using slf4j framework + * + * This is largely based on the jdk-based log handler supplied by doobie: https://github.com/tpolecat/doobie/blob/f04a7a3cab5aecb50be0d1ad10fbdae6b8db5ec2/modules/core/src/main/scala/doobie/util/log.scala#L57 + * It uses slf4j as the logging abstraction, so the end user can control the log output using their preferred log framework + */ +object Slf4jLogHandler { + def apply(logger: Logger): LogHandler = + LogHandler { + case Success(s, a, e1, e2) => + logger.debug(s"""Successful Statement Execution: + | + | ${s.linesIterator.dropWhile(_.trim.isEmpty).mkString("\n ")} + | + | arguments = [${a.mkString(", ")}] + | elapsed = ${e1.toMillis.toString} ms exec + ${e2.toMillis.toString} ms processing (${(e1 + e2).toMillis.toString} ms total) + """.stripMargin) + + case ProcessingFailure(s, a, e1, e2, t) => + logger.debug(s"""Failed Resultset Processing: + | + | ${s.linesIterator.dropWhile(_.trim.isEmpty).mkString("\n ")} + | + | arguments = [${a.mkString(", ")}] + | elapsed = ${e1.toMillis.toString} ms exec + ${e2.toMillis.toString} ms processing (failed) (${(e1 + e2) + .toMillis + .toString} ms total) + | failure = ${t.getMessage} + """.stripMargin) + logger.error(s"Failed Resultset Processing: ${t.getMessage}") + + case ExecFailure(s, a, e1, t) => + logger.debug(s"""Failed Statement Execution: + | + | ${s.linesIterator.dropWhile(_.trim.isEmpty).mkString("\n ")} + | + | arguments = [${a.mkString(", ")}] + | elapsed = ${e1.toMillis.toString} ms exec (failed) + | failure = ${t.getMessage} + """.stripMargin) + logger.error(s"Failed StatementExecution: ${t.getMessage}") + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala new file mode 100644 index 0000000..9cfafe2 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres + +import cats.Eq + +import com.snowplowanalytics.iglu.core.SchemaKey + +import com.snowplowanalytics.iglu.schemaddl.jsonschema.{JsonSchemaProperty, Pointer} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.Pointer.{Cursor, SchemaProperty} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.CommonProperties.Type + +package object loader { + implicit val typeEq: Eq[Type] = Eq.fromUniversalEquals[Type] + implicit val schemaPropertyEq: Eq[SchemaProperty] = Eq.fromUniversalEquals[SchemaProperty] + implicit val jsonSchemaPropertyEq: Eq[JsonSchemaProperty] = Eq.fromUniversalEquals[JsonSchemaProperty] + implicit val cursorEq: Eq[Cursor] = Eq.fromUniversalEquals[Cursor] + implicit val pointerEq: Eq[Pointer] = Eq.fromUniversalEquals[Pointer] + implicit val schemaKeyEq: Eq[SchemaKey] = Eq.fromUniversalEquals[SchemaKey] +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala new file mode 100644 index 0000000..979a8e4 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres + +import cats.implicits._ + +import cats.effect.{Async, Blocker, Clock, Concurrent, ContextShift, Resource, Sync} + +import com.zaxxer.hikari.HikariConfig +import doobie.hikari._ +import doobie.implicits._ +import doobie.util.ExecutionContexts +import doobie.util.transactor.Transactor +import io.circe.Json +import org.log4s.getLogger + +import com.snowplowanalytics.iglu.client.Client + +import com.snowplowanalytics.snowplow.postgres.api.State +import com.snowplowanalytics.snowplow.postgres.config.DBConfig +import com.snowplowanalytics.snowplow.postgres.config.DBConfig.JdbcUri + +object resources { + + private lazy val logger = getLogger + + /** Initialise Blocking Thread Pool, Connection Pool, DB state and bad queue resources */ + def initialize[F[_]: Concurrent: Clock: ContextShift](postgres: DBConfig, iglu: Client[F, Json]) = + for { + blocker <- Blocker[F] + xa <- resources.getTransactor[F](DBConfig.hikariConfig(postgres), blocker) + state <- Resource.liftF(initializeState(postgres.schema, iglu, xa)) + } yield (blocker, xa, state) + + def initializeState[F[_]: Concurrent: Clock](schema: String, iglu: Client[F, Json], xa: HikariTransactor[F]): F[State[F]] = + for { + ci <- storage.query.getComments(schema).transact(xa).map(_.separate) + (issues, comments) = ci + _ <- issues.traverse_(issue => Sync[F].delay(logger.warn(issue.show))) + initState = State.init[F](comments, iglu.resolver).value.flatMap { + case Left(error) => + val exception = new RuntimeException(s"Couldn't initalise the state $error") + Sync[F].raiseError[State[F]](exception) + case Right(state) => + Sync[F].pure(state) + } + state <- initState + } yield state + + /** Get a HikariCP transactor */ + def getTransactor[F[_]: Async: ContextShift](config: HikariConfig, be: Blocker): Resource[F, HikariTransactor[F]] = { + val threadPoolSize = { + // This could be made configurable, but these are sensible defaults and unlikely to be critical for tuning throughput. + // Exceeding availableProcessors could lead to unnecessary context switching. + // Exceeding the connection pool size is unnecessary, because that is limit of the app's parallelism. + val maxPoolSize = if (config.getMaximumPoolSize > 0) config.getMaximumPoolSize else 10 + Math.min(maxPoolSize, Runtime.getRuntime.availableProcessors) + } + logger.debug(s"Using thread pool of size $threadPoolSize for Hikari transactor") + + for { + ce <- ExecutionContexts.fixedThreadPool[F](threadPoolSize) + xa <- HikariTransactor.fromHikariConfig[F](config, ce, be) + } yield xa + } + + /** Get default single-threaded transactor (use for tests only) */ + def getTransactorDefault[F[_]: Async: ContextShift](jdbcUri: JdbcUri, username: String, password: String): Transactor[F] = + Transactor.fromDriverManager[F]( + "org.postgresql.Driver", + jdbcUri.toString, + username, + password + ) +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala new file mode 100644 index 0000000..5bca15b --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.shredding + +import com.snowplowanalytics.iglu.core.SchemaKey + +import Entity.Column + +/** Final shredded entity */ +case class Entity(tableName: String, origin: SchemaKey, columns: List[Column]) + +object Entity { + + /** + * Table cell with value and meta info + * @param name Postgres column name + * @param dataType Postgres data type + * @param value ready-to-be-inserted value + */ + case class Column(name: String, dataType: Type, value: Value) + +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala new file mode 100644 index 0000000..3042bd9 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.shredding + +sealed trait Shredded + +object Shredded { + case class ShreddedSnowplow(event: Entity, entities: List[Entity]) extends Shredded + case class ShreddedSelfDescribing(entity: Entity) extends Shredded +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala new file mode 100644 index 0000000..0f6afd7 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala @@ -0,0 +1,240 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.shredding + +import cats.implicits._ + +import io.circe.Json + +import com.snowplowanalytics.iglu.schemaddl.jsonschema.Schema +import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.CommonProperties.{Type => SType} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.NumberProperty.{Maximum, MultipleOf} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.StringProperty.{Format, MaxLength, MinLength} + +import com.snowplowanalytics.snowplow.postgres.loader._ + +sealed trait Type { + def ddl: String = + this match { + case Type.Char(size) => s"CHAR($size)" + case Type.Varchar(size) => s"VARCHAR($size)" + case Type.Uuid => "UUID" + case Type.Timestamp => "TIMESTAMP" + case Type.Date => "DATE" + case Type.Integer => "INTEGER" + case Type.BigInt => "BIGINT" + case Type.Double => "DOUBLE PRECISION" + case Type.Bool => "BOOLEAN" + case Type.Jsonb => "JSONB" + } +} + +object Type { + + case class Char(size: Int) extends Type + case class Varchar(size: Int) extends Type + case object Uuid extends Type + case object Timestamp extends Type + case object Date extends Type + case object Integer extends Type + case object BigInt extends Type + case object Double extends Type + case object Bool extends Type + case object Jsonb extends Type + + type DataTypeSuggestion = (Schema, String) => Option[Type] + + /** Derive a Postgres type, given JSON Schema */ + def getDataType(properties: Schema, varcharSize: Int, columnName: String, suggestions: List[DataTypeSuggestion]): Type = + suggestions match { + case Nil => Type.Varchar(4096) // Generic + case suggestion :: tail => + suggestion(properties, columnName) match { + case Some(format) => format + case None => getDataType(properties, varcharSize, columnName, tail) + } + } + + // For complex enums Suggest VARCHAR with length of longest element + val complexEnumSuggestion: DataTypeSuggestion = (properties, _) => + properties.enum match { + case Some(enums) if isComplexEnum(enums.value) => + val longest = excludeNull(enums.value).map(_.noSpaces.length).maximumOption.getOrElse(16) + Some(Type.Varchar(longest)) + case _ => None + } + + val productSuggestion: DataTypeSuggestion = (properties, _) => + properties.`type` match { + case Some(t: SType.Union) if t.isUnion => + Some(Type.Jsonb) + case Some(t: SType) if t === (SType.Array: SType) => + Some(Type.Jsonb) + case Some(SType.Union(types)) if types.contains(SType.Array) => + Some(Type.Jsonb) + case _ => None + } + + val timestampSuggestion: DataTypeSuggestion = (properties, _) => + (properties.`type`, properties.format) match { + case (Some(types), Some(Format.DateTimeFormat)) if types.possiblyWithNull(SType.String) => + Some(Type.Timestamp) + case _ => None + } + + val dateSuggestion: DataTypeSuggestion = (properties, _) => + (properties.`type`, properties.format) match { + case (Some(types), Some(Format.DateFormat)) if types.possiblyWithNull(SType.String) => + Some(Type.Date) + case _ => None + } + + val arraySuggestion: DataTypeSuggestion = (properties, _) => + properties.`type` match { + case Some(types) if types.possiblyWithNull(SType.Array) => + Some(Type.Varchar(4096)) + case _ => None + } + + val numberSuggestion: DataTypeSuggestion = (properties, _) => + (properties.`type`, properties.multipleOf) match { + case (Some(types), Some(MultipleOf.NumberMultipleOf(m))) if types.possiblyWithNull(SType.Number) && m === BigDecimal(1, 2) => + Some(Type.Double) + case (Some(types), _) if types.possiblyWithNull(SType.Number) => + Some(Type.Double) + case (Some(types: SType.Union), _) if (types.value - SType.Null) === Set(SType.Integer, SType.Number) => + Some(Type.Double) + case _ => + None + } + + // TODO: add more sizes + val integerSuggestion: DataTypeSuggestion = (properties, _) => { + (properties.`type`, properties.maximum, properties.enum, properties.multipleOf) match { + case (Some(types), Some(maximum), _, _) if types.possiblyWithNull(SType.Integer) => + if (isBigInt(maximum)) Type.BigInt.some + else Type.Integer.some + case (Some(types), None, _, _) if types.possiblyWithNull(SType.Integer) => + Type.BigInt.some + // Contains only enum + case (types, _, Some(_), _) if types.isEmpty || types.get.possiblyWithNull(SType.Integer) => + Type.Integer.some + case (Some(types), _, _, _) if types.possiblyWithNull(SType.Integer) => + Type.Integer.some + case (_, _, _, Some(MultipleOf.IntegerMultipleOf(_))) => + Type.Integer.some + case _ => None + } + } + + val charSuggestion: DataTypeSuggestion = (properties, _) => { + (properties.`type`, properties.minLength, properties.maxLength) match { + case (Some(types), Some(MinLength(min)), Some(MaxLength(max))) if min === max && types.possiblyWithNull(SType.String) => + Some(Type.Char(min.toInt)) + case _ => None + } + } + + val booleanSuggestion: DataTypeSuggestion = (properties, _) => { + properties.`type` match { + case Some(types) if types.possiblyWithNull(SType.Boolean) => Some(Type.Bool) + case _ => None + } + } + + val uuidSuggestion: DataTypeSuggestion = (properties, _) => { + (properties.`type`, properties.format) match { + case (Some(types), Some(Format.UuidFormat)) if types.possiblyWithNull(SType.String) => + Some(Type.Uuid) + case _ => None + } + } + + val varcharSuggestion: DataTypeSuggestion = (properties, _) => { + (properties.`type`, properties.maxLength, properties.enum, properties.format) match { + case (Some(types), Some(maxLength), _, _) if types.possiblyWithNull(SType.String) => + Some(Type.Varchar(maxLength.value.toInt)) + case (_, _, Some(enum), _) => + enum.value.map(jsonLength).maximumOption match { + case Some(maxLength) if enum.value.lengthCompare(1) === 0 => + Some(Type.Varchar(maxLength)) + case Some(maxLength) => + Some(Type.Varchar(maxLength)) + case None => None + } + case _ => None + } + } + + val dataTypeSuggestions: List[DataTypeSuggestion] = List( + complexEnumSuggestion, + productSuggestion, + timestampSuggestion, + dateSuggestion, + arraySuggestion, + integerSuggestion, + numberSuggestion, + booleanSuggestion, + charSuggestion, + uuidSuggestion, + varcharSuggestion + ) + + private def jsonLength(json: Json): Int = + json.fold(0, b => b.toString.length, _ => json.noSpaces.length, _.length, _ => json.noSpaces.length, _ => json.noSpaces.length) + + /** + * Get set of types or enum as string excluding null + * + * @param types comma-separated types + * @return set of strings + */ + private def excludeNull(types: List[Json]): List[Json] = + types.filterNot(_.isNull) + + /** + * Check enum contains some different types + * (string and number or number and boolean) + */ + private def isComplexEnum(enum: List[Json]) = { + // Predicates + def isNumeric(s: Json) = s.isNumber + def isNonNumeric(s: Json) = !isNumeric(s) + def isBoolean(s: Json) = s.isBoolean + + val nonNullEnum = excludeNull(enum) + somePredicates(nonNullEnum, List(isNumeric _, isNonNumeric _, isBoolean _), 2) + } + + def isBigInt(long: Maximum): Boolean = + long match { + case Maximum.IntegerMaximum(bigInt) => bigInt > 2147483647L + case _ => false + } + + /** + * Check at least some `quantity` of `predicates` are true on `instances` + * + * @param instances list of instances to check on + * @param predicates list of predicates to check + * @param quantity required quantity + */ + private def somePredicates(instances: List[Json], predicates: List[Json => Boolean], quantity: Int): Boolean = + if (quantity === 0) true + else + predicates match { + case Nil => false + case h :: tail if instances.exists(h) => somePredicates(instances, tail, quantity - 1) + case _ :: tail => somePredicates(instances, tail, quantity) + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala new file mode 100644 index 0000000..50c7962 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.shredding + +import java.sql.{Timestamp => JTimestamp} +import java.util.UUID +import java.time.Instant + +import io.circe.Json + +import doobie.syntax.string._ +import doobie.implicits.javasql._ +import doobie.postgres.implicits._ +import doobie.postgres.circe.jsonb.implicits._ +import doobie.util.fragment.Fragment + +sealed trait Value { + def fragment: Fragment = + this match { + case Value.Uuid(value) => fr"$value" + case Value.Char(value) => fr"$value" + case Value.Varchar(value) => fr"$value" + case Value.Timestamp(value) => fr"$value" + case Value.Integer(value) => fr"$value" + case Value.BigInt(value) => fr"$value" + case Value.Double(value) => fr"$value" + case Value.Bool(value) => fr"$value" + case Value.Jsonb(value) => fr"$value" + } +} + +object Value { + case class Uuid(value: UUID) extends Value + case class Char(value: String) extends Value + case class Varchar(value: String) extends Value + case class Timestamp(value: JTimestamp) extends Value + case class Integer(value: Int) extends Value + case class BigInt(value: Long) extends Value + case class Double(value: scala.Double) extends Value + case class Bool(value: Boolean) extends Value + case class Jsonb(value: Json) extends Value + + object Timestamp { + def apply(instant: Instant): Timestamp = Timestamp(JTimestamp.from(instant)) + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala new file mode 100644 index 0000000..601dfae --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.shredding + +import cats.Monad +import cats.data.EitherT + +import cats.effect.Clock + +import io.circe.Json + +import com.snowplowanalytics.iglu.client.{ClientError, Resolver} +import com.snowplowanalytics.iglu.client.resolver.registries.RegistryLookup +import com.snowplowanalytics.iglu.core.{SchemaCriterion, SchemaKey, SchemaList, SchemaMap, SelfDescribingSchema} + +import com.snowplowanalytics.iglu.schemaddl.{IgluSchema, Properties} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.Schema +import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.CommonProperties.{Type => SType} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.circe.implicits._ +import com.snowplowanalytics.iglu.schemaddl.migrations.{FlatSchema, SchemaList => DdlSchemaList} + +import com.snowplowanalytics.snowplow.badrows.FailureDetails + +/** Generic schema functionality, related to JSON schema (Iglu) transformations */ +object schema { + + def fetch[F[_]: Monad: RegistryLookup: Clock]( + resolver: Resolver[F] + )(key: SchemaKey): EitherT[F, FailureDetails.LoaderIgluError, IgluSchema] = + for { + json <- EitherT(resolver.lookupSchema(key)).leftMap(error => + FailureDetails.LoaderIgluError.IgluError(key, error): FailureDetails.LoaderIgluError + ) + schema <- EitherT.fromEither[F](Schema.parse(json).toRight(buildFailure(json, key))) + } yield SelfDescribingSchema(SchemaMap(key), schema) + + def buildFailure(json: Json, key: SchemaKey): FailureDetails.LoaderIgluError = + FailureDetails + .LoaderIgluError + .InvalidSchema(key, s"JSON ${json.noSpaces} cannot be parsed as JSON Schema"): FailureDetails.LoaderIgluError + + def getSchemaList[F[_]: Monad: RegistryLookup: Clock]( + resolver: Resolver[F] + )(vendor: String, name: String, model: Int): EitherT[F, FailureDetails.LoaderIgluError, DdlSchemaList] = { + + val criterion = SchemaCriterion(vendor, name, "jsonschema", Some(model), None, None) + val schemaList = resolver.listSchemas(vendor, name, model) + for { + schemaList <- EitherT[F, ClientError.ResolutionError, SchemaList](schemaList).leftMap(error => + FailureDetails.LoaderIgluError.SchemaListNotFound(criterion, error) + ) + ordered <- DdlSchemaList.fromSchemaList(schemaList, fetch(resolver)) + } yield ordered + } + + def getOrdered[F[_]: Monad: RegistryLookup: Clock]( + resolver: Resolver[F] + )(vendor: String, name: String, model: Int): EitherT[F, FailureDetails.LoaderIgluError, Properties] = + getSchemaList[F](resolver)(vendor, name, model).map(FlatSchema.extractProperties) + + def canBeNull(schema: Schema): Boolean = + schema.enum.exists(_.value.exists(_.isNull)) || (schema.`type` match { + case Some(SType.Union(types)) => types.contains(SType.Null) + case Some(t) => t == SType.Null + case None => false + }) +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala new file mode 100644 index 0000000..b09c71e --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala @@ -0,0 +1,291 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.shredding + +import java.time.Instant +import java.time.format.DateTimeParseException +import java.util.UUID +import java.sql.Timestamp + +import cats.data.{EitherNel, EitherT, NonEmptyList} +import cats.implicits._ + +import cats.effect.{Clock, Sync} + +import io.circe.{ACursor, Json, JsonNumber} + +import com.snowplowanalytics.iglu.core._ + +import com.snowplowanalytics.iglu.client.Client + +import com.snowplowanalytics.iglu.schemaddl.jsonschema.Pointer.SchemaPointer +import com.snowplowanalytics.iglu.schemaddl.{Properties, StringUtils} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.{Pointer, Schema} +import com.snowplowanalytics.iglu.schemaddl.migrations.FlatSchema + +import com.snowplowanalytics.snowplow.analytics.scalasdk.Event +import com.snowplowanalytics.snowplow.badrows.{BadRow, Failure, FailureDetails, Payload, Processor} +import Entity.Column +import Shredded.{ShreddedSelfDescribing, ShreddedSnowplow} + +object transform { + val Atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1, 0, 0)) + + /** Transform the whole `Event` (canonical and JSONs) into list of independent entities ready to be inserted */ + def shredEvent[F[_]: Sync: Clock](client: Client[F, Json], processor: Processor, event: Event): EitherT[F, BadRow, ShreddedSnowplow] = { + val entities = event.contexts.data ++ event.derived_contexts.data ++ event.unstruct_event.data.toList + val wholeEvent = entities.parTraverse(shredJson(client)).value.map { shreddedOrError => + (shreddedOrError, shredAtomic(Map())(event)).mapN { (shreddedEntities, atomic) => + ShreddedSnowplow(atomic, shreddedEntities.map(_.entity).map(addMetadata(event.event_id, event.collector_tstamp))) + } + } + EitherT(wholeEvent).leftMap[BadRow](buildBadRow(processor, event)) + } + + def addMetadata(eventId: UUID, tstamp: Instant)(entity: Entity): Entity = { + val metaColumns = List( + Column("schema_vendor", Type.Varchar(128), Value.Varchar(entity.origin.vendor)), + Column("schema_name", Type.Varchar(128), Value.Varchar(entity.origin.name)), + Column("schema_format", Type.Varchar(16), Value.Varchar(entity.origin.format)), + Column("schema_version", Type.Varchar(8), Value.Varchar(entity.origin.version.asString)), + Column("root_id", Type.Uuid, Value.Uuid(eventId)), + Column("root_tstamp", Type.Timestamp, Value.Timestamp(tstamp)) + ) + + entity.copy(columns = metaColumns ++ entity.columns) + } + + /** Remove all properties which are roots for other properties, + * Otherwise table will have structure of [nested, nested.a, nested.b], + * where we need just [nested.a, nested.b] + */ + def removeRoots(props: Properties): Properties = { + val pointers = props.map(_._1).toSet + props.filterNot { + case (pointer, _) => + pointer.value.isEmpty || { + val problem = pointers.exists(p => pointer.isParentOf(p) && p != pointer) + problem + } + } + } + + /** Transform JSON into [[Entity]] */ + def shredJson[F[_]: Sync: Clock]( + client: Client[F, Json] + )(data: SelfDescribingData[Json]): EitherT[F, NonEmptyList[FailureDetails.LoaderIgluError], ShreddedSelfDescribing] = { + val key = data.schema + schema.getOrdered(client.resolver)(key.vendor, key.name, key.version.model).leftMap(error => NonEmptyList.of(error)).subflatMap { + properties => + val shredded = getNameTypeVal(properties)(data.data).parTraverse { + case (columnName, pgType, value) => + cast(value, pgType).toEitherNel.map { value => + value.map(v => Entity.Column(columnName, pgType, v)) + } + } + + shredded + .leftMap { errors => + errors.map { error => + FailureDetails.LoaderIgluError.WrongType(data.schema, Json.Null, error) // TODO + } + } + .map { cols => + val columns = cols.collect { case Some(c) => c } + val tableName = data.schema match { + case Atomic => "events" + case other => StringUtils.getTableName(SchemaMap(other)) + } + ShreddedSelfDescribing(Entity(tableName, data.schema, columns)) + } + } + } + + /** Transform only canonical part of `Event` (128 non-JSON fields) into `ShreddedEntity` */ + def shredAtomic(lengths: Map[String, Int])(event: Event): EitherNel[FailureDetails.LoaderIgluError, Entity] = { + def tranformDate(col: String)(s: String): Either[FailureDetails.LoaderIgluError, Entity.Column] = + Either + .catchOnly[DateTimeParseException](Instant.parse(s)) + .map(parsed => Entity.Column(col, Type.Timestamp, Value.Timestamp(parsed))) + .leftMap(_ => FailureDetails.LoaderIgluError.WrongType(Atomic, Json.fromString(s), "date-time")) + + def transformUuid(col: String)(s: String): Either[FailureDetails.LoaderIgluError, Entity.Column] = + Either + .catchOnly[IllegalArgumentException](UUID.fromString(s)) + .map(parsed => Entity.Column(col, Type.Uuid, Value.Uuid(parsed))) + .leftMap(_ => FailureDetails.LoaderIgluError.WrongType(Atomic, Json.fromString(s), "uuid")) + + def transformBool(col: String)(b: Boolean): Entity.Column = + if (b) Entity.Column(col, Type.Bool, Value.Bool(true)) + else Entity.Column(col, Type.Bool, Value.Bool(false)) + + def truncate(col: String)(value: String): Entity.Column = + lengths.get(col) match { + case Some(len) => + Entity.Column(col, Type.Varchar(len), Value.Varchar(value.take(len))) + case None => + Entity.Column(col, Type.Varchar(1024), Value.Varchar(value.take(1024))) + } + + def transformNumber(col: String)(num: JsonNumber): Entity.Column = + num.toInt match { + case Some(int) => Entity.Column(col, Type.Integer, Value.Integer(int)) + case None => Entity.Column(col, Type.Double, Value.Double(num.toDouble)) + } + + def castError(expected: String)(value: Json) = + FailureDetails.LoaderIgluError.WrongType(Atomic, value, expected).asLeft[Option[Entity.Column]] + + val data = event.ordered.parTraverse { + case ("contexts" | "derived_contexts" | "unstruct_event", _) => + none.asRight.toEitherNel + case (key @ ("event_id" | "domain_sessionid"), Some(value)) => + val error = castError("uuid") _ + value.fold( + none.asRight.toEitherNel, + b => error(Json.fromBoolean(b)).toEitherNel, + n => error(Json.fromJsonNumber(n)).toEitherNel, + s => transformUuid(key)(s).map(_.some).toEitherNel, + a => error(Json.arr(a: _*)).toEitherNel, + o => error(Json.fromJsonObject(o)).toEitherNel + ) + case (key, Some(value)) if key.endsWith("_tstamp") => + val error = castError("date-time") _ + value.fold( + none.asRight.toEitherNel, + b => error(Json.fromBoolean(b)).toEitherNel, + n => error(Json.fromJsonNumber(n)).toEitherNel, + s => tranformDate(key)(s).map(_.some).toEitherNel, + a => error(Json.arr(a: _*)).toEitherNel, + o => error(Json.fromJsonObject(o)).toEitherNel + ) + case (key, Some(value)) => + value.fold( + none.asRight.toEitherNel, + b => transformBool(key)(b).some.asRight.toEitherNel, + n => transformNumber(key)(n).some.asRight.toEitherNel, + s => truncate(key)(s).some.asRight.toEitherNel, + _ => none.asRight.toEitherNel, + _ => none.asRight.toEitherNel + ) + case (_, None) => none.asRight.toEitherNel + } + data.map(_.unite).map(columns => Entity("events", Atomic, columns)) + } + + def cast(json: Option[Json], dataType: Type): Either[String, Option[Value]] = { + val error = s"Invalid type ${dataType.ddl} for value $json".asLeft[Option[Value]] + json match { + case Some(j) => + dataType match { + case Type.Uuid => + j.asString match { + case Some(s) => Value.Uuid(UUID.fromString(s)).some.asRight // TODO + case None => error + } + case Type.Varchar(_) => + val result = j.asString match { + case Some(s) => s + case None => j.noSpaces + } + Value.Varchar(result).some.asRight[String] + case Type.Bool => + j.asBoolean match { + case Some(b) => Value.Bool(b).some.asRight + case None => error + } + case Type.Char(len) => + j.asString match { + case Some(s) if s.length === len => Value.Char(s).some.asRight + case Some(_) => error + case None => error + } + case Type.Integer => + j.asNumber.flatMap(_.toInt) match { + case Some(int) => Value.Integer(int).some.asRight + case None => error + } + case Type.BigInt => + j.asNumber.flatMap(_.toLong) match { + case Some(long) => Value.BigInt(long).some.asRight + case None => error + } + case Type.Double => + j.asNumber.map(_.toDouble) match { + case Some(int) => Value.Double(int).some.asRight + case None => error + } + case Type.Jsonb => + Value.Jsonb(j).some.asRight + case Type.Date => + error // TODO + case Type.Timestamp => + j.asString match { + case Some(s) => + Either.catchOnly[DateTimeParseException](Instant.parse(s)).leftMap(_.getMessage).map { instant => + Value.Timestamp(Timestamp.from(instant)).some + } + case None => error + } + } + case None => none.asRight + } + } + + def getPath(pointer: Pointer.JsonPointer, json: Json): Option[Json] = { + def go(cursor: List[Pointer.Cursor], data: ACursor): Option[Json] = + cursor match { + case Nil => + data.focus + case Pointer.Cursor.DownField(field) :: t => + go(t, data.downField(field)) + case Pointer.Cursor.At(i) :: t => + go(t, data.downN(i)) + case Pointer.Cursor.DownProperty(_) :: _ => + throw new IllegalStateException(s"Iglu Schema DDL tried to use invalid pointer ${pointer.show} for payload ${json.noSpaces}") + } + + go(pointer.get, json.hcursor) + } + + /** + * Transform Schema properties into information that can be transformed into DDL columns + * It's very important to implement it and [[getNameTypeVal]] using same logic as + * former is an implementation for DDL, while latter is implementation for data shredding + * @return list of JSON Pointer, column name, inferred DB type, nullability + */ + def getNameType(properties: Properties): List[(SchemaPointer, String, Type, Boolean)] = + removeRoots(properties).map { + case (pointer, s: Schema) => + val columnName: String = FlatSchema.getName(pointer) + val pgType = Type.getDataType(s, 4096, columnName, Type.dataTypeSuggestions) + (pointer, columnName, pgType, schema.canBeNull(s)) + } + + /** + * Extract JSON Paths from an actual JSON data + * It's very important to implement [[getNameType]] and this function using same logic as + * former is an implementation for DDL, while latter is implementation for data shredding + * @return list column name, inferred DB type, value + */ + def getNameTypeVal(properties: Properties)(data: Json) = + getNameType(properties).map { + case (pointer, columnName, dataType, _) => + val value = getPath(pointer.forData, data) + (columnName, dataType, value) + } + + private def buildBadRow(processor: Processor, event: Event)(errors: NonEmptyList[FailureDetails.LoaderIgluError]) = + BadRow.LoaderIgluError(processor, Failure.LoaderIgluErrors(errors), Payload.LoaderPayload(event)) + +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala new file mode 100644 index 0000000..1705d87 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.storage + +import cats.{Eq, Show} + +import com.snowplowanalytics.iglu.core.ParseError + +/** Error with table comment, preventing from `PgState` initialisation */ +sealed trait CommentIssue extends Product with Serializable + +object CommentIssue { + + /** Table missing a comment */ + case class Missing(table: String) extends CommentIssue + + /** Comment is not an Iglu URI */ + case class Invalid(table: String, comment: String, error: ParseError) extends CommentIssue + + implicit val commentIssueShow: Show[CommentIssue] = Show.show { + case Missing(table) => + s"Iglu comment is missing in table $table; The table will be ignored" + case Invalid(table, comment, error) => + s"Comment on table $table ($comment) is not valid Iglu URI (${error.code})" + } + + implicit val commentIssueEq: Eq[CommentIssue] = Eq.fromUniversalEquals[CommentIssue] +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala new file mode 100644 index 0000000..e7718f0 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.storage + +import cats.data.EitherT +import cats.implicits._ + +import cats.effect.{Clock, Sync} + +import doobie.ConnectionIO +import doobie.implicits._ +import doobie.util.fragment.Fragment + +import com.snowplowanalytics.iglu.core.{SchemaCriterion, SchemaKey, SchemaMap} + +import com.snowplowanalytics.iglu.client.Resolver + +import com.snowplowanalytics.iglu.schemaddl.StringUtils +import com.snowplowanalytics.iglu.schemaddl.migrations.{SchemaList => DdlSchemaList} + +import com.snowplowanalytics.snowplow.badrows.FailureDetails +import com.snowplowanalytics.snowplow.postgres.shredding.schema.fetch +import com.snowplowanalytics.snowplow.postgres.shredding.transform.Atomic +import com.snowplowanalytics.snowplow.postgres.streaming.IgluErrors +import com.snowplowanalytics.snowplow.postgres.streaming.sink.Insert + +object ddl { + + /** Function that can produce DDL, based on `DdlSchemaList` */ + type Generator = DdlSchemaList => Fragment + + def createTable[F[_]: Sync: Clock](resolver: Resolver[F], + schema: String, + entity: SchemaKey, + meta: Boolean + ): EitherT[F, IgluErrors, Insert] = { + val generator: Generator = schemaList => sql.createTable(schema, entity, schemaList, meta) + manage(resolver, schema, entity, generator) + } + + // TODO: tables need to be updated in transaction, because situation where one node tries to mutate it after its state + // been update are completely predictable + def alterTable[F[_]: Sync: Clock](resolver: Resolver[F], schema: String, entity: SchemaKey): EitherT[F, IgluErrors, Insert] = { + val generator: Generator = schemaList => sql.migrateTable(schema, entity, schemaList) + manage(resolver, schema, entity, generator) + } + + def createEventsTable(schema: String): ConnectionIO[Unit] = + definitions.atomicSql(schema).update().run.void + + /** + * Perform some DB management: create or mutate the table according to current + * schema state (where state is all known versions on the iglu registry) + * First, check the current state of the schema on registry and validate it, + * Then, create an actual update action using `generator` and comment on table + * with latest schema from schema list retrieved from the registry + * At last, update internal mutable state. + * + * Note that it doesn't actually perform a DB action (no `Transactor`) + * + * @param resolver Iglu Resolver tied to Iglu Server (it needs schema list endpoint) + * @param schema database schema + * @param entity an actual shredded entity that we manage tables for + * @param generator a function generating SQL from `DdlSchemaList` + * @return an action that is either failure because of Iglu subsystem + * or doobie IO + */ + def manage[F[_]: Sync: Clock](resolver: Resolver[F], + schema: String, + origin: SchemaKey, + generator: Generator + ): EitherT[F, IgluErrors, Insert] = { + val group = (origin.vendor, origin.name, origin.version.model) + val criterion = SchemaCriterion(origin.vendor, origin.name, "jsonschema", origin.version.model) + val (vendor, name, model) = group + + EitherT(resolver.listSchemas(vendor, name, model)) + .leftMap(error => IgluErrors.of(FailureDetails.LoaderIgluError.SchemaListNotFound(criterion, error))) + .flatMap { list => + + DdlSchemaList.fromSchemaList(list, fetch[F](resolver)).leftMap(IgluErrors.of).map { list => + val statement = generator(list) + val tableName = getTableName(origin) + statement.update().run.void *> + sql.commentTable(schema, tableName, list.latest) + } + } + } + + def getTableName(schemaKey: SchemaKey): String = + schemaKey match { + case Atomic => "events" + case other => StringUtils.getTableName(SchemaMap(other)) + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala new file mode 100644 index 0000000..0783dd7 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.storage + +import doobie.Fragment +import doobie.implicits._ + +import com.snowplowanalytics.snowplow.postgres.shredding.Type + +object definitions { + + /** Columns prepended to every shredded type table */ + val metaColumns: List[(String, Type, Boolean)] = List( + ("schema_vendor", Type.Varchar(128), true), + ("schema_name", Type.Varchar(128), true), + ("schema_format", Type.Varchar(128), true), + ("schema_version", Type.Varchar(128), true), + ("root_id", Type.Uuid, true), + ("root_tstamp", Type.Timestamp, true) + ) + + val atomicColumns: List[(String, Type, Boolean)] = List( + // App + ("app_id", Type.Varchar(255), false), + ("platform", Type.Varchar(255), false), + // Date/time + ("etl_tstamp", Type.Timestamp, false), + ("collector_tstamp", Type.Timestamp, true), + ("dvce_created_tstamp", Type.Timestamp, false), + // Date/time + ("event", Type.Varchar(128), false), + ("event_id", Type.Uuid, true), + ("txn_id", Type.Integer, false), + // Versioning + ("name_tracker", Type.Varchar(128), false), + ("v_tracker", Type.Varchar(100), false), + ("v_collector", Type.Varchar(100), true), + ("v_etl", Type.Varchar(100), true), + // User and visit + ("user_id", Type.Varchar(255), false), + ("user_ipaddress", Type.Varchar(45), false), + ("user_fingerprint", Type.Varchar(50), false), + ("domain_userid", Type.Varchar(36), false), + ("domain_sessionidx", Type.Integer, false), + ("network_userid", Type.Varchar(38), false), + // Location + ("geo_country", Type.Char(2), false), + ("geo_region", Type.Char(3), false), + ("geo_city", Type.Varchar(75), false), + ("geo_zipcode", Type.Varchar(15), false), + ("geo_latitude", Type.Double, false), + ("geo_longitude", Type.Double, false), + ("geo_region_name", Type.Varchar(100), false), + // IP lookups + ("ip_isp", Type.Varchar(100), false), + ("ip_organization", Type.Varchar(100), false), + ("ip_domain", Type.Varchar(100), false), + ("ip_netspeed", Type.Varchar(100), false), + // Page + ("page_url", Type.Varchar(4096), false), + ("page_title", Type.Varchar(2000), false), + ("page_referrer", Type.Varchar(4096), false), + // Page URL components + ("page_urlscheme", Type.Varchar(16), false), + ("page_urlhost", Type.Varchar(255), false), + ("page_urlport", Type.Integer, false), + ("page_urlpath", Type.Varchar(3000), false), + ("page_urlquery", Type.Varchar(6000), false), + ("page_urlfragment", Type.Varchar(3000), false), + // Referrer URL components + ("refr_urlscheme", Type.Varchar(16), false), + ("refr_urlhost", Type.Varchar(255), false), + ("refr_urlport", Type.Integer, false), + ("refr_urlpath", Type.Varchar(6000), false), + ("refr_urlquery", Type.Varchar(6000), false), + ("refr_urlfragment", Type.Varchar(3000), false), + // Referrer details + ("refr_medium", Type.Varchar(25), false), + ("refr_source", Type.Varchar(50), false), + ("refr_term", Type.Varchar(255), false), + // Marketing + ("mkt_medium", Type.Varchar(255), false), + ("mkt_source", Type.Varchar(255), false), + ("mkt_term", Type.Varchar(255), false), + ("mkt_content", Type.Varchar(500), false), + ("mkt_campaign", Type.Varchar(255), false), + // Custom structured event + ("se_category", Type.Varchar(1000), false), + ("se_action", Type.Varchar(1000), false), + ("se_label", Type.Varchar(1000), false), + ("se_property", Type.Varchar(1000), false), + ("se_value", Type.Double, false), + // Ecommerce + ("tr_orderid", Type.Varchar(255), false), + ("tr_affiliation", Type.Varchar(255), false), + ("tr_total", Type.Double, false), + ("tr_tax", Type.Double, false), + ("tr_shipping", Type.Double, false), + ("tr_city", Type.Varchar(255), false), + ("tr_state", Type.Varchar(255), false), + ("tr_country", Type.Varchar(255), false), + ("ti_orderid", Type.Varchar(255), false), + ("ti_sku", Type.Varchar(255), false), + ("ti_name", Type.Varchar(255), false), + ("ti_category", Type.Varchar(255), false), + ("ti_price", Type.Double, false), + ("ti_quantity", Type.Integer, false), + // Page ping + ("pp_xoffset_min", Type.Integer, false), + ("pp_xoffset_max", Type.Integer, false), + ("pp_yoffset_min", Type.Integer, false), + ("pp_yoffset_max", Type.Integer, false), + // User Agent + ("useragent", Type.Varchar(1000), false), + // Browser + ("br_name", Type.Varchar(50), false), + ("br_family", Type.Varchar(50), false), + ("br_version", Type.Varchar(50), false), + ("br_type", Type.Varchar(50), false), + ("br_renderengine", Type.Varchar(50), false), + ("br_lang", Type.Varchar(255), false), + ("br_features_pdf", Type.Bool, false), + ("br_features_flash", Type.Bool, false), + ("br_features_java", Type.Bool, false), + ("br_features_director", Type.Bool, false), + ("br_features_quicktime", Type.Bool, false), + ("br_features_realplayer", Type.Bool, false), + ("br_features_windowsmedia", Type.Bool, false), + ("br_features_gears", Type.Bool, false), + ("br_features_silverlight", Type.Bool, false), + ("br_cookies", Type.Bool, false), + ("br_colordepth", Type.Varchar(12), false), + ("br_viewwidth", Type.Integer, false), + ("br_viewheight", Type.Integer, false), + // Operating System + ("os_name", Type.Varchar(50), false), + ("os_family", Type.Varchar(50), false), + ("os_manufacturer", Type.Varchar(50), false), + ("os_timezone", Type.Varchar(50), false), + // Device/Hardware + ("dvce_type", Type.Varchar(50), false), + ("dvce_ismobile", Type.Bool, false), + ("dvce_screenwidth", Type.Integer, false), + ("dvce_screenheight", Type.Integer, false), + // Document + ("doc_charset", Type.Varchar(128), false), + ("doc_width", Type.Integer, false), + ("doc_height", Type.Integer, false), + // Currency + ("tr_currency", Type.Char(3), false), + ("tr_total_base", Type.Double, false), + ("tr_tax_base", Type.Double, false), + ("tr_shipping_base", Type.Double, false), + ("ti_currency", Type.Char(3), false), + ("ti_price_base", Type.Double, false), + ("base_currency", Type.Char(3), false), + // Geolocation + ("geo_timezone", Type.Varchar(64), false), + // Click ID + ("mkt_clickid", Type.Varchar(128), false), + ("mkt_network", Type.Varchar(64), false), + // ETL tags + ("etl_tags", Type.Varchar(500), false), + // Time event was sent + ("dvce_sent_tstamp", Type.Timestamp, false), + // Referer + ("refr_domain_userid", Type.Varchar(36), false), + ("refr_dvce_tstamp", Type.Timestamp, false), + // Session ID + ("domain_sessionid", Type.Uuid, false), + // Derived Type.Timestamp + ("derived_tstamp", Type.Timestamp, false), + // Event schema + ("event_vendor", Type.Varchar(1000), false), + ("event_name", Type.Varchar(1000), false), + ("event_format", Type.Varchar(128), false), + ("event_version", Type.Varchar(128), false), + // Event fingerprint + ("event_fingerprint", Type.Varchar(128), false), + // True Type.Timestamp + ("true_tstamp", Type.Timestamp, false) + ) + + def atomicSql(schema: String) = { + val columns = atomicColumns + .map { + case (n, t, true) => Fragment.const(s"$n ${t.ddl} NOT NULL") + case (n, t, false) => Fragment.const(s"$n ${t.ddl}") + } + .foldLeft(Fragment.empty) { (acc, cur) => + val separator = if (acc == Fragment.empty) Fragment.const("\n") else Fragment.const(",\n") + acc ++ separator ++ cur + } + + val schemaFr = Fragment.const0(schema) + + fr"""CREATE TABLE $schemaFr.events ($columns) WITH (OIDS=FALSE)""" + } + + def columnToString(columnName: String, dataType: Type, nullable: Boolean) = { + val notNull = if (nullable) "NULL" else "NOT NULL" + s""""$columnName" ${dataType.ddl} $notNull""" + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala new file mode 100644 index 0000000..00a925f --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.storage + +import cats.syntax.traverse._ +import cats.syntax.either._ +import cats.instances.list._ + +import doobie.ConnectionIO +import doobie.implicits._ +import org.log4s.getLogger + +import com.snowplowanalytics.iglu.core.SchemaKey +import com.snowplowanalytics.snowplow.postgres.logging.Slf4jLogHandler + +/** Functions to query the storage for state and metadata */ +object query { + + private lazy val logger = Slf4jLogHandler(getLogger) + + def tableExists(schema: String, name: String): ConnectionIO[Boolean] = + fr"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = $name AND table_schema = $schema);" + .queryWithLogHandler[Boolean](logger) + .unique + + def listTables(schema: String): ConnectionIO[List[String]] = + fr"SELECT tablename FROM pg_catalog.pg_tables WHERE schemaname = $schema".query[String].to[List] + + def getComment(schema: String)(tableName: String): ConnectionIO[Either[CommentIssue, SchemaKey]] = + (fr"""SELECT obj_description(oid) FROM pg_class WHERE relkind = 'r' AND relnamespace = ( + SELECT oid + FROM pg_catalog.pg_namespace + WHERE nspname = $schema + ) AND relname = $tableName""") + .queryWithLogHandler[Option[String]](logger) // It can be NULL, thus query[String].option will fail + .unique + .map { + case Some(comment) => + SchemaKey.fromUri(comment) match { + case Right(key) => key.asRight + case Left(error) => CommentIssue.Invalid(tableName, comment, error).asLeft + } + case None => + CommentIssue.Missing(tableName).asLeft + } + + def getComments(schema: String): ConnectionIO[List[Either[CommentIssue, SchemaKey]]] = + listTables(schema).flatMap(_.traverse(getComment(schema))) +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala new file mode 100644 index 0000000..c96a671 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.storage + +import cats.syntax.functor._ + +import doobie.Fragment +import doobie.free.connection.ConnectionIO +import doobie.implicits._ +import org.log4s.getLogger + +import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaMap} + +import com.snowplowanalytics.iglu.schemaddl.StringUtils +import com.snowplowanalytics.iglu.schemaddl.StringUtils.getTableName +import com.snowplowanalytics.iglu.schemaddl.jsonschema.{Pointer, Schema} +import com.snowplowanalytics.iglu.schemaddl.migrations.{FlatSchema, Migration, SchemaList} + +import com.snowplowanalytics.snowplow.postgres.shredding.transform.Atomic +import com.snowplowanalytics.snowplow.postgres.shredding.{Type, schema, transform} +import com.snowplowanalytics.snowplow.postgres.logging.Slf4jLogHandler + +object sql { + + private lazy val logger = Slf4jLogHandler(getLogger) + + val DefaultVarcharSize = 4096 + + /** + * Generate the `CREATE TABLE` DDL statement + * @param schema database schema + * @param entity shredded entity + * @param schemaList state of the + * @param meta whether meta columns should be prepended + * @return pure SQL expression with `CREATE TABLE` statement + */ + def createTable(schema: String, entity: SchemaKey, schemaList: SchemaList, meta: Boolean): Fragment = { + val subschemas = FlatSchema.extractProperties(schemaList) + + // Columns derived from schema (no metadata) + val entityColumns = transform.getNameType(subschemas).map { + case (_, columnName, dataType, nullability) => + definitions.columnToString(columnName, dataType, nullability) + } + + val tableName = entity match { + case Atomic => "events" + case other => StringUtils.getTableName(SchemaMap(other)) + } + + val columns = (if (meta) definitions.metaColumns.map((definitions.columnToString _).tupled) else Nil) ++ entityColumns + val table = s"$schema.$tableName" + + Fragment.const(s"CREATE TABLE $table (\n${columns.mkString(",\n")}\n)") + } + + def commentTable(schema: String, tableName: String, schemaKey: SchemaMap): ConnectionIO[Unit] = { + val uri = schemaKey.schemaKey.toSchemaUri + val table = s"$schema.$tableName" + Fragment.const(s"COMMENT ON TABLE $table IS '$uri'").update(logger).run.void + } + + def migrateTable(schema: String, entity: SchemaKey, schemaList: SchemaList) = + schemaList match { + case s: SchemaList.Full => + val migrationList = s.extractSegments.map(Migration.fromSegment) + migrationList.find(_.from == entity.version) match { + case Some(migration) => + val schemaMap = SchemaMap(migration.vendor, migration.name, "jsonschema", migration.to) + val tableName = getTableName(schemaMap) // e.g. com_acme_event_1 + val tableNameFull = s"$schema.$tableName" + + if (migration.diff.added.nonEmpty) { + val columns = migration.diff.added.map { + case (pointer, schema) => + buildColumn(DefaultVarcharSize, (pointer, schema)) + } + + val columnFragments = columns.foldLeft(Fragment.empty) { (acc, cur) => + val separator = if (acc == Fragment.empty) Fragment.const("\n") else Fragment.const(",\n") + acc ++ separator ++ cur.toFragment + } + + Fragment.const0(s"""ALTER TABLE $tableNameFull $columnFragments""") + } else Fragment.empty + case None => + Fragment.empty // TODO: This should be a warning + } + case _: SchemaList.Single => + Fragment.empty // TODO: This should be a warning + } + + /** + * Generate single ALTER TABLE statement for some new property + * + * @param varcharSize default size for VARCHAR + * @param pair pair of property name and its Schema properties like + * length, maximum, etc + * @return DDL statement altering single column in table + */ + def buildColumn(varcharSize: Int, pair: (Pointer.SchemaPointer, Schema)): Column = + pair match { + case (pointer, properties) => + val columnName = FlatSchema.getName(pointer) + val dataType = Type.getDataType(properties, varcharSize, columnName, Type.dataTypeSuggestions) + Column(columnName, dataType, schema.canBeNull(properties)) + } + + case class Column(name: String, dataType: Type, nullable: Boolean) { + + /** "column_name VARCHAR(128) NOT NULL" */ + def toFragment: Fragment = + Fragment.const0(s"$name ${dataType.ddl} ${if (nullable) "NULL" else "NOT NULL"}") + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala new file mode 100644 index 0000000..16d50b4 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.storage + +import cats.Monad +import cats.implicits._ + +import cats.effect.Sync +import org.log4s.getLogger + +import doobie.ConnectionIO +import doobie.implicits._ +import doobie.util.transactor.Transactor + +import query.tableExists + +object utils { + + private lazy val logger = getLogger + + def prepareEventsTable(schema: String): ConnectionIO[Boolean] = { + val create = ddl.createEventsTable(schema).as(false) + val exists = Monad[ConnectionIO].pure(true) + Monad[ConnectionIO].ifM(tableExists(schema, "events"))(exists, create) + } + + def prepare[F[_]: Sync](schema: String, xa: Transactor[F]): F[Unit] = + prepareEventsTable(schema).transact(xa).flatMap { + case true => Sync[F].delay(logger.info(s"$schema.events table already exists")) + case false => Sync[F].delay(logger.info(s"$schema.events table created")) + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala new file mode 100644 index 0000000..72af2f6 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.streaming + +import fs2.Pipe +import cats.effect.Concurrent +import doobie.hikari.HikariTransactor + +/** Evaluates effects, possibly concurrently, and emits the results downstream in any order + */ +trait UnorderedPipe[F[_]] { + def apply[A, B](f: A => F[B]): Pipe[F, A, B] +} + +object UnorderedPipe { + + /** An UnorderedPipe in which results are emitted in the same order as the inputs + * + * Use this UnorderedPipe when a `Concurrent[F]` is not available + */ + def sequential[F[_]]: UnorderedPipe[F] = + new UnorderedPipe[F] { + override def apply[A, B](f: A => F[B]): Pipe[F, A, B] = + _.evalMap(f) + } + + /** An UnorderedPipe that evaluates effects in parallel. + */ + def concurrent[F[_]: Concurrent](maxConcurrent: Int): UnorderedPipe[F] = + new UnorderedPipe[F] { + override def apply[A, B](f: A => F[B]): Pipe[F, A, B] = + _.parEvalMapUnordered(maxConcurrent)(f) + } + + /** A concurrent UnorderedPipe whose parallelism matches the size of the transactor's underlying connection pool. + * + * Use this UnorderedPipe whenever the effect requires a database connection + */ + def forTransactor[F[_]: Concurrent](xa: HikariTransactor[F]): UnorderedPipe[F] = + concurrent(xa.kernel.getMaximumPoolSize) + +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala new file mode 100644 index 0000000..8b7e0cf --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.streaming + +import com.snowplowanalytics.iglu.core.SelfDescribingData + +import io.circe.Json + +import com.snowplowanalytics.snowplow.analytics.scalasdk.Event +import com.snowplowanalytics.snowplow.badrows.BadRow + +object data { + + /** Kind of data flowing through the Loader */ + sealed trait Data extends Product with Serializable { + def snowplow: Boolean = + this match { + case _: Data.Snowplow => true + case _: Data.SelfDescribing => false + } + } + + object Data { + case class Snowplow(data: Event) extends Data + case class SelfDescribing(data: SelfDescribingData[Json]) extends Data + } + + /** Data that for some reasons cannot be inserted into DB */ + sealed trait BadData extends Throwable with Product with Serializable + object BadData { + + /** Typical Snowplow bad row (Loader Iglu Error etc) */ + case class BadEnriched(data: BadRow) extends BadData + + /** Non-enriched error */ + case class BadJson(payload: String, error: String) extends BadData + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/package.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/package.scala new file mode 100644 index 0000000..9503f2e --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/package.scala @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres + +import cats.data.NonEmptyList + +import com.snowplowanalytics.snowplow.badrows.FailureDetails + +package object streaming { + + type IgluErrors = NonEmptyList[FailureDetails.LoaderIgluError] + + object IgluErrors { + def of(error: FailureDetails.LoaderIgluError): NonEmptyList[FailureDetails.LoaderIgluError] = + NonEmptyList.of(error) + } +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala new file mode 100644 index 0000000..c55848c --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.streaming + +import cats.data.EitherT +import cats.implicits._ + +import cats.effect.{Clock, ContextShift, Sync} + +import fs2.Pipe + +import doobie._ +import doobie.implicits._ + +import io.circe.Json +import org.log4s.getLogger + +import com.snowplowanalytics.iglu.core.circe.implicits._ + +import com.snowplowanalytics.iglu.client.Client + +import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload, Processor} +import com.snowplowanalytics.snowplow.postgres.api.{DB, State} +import com.snowplowanalytics.snowplow.postgres.shredding.{Entity, transform} +import com.snowplowanalytics.snowplow.postgres.streaming.data.{BadData, Data} +import com.snowplowanalytics.snowplow.postgres.logging.Slf4jLogHandler + +object sink { + + private lazy val logger = getLogger + private lazy val logHandler = Slf4jLogHandler(logger) + + type Insert = ConnectionIO[Unit] + + /** + * Sink good events into Postgres. During sinking, payloads go through all transformation steps + * and checking the state of the DB itself. + * Events that could not be transformed (due Iglu errors or DB unavailability) are emitted from + * the pipe + * @param unorderdPipe pipe which might optimise by processing events concurrently + * @param state mutable Loader state + * @param client Iglu Client + * @param processor The actor processing these events + */ + def goodSink[F[_]: Sync: Clock: DB](unorderedPipe: UnorderedPipe[F], + state: State[F], + client: Client[F, Json], + processor: Processor + ): Pipe[F, Data, BadData] = + unorderedPipe(sinkPayload(state, client, processor)).andThen { + _.collect { + case Left(badData) => badData + } + } + + /** Sink bad data coming directly into the `Pipe` */ + def badSink[F[_]: Sync: ContextShift]: Pipe[F, BadData, Unit] = + _.evalMap { + case BadData.BadEnriched(row) => Sync[F].delay(logger.warn(row.compact)) + case BadData.BadJson(payload, error) => Sync[F].delay(logger.warn(s"Cannot parse $payload. $error")) + } + + /** Implementation for [[goodSink]] */ + def sinkPayload[F[_]: Sync: Clock: DB](state: State[F], client: Client[F, Json], processor: Processor)( + payload: Data + ): F[Either[BadData, Unit]] = { + val result = for { + entities <- payload match { + case Data.Snowplow(event) => + transform.shredEvent[F](client, processor, event).leftMap(bad => BadData.BadEnriched(bad)) + case Data.SelfDescribing(json) => + transform.shredJson(client)(json).leftMap(errors => BadData.BadJson(json.normalize.noSpaces, errors.toString)) + } + insert <- EitherT(DB.process(entities, state).attempt).leftMap { + case error => + payload match { + case Data.Snowplow(event) => + val badRow = BadRow.LoaderRuntimeError(processor, error.getMessage, Payload.LoaderPayload(event)) + BadData.BadEnriched(badRow) + case Data.SelfDescribing(json) => + BadData.BadJson(json.normalize.noSpaces, s"Cannot insert: ${error.getMessage}") + } + } + } yield insert + + result.value + } + + /** + * Build an `INSERT` action for a single entity + * Multiple inserts later can be combined into a transaction + */ + def insertStatement(schema: String, row: Entity): Insert = { + val length = row.columns.length + + val columns = Fragment.const0(row.columns.map(c => s"""\"${c.name}\"""").mkString(",")) + + val table = Fragment.const0(s"$schema.${row.tableName}") + val values = row.columns.zipWithIndex.foldLeft(fr0"") { + case (acc, (cur, i)) if i < length - 1 => acc ++ cur.value.fragment ++ fr0"," + case (acc, (cur, _)) => acc ++ cur.value.fragment + } + + fr"""INSERT INTO $table ($columns) VALUES ($values)""".update(logHandler).run.void + } + +} diff --git a/modules/common/src/test/resources/logback-test.xml b/modules/common/src/test/resources/logback-test.xml new file mode 100644 index 0000000..e47ca01 --- /dev/null +++ b/modules/common/src/test/resources/logback-test.xml @@ -0,0 +1,19 @@ + + + + true + + [%thread] %highlight(%-5level) %cyan(%logger{30}) - %msg %n + + + + + + + + + diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala new file mode 100644 index 0000000..13a79ef --- /dev/null +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala @@ -0,0 +1,93 @@ +package com.snowplowanalytics.snowplow.postgres + +import java.net.URI +import java.util.UUID + +import cats.data.EitherT +import cats.implicits._ + +import cats.effect.{Clock, ContextShift, IO} + +import org.specs2.mutable.Specification +import org.specs2.specification.BeforeAfterEach +import doobie._ +import doobie.implicits._ +import doobie.postgres.implicits._ + +import io.circe.Json + +import com.snowplowanalytics.iglu.client.Client +import com.snowplowanalytics.iglu.client.resolver.Resolver +import com.snowplowanalytics.iglu.client.resolver.registries.Registry +import com.snowplowanalytics.iglu.client.resolver.registries.Registry.{Config, Http, HttpConnection} +import com.snowplowanalytics.iglu.client.validator.CirceValidator + +import com.snowplowanalytics.snowplow.badrows.FailureDetails +import com.snowplowanalytics.snowplow.postgres.config.DBConfig.JdbcUri + +trait Database extends Specification with BeforeAfterEach { + import Database._ + + implicit val ioClock: Clock[IO] = Clock.create[IO] + + def before = + (dropAll *> storage.utils.prepare[IO](Schema, xa)).unsafeRunSync() + + def after = + dropAll.unsafeRunSync() + + sequential + +} + +object Database { + + val Schema = "public" + + implicit val CS: ContextShift[IO] = IO.contextShift(concurrent.ExecutionContext.global) + + val jdbcUri = JdbcUri("localhost", 5432, "snowplow", "allow") + val registry = Http(Config("localhost registry", 1, Nil), HttpConnection(URI.create("http://localhost:8080/api/"), None)) + val igluClient = Client[IO, Json](Resolver(List(Registry.IgluCentral, registry), None), CirceValidator) + val xa: Transactor[IO] = resources.getTransactorDefault[IO](jdbcUri, "postgres", "mysecretpassword") + + case class ColumnInfo(columnName: String, + columnDefault: Option[String], + isNullable: Boolean, + dataType: String, + characterMaximumLength: Option[Int] + ) + + def query: IO[List[UUID]] = + fr"SELECT event_id FROM events".query[UUID].to[List].transact(xa) + + def count(table: String): IO[Int] = + (fr"SELECT count(*) FROM " ++ Fragment.const(table)).query[Int].unique.transact(xa) + + def describeTable(tableName: String) = + sql"""SELECT column_name::VARCHAR, + column_default::VARCHAR, + is_nullable::BOOLEAN, + data_type::VARCHAR, + character_maximum_length::INTEGER + FROM information_schema.columns + WHERE table_name = $tableName""" + .query[(String, Option[String], Boolean, String, Option[Int])] + .map(ColumnInfo.tupled) + .to[List] + .transact(xa) + + def dropAll: IO[Unit] = { + val schemaFr = Fragment.const(Schema) + List( + fr"DROP SCHEMA $schemaFr CASCADE;", + fr"CREATE SCHEMA $schemaFr;", + fr"GRANT ALL ON SCHEMA public TO postgres;", + fr"GRANT ALL ON SCHEMA public TO $schemaFr;" + ).map(_.update.run).traverse_(_.transact(xa).void) + } + + implicit class ActionOps[A](io: IO[A]) { + def action = EitherT.liftF[IO, FailureDetails.LoaderIgluError, A](io) + } +} diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala new file mode 100644 index 0000000..50b9ba2 --- /dev/null +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.api + +import cats.data.EitherT + +import cats.effect.IO + +import com.snowplowanalytics.iglu.core.{SchemaKey, SelfDescribingSchema, SchemaVer, SchemaMap, SchemaList => CoreSchemaList} + +import com.snowplowanalytics.iglu.schemaddl.IgluSchema +import com.snowplowanalytics.iglu.schemaddl.jsonschema.Schema +import com.snowplowanalytics.iglu.schemaddl.migrations.SchemaList + +import com.snowplowanalytics.snowplow.postgres.Database + +class SchemaStateSpec extends Database { + "init" should { + "initialize an empty state if no tables exist" >> { + val state = SchemaState.init(List(), Database.igluClient.resolver) + val result = state.semiflatMap(_.get).value.unsafeRunSync() + val expected = SchemaState(Map()) + result must beRight(expected) + } + } + + "check" should { + "confirm table exists with a same key as in state" >> { + val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 0)) + val schemaList = SchemaStateSpec.buildSchemaList(List(key)) + + val init = Map(("com.acme", "event", 1) -> schemaList) + val state = SchemaState(init) + state.check(key) must beEqualTo(TableState.Match) + } + + "claim table is outdated for 1-0-1 key if only 1-0-0 is known" >> { + val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 0)) + val schemaList = SchemaStateSpec.buildSchemaList(List(key)) + + val init = Map(("com.acme", "event", 1) -> schemaList) + val state = SchemaState(init) + state.check(SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 1))) must beEqualTo(TableState.Outdated) + } + + "claim table is missing for bumped model" >> { + val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 0)) + val schemaList = SchemaStateSpec.buildSchemaList(List(key)) + + val init = Map(("com.acme", "event", 1) -> schemaList) + val state = SchemaState(init) + state.check(SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(2, 0, 0))) must beEqualTo(TableState.Missing) + } + + "always assume events table exists" >> { + val atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1, 0, 0)) + val state = SchemaState(Map()) + state.check(atomic) must beEqualTo(TableState.Match) + } + } +} + +object SchemaStateSpec { + + val fetch: SchemaKey => EitherT[IO, String, IgluSchema] = + key => EitherT.pure[IO, String](SelfDescribingSchema(SchemaMap(key), Schema.empty)) + + /** Bypass the `SchemaList` construction boilerplate */ + def buildSchemaList(keys: List[SchemaKey]): SchemaList = { + val coreSchemaList = CoreSchemaList.parseUnsafe(keys) + SchemaList.fromSchemaList(coreSchemaList, fetch).value.unsafeRunSync().getOrElse(throw new IllegalStateException) + } +} diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala new file mode 100644 index 0000000..b07c6ee --- /dev/null +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.api + +import java.util.concurrent.TimeUnit + +import concurrent.duration._ + +import cats.implicits._ + +import cats.effect.concurrent.Ref +import cats.effect.{Clock, IO, Timer} + +import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer} + +import org.specs2.ScalaCheck +import org.specs2.mutable.Specification +import com.snowplowanalytics.snowplow.postgres.Database.{CS, igluClient} +import com.snowplowanalytics.snowplow.postgres.api.DB.StateCheck +import com.snowplowanalytics.snowplow.postgres.api.StateSpec._ + +import org.scalacheck.{Gen, Prop} +import org.specs2.scalacheck.Parameters + +class StateSpec extends Specification with ScalaCheck { + "checkAndRun" should { + "execute `mutate` when StateCheck is Block" >> { + val key = SchemaKey("com.acme", "missing_table", "jsonschema", SchemaVer.Full(1, 0, 0)) + val alwaysEmpty: SchemaState => StateCheck = + _ => StateCheck.Block(Set(key), Set.empty) + + val result = for { + state <- initState + db <- Ref.of[IO, List[Int]](List.empty) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- state.checkAndRun(alwaysEmpty, IO.sleep(100.millis) *> db.update(s => 1 :: s), (m, _) => keys.update(_ ++ m)) + res <- (db.get, keys.get).tupled + } yield res + + result.unsafeRunSync() must beEqualTo((List(1), Set(key))) + } + + "not execute `mutate` when StateCheck is Ok" >> { + val alwaysOk: SchemaState => StateCheck = + _ => StateCheck.Ok + + val result = for { + state <- initState + db <- Ref.of[IO, List[Int]](List.empty) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- state.checkAndRun(alwaysOk, IO.sleep(100.millis) *> db.update(s => 1 :: s), (m, _) => keys.update(_ ++ m)) + res <- (db.get, keys.get).tupled + } yield res + + result.unsafeRunSync() must beEqualTo((List(1), Set())) + } + + "execute locked calls one after another" >> { + val key = SchemaKey("com.acme", "missing_table", "jsonschema", SchemaVer.Full(1, 0, 0)) + val alwaysEmpty: SchemaState => StateCheck = + _ => StateCheck.Block(Set(key), Set.empty) + + Prop + .forAll(durationsGen) { durations => + val checks = for { + state <- initState + db <- Ref.of[IO, Int](0) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- durations.parTraverse_(d => state.checkAndRun(alwaysEmpty, db.update(_ + 1), (m, _) => IO.sleep(d) *> keys.update(_ ++ m))) + res <- (db.get, keys.get).tupled + } yield res + val result = measure(checks) + + result.unsafeRunSync() must beLike { + case ((counter, keys), time) => + val totalDelays = durations.foldMap(_.toMillis) + val allBlocking = time must beBetween(totalDelays, totalDelays * 2) + allBlocking.and(counter must beEqualTo(durations.length)).and(keys must beEqualTo(Set(key))) + } + } + .setParameters(Parameters(minTestsOk = 5, maxSize = 10)) + } + + "execute non-locked calls in parallel" >> { // Potentially flaky test + val alwaysEmpty: SchemaState => StateCheck = + _ => StateCheck.Ok + + Prop + .forAll(durationsGen) { durations => + val checks = for { + state <- initState + db <- Ref.of[IO, Int](0) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- durations.parTraverse_(d => state.checkAndRun(alwaysEmpty, IO.sleep(d) *> db.update(_ + 1), (m, _) => keys.update(_ ++ m))) + res <- (db.get, keys.get).tupled + } yield res + val result = measure(checks) + + result.unsafeRunSync() must beLike { + case ((counter, keys), time) => + val maxDelay = durations.fold(5.millis)((a, b) => a.max(b)).toMillis + val nonBlocking = time must lessThan(maxDelay * 2) + nonBlocking.and(counter must beEqualTo(durations.length)).and(keys must beEqualTo(Set())) + } + } + .setParameters(Parameters(minTestsOk = 5, maxSize = 10)) + } + } +} + +object StateSpec { + implicit val C: Clock[IO] = Clock.create[IO] + implicit val T: Timer[IO] = IO.timer(concurrent.ExecutionContext.global) + + val initState = State + .init[IO](List.empty, igluClient.resolver) + .value + .flatMap(_.fold(_ => IO.raiseError[State[IO]](new IllegalStateException("Cannot start a test")), IO.pure)) + + val durationsGen = for { + size <- Gen.chooseNum(2, 20) + delay = Gen.chooseNum(5, 300) + durations <- Gen.listOfN(size, delay) + } yield durations.map(_.millis) + + def measure[A](action: IO[A]): IO[(A, Long)] = + for { + start <- C.realTime(TimeUnit.MILLISECONDS) + a <- action + end <- C.realTime(TimeUnit.MILLISECONDS) + } yield (a, end - start) +} diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/queryspec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/queryspec.scala new file mode 100644 index 0000000..3ea809d --- /dev/null +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/queryspec.scala @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres + +import doobie.implicits._ + +import com.snowplowanalytics.snowplow.postgres.storage.query + +class queryspec extends Database { + "listTables" should { + "return single events table (after prepare executed)" >> { + val expected = List("events") + val result = query.listTables("public").transact(Database.xa).unsafeRunSync() + + result must beEqualTo(expected) + } + + "return no tables (prepare executed only for 'public')" >> { + val expected = List() + val result = query.listTables("empty").transact(Database.xa).unsafeRunSync() + + result must beEqualTo(expected) + } + } + + "tableExists" should { + "return false if table does not exist" >> { + val expected = false + val result = query.tableExists("empty", "non-existent").transact(Database.xa).unsafeRunSync() + + result must beEqualTo(expected) + } + + "return true if table exists (created by Database.before)" >> { + val expected = true + val result = query.tableExists(Database.Schema, "events").transact(Database.xa).unsafeRunSync() + + result must beEqualTo(expected) + } + } + + "getComments" should { + "not fail if schema does not exist" >> { + val expected = List() + val result = query.getComments("empty").transact(Database.xa).unsafeRunSync() + result must beEqualTo(expected) + } + } +} diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala new file mode 100644 index 0000000..44db0f3 --- /dev/null +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.streaming + +import java.util.UUID + +import cats.effect.IO + +import fs2.Stream + +import io.circe.Json +import io.circe.literal._ + +import com.snowplowanalytics.iglu.core.SelfDescribingData +import com.snowplowanalytics.iglu.core.circe.implicits._ + +import com.snowplowanalytics.snowplow.analytics.scalasdk.Event + +import com.snowplowanalytics.snowplow.badrows.Processor +import com.snowplowanalytics.snowplow.postgres.Database +import com.snowplowanalytics.snowplow.postgres.api.{DB, State} +import com.snowplowanalytics.snowplow.postgres.streaming.data.Data + +class sinkspec extends Database { + import Database._ + + val processor = Processor("pgloader", "test") + val unorderedPipe = UnorderedPipe.concurrent[IO](5) + + "goodSink" should { + "sink a single good event" >> { + val line = + "snowplowweb\tweb\t2018-12-18 15:07:17.970\t2016-03-29 07:28:18.611\t2016-03-29 07:28:18.634\tpage_view\t11cdec7b-4cbd-4aa4-a4c9-3874ab9663d4\t\tsnplow6\tjs-2.6.0\tssc-0.6.0-kinesis\tspark-1.16.0-common-0.35.0\t34df2c48bc170c87befb441732a94196\t372d1f2983860eefd262b58e6592dfbc\t80546dc70f4a91f1283c4b6247e31bcf\t26e6412a2421eb923d9d40258ca9ca69\t1\t3a12e8b8e3e91a4d092b833d583c7e30\tDK\t82\tOdder\t8300\t42.0001\t42.003\tCentral Jutland\tTDC Danmark\tTDC Danmark\t\t\thttp://snowplowanalytics.com/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\tMarket basket analysis - identifying products and content that go well together – Snowplow\thttp://snowplowanalytics.com/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\thttp\tsnowplowanalytics.com\t80\t/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\thttp\tsnowplowanalytics.com\t80\t/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\tinternal\t\t\t\t\t\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/web_page/jsonschema/1-0-0\",\"data\":{\"id\":\"05862d26-0dde-4d7a-a494-fc9aae283d23\"}},{\"schema\":\"iglu:org.schema/WebPage/jsonschema/1-0-0\",\"data\":{\"genre\":\"documentation\",\"inLanguage\":\"en-US\"}},{\"schema\":\"iglu:org.w3/PerformanceTiming/jsonschema/1-0-0\",\"data\":{\"navigationStart\":1459236496534,\"unloadEventStart\":1459236496838,\"unloadEventEnd\":1459236496838,\"redirectStart\":0,\"redirectEnd\":0,\"fetchStart\":1459236496534,\"domainLookupStart\":1459236496534,\"domainLookupEnd\":1459236496534,\"connectStart\":1459236496534,\"connectEnd\":1459236496534,\"secureConnectionStart\":0,\"requestStart\":1459236496580,\"responseStart\":1459236496834,\"responseEnd\":1459236496844,\"domLoading\":1459236496853,\"domInteractive\":1459236497780,\"domContentLoadedEventStart\":1459236497780,\"domContentLoadedEventEnd\":1459236498038,\"domComplete\":0,\"loadEventStart\":0,\"loadEventEnd\":0,\"chromeFirstPaint\":1459236498203}}]}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\tChrome 49\tChrome\t49.0.2623.87\tBrowser\tWEBKIT\ten-US\t1\t1\t0\t0\t0\t0\t0\t0\t0\t1\t24\t1920\t1075\tWindows 7\tWindows\tMicrosoft Corporation\tEurope/Berlin\tComputer\t0\t1920\t1200\tUTF-8\t1903\t11214\t\t\t\t\t\t\t\tEurope/Copenhagen\t\t\t\t2016-03-29 07:28:18.636\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-1\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/ua_parser_context/jsonschema/1-0-0\",\"data\":{\"useragentFamily\":\"Chrome\",\"useragentMajor\":\"49\",\"useragentMinor\":\"0\",\"useragentPatch\":\"2623\",\"useragentVersion\":\"Chrome 49.0.2623\",\"osFamily\":\"Windows\",\"osMajor\":\"7\",\"osMinor\":null,\"osPatch\":null,\"osPatchMinor\":null,\"osVersion\":\"Windows 7\",\"deviceFamily\":\"Other\"}}]}\t88c23330-ac4d-4c82-8a18-aa83c1e0c163\t2016-03-29 07:28:18.609\tcom.snowplowanalytics.snowplow\tpage_view\tjsonschema\t1-0-0\tcab5ba164038f31d8e10befc4eb199df\t" + val event = Event.parse(line).getOrElse(throw new RuntimeException("Event is invalid")) + val stream = Stream.emit[IO, Data](Data.Snowplow(event)) + + implicit val D = DB.interpreter[IO](igluClient.resolver, xa, Schema) + + val action = for { + state <- State.init[IO](List(), igluClient.resolver) + _ <- stream.through(sink.goodSink(unorderedPipe, state, igluClient, processor)).compile.drain.action + eventIds <- query.action + uaParserCtxs <- count("com_snowplowanalytics_snowplow_ua_parser_context_1").action + } yield (eventIds, uaParserCtxs) + + val result = action.value.unsafeRunSync() + val ExpectedEventId = UUID.fromString("11cdec7b-4cbd-4aa4-a4c9-3874ab9663d4") + result must beRight.like { + case (List(ExpectedEventId), 1) => ok + case (ids, ctxs) => ko(s"Unexpected result. Event ids: $ids; Contexts: $ctxs") + } + } + + "sink a single self-describing JSON" >> { + val row = json"""{"schema":"iglu:com.getvero/bounced/jsonschema/1-0-0","data":{"bounce_type":"one"}}""" + val json = SelfDescribingData.parse(row).getOrElse(throw new RuntimeException("Invalid SelfDescribingData")) + val stream = Stream.emit[IO, Data](Data.SelfDescribing(json)) + + implicit val D = DB.interpreter[IO](igluClient.resolver, xa, Schema) + + val action = for { + state <- State.init[IO](List(), igluClient.resolver) + _ <- stream.through(sink.goodSink(unorderedPipe, state, igluClient, processor)).compile.drain.action + eventIds <- query.action + rows <- count("com_getvero_bounced_1").action + } yield (eventIds, rows) + + val result = action.value.unsafeRunSync() + result must beRight.like { + case (Nil, 1) => ok + case (ids, ctxs) => ko(s"Unexpected result. Event ids: ${ids.mkString(", ")}; Contexts: $ctxs") + } + } + + "sink a several self-describing JSONs with migrations" >> { + val rows = List( + json"""{"schema":"iglu:me.chuwy/pg-test/jsonschema/1-0-0","data":{"requiredString":"one","requiredUnion":false,"nested":{"a": 1}}}""", + json"""{"schema":"iglu:me.chuwy/pg-test/jsonschema/1-0-1","data":{"requiredString":"two", "requiredUnion": false, "nested": {"a": 2}, "someArray": [2,"two",{}]}}""", + json"""{"schema":"iglu:me.chuwy/pg-test/jsonschema/1-0-2","data":{"requiredString":"three","requiredUnion":"three","nested":{"a": 3},"bigInt": 3}}""" + ).map(SelfDescribingData.parse[Json]) + .map(_.getOrElse(throw new RuntimeException("Invalid SelfDescribingData"))) + .map(Data.SelfDescribing.apply) + + val stream = Stream.emits[IO, Data](rows) + + val ExpectedColumnInfo = List( + ColumnInfo("required_string", None, false, "character varying", Some(4096)), + ColumnInfo("required_union", None, false, "jsonb", None), + ColumnInfo("id", None, true, "uuid", None), + ColumnInfo("nested.a", None, true, "double precision", None), + ColumnInfo("nested.b", None, true, "character varying", Some(4096)), + ColumnInfo("some_array", None, true, "jsonb", None), + ColumnInfo("nested.c", None, true, "bigint", None), + ColumnInfo("some_date", None, true, "timestamp without time zone", None), + ColumnInfo("big_int", None, true, "bigint", None) + ) + + implicit val D = DB.interpreter[IO](igluClient.resolver, xa, Schema) + + val action = for { + state <- State.init[IO](List(), igluClient.resolver) + _ <- stream.through(sink.goodSink(unorderedPipe, state, igluClient, processor)).compile.drain.action + rows <- count("me_chuwy_pg_test_1").action + table <- describeTable("me_chuwy_pg_test_1").action + } yield (rows, table) + + val result = action.value.unsafeRunSync() + result must beRight.like { + case (3, ExpectedColumnInfo) => ok + case (ctxs, cols) => ko(s"Unexpected result. Number of rows: $ctxs; Columns ${cols}") + } + } + } +} diff --git a/modules/loader/src/main/resources/logback.xml b/modules/loader/src/main/resources/logback.xml new file mode 100644 index 0000000..9dd924c --- /dev/null +++ b/modules/loader/src/main/resources/logback.xml @@ -0,0 +1,28 @@ + + + + true + + [%thread] %highlight(%-5level) %cyan(%logger{30}) - %msg %n + + + + + + + + + + + + + + + + + + diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala new file mode 100644 index 0000000..30a653f --- /dev/null +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.config + +import java.nio.file.{Files, InvalidPathException, Path, Paths} +import java.util.Base64 + +import cats.data.{EitherT, ValidatedNel} +import cats.implicits._ + +import cats.effect.{Clock, Sync} + +import io.circe.Json +import io.circe.syntax._ +import io.circe.parser.{parse => jsonParse} + +import com.snowplowanalytics.iglu.core.SelfDescribingData +import com.snowplowanalytics.iglu.core.circe.implicits._ + +import com.snowplowanalytics.iglu.client.Client + +import com.snowplowanalytics.snowplow.badrows.Processor + +import com.monovore.decline._ + +import com.snowplowanalytics.snowplow.postgres.generated.BuildInfo + +case class Cli[F[_]](config: LoaderConfig, iglu: Client[F, Json]) + +object Cli { + + val processor = Processor(BuildInfo.name, BuildInfo.version) + + /** Parse list of arguments, validate against schema and initialize */ + def parse[F[_]: Sync: Clock](args: List[String]): EitherT[F, String, Cli[F]] = + command.parse(args) match { + case Left(help) => EitherT.leftT[F, Cli[F]](help.show) + case Right(rawConfig) => fromRawConfig(rawConfig) + } + + private def fromRawConfig[F[_]: Sync: Clock](rawConfig: RawConfig): EitherT[F, String, Cli[F]] = + for { + resolverJson <- PathOrJson.load(rawConfig.resolver) + igluClient <- Client.parseDefault[F](resolverJson).leftMap(_.show) + configJson <- PathOrJson.load(rawConfig.config) + configData <- + SelfDescribingData + .parse(configJson) + .leftMap(e => s"Configuration JSON is not self-describing, ${e.message(configJson.noSpaces)}") + .toEitherT[F] + _ <- igluClient.check(configData).leftMap(e => s"Iglu validation failed with following error\n: ${e.asJson.spaces2}") + appConfig <- configData.data.as[LoaderConfig].toEitherT[F].leftMap(e => s"Error while decoding configuration JSON, ${e.show}") + } yield Cli(appConfig, igluClient) + + /** Config files for Loader can be passed either as FS path + * or as base64-encoded JSON (if `--base64` is provided) */ + type PathOrJson = Either[Path, Json] + + object PathOrJson { + def parse(string: String, encoded: Boolean): ValidatedNel[String, PathOrJson] = { + val result = + if (encoded) + Either + .catchOnly[IllegalArgumentException](new String(Base64.getDecoder.decode(string))) + .leftMap(_.getMessage) + .flatMap(s => jsonParse(s).leftMap(_.show)) + .map(_.asRight) + else Either.catchOnly[InvalidPathException](Paths.get(string).asLeft).leftMap(_.getMessage) + result.leftMap(error => s"Cannot parse as ${if (encoded) "base64-encoded JSON" else "FS path"}: $error").toValidatedNel + } + + def load[F[_]: Sync](value: PathOrJson): EitherT[F, String, Json] = + value match { + case Right(json) => + EitherT.rightT[F, String](json) + case Left(path) => + Either + .catchNonFatal(new String(Files.readAllBytes(path))) + .leftMap(e => s"Cannot read the file path: $e") + .flatMap(s => jsonParse(s).leftMap(_.show)) + .toEitherT[F] + } + } + + val resolver = Opts.option[String]( + long = "resolver", + help = "Iglu Resolver JSON config, FS path or base64-encoded" + ) + + val config = Opts.option[String]( + long = "config", + help = "Self-describing JSON configuration" + ) + + val base64 = Opts + .flag( + long = "base64", + help = "Configuration passed as Base64-encoded string, not as file path" + ) + .orFalse + + /** Temporary, pure config */ + private case class RawConfig(config: PathOrJson, resolver: PathOrJson) + + private val command: Command[RawConfig] = + Command[(String, String, Boolean)](BuildInfo.name, BuildInfo.version)((config, resolver, base64).tupled).mapValidated { + case (cfg, res, enc) => + (PathOrJson.parse(cfg, enc), PathOrJson.parse(res, enc)).mapN(RawConfig.apply) + } +} diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala new file mode 100644 index 0000000..0bfdd89 --- /dev/null +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.config + +import java.util.{Date, UUID} +import java.time.Instant + +import scala.jdk.CollectionConverters._ + +import cats.syntax.either._ + +import io.circe.Decoder +import io.circe.generic.semiauto.deriveDecoder +import io.circe.generic.extras.Configuration +import io.circe.generic.extras.semiauto.deriveConfiguredDecoder + +import LoaderConfig.{Purpose, Source} + +import software.amazon.awssdk.regions.Region +import software.amazon.kinesis.common.InitialPositionInStream + +case class LoaderConfig(name: String, + id: UUID, + source: Source, + host: String, + port: Int, + database: String, + username: String, + password: String, // TODO: can be EC2 store + sslMode: String, + schema: String, + purpose: Purpose +) { + def getDBConfig: DBConfig = + DBConfig(host, port, database, username, password, sslMode, schema) +} + +object LoaderConfig { + + implicit val awsRegionDecoder: Decoder[Region] = + Decoder.decodeString.emap { s => + val allRegions = Region.regions().asScala.toSet.map((r: Region) => r.id()) + if (allRegions.contains(s)) Region.of(s).asRight + else s"Region $s is unknown, choose from [${allRegions.mkString(", ")}]".asLeft + } + + sealed trait InitPosition { + + /** Turn it into fs2-aws-compatible structure */ + def unwrap: Either[InitialPositionInStream, Date] = + this match { + case InitPosition.Latest => InitialPositionInStream.LATEST.asLeft + case InitPosition.TrimHorizon => InitialPositionInStream.TRIM_HORIZON.asLeft + case InitPosition.AtTimestamp(date) => Date.from(date).asRight + } + } + object InitPosition { + case object Latest extends InitPosition + case object TrimHorizon extends InitPosition + case class AtTimestamp(timestamp: Instant) extends InitPosition + + implicit val ioCirceInitPositionDecoder: Decoder[InitPosition] = + Decoder.decodeJson.emap { json => + json.asString match { + case Some("TRIM_HORIZON") => TrimHorizon.asRight + case Some("LATEST") => Latest.asRight + case Some(other) => + s"Initial position $other is unknown. Choose from LATEST and TRIM_HORIZEON. AT_TIMESTAMP must provide the timestamp".asLeft + case None => + val result = for { + root <- json.asObject.map(_.toMap) + atTimestamp <- root.get("AT_TIMESTAMP") + atTimestampObj <- atTimestamp.asObject.map(_.toMap) + timestampStr <- atTimestampObj.get("timestamp") + timestamp <- timestampStr.as[Instant].toOption + } yield AtTimestamp(timestamp) + result match { + case Some(atTimestamp) => atTimestamp.asRight + case None => + "Initial position can be either LATEST or TRIM_HORIZON string or AT_TIMESTAMP object (e.g. 2020-06-03T00:00:00Z)".asLeft + } + } + } + } + + sealed trait Purpose extends Product with Serializable + object Purpose { + case object Enriched extends Purpose + case object SelfDescribing extends Purpose + + implicit def ioCirceConfigPurposeDecoder: Decoder[Purpose] = + Decoder.decodeString.emap { + case "ENRICHED_EVENTS" => Enriched.asRight + case "JSON" => SelfDescribing.asRight + case other => s"$other is not supported purpose, choose from ENRICHED_EVENTS and JSON".asLeft + } + } + + sealed trait Source extends Product with Serializable + object Source { + + case class Kinesis(appName: String, streamName: String, region: Region, initialPosition: InitPosition) extends Source + case class PubSub(projectId: String, subscriptionId: String) extends Source + + implicit val config: Configuration = + Configuration.default.withSnakeCaseConstructorNames + + implicit def ioCirceConfigSourceDecoder: Decoder[Source] = + deriveConfiguredDecoder[Source] + } + + implicit def ioCirceConfigDecoder: Decoder[LoaderConfig] = + deriveDecoder[LoaderConfig] + +} diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala new file mode 100644 index 0000000..1af8ec8 --- /dev/null +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.loader + +import cats.effect.{ExitCode, IO, IOApp} + +import org.log4s.getLogger + +import com.snowplowanalytics.snowplow.badrows.Processor +import com.snowplowanalytics.snowplow.postgres.api.DB +import com.snowplowanalytics.snowplow.postgres.config.Cli +import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose +import com.snowplowanalytics.snowplow.postgres.generated.BuildInfo +import com.snowplowanalytics.snowplow.postgres.resources +import com.snowplowanalytics.snowplow.postgres.storage.utils +import com.snowplowanalytics.snowplow.postgres.streaming.{UnorderedPipe, sink, source} + +object Main extends IOApp { + + lazy val logger = getLogger + + val processor = Processor(BuildInfo.name, BuildInfo.version) + + def run(args: List[String]): IO[ExitCode] = + Cli.parse[IO](args).value.flatMap { + case Right(Cli(loaderConfig, iglu)) => + resources.initialize[IO](loaderConfig.getDBConfig, iglu).use { + case (blocker, xa, state) => + source.getSource[IO](blocker, loaderConfig.purpose, loaderConfig.source) match { + case Right(dataStream) => + implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, loaderConfig.schema) + for { + _ <- loaderConfig.purpose match { + case Purpose.Enriched => utils.prepare[IO](loaderConfig.schema, xa) + case Purpose.SelfDescribing => IO.unit + } + badSink = sink.badSink[IO] + goodSink = sink.goodSink[IO](UnorderedPipe.forTransactor(xa), state, iglu, processor).andThen(_.through(badSink)) + s = dataStream.observeEither(badSink, goodSink) + + _ <- s.compile.drain + } yield ExitCode.Success + case Left(error) => + IO.delay(logger.error(s"Source initialization error\n${error.getMessage}")).as(ExitCode.Error) + } + } + + case Left(error) => + IO.delay(logger.error(s"Configuration initialization failure\n$error")).as(ExitCode.Error) + } +} diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala new file mode 100644 index 0000000..eded666 --- /dev/null +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.streaming + +import java.util.Base64 +import java.nio.charset.StandardCharsets + +import cats.implicits._ + +import cats.effect.{Blocker, ConcurrentEffect, ContextShift, Sync} + +import fs2.aws.kinesis.{CommittableRecord, KinesisConsumerSettings} +import fs2.aws.kinesis.consumer.readFromKinesisStream + +import com.permutive.pubsub.consumer.grpc.{PubsubGoogleConsumer, PubsubGoogleConsumerConfig} +import io.circe.Json +import io.circe.parser.{parse => parseCirce} +import org.log4s.getLogger + +import com.snowplowanalytics.iglu.core.SelfDescribingData +import com.snowplowanalytics.iglu.core.circe.implicits._ + +import com.snowplowanalytics.snowplow.analytics.scalasdk.Event +import com.snowplowanalytics.snowplow.analytics.scalasdk.ParsingError.NotTSV +import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload} +import com.snowplowanalytics.snowplow.postgres.config.{Cli, LoaderConfig} +import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.{Purpose, Source} +import com.snowplowanalytics.snowplow.postgres.streaming.data.{BadData, Data} + +import com.google.pubsub.v1.PubsubMessage +import com.permutive.pubsub.consumer.Model.{ProjectId, Subscription} +import com.permutive.pubsub.consumer.decoder.MessageDecoder + +object source { + + private lazy val logger = getLogger + + /** + * Acquire a stream of parsed payloads + * + * @param blocker thread pool for pulling events (used only in PubSub) + * @param purpose kind of data, enriched or plain JSONs + * @param config source configuration + * @return either error or stream of parsed payloads + */ + def getSource[F[_]: ConcurrentEffect: ContextShift](blocker: Blocker, purpose: Purpose, config: Source) = + config match { + case LoaderConfig.Source.Kinesis(appName, streamName, region, position) => + KinesisConsumerSettings.apply(streamName, appName, region, initialPositionInStream = position.unwrap) match { + case Right(settings) => + readFromKinesisStream[F](settings).evalMap(record => record.checkpoint.as(parseRecord(purpose, record))).asRight + case Left(error) => + error.asLeft + } + case LoaderConfig.Source.PubSub(projectId, subscriptionId) => + implicit val decoder: MessageDecoder[Either[BadData, Data]] = pubsubDataDecoder(purpose) + val project = ProjectId(projectId) + val subscription = Subscription(subscriptionId) + val pubsubConfig = PubsubGoogleConsumerConfig[F](onFailedTerminate = pubsubOnFailedTerminate[F]) + PubsubGoogleConsumer + .subscribeAndAck[F, Either[BadData, Data]](blocker, project, subscription, pubsubErrorHandler[F], pubsubConfig) + .asRight + } + + /** + * Parse Kinesis record into a valid Loader's record, either enriched event or self-describing JSON, + * depending on purpose of the Loader + */ + def parseRecord(kind: Purpose, record: CommittableRecord): Either[BadData, Data] = { + val string = + try StandardCharsets.UTF_8.decode(record.record.data()).toString.asRight[BadData] + catch { + case _: IllegalArgumentException => + val payload = StandardCharsets.UTF_8.decode(Base64.getEncoder.encode(record.record.data())).toString + kind match { + case Purpose.Enriched => + val badRow = BadRow.LoaderParsingError(Cli.processor, NotTSV, Payload.RawPayload(payload)) + BadData.BadEnriched(badRow).asLeft + case Purpose.SelfDescribing => + BadData.BadJson(payload, "Cannot deserialize self-describing JSON from Kinesis record").asLeft + } + } + + string.flatMap { payload => + kind match { + case Purpose.Enriched => + parseEventString(payload).map(Data.Snowplow.apply) + case Purpose.SelfDescribing => + parseJson(payload).map(Data.SelfDescribing.apply) + } + } + } + + def parseEventString(s: String): Either[BadData, Event] = + Event.parse(s).toEither.leftMap { error => + val badRow = BadRow.LoaderParsingError(Cli.processor, error, Payload.RawPayload(s)) + BadData.BadEnriched(badRow) + } + + def parseJson(s: String): Either[BadData, SelfDescribingData[Json]] = + parseCirce(s) + .leftMap(_.show) + .flatMap(json => SelfDescribingData.parse[Json](json).leftMap(_.message(json.noSpaces))) + .leftMap(error => BadData.BadJson(s, error)) + + def pubsubDataDecoder(purpose: Purpose): MessageDecoder[Either[BadData, Data]] = + purpose match { + case Purpose.Enriched => + (message: Array[Byte]) => parseEventString(new String(message)).map(Data.Snowplow.apply).asRight + case Purpose.SelfDescribing => + (message: Array[Byte]) => parseJson(new String(message)).map(Data.SelfDescribing.apply).asRight + } + + def pubsubErrorHandler[F[_]: Sync](message: PubsubMessage, error: Throwable, ack: F[Unit], nack: F[Unit]): F[Unit] = { + val _ = (error, nack) + Sync[F].delay(logger.warn(s"Couldn't handle ${message.getData.toStringUtf8}")) *> ack + } + + def pubsubOnFailedTerminate[F[_]: Sync](error: Throwable): F[Unit] = + Sync[F].delay(logger.warn(s"Cannot terminate pubsub consumer properly\n${error.getMessage}")) +} diff --git a/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala new file mode 100644 index 0000000..3e233bf --- /dev/null +++ b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.config + +import java.nio.file.Paths +import java.util.UUID + +import cats.effect.{Clock, IO} + +import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.{InitPosition, Purpose, Source} + +import org.specs2.mutable.Specification +import software.amazon.awssdk.regions.Region + +class CliSpec extends Specification { + implicit val ioClock: Clock[IO] = Clock.create[IO] + + "Cli.parse" should { + "accept example config" >> { + val config = Paths.get(getClass.getResource("/config.json").toURI) + val resolver = Paths.get(getClass.getResource("/resolver.json").toURI) + val argv = List("--config", config.toString, "--resolver", resolver.toString) + + val expected = LoaderConfig( + "Acme Ltd. Snowplow Postgres", + UUID.fromString("5c5e4353-4eeb-43da-98f8-2de6dc7fa947"), + Source.Kinesis("acme-postgres-loader", "enriched-events", Region.EU_CENTRAL_1, InitPosition.TrimHorizon), + "localhost", + 5432, + "snowplow", + "postgres", + "mysecretpassword", + "REQUIRE", + "atomic", + Purpose.Enriched + ) + val result = Cli.parse[IO](argv).value.unsafeRunSync() + result must beRight.like { + case Cli(config, _) => config must beEqualTo(expected) + } + } + } + +} diff --git a/project/BuildSettings.scala b/project/BuildSettings.scala new file mode 100644 index 0000000..e049ad9 --- /dev/null +++ b/project/BuildSettings.scala @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ + +// sbt +import sbt._ +import Keys._ + +import sbtbuildinfo.BuildInfoKey +import sbtbuildinfo.BuildInfoKeys._ + +import bintray.BintrayPlugin._ +import bintray.BintrayKeys._ + +import com.typesafe.sbt.SbtNativePackager.autoImport._ +import com.typesafe.sbt.packager.linux.LinuxPlugin.autoImport._ +import com.typesafe.sbt.packager.docker.DockerPlugin.autoImport._ + +import scoverage.ScoverageKeys._ + +object BuildSettings { + val scala212 = "2.12.11" + val scala213 = "2.13.3" + + lazy val projectSettings = Seq( + organization := "com.snowplowanalytics", + version := "0.1.0", + scalaVersion := scala213, + crossScalaVersions := Seq(scala212, scala213), + description := "Loading Snowplow enriched data into PostgreSQL in real-time", + licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0.html")), + parallelExecution in Test := false + ) + + lazy val buildInfoSettings = Seq( + buildInfoKeys := Seq[BuildInfoKey](name, version), + buildInfoPackage := "com.snowplowanalytics.snowplow.postgres.generated" + ) + + /** Docker image settings */ + lazy val dockerSettings = Seq( + maintainer in Docker := "Snowplow Analytics Ltd. ", + dockerBaseImage := "snowplow-docker-registry.bintray.io/snowplow/base-debian:0.2.1", + daemonUser in Docker := "snowplow", + dockerUpdateLatest := true, + dockerRepository := Some("snowplow"), + + daemonUserUid in Docker := None, + defaultLinuxInstallLocation in Docker := "/home/snowplow", // must be home directory of daemonUser + ) + + lazy val mavenSettings = bintraySettings ++ Seq( + publishMavenStyle := true, + publishArtifact := true, + publishArtifact in Test := false, + bintrayOrganization := Some("snowplow"), + bintrayRepository := "snowplow-maven", + pomIncludeRepository := { _ => false }, + homepage := Some(url("http://snowplowanalytics.com")), + scmInfo := Some(ScmInfo(url("https://github.com/snowplow-incubator/snowplow-postgres-loader"), + "scm:git@github.com:snowplow-incubator/snowplow-postgres-loader.git")), + pomExtra := ( + + + Snowplow Analytics Ltd + support@snowplowanalytics.com + Snowplow Analytics Ltd + http://snowplowanalytics.com + + ) + ) + + lazy val scoverageSettings = Seq( + coverageMinimum := 50, + coverageFailOnMinimum := false, + coverageExcludedPackages := "^target/.*", + (test in Test) := { + (coverageReport dependsOn (test in Test)).value + } + ) + + lazy val addExampleConfToTestCp = Seq( + unmanagedClasspath in Test += { + baseDirectory.value.getParentFile.getParentFile / "config" + } + ) +} diff --git a/project/Dependencies.scala b/project/Dependencies.scala new file mode 100644 index 0000000..2a371a4 --- /dev/null +++ b/project/Dependencies.scala @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ + +import sbt._ + +object Dependencies { + + lazy val SnowplowBintray = "Snowplow Bintray" at "https://snowplow.bintray.com/snowplow-maven" + + object V { + // Java + val postgres = "42.2.14" + val commons = "1.13" + val logback = "1.2.3" + + // Scala third-party + val decline = "1.2.0" + val catsEffect = "2.2.0" + val circe = "0.13.0" + val fs2Aws = "3.0.2" + val fs2PubSub = "0.16.1" + val doobie = "0.9.2" + val fs2 = "2.4.4" + val log4s = "1.8.2" + + val analyticsSdk = "2.0.1" + val badRows = "2.1.0" + val schemaDdl = "0.11.0" + + // Testing + val specs2 = "4.9.4" + val scalaCheck = "1.14.3" + } + + // Java + val logback = "ch.qos.logback" % "logback-classic" % V.logback + + // Snyk warnings + val postgres = "org.postgresql" % "postgresql" % V.postgres + val commons = "commons-codec" % "commons-codec" % V.commons + + // Scala third-party + val decline = "com.monovore" %% "decline" % V.decline + val catsEffect = "org.typelevel" %% "cats-effect" % V.catsEffect + val fs2 = "co.fs2" %% "fs2-core" % V.fs2 + val fs2Io = "co.fs2" %% "fs2-io" % V.fs2 + val circe = "io.circe" %% "circe-core" % V.circe + val circeGeneric = "io.circe" %% "circe-generic" % V.circe + val circeExtras = "io.circe" %% "circe-generic-extras" % V.circe + val circeParser = "io.circe" %% "circe-parser" % V.circe + val circeLiteral = "io.circe" %% "circe-literal" % V.circe + val fs2Aws = "io.laserdisc" %% "fs2-aws" % V.fs2Aws + val fs2PubSub = "com.permutive" %% "fs2-google-pubsub-grpc" % V.fs2PubSub + val doobie = "org.tpolecat" %% "doobie-core" % V.doobie + val doobiePg = "org.tpolecat" %% "doobie-postgres" % V.doobie + val doobiePgCirce = "org.tpolecat" %% "doobie-postgres-circe" % V.doobie + val doobieHikari = "org.tpolecat" %% "doobie-hikari" % V.doobie + val log4s = "org.log4s" %% "log4s" % V.log4s + + // Scala first-party + val analyticsSdk = "com.snowplowanalytics" %% "snowplow-scala-analytics-sdk" % V.analyticsSdk + val badRows = "com.snowplowanalytics" %% "snowplow-badrows" % V.badRows + val schemaDdl = "com.snowplowanalytics" %% "schema-ddl" % V.schemaDdl + + // Testing + val specs2 = "org.specs2" %% "specs2-core" % V.specs2 % Test + val specs2Check = "org.specs2" %% "specs2-scalacheck" % V.specs2 % Test + val scalaCheck = "org.scalacheck" %% "scalacheck" % V.scalaCheck % Test + +} diff --git a/project/build.properties b/project/build.properties new file mode 100644 index 0000000..0837f7a --- /dev/null +++ b/project/build.properties @@ -0,0 +1 @@ +sbt.version=1.3.13 diff --git a/project/plugins.sbt b/project/plugins.sbt new file mode 100644 index 0000000..13801db --- /dev/null +++ b/project/plugins.sbt @@ -0,0 +1,10 @@ +logLevel := Level.Warn + +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.0") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.6.1") +addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.7.3") +addSbtPlugin("io.github.davidgregory084" % "sbt-tpolecat" % "0.1.13") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.15") +addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.9.0") +addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.2.7") +addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6")