From 9021c4fa2c5d0903bf1420daeb88e2b98dcd2711 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Mon, 10 Aug 2020 14:14:40 +0100 Subject: [PATCH 1/8] Configure JDBC connection with sslmode (closes #3) --- .../snowplow/postgres/config/LoaderConfig.scala | 6 +++--- .../com/snowplowanalytics/snowplow/postgres/Database.scala | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala index 1505d51..e3a06d4 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala @@ -41,7 +41,7 @@ case class LoaderConfig(name: String, schema: String, purpose: Purpose) { def getJdbc: JdbcUri = - JdbcUri(host, port, database) + JdbcUri(host, port, database, sslMode.toLowerCase().replace('_', '-')) } object LoaderConfig { @@ -120,9 +120,9 @@ object LoaderConfig { deriveConfiguredDecoder[Source] } - case class JdbcUri(host: String, port: Int, database: String) { + case class JdbcUri(host: String, port: Int, database: String, sslMode: String) { override def toString = - s"jdbc:postgresql://$host:$port/$database" + s"jdbc:postgresql://$host:$port/$database?sslmode=$sslMode" } implicit def ioCirceConfigDecoder: Decoder[LoaderConfig] = diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala index ce2720c..974d7f1 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala @@ -47,7 +47,7 @@ object Database { val logger: LogHandler = LogHandler.nop implicit val CS: ContextShift[IO] = IO.contextShift(concurrent.ExecutionContext.global) - val jdbcUri = JdbcUri("localhost", 5432, "snowplow") + val jdbcUri = JdbcUri("localhost", 5432, "snowplow", "allow") val registry = Http(Config("localhost registry", 1, Nil), HttpConnection(URI.create("http://localhost:8080/api/"), None)) val igluClient = Client[IO, Json](Resolver(List(Registry.IgluCentral, registry), None), CirceValidator) val xa: Transactor[IO] = resources.getTransactorDefault[IO](jdbcUri, "postgres", "mysecretpassword") From 24864d652b188f1190fefcb8c77673de5ceb8c6b Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Mon, 10 Aug 2020 15:33:06 +0100 Subject: [PATCH 2/8] Cross compile for scala 2.12 and 2.13 --- .../snowplow/postgres/streaming/source.scala | 3 +-- project/BuildSettings.scala | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala index 220fb19..84421d2 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala @@ -140,8 +140,7 @@ object source { } def pubsubErrorHandler[F[_]: Sync](message: PubsubMessage, error: Throwable, ack: F[Unit], nack: F[Unit]): F[Unit] = { - val _ = error - val _ = nack + val _ = (error, nack) Sync[F].delay(println(s"Couldn't handle ${message.getData.toStringUtf8}")) *> ack } diff --git a/project/BuildSettings.scala b/project/BuildSettings.scala index b2c8a46..b522a85 100644 --- a/project/BuildSettings.scala +++ b/project/BuildSettings.scala @@ -28,10 +28,14 @@ import com.typesafe.sbt.packager.docker.DockerPlugin.autoImport._ import scoverage.ScoverageKeys._ object BuildSettings { + val scala212 = "2.12.11" + val scala213 = "2.13.3" + lazy val projectSettings = Seq( organization := "com.snowplowanalytics", version := "0.1.0-rc3", - scalaVersion := "2.13.3", + scalaVersion := scala213, + crossScalaVersions := Seq(scala212, scala213), description := "Loading Snowplow enriched data into PostgreSQL in real-time", licenses += ("Apache-2.0", url("http://www.apache.org/licenses/LICENSE-2.0.html")), parallelExecution in Test := false From f913cedfa4b1ebc201ba0b94a8117e08a1c4a2e0 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Tue, 11 Aug 2020 10:24:26 +0100 Subject: [PATCH 3/8] Split sink- and source-specific code into different modules --- build.sbt | 17 ++- .../postgres/config/Base64Encoded.scala | 45 -------- .../postgres/config/LoaderConfig.scala | 74 +----------- .../snowplow/postgres/resources.scala | 2 +- .../postgres/shredding/transform.scala | 11 +- .../snowplow/postgres/streaming/data.scala | 46 ++++++++ .../snowplow/postgres/streaming/sink.scala | 16 +-- .../postgres/streaming/sinkspec.scala | 13 ++- .../snowplow/postgres/config/AppConfig.scala | 109 ++++++++++++++++++ .../snowplow/postgres/config/Cli.scala | 6 +- .../snowplow/postgres/loader/Main.scala | 21 ++-- .../snowplow/postgres/streaming/source.scala | 33 +----- .../config/CliSpec.scala | 5 +- project/BuildSettings.scala | 2 +- 14 files changed, 218 insertions(+), 182 deletions(-) delete mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Base64Encoded.scala create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala create mode 100644 modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/AppConfig.scala rename modules/{common => loader}/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala (94%) rename modules/{common => loader}/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala (82%) rename modules/{common/src/test/scala/com/snowplowanalytics/snowplow/postgres => loader/src/test/scala/com.snowplowanalytics.snowplow.postgres}/config/CliSpec.scala (93%) diff --git a/build.sbt b/build.sbt index 4da4082..0eb5dd3 100644 --- a/build.sbt +++ b/build.sbt @@ -20,18 +20,14 @@ lazy val common = project .settings(name := "snowplow-postgres") .enablePlugins(BuildInfoPlugin) .settings(BuildSettings.projectSettings) - .settings(BuildSettings.buildInfoSettings) .settings(BuildSettings.scoverageSettings) - .settings(BuildSettings.addExampleConfToTestCp) .settings(BuildSettings.mavenSettings) .settings( resolvers += Dependencies.SnowplowBintray, libraryDependencies ++= Seq( Dependencies.logger, Dependencies.postgres, - Dependencies.commons, Dependencies.catsEffect, - Dependencies.decline, Dependencies.circe, Dependencies.circeGeneric, Dependencies.circeExtras, @@ -41,8 +37,6 @@ lazy val common = project Dependencies.doobiePg, Dependencies.doobiePgCirce, Dependencies.doobieHikari, - Dependencies.fs2Aws, - Dependencies.fs2PubSub, Dependencies.analyticsSdk, Dependencies.badRows, Dependencies.schemaDdl, @@ -57,6 +51,17 @@ lazy val loader = project .settings(name := "snowplow-postgres-loader") .settings(BuildSettings.projectSettings) .settings(BuildSettings.dockerSettings) + .settings(BuildSettings.buildInfoSettings) + .settings(BuildSettings.addExampleConfToTestCp) + .settings( + libraryDependencies ++= Seq( + Dependencies.commons, + Dependencies.fs2Aws, + Dependencies.fs2PubSub, + Dependencies.decline, + Dependencies.specs2 + ) + ) .dependsOn(common) .enablePlugins(JavaAppPackaging, DockerPlugin, BuildInfoPlugin) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Base64Encoded.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Base64Encoded.scala deleted file mode 100644 index c56eec6..0000000 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Base64Encoded.scala +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. - * - * This program is licensed to you under the Apache License Version 2.0, - * and you may not use this file except in compliance with the Apache License Version 2.0. - * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the Apache License Version 2.0 is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. - */ -package com.snowplowanalytics.snowplow.postgres.config - -import java.util.Base64 - -import cats.syntax.either._ -import cats.data.ValidatedNel - -import io.circe.Json -import io.circe.parser.{ parse => jsonParse } - -import com.monovore.decline.Argument - -/** Base64-encoded JSON */ -case class Base64Encoded(json: Json) extends AnyVal - -object Base64Encoded { - def parse(string: String): Either[String, Base64Encoded] = - Either - .catchOnly[IllegalArgumentException](Base64.getDecoder.decode(string)) - .map(bytes => new String(bytes)) - .flatMap(str => jsonParse(str)) - .leftMap(e => s"Cannot parse ${string} as Base64-encoded JSON: ${e.getMessage}") - .map(json => Base64Encoded(json)) - - implicit def base64EncodedDeclineArg: Argument[Base64Encoded] = - new Argument[Base64Encoded] { - def read(string: String): ValidatedNel[String, Base64Encoded] = - Base64Encoded.parse(string).toValidatedNel - - def defaultMetavar: String = "base64" - } -} - diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala index e3a06d4..0925d84 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala @@ -12,27 +12,13 @@ */ package com.snowplowanalytics.snowplow.postgres.config -import java.util.{UUID, Date} -import java.time.Instant - -import scala.jdk.CollectionConverters._ - import cats.syntax.either._ import io.circe.Decoder +import LoaderConfig.{JdbcUri, Purpose} import io.circe.generic.semiauto.deriveDecoder -import io.circe.generic.extras.Configuration -import io.circe.generic.extras.semiauto.deriveConfiguredDecoder - -import LoaderConfig.{JdbcUri, Source, Purpose} -import software.amazon.awssdk.regions.Region -import software.amazon.kinesis.common.InitialPositionInStream - -case class LoaderConfig(name: String, - id: UUID, - source: Source, - host: String, +case class LoaderConfig(host: String, port: Int, database: String, username: String, @@ -64,62 +50,6 @@ object LoaderConfig { } } - implicit val awsRegionDecoder: Decoder[Region] = - Decoder.decodeString.emap { s => - val allRegions = Region.regions().asScala.toSet.map((r: Region) => r.id()) - if (allRegions.contains(s)) Region.of(s).asRight - else s"Region $s is unknown, choose from [${allRegions.mkString(", ")}]".asLeft - } - - sealed trait InitPosition { - /** Turn it into fs2-aws-compatible structure */ - def unwrap: Either[InitialPositionInStream, Date] = this match { - case InitPosition.Latest => InitialPositionInStream.LATEST.asLeft - case InitPosition.TrimHorizon => InitialPositionInStream.TRIM_HORIZON.asLeft - case InitPosition.AtTimestamp(date) => Date.from(date).asRight - } - } - object InitPosition { - case object Latest extends InitPosition - case object TrimHorizon extends InitPosition - case class AtTimestamp(timestamp: Instant) extends InitPosition - - implicit val ioCirceInitPositionDecoder: Decoder[InitPosition] = - Decoder.decodeJson.emap { json => - json.asString match { - case Some("TRIM_HORIZON") => TrimHorizon.asRight - case Some("LATEST") => Latest.asRight - case Some(other) => - s"Initial position $other is unknown. Choose from LATEST and TRIM_HORIZEON. AT_TIMESTAMP must provide the timestamp".asLeft - case None => - val result = for { - root <- json.asObject.map(_.toMap) - atTimestamp <- root.get("AT_TIMESTAMP") - atTimestampObj <- atTimestamp.asObject.map(_.toMap) - timestampStr <- atTimestampObj.get("timestamp") - timestamp <- timestampStr.as[Instant].toOption - } yield AtTimestamp(timestamp) - result match { - case Some(atTimestamp) => atTimestamp.asRight - case None => "Initial position can be either LATEST or TRIM_HORIZON string or AT_TIMESTAMP object (e.g. 2020-06-03T00:00:00Z)".asLeft - } - } - } - } - - sealed trait Source extends Product with Serializable - object Source { - - case class Kinesis(appName: String, streamName: String, region: Region, initialPosition: InitPosition) extends Source - case class PubSub(projectId: String, subscriptionId: String) extends Source - - implicit val config: Configuration = - Configuration.default.withSnakeCaseConstructorNames - - implicit def ioCirceConfigSourceDecoder: Decoder[Source] = - deriveConfiguredDecoder[Source] - } - case class JdbcUri(host: String, port: Int, database: String, sslMode: String) { override def toString = s"jdbc:postgresql://$host:$port/$database?sslmode=$sslMode" diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala index 9c78fe0..daa3cd1 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala @@ -31,7 +31,7 @@ import com.snowplowanalytics.iglu.client.Client import com.snowplowanalytics.snowplow.postgres.api.State import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.JdbcUri -import com.snowplowanalytics.snowplow.postgres.streaming.source.BadData +import com.snowplowanalytics.snowplow.postgres.streaming.data.BadData object resources { diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala index ea4948c..dc6767e 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala @@ -34,15 +34,14 @@ import com.snowplowanalytics.iglu.schemaddl.jsonschema.{Pointer, Schema} import com.snowplowanalytics.iglu.schemaddl.migrations.FlatSchema import com.snowplowanalytics.snowplow.analytics.scalasdk.Event -import com.snowplowanalytics.snowplow.badrows.{FailureDetails, BadRow, Failure, Payload} +import com.snowplowanalytics.snowplow.badrows.{FailureDetails, BadRow, Failure, Payload, Processor} import Entity.Column -import com.snowplowanalytics.snowplow.postgres.config.Cli object transform { val Atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1,0,0)) /** Transform the whole `Event` (canonical and JSONs) into list of independent entities ready to be inserted */ - def shredEvent[F[_]: Sync: Clock](client: Client[F, Json], event: Event): EitherT[F, BadRow, List[Entity]] = { + def shredEvent[F[_]: Sync: Clock](client: Client[F, Json], processor: Processor, event: Event): EitherT[F, BadRow, List[Entity]] = { val entities = event.contexts.data ++ event.derived_contexts.data ++ event.unstruct_event.data.toList val wholeEvent = entities .parTraverse(shredJson(client)) @@ -52,7 +51,7 @@ object transform { (shreddedEntities, atomic) => atomic :: shreddedEntities.map(addMetadata(event.event_id, event.collector_tstamp)) } } - EitherT(wholeEvent).leftMap[BadRow](buildBadRow(event)) + EitherT(wholeEvent).leftMap[BadRow](buildBadRow(processor, event)) } def addMetadata(eventId: UUID, tstamp: Instant)(entity: Entity): Entity = { @@ -282,7 +281,7 @@ object transform { (columnName, dataType, value) } - private def buildBadRow(event: Event)(errors: NonEmptyList[FailureDetails.LoaderIgluError]) = - BadRow.LoaderIgluError(Cli.processor, Failure.LoaderIgluErrors(errors), Payload.LoaderPayload(event)) + private def buildBadRow(processor: Processor, event: Event)(errors: NonEmptyList[FailureDetails.LoaderIgluError]) = + BadRow.LoaderIgluError(processor, Failure.LoaderIgluErrors(errors), Payload.LoaderPayload(event)) } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala new file mode 100644 index 0000000..42ccbe5 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.streaming + +import com.snowplowanalytics.iglu.core.SelfDescribingData + +import io.circe.Json + +import com.snowplowanalytics.snowplow.analytics.scalasdk.Event +import com.snowplowanalytics.snowplow.badrows.BadRow + +object data { + + /** Kind of data flowing through the Loader */ + sealed trait Data extends Product with Serializable { + def snowplow: Boolean = this match { + case _: Data.Snowplow => true + case _: Data.SelfDescribing => false + } + } + + object Data { + case class Snowplow(data: Event) extends Data + case class SelfDescribing(data: SelfDescribingData[Json]) extends Data + } + + /** Data that for some reasons cannot be inserted into DB */ + sealed trait BadData extends Throwable with Product with Serializable + object BadData { + /** Typical Snowplow bad row (Loader Iglu Error etc) */ + case class BadEnriched(data: BadRow) extends BadData + /** Non-enriched error */ + case class BadJson(payload: String, error: String) extends BadData + } +} + diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala index 0038a4b..eee18a7 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala @@ -28,11 +28,10 @@ import com.snowplowanalytics.iglu.core.circe.implicits._ import com.snowplowanalytics.iglu.client.Client -import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload} +import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload, Processor} import com.snowplowanalytics.snowplow.postgres.api.{State, DB} -import com.snowplowanalytics.snowplow.postgres.config.Cli.processor import com.snowplowanalytics.snowplow.postgres.shredding.{Entity, transform} -import com.snowplowanalytics.snowplow.postgres.streaming.source.{Data, BadData} +import com.snowplowanalytics.snowplow.postgres.streaming.data.{Data, BadData} object sink { @@ -46,11 +45,13 @@ object sink { * @param state mutable Loader state * @param badQueue queue where all unsucessful actions can unload its results * @param client Iglu Client + * @param processor The actor processing these events */ def goodSink[F[_]: Concurrent: Clock: DB](state: State[F], badQueue: Queue[F, BadData], - client: Client[F, Json]): Pipe[F, Data, Unit] = - _.parEvalMapUnordered(32)(sinkPayload(state, badQueue, client)) + client: Client[F, Json], + processor: Processor): Pipe[F, Data, Unit] = + _.parEvalMapUnordered(32)(sinkPayload(state, badQueue, client, processor)) /** Sink bad data coming directly into the `Pipe` and data coming from `badQueue` */ def badSink[F[_]: Concurrent](badQueue: Queue[F, BadData]): Pipe[F, BadData, Unit] = @@ -62,12 +63,13 @@ object sink { /** Implementation for [[goodSink]] */ def sinkPayload[F[_]: Sync: Clock: DB](state: State[F], badQueue: Queue[F, BadData], - client: Client[F, Json])(payload: Data): F[Unit] = { + client: Client[F, Json], + processor: Processor)(payload: Data): F[Unit] = { val result = for { entities <- payload match { case Data.Snowplow(event) => transform - .shredEvent[F](client, event) + .shredEvent[F](client, processor, event) .leftMap(bad => BadData.BadEnriched(bad)) case Data.SelfDescribing(json) => transform diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala index 9d1ac93..6d22c5e 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala @@ -27,13 +27,18 @@ import com.snowplowanalytics.iglu.core.circe.implicits._ import com.snowplowanalytics.snowplow.analytics.scalasdk.Event +import com.snowplowanalytics.snowplow.badrows.Processor import com.snowplowanalytics.snowplow.postgres.Database import com.snowplowanalytics.snowplow.postgres.api.{State, DB} -import com.snowplowanalytics.snowplow.postgres.streaming.source.{Data, BadData} +import com.snowplowanalytics.snowplow.postgres.streaming.data.{Data, BadData} + class sinkspec extends Database { import Database._ + val processor = Processor("pgloader", "test") + + "goodSink" should { "sink a single good event" >> { val line = "snowplowweb\tweb\t2018-12-18 15:07:17.970\t2016-03-29 07:28:18.611\t2016-03-29 07:28:18.634\tpage_view\t11cdec7b-4cbd-4aa4-a4c9-3874ab9663d4\t\tsnplow6\tjs-2.6.0\tssc-0.6.0-kinesis\tspark-1.16.0-common-0.35.0\t34df2c48bc170c87befb441732a94196\t372d1f2983860eefd262b58e6592dfbc\t80546dc70f4a91f1283c4b6247e31bcf\t26e6412a2421eb923d9d40258ca9ca69\t1\t3a12e8b8e3e91a4d092b833d583c7e30\tDK\t82\tOdder\t8300\t42.0001\t42.003\tCentral Jutland\tTDC Danmark\tTDC Danmark\t\t\thttp://snowplowanalytics.com/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\tMarket basket analysis - identifying products and content that go well together – Snowplow\thttp://snowplowanalytics.com/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\thttp\tsnowplowanalytics.com\t80\t/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\thttp\tsnowplowanalytics.com\t80\t/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\tinternal\t\t\t\t\t\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/web_page/jsonschema/1-0-0\",\"data\":{\"id\":\"05862d26-0dde-4d7a-a494-fc9aae283d23\"}},{\"schema\":\"iglu:org.schema/WebPage/jsonschema/1-0-0\",\"data\":{\"genre\":\"documentation\",\"inLanguage\":\"en-US\"}},{\"schema\":\"iglu:org.w3/PerformanceTiming/jsonschema/1-0-0\",\"data\":{\"navigationStart\":1459236496534,\"unloadEventStart\":1459236496838,\"unloadEventEnd\":1459236496838,\"redirectStart\":0,\"redirectEnd\":0,\"fetchStart\":1459236496534,\"domainLookupStart\":1459236496534,\"domainLookupEnd\":1459236496534,\"connectStart\":1459236496534,\"connectEnd\":1459236496534,\"secureConnectionStart\":0,\"requestStart\":1459236496580,\"responseStart\":1459236496834,\"responseEnd\":1459236496844,\"domLoading\":1459236496853,\"domInteractive\":1459236497780,\"domContentLoadedEventStart\":1459236497780,\"domContentLoadedEventEnd\":1459236498038,\"domComplete\":0,\"loadEventStart\":0,\"loadEventEnd\":0,\"chromeFirstPaint\":1459236498203}}]}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\tChrome 49\tChrome\t49.0.2623.87\tBrowser\tWEBKIT\ten-US\t1\t1\t0\t0\t0\t0\t0\t0\t0\t1\t24\t1920\t1075\tWindows 7\tWindows\tMicrosoft Corporation\tEurope/Berlin\tComputer\t0\t1920\t1200\tUTF-8\t1903\t11214\t\t\t\t\t\t\t\tEurope/Copenhagen\t\t\t\t2016-03-29 07:28:18.636\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-1\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/ua_parser_context/jsonschema/1-0-0\",\"data\":{\"useragentFamily\":\"Chrome\",\"useragentMajor\":\"49\",\"useragentMinor\":\"0\",\"useragentPatch\":\"2623\",\"useragentVersion\":\"Chrome 49.0.2623\",\"osFamily\":\"Windows\",\"osMajor\":\"7\",\"osMinor\":null,\"osPatch\":null,\"osPatchMinor\":null,\"osVersion\":\"Windows 7\",\"deviceFamily\":\"Other\"}}]}\t88c23330-ac4d-4c82-8a18-aa83c1e0c163\t2016-03-29 07:28:18.609\tcom.snowplowanalytics.snowplow\tpage_view\tjsonschema\t1-0-0\tcab5ba164038f31d8e10befc4eb199df\t" @@ -45,7 +50,7 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) queue <- Queue.bounded[IO, BadData](1).action - _ <- stream.through(sink.goodSink(state, queue, igluClient)).compile.drain.action + _ <- stream.through(sink.goodSink(state, queue, igluClient, processor)).compile.drain.action eventIds <- query.action uaParserCtxs <- count("com_snowplowanalytics_snowplow_ua_parser_context_1").action } yield (eventIds, uaParserCtxs) @@ -68,7 +73,7 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) queue <- Queue.bounded[IO, BadData](1).action - _ <- stream.through(sink.goodSink(state, queue, igluClient)).compile.drain.action + _ <- stream.through(sink.goodSink(state, queue, igluClient, processor)).compile.drain.action eventIds <- query.action rows <- count("com_getvero_bounced_1").action } yield (eventIds, rows) @@ -106,7 +111,7 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) queue <- Queue.bounded[IO, BadData](1).action - _ <- stream.through(sink.goodSink(state, queue, igluClient)).compile.drain.action + _ <- stream.through(sink.goodSink(state, queue, igluClient, processor)).compile.drain.action rows <- count("me_chuwy_pg_test_1").action table <- describeTable("me_chuwy_pg_test_1").action } yield (rows, table) diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/AppConfig.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/AppConfig.scala new file mode 100644 index 0000000..ce62fd1 --- /dev/null +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/AppConfig.scala @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.config + +import java.util.{UUID, Date} +import java.time.Instant + +import scala.jdk.CollectionConverters._ + +import cats.syntax.either._ + +import io.circe.Decoder +import io.circe.generic.semiauto.deriveDecoder +import io.circe.generic.extras.Configuration +import io.circe.generic.extras.semiauto.deriveConfiguredDecoder + +import AppConfig.Source +import LoaderConfig.Purpose + +import software.amazon.awssdk.regions.Region +import software.amazon.kinesis.common.InitialPositionInStream + +case class AppConfig(name: String, + id: UUID, + source: Source, + host: String, + port: Int, + database: String, + username: String, + password: String, // TODO: can be EC2 store + sslMode: String, + schema: String, + purpose: Purpose) { + def getLoaderConfig: LoaderConfig = + LoaderConfig(host, port, database, username, password, sslMode, schema, purpose) +} + +object AppConfig { + + implicit val awsRegionDecoder: Decoder[Region] = + Decoder.decodeString.emap { s => + val allRegions = Region.regions().asScala.toSet.map((r: Region) => r.id()) + if (allRegions.contains(s)) Region.of(s).asRight + else s"Region $s is unknown, choose from [${allRegions.mkString(", ")}]".asLeft + } + + sealed trait InitPosition { + /** Turn it into fs2-aws-compatible structure */ + def unwrap: Either[InitialPositionInStream, Date] = this match { + case InitPosition.Latest => InitialPositionInStream.LATEST.asLeft + case InitPosition.TrimHorizon => InitialPositionInStream.TRIM_HORIZON.asLeft + case InitPosition.AtTimestamp(date) => Date.from(date).asRight + } + } + object InitPosition { + case object Latest extends InitPosition + case object TrimHorizon extends InitPosition + case class AtTimestamp(timestamp: Instant) extends InitPosition + + implicit val ioCirceInitPositionDecoder: Decoder[InitPosition] = + Decoder.decodeJson.emap { json => + json.asString match { + case Some("TRIM_HORIZON") => TrimHorizon.asRight + case Some("LATEST") => Latest.asRight + case Some(other) => + s"Initial position $other is unknown. Choose from LATEST and TRIM_HORIZEON. AT_TIMESTAMP must provide the timestamp".asLeft + case None => + val result = for { + root <- json.asObject.map(_.toMap) + atTimestamp <- root.get("AT_TIMESTAMP") + atTimestampObj <- atTimestamp.asObject.map(_.toMap) + timestampStr <- atTimestampObj.get("timestamp") + timestamp <- timestampStr.as[Instant].toOption + } yield AtTimestamp(timestamp) + result match { + case Some(atTimestamp) => atTimestamp.asRight + case None => "Initial position can be either LATEST or TRIM_HORIZON string or AT_TIMESTAMP object (e.g. 2020-06-03T00:00:00Z)".asLeft + } + } + } + } + + sealed trait Source extends Product with Serializable + object Source { + + case class Kinesis(appName: String, streamName: String, region: Region, initialPosition: InitPosition) extends Source + case class PubSub(projectId: String, subscriptionId: String) extends Source + + implicit val config: Configuration = + Configuration.default.withSnakeCaseConstructorNames + + implicit def ioCirceConfigSourceDecoder: Decoder[Source] = + deriveConfiguredDecoder[Source] + } + + implicit def ioCirceConfigDecoder: Decoder[AppConfig] = + deriveDecoder[AppConfig] + +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala similarity index 94% rename from modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala rename to modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala index 1e8e6f7..50f0b8a 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala @@ -35,7 +35,7 @@ import com.monovore.decline._ import com.snowplowanalytics.snowplow.postgres.generated.BuildInfo -case class Cli[F[_]](postgres: LoaderConfig, iglu: Client[F, Json], debug: Boolean) +case class Cli[F[_]](config: AppConfig, iglu: Client[F, Json], debug: Boolean) object Cli { @@ -55,8 +55,8 @@ object Cli { configJson <- PathOrJson.load(rawConfig.config) configData <- SelfDescribingData.parse(configJson).leftMap(e => s"Configuration JSON is not self-describing, ${e.message(configJson.noSpaces)}").toEitherT[F] _ <- igluClient.check(configData).leftMap(e => s"Iglu validation failed with following error\n: ${e.asJson.spaces2}") - pgConfig <- configData.data.as[LoaderConfig].toEitherT[F].leftMap(e => s"Error while decoding configuration JSON, ${e.show}") - } yield Cli(pgConfig, igluClient, rawConfig.debug) + appConfig <- configData.data.as[AppConfig].toEitherT[F].leftMap(e => s"Error while decoding configuration JSON, ${e.show}") + } yield Cli(appConfig, igluClient, rawConfig.debug) } /** Config files for Loader can be passed either as FS path diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala index f075a69..94609e0 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala @@ -16,30 +16,35 @@ import cats.effect.{ExitCode, IO, IOApp} import doobie.util.log.LogHandler +import com.snowplowanalytics.snowplow.badrows.Processor import com.snowplowanalytics.snowplow.postgres.api.DB import com.snowplowanalytics.snowplow.postgres.config.Cli import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose +import com.snowplowanalytics.snowplow.postgres.generated.BuildInfo import com.snowplowanalytics.snowplow.postgres.resources import com.snowplowanalytics.snowplow.postgres.storage.utils import com.snowplowanalytics.snowplow.postgres.streaming.{sink, source} object Main extends IOApp { + + val processor = Processor(BuildInfo.name, BuildInfo.version) + def run(args: List[String]): IO[ExitCode] = Cli.parse[IO](args).value.flatMap { - case Right(Cli(postgres, iglu, debug)) => + case Right(Cli(appConfig, iglu, debug)) => val logger = if (debug) LogHandler.jdkLogHandler else LogHandler.nop - resources.initialize[IO](postgres, logger, iglu).use { + resources.initialize[IO](appConfig.getLoaderConfig, logger, iglu).use { case (blocker, xa, state, badQueue) => - source.getSource[IO](blocker, postgres.purpose, postgres.source) match { + source.getSource[IO](blocker, appConfig.purpose, appConfig.source) match { case Right(dataStream) => - val meta = postgres.purpose.snowplow - implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, logger, postgres.schema, meta) + val meta = appConfig.purpose.snowplow + implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, logger, appConfig.schema, meta) for { - _ <- postgres.purpose match { - case Purpose.Enriched => utils.prepare[IO](postgres.schema, xa, logger) + _ <- appConfig.purpose match { + case Purpose.Enriched => utils.prepare[IO](appConfig.schema, xa, logger) case Purpose.SelfDescribing => IO.unit } - goodSink = sink.goodSink[IO](state, badQueue, iglu) + goodSink = sink.goodSink[IO](state, badQueue, iglu, processor) badSink = sink.badSink[IO](badQueue) s = dataStream.observeEither(badSink, goodSink) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala similarity index 82% rename from modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala rename to modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala index 84421d2..a8fd2e9 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala @@ -32,8 +32,9 @@ import com.snowplowanalytics.iglu.core.circe.implicits._ import com.snowplowanalytics.snowplow.analytics.scalasdk.Event import com.snowplowanalytics.snowplow.analytics.scalasdk.ParsingError.NotTSV import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload} -import com.snowplowanalytics.snowplow.postgres.config.{LoaderConfig, Cli} +import com.snowplowanalytics.snowplow.postgres.config.{AppConfig, Cli} import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose +import com.snowplowanalytics.snowplow.postgres.streaming.data.{BadData, Data} import com.google.pubsub.v1.PubsubMessage import com.permutive.pubsub.consumer.Model.{Subscription, ProjectId} @@ -50,17 +51,17 @@ object source { * @return either error or stream of parsed payloads */ def getSource[F[_]: ConcurrentEffect: ContextShift](blocker: Blocker, - purpose: LoaderConfig.Purpose, - config: LoaderConfig.Source) = + purpose: Purpose, + config: AppConfig.Source) = config match { - case LoaderConfig.Source.Kinesis(appName, streamName, region, position) => + case AppConfig.Source.Kinesis(appName, streamName, region, position) => KinesisConsumerSettings.apply(streamName, appName, region, initialPositionInStream = position.unwrap) match { case Right(settings) => readFromKinesisStream[F](settings).evalMap(record => record.checkpoint.as(parseRecord(purpose, record))).asRight case Left(error) => error.asLeft } - case LoaderConfig.Source.PubSub(projectId, subscriptionId) => + case AppConfig.Source.PubSub(projectId, subscriptionId) => implicit val decoder: MessageDecoder[Either[BadData, Data]] = pubsubDataDecoder(purpose) val project = ProjectId(projectId) val subscription = Subscription(subscriptionId) @@ -109,28 +110,6 @@ object source { .flatMap(json => SelfDescribingData.parse[Json](json).leftMap(_.message(json.noSpaces))) .leftMap(error => BadData.BadJson(s, error)) - /** Kind of data flowing through the Loader */ - sealed trait Data extends Product with Serializable { - def snowplow: Boolean = this match { - case _: Data.Snowplow => true - case _: Data.SelfDescribing => false - } - } - - object Data { - case class Snowplow(data: Event) extends Data - case class SelfDescribing(data: SelfDescribingData[Json]) extends Data - } - - /** Data that for some reasons cannot be inserted into DB */ - sealed trait BadData extends Throwable with Product with Serializable - object BadData { - /** Typical Snowplow bad row (Loader Iglu Error etc) */ - case class BadEnriched(data: BadRow) extends BadData - /** Non-enriched error */ - case class BadJson(payload: String, error: String) extends BadData - } - def pubsubDataDecoder(purpose: Purpose): MessageDecoder[Either[BadData, Data]] = purpose match { case Purpose.Enriched => diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/config/CliSpec.scala b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala similarity index 93% rename from modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/config/CliSpec.scala rename to modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala index b20cdd6..439b532 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/config/CliSpec.scala +++ b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala @@ -17,7 +17,8 @@ import java.util.UUID import cats.effect.{IO, Clock} -import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.{Source, InitPosition, Purpose} +import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose +import com.snowplowanalytics.snowplow.postgres.config.AppConfig.{Source, InitPosition} import org.specs2.mutable.Specification import software.amazon.awssdk.regions.Region @@ -31,7 +32,7 @@ class CliSpec extends Specification { val resolver = Paths.get(getClass.getResource("/resolver.json").toURI) val argv = List("--config", config.toString, "--resolver", resolver.toString) - val expected = LoaderConfig( + val expected = AppConfig( "Acme Ltd. Snowplow Postgres", UUID.fromString("5c5e4353-4eeb-43da-98f8-2de6dc7fa947"), Source.Kinesis("acme-postgres-loader", "enriched-events", Region.EU_CENTRAL_1, InitPosition.TrimHorizon), diff --git a/project/BuildSettings.scala b/project/BuildSettings.scala index b522a85..cd0ea85 100644 --- a/project/BuildSettings.scala +++ b/project/BuildSettings.scala @@ -33,7 +33,7 @@ object BuildSettings { lazy val projectSettings = Seq( organization := "com.snowplowanalytics", - version := "0.1.0-rc3", + version := "0.1.0-rc4", scalaVersion := scala213, crossScalaVersions := Seq(scala212, scala213), description := "Loading Snowplow enriched data into PostgreSQL in real-time", From 5313d95f3242a7e38ff81a7ff1f6b9e9edd9bc7d Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Tue, 11 Aug 2020 16:57:37 +0100 Subject: [PATCH 4/8] Replaces the queue with pure pipes --- .../snowplow/postgres/resources.scala | 6 +-- .../snowplow/postgres/streaming/sink.scala | 43 ++++++++----------- .../postgres/streaming/sinkspec.scala | 12 ++---- .../snowplow/postgres/loader/Main.scala | 8 ++-- 4 files changed, 27 insertions(+), 42 deletions(-) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala index daa3cd1..9a831c9 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala @@ -22,8 +22,6 @@ import doobie.util.ExecutionContexts import doobie.util.log.LogHandler import doobie.util.transactor.Transactor -import fs2.concurrent.Queue - import io.circe.Json import com.snowplowanalytics.iglu.client.Client @@ -31,7 +29,6 @@ import com.snowplowanalytics.iglu.client.Client import com.snowplowanalytics.snowplow.postgres.api.State import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.JdbcUri -import com.snowplowanalytics.snowplow.postgres.streaming.data.BadData object resources { @@ -41,7 +38,6 @@ object resources { iglu: Client[F, Json]) = for { blocker <- Blocker[F] - badQueue <- Resource.liftF(Queue.bounded[F, BadData](128)) xa <- resources.getTransactor[F](postgres.getJdbc, postgres.username, postgres.password, blocker) keysF = for { ci <- storage.query.getComments(postgres.schema, logger).transact(xa).map(_.separate) @@ -57,7 +53,7 @@ object resources { Sync[F].pure(state) } state <- Resource.liftF(initState) - } yield (blocker, xa, state, badQueue) + } yield (blocker, xa, state) /** Get a HikariCP transactor */ def getTransactor[F[_]: Async: ContextShift](jdbcUri: JdbcUri, user: String, password: String, be: Blocker): Resource[F, HikariTransactor[F]] = diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala index eee18a7..4c6f0cd 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala @@ -12,12 +12,12 @@ */ package com.snowplowanalytics.snowplow.postgres.streaming +import cats.data.EitherT import cats.implicits._ import cats.effect.{Sync, Clock, Concurrent} import fs2.Pipe -import fs2.concurrent.Queue import doobie._ import doobie.implicits._ @@ -40,31 +40,31 @@ object sink { /** * Sink good events into Postgres. During sinking, payloads go through all transformation steps * and checking the state of the DB itself. - * Events that could not be transformed (due Iglu errors or DB unavailability) are sent back - * to `badQueue` + * Events that could not be transformed (due Iglu errors or DB unavailability) are emitted from + * the pipe * @param state mutable Loader state - * @param badQueue queue where all unsucessful actions can unload its results * @param client Iglu Client * @param processor The actor processing these events */ def goodSink[F[_]: Concurrent: Clock: DB](state: State[F], - badQueue: Queue[F, BadData], client: Client[F, Json], - processor: Processor): Pipe[F, Data, Unit] = - _.parEvalMapUnordered(32)(sinkPayload(state, badQueue, client, processor)) - - /** Sink bad data coming directly into the `Pipe` and data coming from `badQueue` */ - def badSink[F[_]: Concurrent](badQueue: Queue[F, BadData]): Pipe[F, BadData, Unit] = - _.merge(badQueue.dequeue).evalMap { + processor: Processor): Pipe[F, Data, BadData] = + _.parEvalMapUnordered(32)(sinkPayload(state, client, processor)) + .collect { + case Left(badData) => badData + } + + /** Sink bad data coming directly into the `Pipe` */ + def badSink[F[_]: Concurrent]: Pipe[F, BadData, Unit] = + _.evalMap { case BadData.BadEnriched(row) => Sync[F].delay(println(row.compact)) case BadData.BadJson(payload, error) => Sync[F].delay(println(s"Cannot parse $payload. $error")) } /** Implementation for [[goodSink]] */ def sinkPayload[F[_]: Sync: Clock: DB](state: State[F], - badQueue: Queue[F, BadData], client: Client[F, Json], - processor: Processor)(payload: Data): F[Unit] = { + processor: Processor)(payload: Data): F[Either[BadData, Unit]] = { val result = for { entities <- payload match { case Data.Snowplow(event) => @@ -77,25 +77,18 @@ object sink { .map(entity => List(entity)) .leftMap(errors => BadData.BadJson(json.normalize.noSpaces, errors.toString)) } - insert = DB.process(entities, state).attempt.flatMap { - case Right(_) => Sync[F].unit - case Left(error) => payload match { + insert <- EitherT(DB.process(entities, state).attempt).leftMap { + case error => payload match { case Data.Snowplow(event) => val badRow = BadRow.LoaderRuntimeError(processor, error.getMessage, Payload.LoaderPayload(event)) - val pgBadRow = BadData.BadEnriched(badRow) - badQueue.enqueue1(pgBadRow) + BadData.BadEnriched(badRow) case Data.SelfDescribing(json) => - val pgBadRow = BadData.BadJson(json.normalize.noSpaces, s"Cannot insert: ${error.getMessage}") - badQueue.enqueue1(pgBadRow) - + BadData.BadJson(json.normalize.noSpaces, s"Cannot insert: ${error.getMessage}") } } } yield insert - result.value.flatMap { - case Right(action) => action - case Left(error) => badQueue.enqueue1(error) - } + result.value } /** diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala index 6d22c5e..ef6f70b 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala @@ -17,7 +17,6 @@ import java.util.UUID import cats.effect.IO import fs2.Stream -import fs2.concurrent.Queue import io.circe.Json import io.circe.literal._ @@ -30,7 +29,7 @@ import com.snowplowanalytics.snowplow.analytics.scalasdk.Event import com.snowplowanalytics.snowplow.badrows.Processor import com.snowplowanalytics.snowplow.postgres.Database import com.snowplowanalytics.snowplow.postgres.api.{State, DB} -import com.snowplowanalytics.snowplow.postgres.streaming.data.{Data, BadData} +import com.snowplowanalytics.snowplow.postgres.streaming.data.Data class sinkspec extends Database { @@ -49,8 +48,7 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) - queue <- Queue.bounded[IO, BadData](1).action - _ <- stream.through(sink.goodSink(state, queue, igluClient, processor)).compile.drain.action + _ <- stream.through(sink.goodSink(state, igluClient, processor)).compile.drain.action eventIds <- query.action uaParserCtxs <- count("com_snowplowanalytics_snowplow_ua_parser_context_1").action } yield (eventIds, uaParserCtxs) @@ -72,8 +70,7 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) - queue <- Queue.bounded[IO, BadData](1).action - _ <- stream.through(sink.goodSink(state, queue, igluClient, processor)).compile.drain.action + _ <- stream.through(sink.goodSink(state, igluClient, processor)).compile.drain.action eventIds <- query.action rows <- count("com_getvero_bounced_1").action } yield (eventIds, rows) @@ -110,8 +107,7 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) - queue <- Queue.bounded[IO, BadData](1).action - _ <- stream.through(sink.goodSink(state, queue, igluClient, processor)).compile.drain.action + _ <- stream.through(sink.goodSink(state, igluClient, processor)).compile.drain.action rows <- count("me_chuwy_pg_test_1").action table <- describeTable("me_chuwy_pg_test_1").action } yield (rows, table) diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala index 94609e0..578cf1d 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala @@ -34,7 +34,7 @@ object Main extends IOApp { case Right(Cli(appConfig, iglu, debug)) => val logger = if (debug) LogHandler.jdkLogHandler else LogHandler.nop resources.initialize[IO](appConfig.getLoaderConfig, logger, iglu).use { - case (blocker, xa, state, badQueue) => + case (blocker, xa, state) => source.getSource[IO](blocker, appConfig.purpose, appConfig.source) match { case Right(dataStream) => val meta = appConfig.purpose.snowplow @@ -44,9 +44,9 @@ object Main extends IOApp { case Purpose.Enriched => utils.prepare[IO](appConfig.schema, xa, logger) case Purpose.SelfDescribing => IO.unit } - goodSink = sink.goodSink[IO](state, badQueue, iglu, processor) - badSink = sink.badSink[IO](badQueue) - s = dataStream.observeEither(badSink, goodSink) + goodSink = sink.goodSink[IO](state, iglu, processor) + badSink = sink.badSink[IO] + s = dataStream.observeEither(badSink, goodSink.andThen(_.through(badSink))) _ <- s.compile.drain } yield ExitCode.Success From 1a0650629974841fbbeff5d1ac660bc36a4a8c85 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Wed, 12 Aug 2020 08:28:53 +0100 Subject: [PATCH 5/8] Adjust separation of DB and non-DB config classes --- .../{LoaderConfig.scala => DBConfig.scala} | 33 +++---------------- .../snowplow/postgres/resources.scala | 6 ++-- .../snowplow/postgres/Database.scala | 2 +- .../snowplow/postgres/config/Cli.scala | 4 +-- .../{AppConfig.scala => LoaderConfig.scala} | 33 ++++++++++++++----- .../snowplow/postgres/loader/Main.scala | 14 ++++---- .../snowplow/postgres/streaming/source.scala | 8 ++--- .../config/CliSpec.scala | 5 ++- 8 files changed, 48 insertions(+), 57 deletions(-) rename modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/{LoaderConfig.scala => DBConfig.scala} (56%) rename modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/{AppConfig.scala => LoaderConfig.scala} (81%) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala similarity index 56% rename from modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala rename to modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala index 0925d84..9a7769e 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala @@ -12,50 +12,25 @@ */ package com.snowplowanalytics.snowplow.postgres.config -import cats.syntax.either._ +import DBConfig.JdbcUri -import io.circe.Decoder -import LoaderConfig.{JdbcUri, Purpose} -import io.circe.generic.semiauto.deriveDecoder - -case class LoaderConfig(host: String, +case class DBConfig(host: String, port: Int, database: String, username: String, password: String, // TODO: can be EC2 store sslMode: String, - schema: String, - purpose: Purpose) { + schema: String) { def getJdbc: JdbcUri = JdbcUri(host, port, database, sslMode.toLowerCase().replace('_', '-')) } -object LoaderConfig { +object DBConfig { - sealed trait Purpose extends Product with Serializable { - def snowplow: Boolean = this match { - case Purpose.Enriched => true - case Purpose.SelfDescribing => false - } - } - object Purpose { - case object Enriched extends Purpose - case object SelfDescribing extends Purpose - - implicit def ioCirceConfigPurposeDecoder: Decoder[Purpose] = - Decoder.decodeString.emap { - case "ENRICHED_EVENTS" => Enriched.asRight - case "JSON" => SelfDescribing.asRight - case other => s"$other is not supported purpose, choose from ENRICHED_EVENTS and JSON".asLeft - } - } case class JdbcUri(host: String, port: Int, database: String, sslMode: String) { override def toString = s"jdbc:postgresql://$host:$port/$database?sslmode=$sslMode" } - implicit def ioCirceConfigDecoder: Decoder[LoaderConfig] = - deriveDecoder[LoaderConfig] - } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala index 9a831c9..87bc01a 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala @@ -27,13 +27,13 @@ import io.circe.Json import com.snowplowanalytics.iglu.client.Client import com.snowplowanalytics.snowplow.postgres.api.State -import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig -import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.JdbcUri +import com.snowplowanalytics.snowplow.postgres.config.DBConfig +import com.snowplowanalytics.snowplow.postgres.config.DBConfig.JdbcUri object resources { /** Initialise Blocking Thread Pool, Connection Pool, DB state and bad queue resources */ - def initialize[F[_]: Concurrent: Clock: ContextShift](postgres: LoaderConfig, + def initialize[F[_]: Concurrent: Clock: ContextShift](postgres: DBConfig, logger: LogHandler, iglu: Client[F, Json]) = for { diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala index 974d7f1..7be64a6 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala @@ -23,7 +23,7 @@ import com.snowplowanalytics.iglu.client.resolver.registries.Registry.{HttpConne import com.snowplowanalytics.iglu.client.validator.CirceValidator import com.snowplowanalytics.snowplow.badrows.FailureDetails -import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.JdbcUri +import com.snowplowanalytics.snowplow.postgres.config.DBConfig.JdbcUri trait Database extends Specification with BeforeAfterEach { import Database._ diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala index 50f0b8a..0528434 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala @@ -35,7 +35,7 @@ import com.monovore.decline._ import com.snowplowanalytics.snowplow.postgres.generated.BuildInfo -case class Cli[F[_]](config: AppConfig, iglu: Client[F, Json], debug: Boolean) +case class Cli[F[_]](config: LoaderConfig, iglu: Client[F, Json], debug: Boolean) object Cli { @@ -55,7 +55,7 @@ object Cli { configJson <- PathOrJson.load(rawConfig.config) configData <- SelfDescribingData.parse(configJson).leftMap(e => s"Configuration JSON is not self-describing, ${e.message(configJson.noSpaces)}").toEitherT[F] _ <- igluClient.check(configData).leftMap(e => s"Iglu validation failed with following error\n: ${e.asJson.spaces2}") - appConfig <- configData.data.as[AppConfig].toEitherT[F].leftMap(e => s"Error while decoding configuration JSON, ${e.show}") + appConfig <- configData.data.as[LoaderConfig].toEitherT[F].leftMap(e => s"Error while decoding configuration JSON, ${e.show}") } yield Cli(appConfig, igluClient, rawConfig.debug) } diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/AppConfig.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala similarity index 81% rename from modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/AppConfig.scala rename to modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala index ce62fd1..4397817 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/AppConfig.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala @@ -24,13 +24,12 @@ import io.circe.generic.semiauto.deriveDecoder import io.circe.generic.extras.Configuration import io.circe.generic.extras.semiauto.deriveConfiguredDecoder -import AppConfig.Source -import LoaderConfig.Purpose +import LoaderConfig.{Purpose, Source} import software.amazon.awssdk.regions.Region import software.amazon.kinesis.common.InitialPositionInStream -case class AppConfig(name: String, +case class LoaderConfig(name: String, id: UUID, source: Source, host: String, @@ -41,11 +40,11 @@ case class AppConfig(name: String, sslMode: String, schema: String, purpose: Purpose) { - def getLoaderConfig: LoaderConfig = - LoaderConfig(host, port, database, username, password, sslMode, schema, purpose) + def getDBConfig: DBConfig = + DBConfig(host, port, database, username, password, sslMode, schema) } -object AppConfig { +object LoaderConfig { implicit val awsRegionDecoder: Decoder[Region] = Decoder.decodeString.emap { s => @@ -90,6 +89,24 @@ object AppConfig { } } + sealed trait Purpose extends Product with Serializable { + def snowplow: Boolean = this match { + case Purpose.Enriched => true + case Purpose.SelfDescribing => false + } + } + object Purpose { + case object Enriched extends Purpose + case object SelfDescribing extends Purpose + + implicit def ioCirceConfigPurposeDecoder: Decoder[Purpose] = + Decoder.decodeString.emap { + case "ENRICHED_EVENTS" => Enriched.asRight + case "JSON" => SelfDescribing.asRight + case other => s"$other is not supported purpose, choose from ENRICHED_EVENTS and JSON".asLeft + } + } + sealed trait Source extends Product with Serializable object Source { @@ -103,7 +120,7 @@ object AppConfig { deriveConfiguredDecoder[Source] } - implicit def ioCirceConfigDecoder: Decoder[AppConfig] = - deriveDecoder[AppConfig] + implicit def ioCirceConfigDecoder: Decoder[LoaderConfig] = + deriveDecoder[LoaderConfig] } diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala index 578cf1d..75da1e3 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala @@ -31,17 +31,17 @@ object Main extends IOApp { def run(args: List[String]): IO[ExitCode] = Cli.parse[IO](args).value.flatMap { - case Right(Cli(appConfig, iglu, debug)) => + case Right(Cli(loaderConfig, iglu, debug)) => val logger = if (debug) LogHandler.jdkLogHandler else LogHandler.nop - resources.initialize[IO](appConfig.getLoaderConfig, logger, iglu).use { + resources.initialize[IO](loaderConfig.getDBConfig, logger, iglu).use { case (blocker, xa, state) => - source.getSource[IO](blocker, appConfig.purpose, appConfig.source) match { + source.getSource[IO](blocker, loaderConfig.purpose, loaderConfig.source) match { case Right(dataStream) => - val meta = appConfig.purpose.snowplow - implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, logger, appConfig.schema, meta) + val meta = loaderConfig.purpose.snowplow + implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, logger, loaderConfig.schema, meta) for { - _ <- appConfig.purpose match { - case Purpose.Enriched => utils.prepare[IO](appConfig.schema, xa, logger) + _ <- loaderConfig.purpose match { + case Purpose.Enriched => utils.prepare[IO](loaderConfig.schema, xa, logger) case Purpose.SelfDescribing => IO.unit } goodSink = sink.goodSink[IO](state, iglu, processor) diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala index a8fd2e9..28e8e4b 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala @@ -32,7 +32,7 @@ import com.snowplowanalytics.iglu.core.circe.implicits._ import com.snowplowanalytics.snowplow.analytics.scalasdk.Event import com.snowplowanalytics.snowplow.analytics.scalasdk.ParsingError.NotTSV import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload} -import com.snowplowanalytics.snowplow.postgres.config.{AppConfig, Cli} +import com.snowplowanalytics.snowplow.postgres.config.{LoaderConfig, Cli} import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose import com.snowplowanalytics.snowplow.postgres.streaming.data.{BadData, Data} @@ -52,16 +52,16 @@ object source { */ def getSource[F[_]: ConcurrentEffect: ContextShift](blocker: Blocker, purpose: Purpose, - config: AppConfig.Source) = + config: LoaderConfig.Source) = config match { - case AppConfig.Source.Kinesis(appName, streamName, region, position) => + case LoaderConfig.Source.Kinesis(appName, streamName, region, position) => KinesisConsumerSettings.apply(streamName, appName, region, initialPositionInStream = position.unwrap) match { case Right(settings) => readFromKinesisStream[F](settings).evalMap(record => record.checkpoint.as(parseRecord(purpose, record))).asRight case Left(error) => error.asLeft } - case AppConfig.Source.PubSub(projectId, subscriptionId) => + case LoaderConfig.Source.PubSub(projectId, subscriptionId) => implicit val decoder: MessageDecoder[Either[BadData, Data]] = pubsubDataDecoder(purpose) val project = ProjectId(projectId) val subscription = Subscription(subscriptionId) diff --git a/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala index 439b532..1aefd45 100644 --- a/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala +++ b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala @@ -17,8 +17,7 @@ import java.util.UUID import cats.effect.{IO, Clock} -import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose -import com.snowplowanalytics.snowplow.postgres.config.AppConfig.{Source, InitPosition} +import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.{Purpose, Source, InitPosition} import org.specs2.mutable.Specification import software.amazon.awssdk.regions.Region @@ -32,7 +31,7 @@ class CliSpec extends Specification { val resolver = Paths.get(getClass.getResource("/resolver.json").toURI) val argv = List("--config", config.toString, "--resolver", resolver.toString) - val expected = AppConfig( + val expected = LoaderConfig( "Acme Ltd. Snowplow Postgres", UUID.fromString("5c5e4353-4eeb-43da-98f8-2de6dc7fa947"), Source.Kinesis("acme-postgres-loader", "enriched-events", Region.EU_CENTRAL_1, InitPosition.TrimHorizon), From 00c1e1251e14227b57cd0b4b1982a3d1cd87bb5e Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Thu, 13 Aug 2020 15:53:26 +0100 Subject: [PATCH 6/8] Shredded events carry whether they include meta columns --- .../snowplow/postgres/api/DB.scala | 23 +++++++++++-------- .../postgres/shredding/Shredded.scala | 20 ++++++++++++++++ .../postgres/shredding/transform.scala | 9 ++++---- .../snowplow/postgres/streaming/sink.scala | 1 - .../postgres/streaming/sinkspec.scala | 6 ++--- .../postgres/config/LoaderConfig.scala | 7 +----- .../snowplow/postgres/loader/Main.scala | 3 +-- 7 files changed, 43 insertions(+), 26 deletions(-) create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala index 69a2ff2..9969070 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala @@ -27,14 +27,14 @@ import com.snowplowanalytics.iglu.client.Resolver import com.snowplowanalytics.iglu.schemaddl.migrations.SchemaList -import com.snowplowanalytics.snowplow.postgres.shredding.{Entity, schema} +import com.snowplowanalytics.snowplow.postgres.shredding.{Entity, Shredded, schema} import com.snowplowanalytics.snowplow.postgres.storage.ddl import com.snowplowanalytics.snowplow.postgres.streaming.sink trait DB[F[_]] { def insert(event: List[Entity]): F[Unit] def alter(schemaKey: SchemaKey): F[Unit] - def create(schemaKey: SchemaKey): F[Unit] + def create(schemaKey: SchemaKey, includeMeta: Boolean): F[Unit] def getSchemaList(schemaKey: SchemaKey): F[SchemaList] } @@ -43,14 +43,18 @@ object DB { def apply[F[_]](implicit ev: DB[F]): DB[F] = ev - def process[F[_]](event: List[Entity], state: State[F]) + def process[F[_]](shredded: Shredded, state: State[F]) (implicit D: DB[F], B: Bracket[F, Throwable]): F[Unit] = { - val insert = D.insert(event) + val (includeMeta, entities) = shredded match { + case Shredded.ShreddedSnowplow(atomic, entities) => (true, atomic :: entities) + case Shredded.ShreddedSelfDescribing(entity) => (false, List(entity)) + } + val insert = D.insert(entities) // Mutate table and Loader's mutable variable. Only for locked state! def mutate(missing: Set[SchemaKey], outdated: Set[SchemaKey]): F[Unit] = for { - _ <- missing.toList.traverse(D.create) // Create missing tables if any + _ <- missing.toList.traverse(key => D.create(key, includeMeta)) // Create missing tables if any _ <- outdated.toList.traverse(D.alter) // Updated outdated tables if any _ <- (missing ++ outdated).toList.traverse_ { entity => for { // Update state with new schemas @@ -60,7 +64,7 @@ object DB { } } yield () - state.checkAndRun(_.checkEvent(event), insert, mutate) + state.checkAndRun(_.checkEvent(entities), insert, mutate) } @@ -87,8 +91,7 @@ object DB { def interpreter[F[_]: Sync: Clock](resolver: Resolver[F], xa: Transactor[F], logger: LogHandler, - schemaName: String, - meta: Boolean): DB[F] = new DB[F] { + schemaName: String): DB[F] = new DB[F] { def insert(event: List[Entity]): F[Unit] = event.traverse_(sink.insertStatement(logger, schemaName, _)).transact(xa) @@ -97,8 +100,8 @@ object DB { rethrow(result.semiflatMap(_.transact(xa))) } - def create(schemaKey: SchemaKey): F[Unit] = { - val result = ddl.createTable[F](resolver, logger, schemaName, schemaKey, meta) + def create(schemaKey: SchemaKey, includeMeta: Boolean): F[Unit] = { + val result = ddl.createTable[F](resolver, logger, schemaName, schemaKey, includeMeta) rethrow(result.semiflatMap(_.transact(xa))) } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala new file mode 100644 index 0000000..3042bd9 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Shredded.scala @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.shredding + +sealed trait Shredded + +object Shredded { + case class ShreddedSnowplow(event: Entity, entities: List[Entity]) extends Shredded + case class ShreddedSelfDescribing(entity: Entity) extends Shredded +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala index dc6767e..a7f1062 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala @@ -36,19 +36,20 @@ import com.snowplowanalytics.iglu.schemaddl.migrations.FlatSchema import com.snowplowanalytics.snowplow.analytics.scalasdk.Event import com.snowplowanalytics.snowplow.badrows.{FailureDetails, BadRow, Failure, Payload, Processor} import Entity.Column +import Shredded.{ShreddedSelfDescribing, ShreddedSnowplow} object transform { val Atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1,0,0)) /** Transform the whole `Event` (canonical and JSONs) into list of independent entities ready to be inserted */ - def shredEvent[F[_]: Sync: Clock](client: Client[F, Json], processor: Processor, event: Event): EitherT[F, BadRow, List[Entity]] = { + def shredEvent[F[_]: Sync: Clock](client: Client[F, Json], processor: Processor, event: Event): EitherT[F, BadRow, ShreddedSnowplow] = { val entities = event.contexts.data ++ event.derived_contexts.data ++ event.unstruct_event.data.toList val wholeEvent = entities .parTraverse(shredJson(client)) .value .map { shreddedOrError => (shreddedOrError, shredAtomic(Map())(event)).mapN { - (shreddedEntities, atomic) => atomic :: shreddedEntities.map(addMetadata(event.event_id, event.collector_tstamp)) + (shreddedEntities, atomic) => ShreddedSnowplow(atomic, shreddedEntities.map(_.entity).map(addMetadata(event.event_id, event.collector_tstamp))) } } EitherT(wholeEvent).leftMap[BadRow](buildBadRow(processor, event)) @@ -83,7 +84,7 @@ object transform { /** Transform JSON into [[Entity]] */ def shredJson[F[_]: Sync: Clock](client: Client[F, Json]) - (data: SelfDescribingData[Json]): EitherT[F, NonEmptyList[FailureDetails.LoaderIgluError], Entity] = { + (data: SelfDescribingData[Json]): EitherT[F, NonEmptyList[FailureDetails.LoaderIgluError], ShreddedSelfDescribing] = { val key = data.schema schema.getOrdered(client.resolver)(key.vendor, key.name, key.version.model) .leftMap { error => NonEmptyList.of(error) } @@ -105,7 +106,7 @@ object transform { case Atomic => "events" case other => StringUtils.getTableName(SchemaMap(other)) } - Entity(tableName, data.schema, columns) + ShreddedSelfDescribing(Entity(tableName, data.schema, columns)) } } } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala index 4c6f0cd..ae504bb 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala @@ -74,7 +74,6 @@ object sink { case Data.SelfDescribing(json) => transform .shredJson(client)(json) - .map(entity => List(entity)) .leftMap(errors => BadData.BadJson(json.normalize.noSpaces, errors.toString)) } insert <- EitherT(DB.process(entities, state).attempt).leftMap { diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala index ef6f70b..d8c56f9 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala @@ -44,7 +44,7 @@ class sinkspec extends Database { val event = Event.parse(line).getOrElse(throw new RuntimeException("Event is invalid")) val stream = Stream.emit[IO, Data](Data.Snowplow(event)) - implicit val D = DB.interpreter[IO](igluClient.resolver, xa, logger, Schema, true) + implicit val D = DB.interpreter[IO](igluClient.resolver, xa, logger, Schema) val action = for { state <- State.init[IO](List(), igluClient.resolver) @@ -66,7 +66,7 @@ class sinkspec extends Database { val json = SelfDescribingData.parse(row).getOrElse(throw new RuntimeException("Invalid SelfDescribingData")) val stream = Stream.emit[IO, Data](Data.SelfDescribing(json)) - implicit val D = DB.interpreter[IO](igluClient.resolver, xa, logger, Schema, false) + implicit val D = DB.interpreter[IO](igluClient.resolver, xa, logger, Schema) val action = for { state <- State.init[IO](List(), igluClient.resolver) @@ -103,7 +103,7 @@ class sinkspec extends Database { ColumnInfo("big_int", None, true, "bigint", None), ) - implicit val D = DB.interpreter[IO](igluClient.resolver, xa, logger, Schema, false) + implicit val D = DB.interpreter[IO](igluClient.resolver, xa, logger, Schema) val action = for { state <- State.init[IO](List(), igluClient.resolver) diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala index 4397817..3b4d290 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala @@ -89,12 +89,7 @@ object LoaderConfig { } } - sealed trait Purpose extends Product with Serializable { - def snowplow: Boolean = this match { - case Purpose.Enriched => true - case Purpose.SelfDescribing => false - } - } + sealed trait Purpose extends Product with Serializable object Purpose { case object Enriched extends Purpose case object SelfDescribing extends Purpose diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala index 75da1e3..bc5ad34 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala @@ -37,8 +37,7 @@ object Main extends IOApp { case (blocker, xa, state) => source.getSource[IO](blocker, loaderConfig.purpose, loaderConfig.source) match { case Right(dataStream) => - val meta = loaderConfig.purpose.snowplow - implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, logger, loaderConfig.schema, meta) + implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, logger, loaderConfig.schema) for { _ <- loaderConfig.purpose match { case Purpose.Enriched => utils.prepare[IO](loaderConfig.schema, xa, logger) From d6f595e0b924005c01bd1eabb2279a360852ef41 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Thu, 13 Aug 2020 17:36:52 +0100 Subject: [PATCH 7/8] Helper function for initializing db state --- .../snowplow/postgres/resources.scala | 28 ++++++++++++------- .../snowplow/postgres/streaming/sink.scala | 3 +- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala index 87bc01a..c90759f 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala @@ -32,6 +32,8 @@ import com.snowplowanalytics.snowplow.postgres.config.DBConfig.JdbcUri object resources { + val FixedThreadPoolSize: Int = 32 + /** Initialise Blocking Thread Pool, Connection Pool, DB state and bad queue resources */ def initialize[F[_]: Concurrent: Clock: ContextShift](postgres: DBConfig, logger: LogHandler, @@ -39,26 +41,32 @@ object resources { for { blocker <- Blocker[F] xa <- resources.getTransactor[F](postgres.getJdbc, postgres.username, postgres.password, blocker) - keysF = for { - ci <- storage.query.getComments(postgres.schema, logger).transact(xa).map(_.separate) - (issues, comments) = ci - _ <- issues.traverse_(issue => Sync[F].delay(println(issue))) - } yield comments - keys <- Resource.liftF(keysF) - initState = State.init[F](keys, iglu.resolver).value.flatMap { + state <- Resource.liftF(initializeState(postgres, logger, iglu, xa)) + } yield (blocker, xa, state) + + def initializeState[F[_]: Concurrent: Clock](postgres: DBConfig, + logger: LogHandler, + iglu: Client[F, Json], + xa: Transactor[F]): F[State[F]] = { + for { + ci <- storage.query.getComments(postgres.schema, logger).transact(xa).map(_.separate) + (issues, comments) = ci + _ <- issues.traverse_(issue => Sync[F].delay(println(issue))) + initState = State.init[F](comments, iglu.resolver).value.flatMap { case Left(error) => val exception = new RuntimeException(s"Couldn't initalise the state $error") Sync[F].raiseError[State[F]](exception) case Right(state) => Sync[F].pure(state) } - state <- Resource.liftF(initState) - } yield (blocker, xa, state) + state <- initState + } yield state + } /** Get a HikariCP transactor */ def getTransactor[F[_]: Async: ContextShift](jdbcUri: JdbcUri, user: String, password: String, be: Blocker): Resource[F, HikariTransactor[F]] = for { - ce <- ExecutionContexts.fixedThreadPool[F](32) + ce <- ExecutionContexts.fixedThreadPool[F](FixedThreadPoolSize) xa <- HikariTransactor.newHikariTransactor[F]("org.postgresql.Driver", jdbcUri.toString, user, password, ce, be) } yield xa diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala index ae504bb..5195ace 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala @@ -30,6 +30,7 @@ import com.snowplowanalytics.iglu.client.Client import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload, Processor} import com.snowplowanalytics.snowplow.postgres.api.{State, DB} +import com.snowplowanalytics.snowplow.postgres.resources.FixedThreadPoolSize import com.snowplowanalytics.snowplow.postgres.shredding.{Entity, transform} import com.snowplowanalytics.snowplow.postgres.streaming.data.{Data, BadData} @@ -49,7 +50,7 @@ object sink { def goodSink[F[_]: Concurrent: Clock: DB](state: State[F], client: Client[F, Json], processor: Processor): Pipe[F, Data, BadData] = - _.parEvalMapUnordered(32)(sinkPayload(state, client, processor)) + _.parEvalMapUnordered(FixedThreadPoolSize)(sinkPayload(state, client, processor)) .collect { case Left(badData) => badData } From 8bd586310492dee5e12bb41a54e73b97594e6320 Mon Sep 17 00:00:00 2001 From: Ian Streeter Date: Mon, 24 Aug 2020 09:03:49 +0100 Subject: [PATCH 8/8] Change to parallelism default settings - Sets the fs2 parallelism to match the db connection pool - Sets the doobie thread pool equal to the number of available processors --- .../snowplow/postgres/api/DB.scala | 74 +++-- .../snowplow/postgres/api/SchemaState.scala | 80 +++--- .../snowplow/postgres/api/State.scala | 45 ++- .../snowplow/postgres/api/TableState.scala | 2 +- .../snowplow/postgres/config/DBConfig.scala | 25 +- .../snowplow/postgres/package.scala | 4 +- .../snowplow/postgres/resources.scala | 42 +-- .../snowplow/postgres/shredding/Entity.scala | 11 +- .../snowplow/postgres/shredding/Type.scala | 80 +++--- .../snowplow/postgres/shredding/Value.scala | 24 +- .../snowplow/postgres/shredding/schema.scala | 37 ++- .../postgres/shredding/transform.scala | 139 ++++----- .../postgres/storage/CommentIssue.scala | 3 +- .../snowplow/postgres/storage/ddl.scala | 64 +++-- .../postgres/storage/definitions.scala | 268 +++++++++--------- .../snowplow/postgres/storage/query.scala | 4 +- .../snowplow/postgres/storage/sql.scala | 45 ++- .../snowplow/postgres/storage/utils.scala | 2 +- .../postgres/streaming/UnorderedPipe.scala | 52 ++++ .../snowplow/postgres/streaming/data.scala | 12 +- .../snowplow/postgres/streaming/sink.scala | 85 +++--- .../snowplow/postgres/Database.scala | 7 +- .../postgres/api/SchemaStateSpec.scala | 17 +- .../snowplow/postgres/api/StateSpec.scala | 99 ++++--- .../postgres/streaming/sinkspec.scala | 42 +-- .../snowplow/postgres/config/Cli.scala | 62 ++-- .../postgres/config/LoaderConfig.scala | 44 +-- .../snowplow/postgres/loader/Main.scala | 10 +- .../snowplow/postgres/streaming/source.scala | 60 ++-- .../config/CliSpec.scala | 6 +- 30 files changed, 769 insertions(+), 676 deletions(-) create mode 100644 modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala index 9969070..964d25f 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/DB.scala @@ -15,7 +15,7 @@ package com.snowplowanalytics.snowplow.postgres.api import cats.data.EitherT import cats.implicits._ -import cats.effect.{Bracket, Sync, Clock} +import cats.effect.{Bracket, Clock, Sync} import doobie.implicits._ import doobie.util.log.LogHandler @@ -43,23 +43,22 @@ object DB { def apply[F[_]](implicit ev: DB[F]): DB[F] = ev - def process[F[_]](shredded: Shredded, state: State[F]) - (implicit D: DB[F], B: Bracket[F, Throwable]): F[Unit] = { + def process[F[_]](shredded: Shredded, state: State[F])(implicit D: DB[F], B: Bracket[F, Throwable]): F[Unit] = { val (includeMeta, entities) = shredded match { case Shredded.ShreddedSnowplow(atomic, entities) => (true, atomic :: entities) - case Shredded.ShreddedSelfDescribing(entity) => (false, List(entity)) + case Shredded.ShreddedSelfDescribing(entity) => (false, List(entity)) } val insert = D.insert(entities) // Mutate table and Loader's mutable variable. Only for locked state! def mutate(missing: Set[SchemaKey], outdated: Set[SchemaKey]): F[Unit] = for { - _ <- missing.toList.traverse(key => D.create(key, includeMeta)) // Create missing tables if any - _ <- outdated.toList.traverse(D.alter) // Updated outdated tables if any + _ <- missing.toList.traverse(key => D.create(key, includeMeta)) // Create missing tables if any + _ <- outdated.toList.traverse(D.alter) // Updated outdated tables if any _ <- (missing ++ outdated).toList.traverse_ { entity => - for { // Update state with new schemas + for { // Update state with new schemas list <- D.getSchemaList(entity) - _ <- state.put(list) + _ <- state.put(list) } yield () } } yield () @@ -67,17 +66,16 @@ object DB { state.checkAndRun(_.checkEvent(entities), insert, mutate) } - - sealed trait StateCheck { def missing: Set[SchemaKey] def outdated: Set[SchemaKey] - final def add(entity: SchemaKey, state: TableState): StateCheck = state match { - case TableState.Match => this - case TableState.Missing => StateCheck.Block(missing + entity, outdated) - case TableState.Outdated => StateCheck.Block(missing, outdated + entity) - } + final def add(entity: SchemaKey, state: TableState): StateCheck = + state match { + case TableState.Match => this + case TableState.Missing => StateCheck.Block(missing + entity, outdated) + case TableState.Outdated => StateCheck.Block(missing, outdated + entity) + } } object StateCheck { @@ -88,32 +86,30 @@ object DB { } } - def interpreter[F[_]: Sync: Clock](resolver: Resolver[F], - xa: Transactor[F], - logger: LogHandler, - schemaName: String): DB[F] = new DB[F] { - def insert(event: List[Entity]): F[Unit] = - event.traverse_(sink.insertStatement(logger, schemaName, _)).transact(xa) + def interpreter[F[_]: Sync: Clock](resolver: Resolver[F], xa: Transactor[F], logger: LogHandler, schemaName: String): DB[F] = + new DB[F] { + def insert(event: List[Entity]): F[Unit] = + event.traverse_(sink.insertStatement(logger, schemaName, _)).transact(xa) - def alter(schemaKey: SchemaKey): F[Unit] = { - val result = ddl.alterTable[F](resolver, logger, schemaName, schemaKey) - rethrow(result.semiflatMap(_.transact(xa))) - } - - def create(schemaKey: SchemaKey, includeMeta: Boolean): F[Unit] = { - val result = ddl.createTable[F](resolver, logger, schemaName, schemaKey, includeMeta) - rethrow(result.semiflatMap(_.transact(xa))) - } + def alter(schemaKey: SchemaKey): F[Unit] = { + val result = ddl.alterTable[F](resolver, logger, schemaName, schemaKey) + rethrow(result.semiflatMap(_.transact(xa))) + } - def getSchemaList(schemaKey: SchemaKey): F[SchemaList] = { - val result = schema.getSchemaList[F](resolver)(schemaKey.vendor, schemaKey.name, schemaKey.version.model) - rethrow(result) - } + def create(schemaKey: SchemaKey, includeMeta: Boolean): F[Unit] = { + val result = ddl.createTable[F](resolver, logger, schemaName, schemaKey, includeMeta) + rethrow(result.semiflatMap(_.transact(xa))) + } - private def rethrow[A, E](f: EitherT[F, E, A]): F[A] = - f.value.flatMap { - case Right(result) => Sync[F].pure(result) - case Left(error) => Sync[F].raiseError(new RuntimeException(error.toString)) + def getSchemaList(schemaKey: SchemaKey): F[SchemaList] = { + val result = schema.getSchemaList[F](resolver)(schemaKey.vendor, schemaKey.name, schemaKey.version.model) + rethrow(result) } - } + + private def rethrow[A, E](f: EitherT[F, E, A]): F[A] = + f.value.flatMap { + case Right(result) => Sync[F].pure(result) + case Left(error) => Sync[F].raiseError(new RuntimeException(error.toString)) + } + } } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala index 32b2fd6..db917bf 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaState.scala @@ -16,7 +16,7 @@ import cats.data.EitherT import cats.implicits._ import cats.effect.concurrent.Ref -import cats.effect.{Sync, Clock} +import cats.effect.{Clock, Sync} import com.snowplowanalytics.iglu.core.SchemaKey @@ -30,19 +30,20 @@ import com.snowplowanalytics.snowplow.postgres.loader._ import com.snowplowanalytics.snowplow.postgres.shredding /** - * State of the DB schema, where every `ModelGroup` (read "table") - * is associated with list of schemas. Each of these schemas is reflected - * in the structure of the table. If `SchemaKey` matches the `ModelGroup`, - * but not associated with it - the table is outdated. After table has been - * migrated to reflect the newest schema - state need to be updated up to - * that schema - */ + * State of the DB schema, where every `ModelGroup` (read "table") + * is associated with list of schemas. Each of these schemas is reflected + * in the structure of the table. If `SchemaKey` matches the `ModelGroup`, + * but not associated with it - the table is outdated. After table has been + * migrated to reflect the newest schema - state need to be updated up to + * that schema + */ case class SchemaState(tables: Map[ModelGroup, SchemaList]) { + /** - * Check if `SchemaKey` is known to the state - * @param entity `SchemaKey` taken from table comment - * @return one of three possible tables states - */ + * Check if `SchemaKey` is known to the state + * @param entity `SchemaKey` taken from table comment + * @return one of three possible tables states + */ private[postgres] def check(entity: SchemaKey): TableState = { val Atomic = shredding.transform.Atomic val group = (entity.vendor, entity.name, entity.version.model) @@ -50,20 +51,21 @@ case class SchemaState(tables: Map[ModelGroup, SchemaList]) { group match { case (Atomic.vendor, Atomic.name, Atomic.version.model) => TableState.Match - case _ => tables.get(group) match { - case Some(SchemaList.Full(schemas)) => - if (schemas.toList.map(_.self.schemaKey).contains(entity)) TableState.Match else TableState.Outdated - case Some(SchemaList.Single(schema)) => - if (schema.self.schemaKey === entity) TableState.Match else TableState.Outdated - case None => - TableState.Missing - } + case _ => + tables.get(group) match { + case Some(SchemaList.Full(schemas)) => + if (schemas.toList.map(_.self.schemaKey).contains(entity)) TableState.Match else TableState.Outdated + case Some(SchemaList.Single(schema)) => + if (schema.self.schemaKey === entity) TableState.Match else TableState.Outdated + case None => + TableState.Missing + } } } /** Check if any entities from an event are missing in current state */ def checkEvent(entities: List[shredding.Entity]): DB.StateCheck = - entities.foldLeft(DB.StateCheck.Ok: DB.StateCheck) { (acc, key) => acc.add(key.origin, check(key.origin)) } + entities.foldLeft(DB.StateCheck.Ok: DB.StateCheck)((acc, key) => acc.add(key.origin, check(key.origin))) /** Add a whole `SchemaList` to the state (replace if it exists) */ def put(list: SchemaList): SchemaState = { @@ -74,30 +76,32 @@ case class SchemaState(tables: Map[ModelGroup, SchemaList]) { } object SchemaState { + /** - * Initialize internal mutable state by traversing all table comments to get their latest version - * For every schema URI, the whole list will be fetched to keep ordering consistent - * All newer versions (present on registry, but not reflected on table) will be dropped - * + * Initialize internal mutable state by traversing all table comments to get their latest version + * For every schema URI, the whole list will be fetched to keep ordering consistent + * All newer versions (present on registry, but not reflected on table) will be dropped + * * @param resolver Iglu Resolver attached to Iglu Server - * @return a list of potential schema issues (not fatal errors, to be logged) and - * an actual mutable reference with the state - */ + * @return a list of potential schema issues (not fatal errors, to be logged) and + * an actual mutable reference with the state + */ def init[F[_]: Sync: Clock](keys: List[SchemaKey], resolver: Resolver[F]) = { val initial = SchemaState(Map.empty) val availableSchemas = keys.traverse { key => EitherT(resolver.listSchemas(key.vendor, key.name, key.version.model)) - .leftMap { resolutionError => LoaderIgluError.IgluError(key, resolutionError) } - .flatMap { schemaKeyList => SchemaList.fromSchemaList(schemaKeyList, shredding.schema.fetch(resolver)) } - .map { list => list.until(key) match { - case Some(updatedList) => updatedList - case None => throw new IllegalStateException(s"SchemaList $list doesn't match vendor of ${key.toSchemaUri}") - } } + .leftMap(resolutionError => LoaderIgluError.IgluError(key, resolutionError)) + .flatMap(schemaKeyList => SchemaList.fromSchemaList(schemaKeyList, shredding.schema.fetch(resolver))) + .map { list => + list.until(key) match { + case Some(updatedList) => updatedList + case None => throw new IllegalStateException(s"SchemaList $list doesn't match vendor of ${key.toSchemaUri}") + } + } } - availableSchemas - .map { list => list.foldLeft(initial) { (acc, cur) => acc.put(cur) } } - .flatMap { state => EitherT.liftF[F, LoaderIgluError, Ref[F, SchemaState]](Ref.of[F, SchemaState](state)) } + availableSchemas.map(list => list.foldLeft(initial)((acc, cur) => acc.put(cur))).flatMap { state => + EitherT.liftF[F, LoaderIgluError, Ref[F, SchemaState]](Ref.of[F, SchemaState](state)) + } } } - diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala index 0dad3d2..4423ee0 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/State.scala @@ -16,7 +16,7 @@ import cats.Monad import cats.data.EitherT import cats.implicits._ -import cats.effect.concurrent.{Ref, MVar} +import cats.effect.concurrent.{MVar, Ref} import cats.effect.{Bracket, Clock, Concurrent} import cats.effect.implicits._ @@ -30,31 +30,31 @@ import com.snowplowanalytics.snowplow.badrows.FailureDetails.LoaderIgluError import com.snowplowanalytics.snowplow.postgres.api.DB.StateCheck /** - * Mutable variable, protected by by lock. - * [[checkAndRun]] is the only function that should be able to mutate this structure - */ + * Mutable variable, protected by by lock. + * [[checkAndRun]] is the only function that should be able to mutate this structure + */ final class State[F[_]](lock: MVar[F, Unit], state: Ref[F, SchemaState]) { + /** - * Primary state-handling and the only state-mutation function. - * + * Primary state-handling and the only state-mutation function. + * * Most of the time `stateCheck` returns `StateCheck.Ok`, meaning that data can be - * inserted without state or DB schema mutation and lock is not acquired, while - * `action` gets executed. - * + * inserted without state or DB schema mutation and lock is not acquired, while + * `action` gets executed. + * * If new schemas are coming through and state and DB schema have to be changed - * it acquires a lock, preventing other threads from mutating data first, then checks - * if state is still outdated (in case other thread acquired the lock first) and - * performs `mutate` and `action`, releasing the lock afterwards - * If another thread already updated the state it just performs `action` - * + * it acquires a lock, preventing other threads from mutating data first, then checks + * if state is still outdated (in case other thread acquired the lock first) and + * performs `mutate` and `action`, releasing the lock afterwards + * If another thread already updated the state it just performs `action` + * * @param stateCheck check if lock has to be acquired - * @param action primary IO - DB insert statement - * @param mutate IO that mutates the internal state and DB schema - */ - def checkAndRun(stateCheck: SchemaState => StateCheck, - action: F[Unit], - mutate: (Set[SchemaKey], Set[SchemaKey]) => F[Unit]) - (implicit F: Bracket[F, Throwable]): F[Unit] = { + * @param action primary IO - DB insert statement + * @param mutate IO that mutates the internal state and DB schema + */ + def checkAndRun(stateCheck: SchemaState => StateCheck, action: F[Unit], mutate: (Set[SchemaKey], Set[SchemaKey]) => F[Unit])(implicit + F: Bracket[F, Throwable] + ): F[Unit] = { // Just insert OR mutate and insert def check(update: (Set[SchemaKey], Set[SchemaKey]) => F[Unit]) = state.get.map(stateCheck).flatMap { @@ -64,7 +64,7 @@ final class State[F[_]](lock: MVar[F, Unit], state: Ref[F, SchemaState]) { update(missingTables, outdatedTables) } - check { (_, _) => withLock(check(mutate)) } *> action + check((_, _) => withLock(check(mutate))) *> action } /** Update [[SchemaState]] with new `SchemaList` */ @@ -83,4 +83,3 @@ object State { state <- SchemaState.init[F](keys, resolver) } yield new State[F](lock, state) } - diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala index 65ffe30..ad917cb 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/api/TableState.scala @@ -12,7 +12,7 @@ */ package com.snowplowanalytics.snowplow.postgres.api -private[postgres] sealed trait TableState extends Product with Serializable +sealed private[postgres] trait TableState extends Product with Serializable private[postgres] object TableState { case object Match extends TableState case object Outdated extends TableState diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala index 9a7769e..1110f1c 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/DBConfig.scala @@ -12,25 +12,36 @@ */ package com.snowplowanalytics.snowplow.postgres.config +import com.zaxxer.hikari.HikariConfig import DBConfig.JdbcUri case class DBConfig(host: String, - port: Int, - database: String, - username: String, - password: String, // TODO: can be EC2 store - sslMode: String, - schema: String) { + port: Int, + database: String, + username: String, + password: String, // TODO: can be EC2 store + sslMode: String, + schema: String +) { def getJdbc: JdbcUri = JdbcUri(host, port, database, sslMode.toLowerCase().replace('_', '-')) } object DBConfig { - case class JdbcUri(host: String, port: Int, database: String, sslMode: String) { override def toString = s"jdbc:postgresql://$host:$port/$database?sslmode=$sslMode" } + def hikariConfig(dbConfig: DBConfig) = { + val config = new HikariConfig() + config.setDriverClassName("org.postgresql.Driver") + config.setJdbcUrl(dbConfig.getJdbc.toString) + config.setUsername(dbConfig.username) + config.setPassword(dbConfig.password) + // TODO: DBConfig could take a MaxConnections field, and set `config.setMaximumPoolSize`. + config + } + } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala index 147d919..9cfafe2 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/package.scala @@ -16,8 +16,8 @@ import cats.Eq import com.snowplowanalytics.iglu.core.SchemaKey -import com.snowplowanalytics.iglu.schemaddl.jsonschema.{Pointer, JsonSchemaProperty} -import com.snowplowanalytics.iglu.schemaddl.jsonschema.Pointer.{SchemaProperty, Cursor} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.{JsonSchemaProperty, Pointer} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.Pointer.{Cursor, SchemaProperty} import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.CommonProperties.Type package object loader { diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala index c90759f..ad349ca 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/resources.scala @@ -14,8 +14,9 @@ package com.snowplowanalytics.snowplow.postgres import cats.implicits._ -import cats.effect.{ContextShift, Async, Blocker, Clock, Resource, Concurrent, Sync} +import cats.effect.{Async, Blocker, Clock, Concurrent, ContextShift, Resource, Sync} +import com.zaxxer.hikari.HikariConfig import doobie.hikari._ import doobie.implicits._ import doobie.util.ExecutionContexts @@ -32,24 +33,21 @@ import com.snowplowanalytics.snowplow.postgres.config.DBConfig.JdbcUri object resources { - val FixedThreadPoolSize: Int = 32 - /** Initialise Blocking Thread Pool, Connection Pool, DB state and bad queue resources */ - def initialize[F[_]: Concurrent: Clock: ContextShift](postgres: DBConfig, - logger: LogHandler, - iglu: Client[F, Json]) = + def initialize[F[_]: Concurrent: Clock: ContextShift](postgres: DBConfig, logger: LogHandler, iglu: Client[F, Json]) = for { blocker <- Blocker[F] - xa <- resources.getTransactor[F](postgres.getJdbc, postgres.username, postgres.password, blocker) - state <- Resource.liftF(initializeState(postgres, logger, iglu, xa)) + xa <- resources.getTransactor[F](DBConfig.hikariConfig(postgres), blocker) + state <- Resource.liftF(initializeState(postgres.schema, logger, iglu, xa)) } yield (blocker, xa, state) - def initializeState[F[_]: Concurrent: Clock](postgres: DBConfig, - logger: LogHandler, - iglu: Client[F, Json], - xa: Transactor[F]): F[State[F]] = { + def initializeState[F[_]: Concurrent: Clock](schema: String, + logger: LogHandler, + iglu: Client[F, Json], + xa: HikariTransactor[F] + ): F[State[F]] = for { - ci <- storage.query.getComments(postgres.schema, logger).transact(xa).map(_.separate) + ci <- storage.query.getComments(schema, logger).transact(xa).map(_.separate) (issues, comments) = ci _ <- issues.traverse_(issue => Sync[F].delay(println(issue))) initState = State.init[F](comments, iglu.resolver).value.flatMap { @@ -61,18 +59,26 @@ object resources { } state <- initState } yield state - } /** Get a HikariCP transactor */ - def getTransactor[F[_]: Async: ContextShift](jdbcUri: JdbcUri, user: String, password: String, be: Blocker): Resource[F, HikariTransactor[F]] = + def getTransactor[F[_]: Async: ContextShift](config: HikariConfig, be: Blocker): Resource[F, HikariTransactor[F]] = { + val threadPoolSize = + // This could be made configurable, but these are sensible defaults and unlikely to be critical for tuning throughput. + // Exceeding availableProcessors could lead to unnecessary context switching. + // Exceeding the connection pool size is unnecessary, because that is limit of the app's parallelism. + Math.min(config.getMaximumPoolSize, Runtime.getRuntime.availableProcessors) for { - ce <- ExecutionContexts.fixedThreadPool[F](FixedThreadPoolSize) - xa <- HikariTransactor.newHikariTransactor[F]("org.postgresql.Driver", jdbcUri.toString, user, password, ce, be) + ce <- ExecutionContexts.fixedThreadPool[F](threadPoolSize) + xa <- HikariTransactor.fromHikariConfig[F](config, ce, be) } yield xa + } /** Get default single-threaded transactor (use for tests only) */ def getTransactorDefault[F[_]: Async: ContextShift](jdbcUri: JdbcUri, username: String, password: String): Transactor[F] = Transactor.fromDriverManager[F]( - "org.postgresql.Driver", jdbcUri.toString, username, password + "org.postgresql.Driver", + jdbcUri.toString, + username, + password ) } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala index d7d1675..5bca15b 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Entity.scala @@ -20,12 +20,13 @@ import Entity.Column case class Entity(tableName: String, origin: SchemaKey, columns: List[Column]) object Entity { + /** - * Table cell with value and meta info - * @param name Postgres column name - * @param dataType Postgres data type - * @param value ready-to-be-inserted value - */ + * Table cell with value and meta info + * @param name Postgres column name + * @param dataType Postgres data type + * @param value ready-to-be-inserted value + */ case class Column(name: String, dataType: Type, value: Value) } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala index 48772ea..0f6afd7 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Type.scala @@ -18,24 +18,25 @@ import io.circe.Json import com.snowplowanalytics.iglu.schemaddl.jsonschema.Schema import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.CommonProperties.{Type => SType} -import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.NumberProperty.{MultipleOf, Maximum} +import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.NumberProperty.{Maximum, MultipleOf} import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.StringProperty.{Format, MaxLength, MinLength} import com.snowplowanalytics.snowplow.postgres.loader._ sealed trait Type { - def ddl: String = this match { - case Type.Char(size) => s"CHAR($size)" - case Type.Varchar(size) => s"VARCHAR($size)" - case Type.Uuid => "UUID" - case Type.Timestamp => "TIMESTAMP" - case Type.Date => "DATE" - case Type.Integer => "INTEGER" - case Type.BigInt => "BIGINT" - case Type.Double => "DOUBLE PRECISION" - case Type.Bool => "BOOLEAN" - case Type.Jsonb => "JSONB" - } + def ddl: String = + this match { + case Type.Char(size) => s"CHAR($size)" + case Type.Varchar(size) => s"VARCHAR($size)" + case Type.Uuid => "UUID" + case Type.Timestamp => "TIMESTAMP" + case Type.Date => "DATE" + case Type.Integer => "INTEGER" + case Type.BigInt => "BIGINT" + case Type.Double => "DOUBLE PRECISION" + case Type.Bool => "BOOLEAN" + case Type.Jsonb => "JSONB" + } } object Type { @@ -54,16 +55,14 @@ object Type { type DataTypeSuggestion = (Schema, String) => Option[Type] /** Derive a Postgres type, given JSON Schema */ - def getDataType(properties: Schema, - varcharSize: Int, - columnName: String, - suggestions: List[DataTypeSuggestion]): Type = + def getDataType(properties: Schema, varcharSize: Int, columnName: String, suggestions: List[DataTypeSuggestion]): Type = suggestions match { case Nil => Type.Varchar(4096) // Generic - case suggestion :: tail => suggestion(properties, columnName) match { - case Some(format) => format - case None => getDataType(properties, varcharSize, columnName, tail) - } + case suggestion :: tail => + suggestion(properties, columnName) match { + case Some(format) => format + case None => getDataType(properties, varcharSize, columnName, tail) + } } // For complex enums Suggest VARCHAR with length of longest element @@ -140,8 +139,7 @@ object Type { val charSuggestion: DataTypeSuggestion = (properties, _) => { (properties.`type`, properties.minLength, properties.maxLength) match { - case (Some(types), Some(MinLength(min)), Some(MaxLength(max))) - if min === max && types.possiblyWithNull(SType.String) => + case (Some(types), Some(MinLength(min)), Some(MaxLength(max))) if min === max && types.possiblyWithNull(SType.String) => Some(Type.Char(min.toInt)) case _ => None } @@ -150,7 +148,7 @@ object Type { val booleanSuggestion: DataTypeSuggestion = (properties, _) => { properties.`type` match { case Some(types) if types.possiblyWithNull(SType.Boolean) => Some(Type.Bool) - case _ => None + case _ => None } } @@ -163,10 +161,10 @@ object Type { } val varcharSuggestion: DataTypeSuggestion = (properties, _) => { - (properties.`type`, properties.maxLength, properties.enum, properties.format) match { - case (Some(types), Some(maxLength), _, _) if types.possiblyWithNull(SType.String) => + (properties.`type`, properties.maxLength, properties.enum, properties.format) match { + case (Some(types), Some(maxLength), _, _) if types.possiblyWithNull(SType.String) => Some(Type.Varchar(maxLength.value.toInt)) - case (_, _, Some(enum), _) => + case (_, _, Some(enum), _) => enum.value.map(jsonLength).maximumOption match { case Some(maxLength) if enum.value.lengthCompare(1) === 0 => Some(Type.Varchar(maxLength)) @@ -178,7 +176,6 @@ object Type { } } - val dataTypeSuggestions: List[DataTypeSuggestion] = List( complexEnumSuggestion, productSuggestion, @@ -197,18 +194,18 @@ object Type { json.fold(0, b => b.toString.length, _ => json.noSpaces.length, _.length, _ => json.noSpaces.length, _ => json.noSpaces.length) /** - * Get set of types or enum as string excluding null - * + * Get set of types or enum as string excluding null + * * @param types comma-separated types - * @return set of strings - */ + * @return set of strings + */ private def excludeNull(types: List[Json]): List[Json] = types.filterNot(_.isNull) /** - * Check enum contains some different types - * (string and number or number and boolean) - */ + * Check enum contains some different types + * (string and number or number and boolean) + */ private def isComplexEnum(enum: List[Json]) = { // Predicates def isNumeric(s: Json) = s.isNumber @@ -222,16 +219,16 @@ object Type { def isBigInt(long: Maximum): Boolean = long match { case Maximum.IntegerMaximum(bigInt) => bigInt > 2147483647L - case _ => false + case _ => false } /** - * Check at least some `quantity` of `predicates` are true on `instances` - * + * Check at least some `quantity` of `predicates` are true on `instances` + * * @param instances list of instances to check on - * @param predicates list of predicates to check - * @param quantity required quantity - */ + * @param predicates list of predicates to check + * @param quantity required quantity + */ private def somePredicates(instances: List[Json], predicates: List[Json => Boolean], quantity: Int): Boolean = if (quantity === 0) true else @@ -241,4 +238,3 @@ object Type { case _ :: tail => somePredicates(instances, tail, quantity) } } - diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala index ebb6a67..50c7962 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/Value.scala @@ -25,17 +25,18 @@ import doobie.postgres.circe.jsonb.implicits._ import doobie.util.fragment.Fragment sealed trait Value { - def fragment: Fragment = this match { - case Value.Uuid(value) => fr"$value" - case Value.Char(value) => fr"$value" - case Value.Varchar(value) => fr"$value" - case Value.Timestamp(value) => fr"$value" - case Value.Integer(value) => fr"$value" - case Value.BigInt(value) => fr"$value" - case Value.Double(value) => fr"$value" - case Value.Bool(value) => fr"$value" - case Value.Jsonb(value) => fr"$value" - } + def fragment: Fragment = + this match { + case Value.Uuid(value) => fr"$value" + case Value.Char(value) => fr"$value" + case Value.Varchar(value) => fr"$value" + case Value.Timestamp(value) => fr"$value" + case Value.Integer(value) => fr"$value" + case Value.BigInt(value) => fr"$value" + case Value.Double(value) => fr"$value" + case Value.Bool(value) => fr"$value" + case Value.Jsonb(value) => fr"$value" + } } object Value { @@ -53,4 +54,3 @@ object Value { def apply(instant: Instant): Timestamp = Timestamp(JTimestamp.from(instant)) } } - diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala index 6e63bdf..601dfae 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/schema.scala @@ -21,12 +21,11 @@ import io.circe.Json import com.snowplowanalytics.iglu.client.{ClientError, Resolver} import com.snowplowanalytics.iglu.client.resolver.registries.RegistryLookup -import com.snowplowanalytics.iglu.core.{SchemaList, SchemaCriterion, SelfDescribingSchema, SchemaKey, SchemaMap} - +import com.snowplowanalytics.iglu.core.{SchemaCriterion, SchemaKey, SchemaList, SchemaMap, SelfDescribingSchema} import com.snowplowanalytics.iglu.schemaddl.{IgluSchema, Properties} import com.snowplowanalytics.iglu.schemaddl.jsonschema.Schema -import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.CommonProperties.{ Type => SType } +import com.snowplowanalytics.iglu.schemaddl.jsonschema.properties.CommonProperties.{Type => SType} import com.snowplowanalytics.iglu.schemaddl.jsonschema.circe.implicits._ import com.snowplowanalytics.iglu.schemaddl.migrations.{FlatSchema, SchemaList => DdlSchemaList} @@ -35,36 +34,44 @@ import com.snowplowanalytics.snowplow.badrows.FailureDetails /** Generic schema functionality, related to JSON schema (Iglu) transformations */ object schema { - def fetch[F[_]: Monad: RegistryLookup: Clock](resolver: Resolver[F]) - (key: SchemaKey): EitherT[F, FailureDetails.LoaderIgluError, IgluSchema] = + def fetch[F[_]: Monad: RegistryLookup: Clock]( + resolver: Resolver[F] + )(key: SchemaKey): EitherT[F, FailureDetails.LoaderIgluError, IgluSchema] = for { - json <- EitherT(resolver.lookupSchema(key)).leftMap(error => FailureDetails.LoaderIgluError.IgluError(key, error): FailureDetails.LoaderIgluError) + json <- EitherT(resolver.lookupSchema(key)).leftMap(error => + FailureDetails.LoaderIgluError.IgluError(key, error): FailureDetails.LoaderIgluError + ) schema <- EitherT.fromEither[F](Schema.parse(json).toRight(buildFailure(json, key))) } yield SelfDescribingSchema(SchemaMap(key), schema) def buildFailure(json: Json, key: SchemaKey): FailureDetails.LoaderIgluError = - FailureDetails.LoaderIgluError.InvalidSchema(key, s"JSON ${json.noSpaces} cannot be parsed as JSON Schema"): FailureDetails.LoaderIgluError - + FailureDetails + .LoaderIgluError + .InvalidSchema(key, s"JSON ${json.noSpaces} cannot be parsed as JSON Schema"): FailureDetails.LoaderIgluError - def getSchemaList[F[_]: Monad: RegistryLookup: Clock](resolver: Resolver[F]) - (vendor: String, name: String, model: Int): EitherT[F, FailureDetails.LoaderIgluError, DdlSchemaList] = { + def getSchemaList[F[_]: Monad: RegistryLookup: Clock]( + resolver: Resolver[F] + )(vendor: String, name: String, model: Int): EitherT[F, FailureDetails.LoaderIgluError, DdlSchemaList] = { val criterion = SchemaCriterion(vendor, name, "jsonschema", Some(model), None, None) val schemaList = resolver.listSchemas(vendor, name, model) for { - schemaList <- EitherT[F, ClientError.ResolutionError, SchemaList](schemaList).leftMap(error => FailureDetails.LoaderIgluError.SchemaListNotFound(criterion, error)) + schemaList <- EitherT[F, ClientError.ResolutionError, SchemaList](schemaList).leftMap(error => + FailureDetails.LoaderIgluError.SchemaListNotFound(criterion, error) + ) ordered <- DdlSchemaList.fromSchemaList(schemaList, fetch(resolver)) } yield ordered } - def getOrdered[F[_]: Monad: RegistryLookup: Clock](resolver: Resolver[F]) - (vendor: String, name: String, model: Int): EitherT[F, FailureDetails.LoaderIgluError, Properties] = + def getOrdered[F[_]: Monad: RegistryLookup: Clock]( + resolver: Resolver[F] + )(vendor: String, name: String, model: Int): EitherT[F, FailureDetails.LoaderIgluError, Properties] = getSchemaList[F](resolver)(vendor, name, model).map(FlatSchema.extractProperties) def canBeNull(schema: Schema): Boolean = schema.enum.exists(_.value.exists(_.isNull)) || (schema.`type` match { case Some(SType.Union(types)) => types.contains(SType.Null) - case Some(t) => t == SType.Null - case None => false + case Some(t) => t == SType.Null + case None => false }) } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala index a7f1062..b09c71e 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/shredding/transform.scala @@ -17,12 +17,12 @@ import java.time.format.DateTimeParseException import java.util.UUID import java.sql.Timestamp -import cats.data.{EitherT, EitherNel, NonEmptyList} +import cats.data.{EitherNel, EitherT, NonEmptyList} import cats.implicits._ -import cats.effect.{Sync, Clock} +import cats.effect.{Clock, Sync} -import io.circe.{JsonNumber, Json, ACursor} +import io.circe.{ACursor, Json, JsonNumber} import com.snowplowanalytics.iglu.core._ @@ -34,24 +34,21 @@ import com.snowplowanalytics.iglu.schemaddl.jsonschema.{Pointer, Schema} import com.snowplowanalytics.iglu.schemaddl.migrations.FlatSchema import com.snowplowanalytics.snowplow.analytics.scalasdk.Event -import com.snowplowanalytics.snowplow.badrows.{FailureDetails, BadRow, Failure, Payload, Processor} +import com.snowplowanalytics.snowplow.badrows.{BadRow, Failure, FailureDetails, Payload, Processor} import Entity.Column import Shredded.{ShreddedSelfDescribing, ShreddedSnowplow} object transform { - val Atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1,0,0)) + val Atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1, 0, 0)) /** Transform the whole `Event` (canonical and JSONs) into list of independent entities ready to be inserted */ def shredEvent[F[_]: Sync: Clock](client: Client[F, Json], processor: Processor, event: Event): EitherT[F, BadRow, ShreddedSnowplow] = { val entities = event.contexts.data ++ event.derived_contexts.data ++ event.unstruct_event.data.toList - val wholeEvent = entities - .parTraverse(shredJson(client)) - .value - .map { shreddedOrError => - (shreddedOrError, shredAtomic(Map())(event)).mapN { - (shreddedEntities, atomic) => ShreddedSnowplow(atomic, shreddedEntities.map(_.entity).map(addMetadata(event.event_id, event.collector_tstamp))) - } + val wholeEvent = entities.parTraverse(shredJson(client)).value.map { shreddedOrError => + (shreddedOrError, shredAtomic(Map())(event)).mapN { (shreddedEntities, atomic) => + ShreddedSnowplow(atomic, shreddedEntities.map(_.entity).map(addMetadata(event.event_id, event.collector_tstamp))) } + } EitherT(wholeEvent).leftMap[BadRow](buildBadRow(processor, event)) } @@ -61,54 +58,57 @@ object transform { Column("schema_name", Type.Varchar(128), Value.Varchar(entity.origin.name)), Column("schema_format", Type.Varchar(16), Value.Varchar(entity.origin.format)), Column("schema_version", Type.Varchar(8), Value.Varchar(entity.origin.version.asString)), - Column("root_id", Type.Uuid, Value.Uuid(eventId)), - Column("root_tstamp", Type.Timestamp, Value.Timestamp(tstamp)), + Column("root_id", Type.Uuid, Value.Uuid(eventId)), + Column("root_tstamp", Type.Timestamp, Value.Timestamp(tstamp)) ) entity.copy(columns = metaColumns ++ entity.columns) } /** Remove all properties which are roots for other properties, - * Otherwise table will have structure of [nested, nested.a, nested.b], - * where we need just [nested.a, nested.b] - */ + * Otherwise table will have structure of [nested, nested.a, nested.b], + * where we need just [nested.a, nested.b] + */ def removeRoots(props: Properties): Properties = { val pointers = props.map(_._1).toSet - props.filterNot { case (pointer, _) => - pointer.value.isEmpty || { - val problem = pointers.exists { p => pointer.isParentOf(p) && p != pointer } - problem - } + props.filterNot { + case (pointer, _) => + pointer.value.isEmpty || { + val problem = pointers.exists(p => pointer.isParentOf(p) && p != pointer) + problem + } } } /** Transform JSON into [[Entity]] */ - def shredJson[F[_]: Sync: Clock](client: Client[F, Json]) - (data: SelfDescribingData[Json]): EitherT[F, NonEmptyList[FailureDetails.LoaderIgluError], ShreddedSelfDescribing] = { + def shredJson[F[_]: Sync: Clock]( + client: Client[F, Json] + )(data: SelfDescribingData[Json]): EitherT[F, NonEmptyList[FailureDetails.LoaderIgluError], ShreddedSelfDescribing] = { val key = data.schema - schema.getOrdered(client.resolver)(key.vendor, key.name, key.version.model) - .leftMap { error => NonEmptyList.of(error) } - .subflatMap { properties => - val shredded = getNameTypeVal(properties)(data.data) - .parTraverse { case (columnName, pgType, value) => + schema.getOrdered(client.resolver)(key.vendor, key.name, key.version.model).leftMap(error => NonEmptyList.of(error)).subflatMap { + properties => + val shredded = getNameTypeVal(properties)(data.data).parTraverse { + case (columnName, pgType, value) => cast(value, pgType).toEitherNel.map { value => - value.map { v => Entity.Column(columnName, pgType, v) } + value.map(v => Entity.Column(columnName, pgType, v)) } - } + } shredded - .leftMap { errors => errors.map { error => - FailureDetails.LoaderIgluError.WrongType(data.schema, Json.Null, error) // TODO - } } + .leftMap { errors => + errors.map { error => + FailureDetails.LoaderIgluError.WrongType(data.schema, Json.Null, error) // TODO + } + } .map { cols => val columns = cols.collect { case Some(c) => c } val tableName = data.schema match { case Atomic => "events" - case other => StringUtils.getTableName(SchemaMap(other)) + case other => StringUtils.getTableName(SchemaMap(other)) } ShreddedSelfDescribing(Entity(tableName, data.schema, columns)) } - } + } } /** Transform only canonical part of `Event` (128 non-JSON fields) into `ShreddedEntity` */ @@ -116,13 +116,14 @@ object transform { def tranformDate(col: String)(s: String): Either[FailureDetails.LoaderIgluError, Entity.Column] = Either .catchOnly[DateTimeParseException](Instant.parse(s)) - .map { parsed => Entity.Column(col, Type.Timestamp, Value.Timestamp(parsed)) } - .leftMap { _ => FailureDetails.LoaderIgluError.WrongType(Atomic, Json.fromString(s), "date-time") } + .map(parsed => Entity.Column(col, Type.Timestamp, Value.Timestamp(parsed))) + .leftMap(_ => FailureDetails.LoaderIgluError.WrongType(Atomic, Json.fromString(s), "date-time")) def transformUuid(col: String)(s: String): Either[FailureDetails.LoaderIgluError, Entity.Column] = - Either.catchOnly[IllegalArgumentException](UUID.fromString(s)) - .map { parsed => Entity.Column(col, Type.Uuid, Value.Uuid(parsed)) } - .leftMap { _ => FailureDetails.LoaderIgluError.WrongType(Atomic, Json.fromString(s), "uuid") } + Either + .catchOnly[IllegalArgumentException](UUID.fromString(s)) + .map(parsed => Entity.Column(col, Type.Uuid, Value.Uuid(parsed))) + .leftMap(_ => FailureDetails.LoaderIgluError.WrongType(Atomic, Json.fromString(s), "uuid")) def transformBool(col: String)(b: Boolean): Entity.Column = if (b) Entity.Column(col, Type.Bool, Value.Bool(true)) @@ -139,7 +140,7 @@ object transform { def transformNumber(col: String)(num: JsonNumber): Entity.Column = num.toInt match { case Some(int) => Entity.Column(col, Type.Integer, Value.Integer(int)) - case None => Entity.Column(col, Type.Double, Value.Double(num.toDouble)) + case None => Entity.Column(col, Type.Double, Value.Double(num.toDouble)) } def castError(expected: String)(value: Json) = @@ -179,7 +180,7 @@ object transform { ) case (_, None) => none.asRight.toEitherNel } - data.map(_.unite).map { columns => Entity("events", Atomic, columns) } + data.map(_.unite).map(columns => Entity("events", Atomic, columns)) } def cast(json: Option[Json], dataType: Type): Either[String, Option[Value]] = { @@ -190,39 +191,39 @@ object transform { case Type.Uuid => j.asString match { case Some(s) => Value.Uuid(UUID.fromString(s)).some.asRight // TODO - case None => error + case None => error } case Type.Varchar(_) => val result = j.asString match { case Some(s) => s - case None => j.noSpaces + case None => j.noSpaces } Value.Varchar(result).some.asRight[String] case Type.Bool => j.asBoolean match { case Some(b) => Value.Bool(b).some.asRight - case None => error + case None => error } case Type.Char(len) => j.asString match { case Some(s) if s.length === len => Value.Char(s).some.asRight - case Some(_) => error - case None => error + case Some(_) => error + case None => error } case Type.Integer => j.asNumber.flatMap(_.toInt) match { case Some(int) => Value.Integer(int).some.asRight - case None => error + case None => error } case Type.BigInt => j.asNumber.flatMap(_.toLong) match { case Some(long) => Value.BigInt(long).some.asRight - case None => error + case None => error } case Type.Double => j.asNumber.map(_.toDouble) match { case Some(int) => Value.Double(int).some.asRight - case None => error + case None => error } case Type.Jsonb => Value.Jsonb(j).some.asRight @@ -258,28 +259,30 @@ object transform { } /** - * Transform Schema properties into information that can be transformed into DDL columns - * It's very important to implement it and [[getNameTypeVal]] using same logic as - * former is an implementation for DDL, while latter is implementation for data shredding - * @return list of JSON Pointer, column name, inferred DB type, nullability - */ + * Transform Schema properties into information that can be transformed into DDL columns + * It's very important to implement it and [[getNameTypeVal]] using same logic as + * former is an implementation for DDL, while latter is implementation for data shredding + * @return list of JSON Pointer, column name, inferred DB type, nullability + */ def getNameType(properties: Properties): List[(SchemaPointer, String, Type, Boolean)] = - removeRoots(properties).map { case (pointer, s: Schema) => - val columnName: String = FlatSchema.getName(pointer) - val pgType = Type.getDataType(s, 4096, columnName, Type.dataTypeSuggestions) - (pointer, columnName, pgType, schema.canBeNull(s)) + removeRoots(properties).map { + case (pointer, s: Schema) => + val columnName: String = FlatSchema.getName(pointer) + val pgType = Type.getDataType(s, 4096, columnName, Type.dataTypeSuggestions) + (pointer, columnName, pgType, schema.canBeNull(s)) } /** - * Extract JSON Paths from an actual JSON data - * It's very important to implement [[getNameType]] and this function using same logic as - * former is an implementation for DDL, while latter is implementation for data shredding - * @return list column name, inferred DB type, value - */ + * Extract JSON Paths from an actual JSON data + * It's very important to implement [[getNameType]] and this function using same logic as + * former is an implementation for DDL, while latter is implementation for data shredding + * @return list column name, inferred DB type, value + */ def getNameTypeVal(properties: Properties)(data: Json) = - getNameType(properties).map { case (pointer, columnName, dataType, _) => - val value = getPath(pointer.forData, data) - (columnName, dataType, value) + getNameType(properties).map { + case (pointer, columnName, dataType, _) => + val value = getPath(pointer.forData, data) + (columnName, dataType, value) } private def buildBadRow(processor: Processor, event: Event)(errors: NonEmptyList[FailureDetails.LoaderIgluError]) = diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala index 001f1d5..1705d87 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/CommentIssue.scala @@ -20,8 +20,10 @@ import com.snowplowanalytics.iglu.core.ParseError sealed trait CommentIssue extends Product with Serializable object CommentIssue { + /** Table missing a comment */ case class Missing(table: String) extends CommentIssue + /** Comment is not an Iglu URI */ case class Invalid(table: String, comment: String, error: ParseError) extends CommentIssue @@ -34,4 +36,3 @@ object CommentIssue { implicit val commentIssueEq: Eq[CommentIssue] = Eq.fromUniversalEquals[CommentIssue] } - diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala index fb3ebda..66ee18e 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/ddl.scala @@ -15,13 +15,13 @@ package com.snowplowanalytics.snowplow.postgres.storage import cats.data.EitherT import cats.implicits._ -import cats.effect.{Sync, Clock} +import cats.effect.{Clock, Sync} import doobie.{ConnectionIO, LogHandler} import doobie.implicits._ import doobie.util.fragment.Fragment -import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaCriterion, SchemaMap} +import com.snowplowanalytics.iglu.core.{SchemaCriterion, SchemaKey, SchemaMap} import com.snowplowanalytics.iglu.client.Resolver @@ -34,7 +34,6 @@ import com.snowplowanalytics.snowplow.postgres.shredding.transform.Atomic import com.snowplowanalytics.snowplow.postgres.streaming.IgluErrors import com.snowplowanalytics.snowplow.postgres.streaming.sink.Insert - object ddl { /** Function that can produce DDL, based on `DdlSchemaList` */ @@ -44,14 +43,19 @@ object ddl { logger: LogHandler, schema: String, entity: SchemaKey, - meta: Boolean): EitherT[F, IgluErrors, Insert] = { + meta: Boolean + ): EitherT[F, IgluErrors, Insert] = { val generator: Generator = schemaList => sql.createTable(schema, entity, schemaList, meta) manage(resolver, logger, schema, entity, generator) } // TODO: tables need to be updated in transaction, because situation where one node tries to mutate it after its state // been update are completely predictable - def alterTable[F[_]: Sync: Clock](resolver: Resolver[F], logger: LogHandler, schema: String, entity: SchemaKey): EitherT[F, IgluErrors, Insert] = { + def alterTable[F[_]: Sync: Clock](resolver: Resolver[F], + logger: LogHandler, + schema: String, + entity: SchemaKey + ): EitherT[F, IgluErrors, Insert] = { val generator: Generator = schemaList => sql.migrateTable(schema, entity, schemaList) manage(resolver, logger, schema, entity, generator) } @@ -60,28 +64,29 @@ object ddl { definitions.atomicSql(schema).update(logger).run.void /** - * Perform some DB management: create or mutate the table according to current - * schema state (where state is all known versions on the iglu registry) - * First, check the current state of the schema on registry and validate it, - * Then, create an actual update action using `generator` and comment on table - * with latest schema from schema list retrieved from the registry - * At last, update internal mutable state. - * + * Perform some DB management: create or mutate the table according to current + * schema state (where state is all known versions on the iglu registry) + * First, check the current state of the schema on registry and validate it, + * Then, create an actual update action using `generator` and comment on table + * with latest schema from schema list retrieved from the registry + * At last, update internal mutable state. + * * Note that it doesn't actually perform a DB action (no `Transactor`) - * + * * @param resolver Iglu Resolver tied to Iglu Server (it needs schema list endpoint) - * @param logger doobie logger - * @param schema database schema - * @param entity an actual shredded entity that we manage tables for - * @param generator a function generating SQL from `DdlSchemaList` - * @return an action that is either failure because of Iglu subsystem - * or doobie IO - */ + * @param logger doobie logger + * @param schema database schema + * @param entity an actual shredded entity that we manage tables for + * @param generator a function generating SQL from `DdlSchemaList` + * @return an action that is either failure because of Iglu subsystem + * or doobie IO + */ def manage[F[_]: Sync: Clock](resolver: Resolver[F], logger: LogHandler, schema: String, origin: SchemaKey, - generator: Generator): EitherT[F, IgluErrors, Insert] = { + generator: Generator + ): EitherT[F, IgluErrors, Insert] = { val group = (origin.vendor, origin.name, origin.version.model) val criterion = SchemaCriterion(origin.vendor, origin.name, "jsonschema", origin.version.model) val (vendor, name, model) = group @@ -90,21 +95,18 @@ object ddl { .leftMap(error => IgluErrors.of(FailureDetails.LoaderIgluError.SchemaListNotFound(criterion, error))) .flatMap { list => - DdlSchemaList - .fromSchemaList(list, fetch[F](resolver)) - .leftMap(IgluErrors.of) - .map { list => - val statement = generator(list) - val tableName = getTableName(origin) - statement.update(logger).run.void *> - sql.commentTable(logger, schema, tableName, list.latest) - } + DdlSchemaList.fromSchemaList(list, fetch[F](resolver)).leftMap(IgluErrors.of).map { list => + val statement = generator(list) + val tableName = getTableName(origin) + statement.update(logger).run.void *> + sql.commentTable(logger, schema, tableName, list.latest) + } } } def getTableName(schemaKey: SchemaKey): String = schemaKey match { case Atomic => "events" - case other => StringUtils.getTableName(SchemaMap(other)) + case other => StringUtils.getTableName(SchemaMap(other)) } } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala index 8aacdfd..0783dd7 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/definitions.scala @@ -20,185 +20,187 @@ import com.snowplowanalytics.snowplow.postgres.shredding.Type object definitions { /** Columns prepended to every shredded type table */ - val metaColumns: List[(String, Type, Boolean)] = List( + val metaColumns: List[(String, Type, Boolean)] = List( ("schema_vendor", Type.Varchar(128), true), ("schema_name", Type.Varchar(128), true), ("schema_format", Type.Varchar(128), true), ("schema_version", Type.Varchar(128), true), - ("root_id", Type.Uuid, true), - ("root_tstamp", Type.Timestamp, true), + ("root_id", Type.Uuid, true), + ("root_tstamp", Type.Timestamp, true) ) val atomicColumns: List[(String, Type, Boolean)] = List( // App - ("app_id", Type.Varchar(255), false), - ("platform", Type.Varchar(255), false), + ("app_id", Type.Varchar(255), false), + ("platform", Type.Varchar(255), false), // Date/time - ("etl_tstamp", Type.Timestamp, false), - ("collector_tstamp", Type.Timestamp, true), - ("dvce_created_tstamp", Type.Timestamp, false), + ("etl_tstamp", Type.Timestamp, false), + ("collector_tstamp", Type.Timestamp, true), + ("dvce_created_tstamp", Type.Timestamp, false), // Date/time - ("event", Type.Varchar(128), false), - ("event_id", Type.Uuid, true), - ("txn_id", Type.Integer, false), + ("event", Type.Varchar(128), false), + ("event_id", Type.Uuid, true), + ("txn_id", Type.Integer, false), // Versioning - ("name_tracker", Type.Varchar(128), false), - ("v_tracker", Type.Varchar(100), false), - ("v_collector", Type.Varchar(100), true), - ("v_etl", Type.Varchar(100), true), + ("name_tracker", Type.Varchar(128), false), + ("v_tracker", Type.Varchar(100), false), + ("v_collector", Type.Varchar(100), true), + ("v_etl", Type.Varchar(100), true), // User and visit - ("user_id", Type.Varchar(255), false), - ("user_ipaddress", Type.Varchar(45), false), - ("user_fingerprint", Type.Varchar(50), false), - ("domain_userid", Type.Varchar(36), false), - ("domain_sessionidx", Type.Integer, false), - ("network_userid", Type.Varchar(38), false), + ("user_id", Type.Varchar(255), false), + ("user_ipaddress", Type.Varchar(45), false), + ("user_fingerprint", Type.Varchar(50), false), + ("domain_userid", Type.Varchar(36), false), + ("domain_sessionidx", Type.Integer, false), + ("network_userid", Type.Varchar(38), false), // Location - ("geo_country", Type.Char(2), false), - ("geo_region", Type.Char(3), false), - ("geo_city", Type.Varchar(75), false), - ("geo_zipcode", Type.Varchar(15), false), - ("geo_latitude", Type.Double, false), - ("geo_longitude", Type.Double, false), - ("geo_region_name", Type.Varchar(100), false), + ("geo_country", Type.Char(2), false), + ("geo_region", Type.Char(3), false), + ("geo_city", Type.Varchar(75), false), + ("geo_zipcode", Type.Varchar(15), false), + ("geo_latitude", Type.Double, false), + ("geo_longitude", Type.Double, false), + ("geo_region_name", Type.Varchar(100), false), // IP lookups - ("ip_isp", Type.Varchar(100), false), - ("ip_organization", Type.Varchar(100), false), - ("ip_domain", Type.Varchar(100), false), - ("ip_netspeed", Type.Varchar(100), false), + ("ip_isp", Type.Varchar(100), false), + ("ip_organization", Type.Varchar(100), false), + ("ip_domain", Type.Varchar(100), false), + ("ip_netspeed", Type.Varchar(100), false), // Page - ("page_url", Type.Varchar(4096), false), - ("page_title", Type.Varchar(2000), false), - ("page_referrer", Type.Varchar(4096), false), + ("page_url", Type.Varchar(4096), false), + ("page_title", Type.Varchar(2000), false), + ("page_referrer", Type.Varchar(4096), false), // Page URL components - ("page_urlscheme", Type.Varchar(16), false), - ("page_urlhost", Type.Varchar(255), false), - ("page_urlport", Type.Integer, false), - ("page_urlpath", Type.Varchar(3000), false), - ("page_urlquery", Type.Varchar(6000), false), - ("page_urlfragment", Type.Varchar(3000), false), + ("page_urlscheme", Type.Varchar(16), false), + ("page_urlhost", Type.Varchar(255), false), + ("page_urlport", Type.Integer, false), + ("page_urlpath", Type.Varchar(3000), false), + ("page_urlquery", Type.Varchar(6000), false), + ("page_urlfragment", Type.Varchar(3000), false), // Referrer URL components - ("refr_urlscheme", Type.Varchar(16), false), - ("refr_urlhost", Type.Varchar(255), false), - ("refr_urlport", Type.Integer, false), - ("refr_urlpath", Type.Varchar(6000), false), - ("refr_urlquery", Type.Varchar(6000), false), - ("refr_urlfragment", Type.Varchar(3000), false), + ("refr_urlscheme", Type.Varchar(16), false), + ("refr_urlhost", Type.Varchar(255), false), + ("refr_urlport", Type.Integer, false), + ("refr_urlpath", Type.Varchar(6000), false), + ("refr_urlquery", Type.Varchar(6000), false), + ("refr_urlfragment", Type.Varchar(3000), false), // Referrer details - ("refr_medium", Type.Varchar(25), false), - ("refr_source", Type.Varchar(50), false), - ("refr_term", Type.Varchar(255), false), + ("refr_medium", Type.Varchar(25), false), + ("refr_source", Type.Varchar(50), false), + ("refr_term", Type.Varchar(255), false), // Marketing - ("mkt_medium", Type.Varchar(255), false), - ("mkt_source", Type.Varchar(255), false), - ("mkt_term", Type.Varchar(255), false), - ("mkt_content", Type.Varchar(500), false), - ("mkt_campaign", Type.Varchar(255), false), + ("mkt_medium", Type.Varchar(255), false), + ("mkt_source", Type.Varchar(255), false), + ("mkt_term", Type.Varchar(255), false), + ("mkt_content", Type.Varchar(500), false), + ("mkt_campaign", Type.Varchar(255), false), // Custom structured event - ("se_category", Type.Varchar(1000), false), - ("se_action", Type.Varchar(1000), false), - ("se_label", Type.Varchar(1000), false), - ("se_property", Type.Varchar(1000), false), - ("se_value", Type.Double, false), + ("se_category", Type.Varchar(1000), false), + ("se_action", Type.Varchar(1000), false), + ("se_label", Type.Varchar(1000), false), + ("se_property", Type.Varchar(1000), false), + ("se_value", Type.Double, false), // Ecommerce - ("tr_orderid", Type.Varchar(255), false), - ("tr_affiliation", Type.Varchar(255), false), - ("tr_total", Type.Double, false), - ("tr_tax", Type.Double, false), - ("tr_shipping", Type.Double, false), - ("tr_city", Type.Varchar(255), false), - ("tr_state", Type.Varchar(255), false), - ("tr_country", Type.Varchar(255), false), - ("ti_orderid", Type.Varchar(255), false), - ("ti_sku", Type.Varchar(255), false), - ("ti_name", Type.Varchar(255), false), - ("ti_category", Type.Varchar(255), false), - ("ti_price", Type.Double, false), - ("ti_quantity", Type.Integer, false), + ("tr_orderid", Type.Varchar(255), false), + ("tr_affiliation", Type.Varchar(255), false), + ("tr_total", Type.Double, false), + ("tr_tax", Type.Double, false), + ("tr_shipping", Type.Double, false), + ("tr_city", Type.Varchar(255), false), + ("tr_state", Type.Varchar(255), false), + ("tr_country", Type.Varchar(255), false), + ("ti_orderid", Type.Varchar(255), false), + ("ti_sku", Type.Varchar(255), false), + ("ti_name", Type.Varchar(255), false), + ("ti_category", Type.Varchar(255), false), + ("ti_price", Type.Double, false), + ("ti_quantity", Type.Integer, false), // Page ping - ("pp_xoffset_min", Type.Integer, false), - ("pp_xoffset_max", Type.Integer, false), - ("pp_yoffset_min", Type.Integer, false), - ("pp_yoffset_max", Type.Integer, false), + ("pp_xoffset_min", Type.Integer, false), + ("pp_xoffset_max", Type.Integer, false), + ("pp_yoffset_min", Type.Integer, false), + ("pp_yoffset_max", Type.Integer, false), // User Agent - ("useragent", Type.Varchar(1000), false), + ("useragent", Type.Varchar(1000), false), // Browser - ("br_name", Type.Varchar(50), false), - ("br_family", Type.Varchar(50), false), - ("br_version", Type.Varchar(50), false), - ("br_type", Type.Varchar(50), false), - ("br_renderengine", Type.Varchar(50), false), - ("br_lang", Type.Varchar(255), false), - ("br_features_pdf", Type.Bool, false), - ("br_features_flash", Type.Bool, false), - ("br_features_java", Type.Bool, false), + ("br_name", Type.Varchar(50), false), + ("br_family", Type.Varchar(50), false), + ("br_version", Type.Varchar(50), false), + ("br_type", Type.Varchar(50), false), + ("br_renderengine", Type.Varchar(50), false), + ("br_lang", Type.Varchar(255), false), + ("br_features_pdf", Type.Bool, false), + ("br_features_flash", Type.Bool, false), + ("br_features_java", Type.Bool, false), ("br_features_director", Type.Bool, false), ("br_features_quicktime", Type.Bool, false), ("br_features_realplayer", Type.Bool, false), ("br_features_windowsmedia", Type.Bool, false), - ("br_features_gears", Type.Bool, false), + ("br_features_gears", Type.Bool, false), ("br_features_silverlight", Type.Bool, false), - ("br_cookies", Type.Bool, false), - ("br_colordepth", Type.Varchar(12), false), - ("br_viewwidth", Type.Integer, false), - ("br_viewheight", Type.Integer, false), + ("br_cookies", Type.Bool, false), + ("br_colordepth", Type.Varchar(12), false), + ("br_viewwidth", Type.Integer, false), + ("br_viewheight", Type.Integer, false), // Operating System - ("os_name", Type.Varchar(50), false), - ("os_family", Type.Varchar(50), false), - ("os_manufacturer", Type.Varchar(50), false), - ("os_timezone", Type.Varchar(50), false), + ("os_name", Type.Varchar(50), false), + ("os_family", Type.Varchar(50), false), + ("os_manufacturer", Type.Varchar(50), false), + ("os_timezone", Type.Varchar(50), false), // Device/Hardware - ("dvce_type", Type.Varchar(50), false), - ("dvce_ismobile", Type.Bool, false), - ("dvce_screenwidth", Type.Integer, false), - ("dvce_screenheight", Type.Integer, false), + ("dvce_type", Type.Varchar(50), false), + ("dvce_ismobile", Type.Bool, false), + ("dvce_screenwidth", Type.Integer, false), + ("dvce_screenheight", Type.Integer, false), // Document - ("doc_charset", Type.Varchar(128), false), - ("doc_width", Type.Integer, false), - ("doc_height", Type.Integer, false), + ("doc_charset", Type.Varchar(128), false), + ("doc_width", Type.Integer, false), + ("doc_height", Type.Integer, false), // Currency - ("tr_currency", Type.Char(3), false), - ("tr_total_base", Type.Double, false), - ("tr_tax_base", Type.Double, false), - ("tr_shipping_base", Type.Double, false), - ("ti_currency", Type.Char(3), false), - ("ti_price_base", Type.Double, false), - ("base_currency", Type.Char(3), false), + ("tr_currency", Type.Char(3), false), + ("tr_total_base", Type.Double, false), + ("tr_tax_base", Type.Double, false), + ("tr_shipping_base", Type.Double, false), + ("ti_currency", Type.Char(3), false), + ("ti_price_base", Type.Double, false), + ("base_currency", Type.Char(3), false), // Geolocation - ("geo_timezone", Type.Varchar(64), false), + ("geo_timezone", Type.Varchar(64), false), // Click ID - ("mkt_clickid", Type.Varchar(128), false), - ("mkt_network", Type.Varchar(64), false), + ("mkt_clickid", Type.Varchar(128), false), + ("mkt_network", Type.Varchar(64), false), // ETL tags - ("etl_tags", Type.Varchar(500), false), + ("etl_tags", Type.Varchar(500), false), // Time event was sent - ("dvce_sent_tstamp", Type.Timestamp, false), + ("dvce_sent_tstamp", Type.Timestamp, false), // Referer - ("refr_domain_userid", Type.Varchar(36), false), - ("refr_dvce_tstamp", Type.Timestamp, false), + ("refr_domain_userid", Type.Varchar(36), false), + ("refr_dvce_tstamp", Type.Timestamp, false), // Session ID - ("domain_sessionid", Type.Uuid, false), + ("domain_sessionid", Type.Uuid, false), // Derived Type.Timestamp - ("derived_tstamp", Type.Timestamp, false), + ("derived_tstamp", Type.Timestamp, false), // Event schema - ("event_vendor", Type.Varchar(1000), false), - ("event_name", Type.Varchar(1000), false), - ("event_format", Type.Varchar(128), false), - ("event_version", Type.Varchar(128), false), + ("event_vendor", Type.Varchar(1000), false), + ("event_name", Type.Varchar(1000), false), + ("event_format", Type.Varchar(128), false), + ("event_version", Type.Varchar(128), false), // Event fingerprint - ("event_fingerprint", Type.Varchar(128), false), + ("event_fingerprint", Type.Varchar(128), false), // True Type.Timestamp - ("true_tstamp", Type.Timestamp, false) + ("true_tstamp", Type.Timestamp, false) ) def atomicSql(schema: String) = { - val columns = atomicColumns.map { - case (n, t, true) => Fragment.const(s"$n ${t.ddl} NOT NULL") - case (n, t, false) => Fragment.const(s"$n ${t.ddl}") - }.foldLeft(Fragment.empty) { (acc, cur) => - val separator = if (acc == Fragment.empty) Fragment.const("\n") else Fragment.const(",\n") - acc ++ separator ++ cur - } + val columns = atomicColumns + .map { + case (n, t, true) => Fragment.const(s"$n ${t.ddl} NOT NULL") + case (n, t, false) => Fragment.const(s"$n ${t.ddl}") + } + .foldLeft(Fragment.empty) { (acc, cur) => + val separator = if (acc == Fragment.empty) Fragment.const("\n") else Fragment.const(",\n") + acc ++ separator ++ cur + } val schemaFr = Fragment.const0(schema) diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala index 58a8c8c..74cbe65 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/query.scala @@ -39,12 +39,12 @@ object query { FROM pg_catalog.pg_namespace WHERE nspname = $schema ) AND relname = $tableName""") - .queryWithLogHandler[Option[String]](logger) // It can be NULL, thus query[String].option will fail + .queryWithLogHandler[Option[String]](logger) // It can be NULL, thus query[String].option will fail .unique .map { case Some(comment) => SchemaKey.fromUri(comment) match { - case Right(key) => key.asRight + case Right(key) => key.asRight case Left(error) => CommentIssue.Invalid(tableName, comment, error).asLeft } case None => diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala index edc6098..d2175fa 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/sql.scala @@ -27,20 +27,20 @@ import com.snowplowanalytics.iglu.schemaddl.jsonschema.{Pointer, Schema} import com.snowplowanalytics.iglu.schemaddl.migrations.{FlatSchema, Migration, SchemaList} import com.snowplowanalytics.snowplow.postgres.shredding.transform.Atomic -import com.snowplowanalytics.snowplow.postgres.shredding.{Type, transform, schema} +import com.snowplowanalytics.snowplow.postgres.shredding.{Type, schema, transform} object sql { val DefaultVarcharSize = 4096 /** - * Generate the `CREATE TABLE` DDL statement - * @param schema database schema - * @param entity shredded entity - * @param schemaList state of the - * @param meta whether meta columns should be prepended - * @return pure SQL expression with `CREATE TABLE` statement - */ + * Generate the `CREATE TABLE` DDL statement + * @param schema database schema + * @param entity shredded entity + * @param schemaList state of the + * @param meta whether meta columns should be prepended + * @return pure SQL expression with `CREATE TABLE` statement + */ def createTable(schema: String, entity: SchemaKey, schemaList: SchemaList, meta: Boolean): Fragment = { val subschemas = FlatSchema.extractProperties(schemaList) @@ -52,7 +52,7 @@ object sql { val tableName = entity match { case Atomic => "events" - case other => StringUtils.getTableName(SchemaMap(other)) + case other => StringUtils.getTableName(SchemaMap(other)) } val columns = (if (meta) definitions.metaColumns.map((definitions.columnToString _).tupled) else Nil) ++ entityColumns @@ -64,21 +64,17 @@ object sql { def commentTable(logger: LogHandler, schema: String, tableName: String, schemaKey: SchemaMap): ConnectionIO[Unit] = { val uri = schemaKey.schemaKey.toSchemaUri val table = s"$schema.$tableName" - Fragment.const(s"COMMENT ON TABLE $table IS '$uri'") - .update(logger) - .run - .void + Fragment.const(s"COMMENT ON TABLE $table IS '$uri'").update(logger).run.void } - def migrateTable(schema: String, entity: SchemaKey, schemaList: SchemaList) = schemaList match { case s: SchemaList.Full => val migrationList = s.extractSegments.map(Migration.fromSegment) migrationList.find(_.from == entity.version) match { case Some(migration) => - val schemaMap = SchemaMap(migration.vendor, migration.name, "jsonschema", migration.to) - val tableName = getTableName(schemaMap) // e.g. com_acme_event_1 + val schemaMap = SchemaMap(migration.vendor, migration.name, "jsonschema", migration.to) + val tableName = getTableName(schemaMap) // e.g. com_acme_event_1 val tableNameFull = s"$schema.$tableName" if (migration.diff.added.nonEmpty) { @@ -95,20 +91,20 @@ object sql { Fragment.const0(s"""ALTER TABLE $tableNameFull $columnFragments""") } else Fragment.empty case None => - Fragment.empty // TODO: This should be a warning + Fragment.empty // TODO: This should be a warning } case _: SchemaList.Single => - Fragment.empty // TODO: This should be a warning + Fragment.empty // TODO: This should be a warning } /** - * Generate single ALTER TABLE statement for some new property - * + * Generate single ALTER TABLE statement for some new property + * * @param varcharSize default size for VARCHAR - * @param pair pair of property name and its Schema properties like - * length, maximum, etc - * @return DDL statement altering single column in table - */ + * @param pair pair of property name and its Schema properties like + * length, maximum, etc + * @return DDL statement altering single column in table + */ def buildColumn(varcharSize: Int, pair: (Pointer.SchemaPointer, Schema)): Column = pair match { case (pointer, properties) => @@ -118,6 +114,7 @@ object sql { } case class Column(name: String, dataType: Type, nullable: Boolean) { + /** "column_name VARCHAR(128) NOT NULL" */ def toFragment: Fragment = Fragment.const0(s"$name ${dataType.ddl} ${if (nullable) "NULL" else "NOT NULL"}") diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala index a09ced3..b4b5194 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/storage/utils.scala @@ -34,7 +34,7 @@ object utils { def prepare[F[_]: Sync](schema: String, xa: Transactor[F], logger: LogHandler): F[Unit] = prepareEventsTable(schema, logger).transact(xa).flatMap { - case true => Sync[F].delay(println(s"$schema.events table already exists")) + case true => Sync[F].delay(println(s"$schema.events table already exists")) case false => Sync[F].delay(println(s"$schema.events table created")) } } diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala new file mode 100644 index 0000000..72af2f6 --- /dev/null +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/UnorderedPipe.scala @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Snowplow Analytics Ltd. All rights reserved. + * + * This program is licensed to you under the Apache License Version 2.0, + * and you may not use this file except in compliance with the Apache License Version 2.0. + * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the Apache License Version 2.0 is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + */ +package com.snowplowanalytics.snowplow.postgres.streaming + +import fs2.Pipe +import cats.effect.Concurrent +import doobie.hikari.HikariTransactor + +/** Evaluates effects, possibly concurrently, and emits the results downstream in any order + */ +trait UnorderedPipe[F[_]] { + def apply[A, B](f: A => F[B]): Pipe[F, A, B] +} + +object UnorderedPipe { + + /** An UnorderedPipe in which results are emitted in the same order as the inputs + * + * Use this UnorderedPipe when a `Concurrent[F]` is not available + */ + def sequential[F[_]]: UnorderedPipe[F] = + new UnorderedPipe[F] { + override def apply[A, B](f: A => F[B]): Pipe[F, A, B] = + _.evalMap(f) + } + + /** An UnorderedPipe that evaluates effects in parallel. + */ + def concurrent[F[_]: Concurrent](maxConcurrent: Int): UnorderedPipe[F] = + new UnorderedPipe[F] { + override def apply[A, B](f: A => F[B]): Pipe[F, A, B] = + _.parEvalMapUnordered(maxConcurrent)(f) + } + + /** A concurrent UnorderedPipe whose parallelism matches the size of the transactor's underlying connection pool. + * + * Use this UnorderedPipe whenever the effect requires a database connection + */ + def forTransactor[F[_]: Concurrent](xa: HikariTransactor[F]): UnorderedPipe[F] = + concurrent(xa.kernel.getMaximumPoolSize) + +} diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala index 42ccbe5..8b7e0cf 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/data.scala @@ -23,10 +23,11 @@ object data { /** Kind of data flowing through the Loader */ sealed trait Data extends Product with Serializable { - def snowplow: Boolean = this match { - case _: Data.Snowplow => true - case _: Data.SelfDescribing => false - } + def snowplow: Boolean = + this match { + case _: Data.Snowplow => true + case _: Data.SelfDescribing => false + } } object Data { @@ -37,10 +38,11 @@ object data { /** Data that for some reasons cannot be inserted into DB */ sealed trait BadData extends Throwable with Product with Serializable object BadData { + /** Typical Snowplow bad row (Loader Iglu Error etc) */ case class BadEnriched(data: BadRow) extends BadData + /** Non-enriched error */ case class BadJson(payload: String, error: String) extends BadData } } - diff --git a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala index 5195ace..0bde4d5 100644 --- a/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala +++ b/modules/common/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/sink.scala @@ -15,7 +15,7 @@ package com.snowplowanalytics.snowplow.postgres.streaming import cats.data.EitherT import cats.implicits._ -import cats.effect.{Sync, Clock, Concurrent} +import cats.effect.{Blocker, Clock, ContextShift, Sync} import fs2.Pipe @@ -29,62 +29,62 @@ import com.snowplowanalytics.iglu.core.circe.implicits._ import com.snowplowanalytics.iglu.client.Client import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload, Processor} -import com.snowplowanalytics.snowplow.postgres.api.{State, DB} -import com.snowplowanalytics.snowplow.postgres.resources.FixedThreadPoolSize +import com.snowplowanalytics.snowplow.postgres.api.{DB, State} import com.snowplowanalytics.snowplow.postgres.shredding.{Entity, transform} -import com.snowplowanalytics.snowplow.postgres.streaming.data.{Data, BadData} +import com.snowplowanalytics.snowplow.postgres.streaming.data.{BadData, Data} object sink { type Insert = ConnectionIO[Unit] /** - * Sink good events into Postgres. During sinking, payloads go through all transformation steps - * and checking the state of the DB itself. - * Events that could not be transformed (due Iglu errors or DB unavailability) are emitted from - * the pipe - * @param state mutable Loader state - * @param client Iglu Client - * @param processor The actor processing these events - */ - def goodSink[F[_]: Concurrent: Clock: DB](state: State[F], - client: Client[F, Json], - processor: Processor): Pipe[F, Data, BadData] = - _.parEvalMapUnordered(FixedThreadPoolSize)(sinkPayload(state, client, processor)) - .collect { - case Left(badData) => badData - } + * Sink good events into Postgres. During sinking, payloads go through all transformation steps + * and checking the state of the DB itself. + * Events that could not be transformed (due Iglu errors or DB unavailability) are emitted from + * the pipe + * @param unorderdPipe pipe which might optimise by processing events concurrently + * @param state mutable Loader state + * @param client Iglu Client + * @param processor The actor processing these events + */ + def goodSink[F[_]: Sync: Clock: DB](unorderedPipe: UnorderedPipe[F], + state: State[F], + client: Client[F, Json], + processor: Processor + ): Pipe[F, Data, BadData] = + unorderedPipe(sinkPayload(state, client, processor)).andThen { + _.collect { + case Left(badData) => badData + } + } /** Sink bad data coming directly into the `Pipe` */ - def badSink[F[_]: Concurrent]: Pipe[F, BadData, Unit] = + def badSink[F[_]: Sync: ContextShift](blocker: Blocker): Pipe[F, BadData, Unit] = _.evalMap { - case BadData.BadEnriched(row) => Sync[F].delay(println(row.compact)) - case BadData.BadJson(payload, error) => Sync[F].delay(println(s"Cannot parse $payload. $error")) + case BadData.BadEnriched(row) => blocker.delay[F, Unit](println(row.compact)) + case BadData.BadJson(payload, error) => blocker.delay[F, Unit](println(s"Cannot parse $payload. $error")) } /** Implementation for [[goodSink]] */ - def sinkPayload[F[_]: Sync: Clock: DB](state: State[F], - client: Client[F, Json], - processor: Processor)(payload: Data): F[Either[BadData, Unit]] = { + def sinkPayload[F[_]: Sync: Clock: DB](state: State[F], client: Client[F, Json], processor: Processor)( + payload: Data + ): F[Either[BadData, Unit]] = { val result = for { entities <- payload match { case Data.Snowplow(event) => - transform - .shredEvent[F](client, processor, event) - .leftMap(bad => BadData.BadEnriched(bad)) + transform.shredEvent[F](client, processor, event).leftMap(bad => BadData.BadEnriched(bad)) case Data.SelfDescribing(json) => - transform - .shredJson(client)(json) - .leftMap(errors => BadData.BadJson(json.normalize.noSpaces, errors.toString)) + transform.shredJson(client)(json).leftMap(errors => BadData.BadJson(json.normalize.noSpaces, errors.toString)) } insert <- EitherT(DB.process(entities, state).attempt).leftMap { - case error => payload match { - case Data.Snowplow(event) => - val badRow = BadRow.LoaderRuntimeError(processor, error.getMessage, Payload.LoaderPayload(event)) - BadData.BadEnriched(badRow) - case Data.SelfDescribing(json) => - BadData.BadJson(json.normalize.noSpaces, s"Cannot insert: ${error.getMessage}") - } + case error => + payload match { + case Data.Snowplow(event) => + val badRow = BadRow.LoaderRuntimeError(processor, error.getMessage, Payload.LoaderPayload(event)) + BadData.BadEnriched(badRow) + case Data.SelfDescribing(json) => + BadData.BadJson(json.normalize.noSpaces, s"Cannot insert: ${error.getMessage}") + } } } yield insert @@ -92,9 +92,9 @@ object sink { } /** - * Build an `INSERT` action for a single entity - * Multiple inserts later can be combined into a transaction - */ + * Build an `INSERT` action for a single entity + * Multiple inserts later can be combined into a transaction + */ def insertStatement(logger: LogHandler, schema: String, row: Entity): Insert = { val length = row.columns.length @@ -103,11 +103,10 @@ object sink { val table = Fragment.const0(s"$schema.${row.tableName}") val values = row.columns.zipWithIndex.foldLeft(fr0"") { case (acc, (cur, i)) if i < length - 1 => acc ++ cur.value.fragment ++ fr0"," - case (acc, (cur, _)) => acc ++ cur.value.fragment + case (acc, (cur, _)) => acc ++ cur.value.fragment } fr"""INSERT INTO $table ($columns) VALUES ($values)""".update(logger).run.void } - } diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala index 7be64a6..dab8a7d 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/Database.scala @@ -6,7 +6,7 @@ import java.util.UUID import cats.data.EitherT import cats.implicits._ -import cats.effect.{ContextShift, IO, Clock} +import cats.effect.{Clock, ContextShift, IO} import org.specs2.mutable.Specification import org.specs2.specification.BeforeAfterEach @@ -19,7 +19,7 @@ import io.circe.Json import com.snowplowanalytics.iglu.client.Client import com.snowplowanalytics.iglu.client.resolver.Resolver import com.snowplowanalytics.iglu.client.resolver.registries.Registry -import com.snowplowanalytics.iglu.client.resolver.registries.Registry.{HttpConnection, Config, Http} +import com.snowplowanalytics.iglu.client.resolver.registries.Registry.{Config, Http, HttpConnection} import com.snowplowanalytics.iglu.client.validator.CirceValidator import com.snowplowanalytics.snowplow.badrows.FailureDetails @@ -56,7 +56,8 @@ object Database { columnDefault: Option[String], isNullable: Boolean, dataType: String, - characterMaximumLength: Option[Int]) + characterMaximumLength: Option[Int] + ) def query: IO[List[UUID]] = fr"SELECT event_id FROM events".query[UUID].to[List].transact(xa) diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala index c45f1f2..50b9ba2 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/SchemaStateSpec.scala @@ -28,10 +28,7 @@ class SchemaStateSpec extends Database { "init" should { "initialize an empty state if no tables exist" >> { val state = SchemaState.init(List(), Database.igluClient.resolver) - val result = state - .semiflatMap(_.get) - .value - .unsafeRunSync() + val result = state.semiflatMap(_.get).value.unsafeRunSync() val expected = SchemaState(Map()) result must beRight(expected) } @@ -39,7 +36,7 @@ class SchemaStateSpec extends Database { "check" should { "confirm table exists with a same key as in state" >> { - val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1,0,0)) + val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 0)) val schemaList = SchemaStateSpec.buildSchemaList(List(key)) val init = Map(("com.acme", "event", 1) -> schemaList) @@ -48,25 +45,25 @@ class SchemaStateSpec extends Database { } "claim table is outdated for 1-0-1 key if only 1-0-0 is known" >> { - val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1,0,0)) + val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 0)) val schemaList = SchemaStateSpec.buildSchemaList(List(key)) val init = Map(("com.acme", "event", 1) -> schemaList) val state = SchemaState(init) - state.check(SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1,0,1))) must beEqualTo(TableState.Outdated) + state.check(SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 1))) must beEqualTo(TableState.Outdated) } "claim table is missing for bumped model" >> { - val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1,0,0)) + val key = SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(1, 0, 0)) val schemaList = SchemaStateSpec.buildSchemaList(List(key)) val init = Map(("com.acme", "event", 1) -> schemaList) val state = SchemaState(init) - state.check(SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(2,0,0))) must beEqualTo(TableState.Missing) + state.check(SchemaKey("com.acme", "event", "jsonschema", SchemaVer.Full(2, 0, 0))) must beEqualTo(TableState.Missing) } "always assume events table exists" >> { - val atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1,0,0)) + val atomic = SchemaKey("com.snowplowanalytics.snowplow", "atomic", "jsonschema", SchemaVer.Full(1, 0, 0)) val state = SchemaState(Map()) state.check(atomic) must beEqualTo(TableState.Match) } diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala index 1086710..b07c6ee 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/api/StateSpec.scala @@ -19,32 +19,32 @@ import concurrent.duration._ import cats.implicits._ import cats.effect.concurrent.Ref -import cats.effect.{IO, Timer, Clock} +import cats.effect.{Clock, IO, Timer} import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer} import org.specs2.ScalaCheck import org.specs2.mutable.Specification -import com.snowplowanalytics.snowplow.postgres.Database.{igluClient, CS} +import com.snowplowanalytics.snowplow.postgres.Database.{CS, igluClient} import com.snowplowanalytics.snowplow.postgres.api.DB.StateCheck import com.snowplowanalytics.snowplow.postgres.api.StateSpec._ -import org.scalacheck.{Prop, Gen} +import org.scalacheck.{Gen, Prop} import org.specs2.scalacheck.Parameters class StateSpec extends Specification with ScalaCheck { "checkAndRun" should { "execute `mutate` when StateCheck is Block" >> { - val key = SchemaKey("com.acme", "missing_table", "jsonschema", SchemaVer.Full(1,0,0)) + val key = SchemaKey("com.acme", "missing_table", "jsonschema", SchemaVer.Full(1, 0, 0)) val alwaysEmpty: SchemaState => StateCheck = _ => StateCheck.Block(Set(key), Set.empty) val result = for { state <- initState - db <- Ref.of[IO, List[Int]](List.empty) - keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) - _ <- state.checkAndRun(alwaysEmpty, IO.sleep(100.millis) *> db.update(s => 1 :: s), (m, _) => keys.update(_ ++ m)) - res <- (db.get, keys.get).tupled + db <- Ref.of[IO, List[Int]](List.empty) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- state.checkAndRun(alwaysEmpty, IO.sleep(100.millis) *> db.update(s => 1 :: s), (m, _) => keys.update(_ ++ m)) + res <- (db.get, keys.get).tupled } yield res result.unsafeRunSync() must beEqualTo((List(1), Set(key))) @@ -56,60 +56,64 @@ class StateSpec extends Specification with ScalaCheck { val result = for { state <- initState - db <- Ref.of[IO, List[Int]](List.empty) - keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) - _ <- state.checkAndRun(alwaysOk, IO.sleep(100.millis) *> db.update(s => 1 :: s), (m, _) => keys.update(_ ++ m)) - res <- (db.get, keys.get).tupled + db <- Ref.of[IO, List[Int]](List.empty) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- state.checkAndRun(alwaysOk, IO.sleep(100.millis) *> db.update(s => 1 :: s), (m, _) => keys.update(_ ++ m)) + res <- (db.get, keys.get).tupled } yield res result.unsafeRunSync() must beEqualTo((List(1), Set())) } "execute locked calls one after another" >> { - val key = SchemaKey("com.acme", "missing_table", "jsonschema", SchemaVer.Full(1,0,0)) + val key = SchemaKey("com.acme", "missing_table", "jsonschema", SchemaVer.Full(1, 0, 0)) val alwaysEmpty: SchemaState => StateCheck = _ => StateCheck.Block(Set(key), Set.empty) - Prop.forAll(durationsGen) { durations => - val checks = for { - state <- initState - db <- Ref.of[IO, Int](0) - keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) - _ <- durations.parTraverse_(d => state.checkAndRun(alwaysEmpty, db.update(_ + 1), (m, _) => IO.sleep(d) *> keys.update(_ ++ m))) - res <- (db.get, keys.get).tupled - } yield res - val result = measure(checks) - - result.unsafeRunSync() must beLike { - case ((counter, keys), time) => - val totalDelays = durations.foldMap(_.toMillis) - val allBlocking = time must beBetween(totalDelays, totalDelays * 2) - allBlocking and (counter must beEqualTo(durations.length)) and (keys must beEqualTo(Set(key))) + Prop + .forAll(durationsGen) { durations => + val checks = for { + state <- initState + db <- Ref.of[IO, Int](0) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- durations.parTraverse_(d => state.checkAndRun(alwaysEmpty, db.update(_ + 1), (m, _) => IO.sleep(d) *> keys.update(_ ++ m))) + res <- (db.get, keys.get).tupled + } yield res + val result = measure(checks) + + result.unsafeRunSync() must beLike { + case ((counter, keys), time) => + val totalDelays = durations.foldMap(_.toMillis) + val allBlocking = time must beBetween(totalDelays, totalDelays * 2) + allBlocking.and(counter must beEqualTo(durations.length)).and(keys must beEqualTo(Set(key))) + } } - }.setParameters(Parameters(minTestsOk = 5, maxSize = 10)) + .setParameters(Parameters(minTestsOk = 5, maxSize = 10)) } - "execute non-locked calls in parallel" >> { // Potentially flaky test + "execute non-locked calls in parallel" >> { // Potentially flaky test val alwaysEmpty: SchemaState => StateCheck = _ => StateCheck.Ok - Prop.forAll(durationsGen) { durations => - val checks = for { - state <- initState - db <- Ref.of[IO, Int](0) - keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) - _ <- durations.parTraverse_(d => state.checkAndRun(alwaysEmpty, IO.sleep(d) *> db.update(_ + 1), (m, _) => keys.update(_ ++ m))) - res <- (db.get, keys.get).tupled - } yield res - val result = measure(checks) - - result.unsafeRunSync() must beLike { - case ((counter, keys), time) => - val maxDelay = durations.fold(5.millis)((a, b) => a.max(b)).toMillis - val nonBlocking = time must lessThan(maxDelay * 2) - nonBlocking and (counter must beEqualTo(durations.length)) and (keys must beEqualTo(Set())) + Prop + .forAll(durationsGen) { durations => + val checks = for { + state <- initState + db <- Ref.of[IO, Int](0) + keys <- Ref.of[IO, Set[SchemaKey]](Set.empty) + _ <- durations.parTraverse_(d => state.checkAndRun(alwaysEmpty, IO.sleep(d) *> db.update(_ + 1), (m, _) => keys.update(_ ++ m))) + res <- (db.get, keys.get).tupled + } yield res + val result = measure(checks) + + result.unsafeRunSync() must beLike { + case ((counter, keys), time) => + val maxDelay = durations.fold(5.millis)((a, b) => a.max(b)).toMillis + val nonBlocking = time must lessThan(maxDelay * 2) + nonBlocking.and(counter must beEqualTo(durations.length)).and(keys must beEqualTo(Set())) + } } - }.setParameters(Parameters(minTestsOk = 5, maxSize = 10)) + .setParameters(Parameters(minTestsOk = 5, maxSize = 10)) } } } @@ -118,7 +122,8 @@ object StateSpec { implicit val C: Clock[IO] = Clock.create[IO] implicit val T: Timer[IO] = IO.timer(concurrent.ExecutionContext.global) - val initState = State.init[IO](List.empty, igluClient.resolver) + val initState = State + .init[IO](List.empty, igluClient.resolver) .value .flatMap(_.fold(_ => IO.raiseError[State[IO]](new IllegalStateException("Cannot start a test")), IO.pure)) diff --git a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala index d8c56f9..5d79bab 100644 --- a/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala +++ b/modules/common/src/test/scala/com/snowplowanalytics/snowplow/postgres/streaming/sinkspec.scala @@ -28,19 +28,19 @@ import com.snowplowanalytics.snowplow.analytics.scalasdk.Event import com.snowplowanalytics.snowplow.badrows.Processor import com.snowplowanalytics.snowplow.postgres.Database -import com.snowplowanalytics.snowplow.postgres.api.{State, DB} +import com.snowplowanalytics.snowplow.postgres.api.{DB, State} import com.snowplowanalytics.snowplow.postgres.streaming.data.Data - class sinkspec extends Database { import Database._ val processor = Processor("pgloader", "test") - + val unorderedPipe = UnorderedPipe.concurrent[IO](5) "goodSink" should { "sink a single good event" >> { - val line = "snowplowweb\tweb\t2018-12-18 15:07:17.970\t2016-03-29 07:28:18.611\t2016-03-29 07:28:18.634\tpage_view\t11cdec7b-4cbd-4aa4-a4c9-3874ab9663d4\t\tsnplow6\tjs-2.6.0\tssc-0.6.0-kinesis\tspark-1.16.0-common-0.35.0\t34df2c48bc170c87befb441732a94196\t372d1f2983860eefd262b58e6592dfbc\t80546dc70f4a91f1283c4b6247e31bcf\t26e6412a2421eb923d9d40258ca9ca69\t1\t3a12e8b8e3e91a4d092b833d583c7e30\tDK\t82\tOdder\t8300\t42.0001\t42.003\tCentral Jutland\tTDC Danmark\tTDC Danmark\t\t\thttp://snowplowanalytics.com/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\tMarket basket analysis - identifying products and content that go well together – Snowplow\thttp://snowplowanalytics.com/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\thttp\tsnowplowanalytics.com\t80\t/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\thttp\tsnowplowanalytics.com\t80\t/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\tinternal\t\t\t\t\t\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/web_page/jsonschema/1-0-0\",\"data\":{\"id\":\"05862d26-0dde-4d7a-a494-fc9aae283d23\"}},{\"schema\":\"iglu:org.schema/WebPage/jsonschema/1-0-0\",\"data\":{\"genre\":\"documentation\",\"inLanguage\":\"en-US\"}},{\"schema\":\"iglu:org.w3/PerformanceTiming/jsonschema/1-0-0\",\"data\":{\"navigationStart\":1459236496534,\"unloadEventStart\":1459236496838,\"unloadEventEnd\":1459236496838,\"redirectStart\":0,\"redirectEnd\":0,\"fetchStart\":1459236496534,\"domainLookupStart\":1459236496534,\"domainLookupEnd\":1459236496534,\"connectStart\":1459236496534,\"connectEnd\":1459236496534,\"secureConnectionStart\":0,\"requestStart\":1459236496580,\"responseStart\":1459236496834,\"responseEnd\":1459236496844,\"domLoading\":1459236496853,\"domInteractive\":1459236497780,\"domContentLoadedEventStart\":1459236497780,\"domContentLoadedEventEnd\":1459236498038,\"domComplete\":0,\"loadEventStart\":0,\"loadEventEnd\":0,\"chromeFirstPaint\":1459236498203}}]}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\tChrome 49\tChrome\t49.0.2623.87\tBrowser\tWEBKIT\ten-US\t1\t1\t0\t0\t0\t0\t0\t0\t0\t1\t24\t1920\t1075\tWindows 7\tWindows\tMicrosoft Corporation\tEurope/Berlin\tComputer\t0\t1920\t1200\tUTF-8\t1903\t11214\t\t\t\t\t\t\t\tEurope/Copenhagen\t\t\t\t2016-03-29 07:28:18.636\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-1\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/ua_parser_context/jsonschema/1-0-0\",\"data\":{\"useragentFamily\":\"Chrome\",\"useragentMajor\":\"49\",\"useragentMinor\":\"0\",\"useragentPatch\":\"2623\",\"useragentVersion\":\"Chrome 49.0.2623\",\"osFamily\":\"Windows\",\"osMajor\":\"7\",\"osMinor\":null,\"osPatch\":null,\"osPatchMinor\":null,\"osVersion\":\"Windows 7\",\"deviceFamily\":\"Other\"}}]}\t88c23330-ac4d-4c82-8a18-aa83c1e0c163\t2016-03-29 07:28:18.609\tcom.snowplowanalytics.snowplow\tpage_view\tjsonschema\t1-0-0\tcab5ba164038f31d8e10befc4eb199df\t" + val line = + "snowplowweb\tweb\t2018-12-18 15:07:17.970\t2016-03-29 07:28:18.611\t2016-03-29 07:28:18.634\tpage_view\t11cdec7b-4cbd-4aa4-a4c9-3874ab9663d4\t\tsnplow6\tjs-2.6.0\tssc-0.6.0-kinesis\tspark-1.16.0-common-0.35.0\t34df2c48bc170c87befb441732a94196\t372d1f2983860eefd262b58e6592dfbc\t80546dc70f4a91f1283c4b6247e31bcf\t26e6412a2421eb923d9d40258ca9ca69\t1\t3a12e8b8e3e91a4d092b833d583c7e30\tDK\t82\tOdder\t8300\t42.0001\t42.003\tCentral Jutland\tTDC Danmark\tTDC Danmark\t\t\thttp://snowplowanalytics.com/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\tMarket basket analysis - identifying products and content that go well together – Snowplow\thttp://snowplowanalytics.com/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\thttp\tsnowplowanalytics.com\t80\t/documentation/recipes/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\thttp\tsnowplowanalytics.com\t80\t/analytics/catalog-analytics/market-basket-analysis-identifying-products-that-sell-well-together.html\t\t\tinternal\t\t\t\t\t\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-0\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/web_page/jsonschema/1-0-0\",\"data\":{\"id\":\"05862d26-0dde-4d7a-a494-fc9aae283d23\"}},{\"schema\":\"iglu:org.schema/WebPage/jsonschema/1-0-0\",\"data\":{\"genre\":\"documentation\",\"inLanguage\":\"en-US\"}},{\"schema\":\"iglu:org.w3/PerformanceTiming/jsonschema/1-0-0\",\"data\":{\"navigationStart\":1459236496534,\"unloadEventStart\":1459236496838,\"unloadEventEnd\":1459236496838,\"redirectStart\":0,\"redirectEnd\":0,\"fetchStart\":1459236496534,\"domainLookupStart\":1459236496534,\"domainLookupEnd\":1459236496534,\"connectStart\":1459236496534,\"connectEnd\":1459236496534,\"secureConnectionStart\":0,\"requestStart\":1459236496580,\"responseStart\":1459236496834,\"responseEnd\":1459236496844,\"domLoading\":1459236496853,\"domInteractive\":1459236497780,\"domContentLoadedEventStart\":1459236497780,\"domContentLoadedEventEnd\":1459236498038,\"domComplete\":0,\"loadEventStart\":0,\"loadEventEnd\":0,\"chromeFirstPaint\":1459236498203}}]}\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\tMozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36\tChrome 49\tChrome\t49.0.2623.87\tBrowser\tWEBKIT\ten-US\t1\t1\t0\t0\t0\t0\t0\t0\t0\t1\t24\t1920\t1075\tWindows 7\tWindows\tMicrosoft Corporation\tEurope/Berlin\tComputer\t0\t1920\t1200\tUTF-8\t1903\t11214\t\t\t\t\t\t\t\tEurope/Copenhagen\t\t\t\t2016-03-29 07:28:18.636\t\t\t{\"schema\":\"iglu:com.snowplowanalytics.snowplow/contexts/jsonschema/1-0-1\",\"data\":[{\"schema\":\"iglu:com.snowplowanalytics.snowplow/ua_parser_context/jsonschema/1-0-0\",\"data\":{\"useragentFamily\":\"Chrome\",\"useragentMajor\":\"49\",\"useragentMinor\":\"0\",\"useragentPatch\":\"2623\",\"useragentVersion\":\"Chrome 49.0.2623\",\"osFamily\":\"Windows\",\"osMajor\":\"7\",\"osMinor\":null,\"osPatch\":null,\"osPatchMinor\":null,\"osVersion\":\"Windows 7\",\"deviceFamily\":\"Other\"}}]}\t88c23330-ac4d-4c82-8a18-aa83c1e0c163\t2016-03-29 07:28:18.609\tcom.snowplowanalytics.snowplow\tpage_view\tjsonschema\t1-0-0\tcab5ba164038f31d8e10befc4eb199df\t" val event = Event.parse(line).getOrElse(throw new RuntimeException("Event is invalid")) val stream = Stream.emit[IO, Data](Data.Snowplow(event)) @@ -48,7 +48,7 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) - _ <- stream.through(sink.goodSink(state, igluClient, processor)).compile.drain.action + _ <- stream.through(sink.goodSink(unorderedPipe, state, igluClient, processor)).compile.drain.action eventIds <- query.action uaParserCtxs <- count("com_snowplowanalytics_snowplow_ua_parser_context_1").action } yield (eventIds, uaParserCtxs) @@ -57,7 +57,7 @@ class sinkspec extends Database { val ExpectedEventId = UUID.fromString("11cdec7b-4cbd-4aa4-a4c9-3874ab9663d4") result must beRight.like { case (List(ExpectedEventId), 1) => ok - case (ids, ctxs) => ko(s"Unexpected result. Event ids: $ids; Contexts: $ctxs") + case (ids, ctxs) => ko(s"Unexpected result. Event ids: $ids; Contexts: $ctxs") } } @@ -70,14 +70,14 @@ class sinkspec extends Database { val action = for { state <- State.init[IO](List(), igluClient.resolver) - _ <- stream.through(sink.goodSink(state, igluClient, processor)).compile.drain.action + _ <- stream.through(sink.goodSink(unorderedPipe, state, igluClient, processor)).compile.drain.action eventIds <- query.action rows <- count("com_getvero_bounced_1").action } yield (eventIds, rows) val result = action.value.unsafeRunSync() result must beRight.like { - case (Nil, 1) => ok + case (Nil, 1) => ok case (ids, ctxs) => ko(s"Unexpected result. Event ids: ${ids.mkString(", ")}; Contexts: $ctxs") } } @@ -87,27 +87,29 @@ class sinkspec extends Database { json"""{"schema":"iglu:me.chuwy/pg-test/jsonschema/1-0-0","data":{"requiredString":"one","requiredUnion":false,"nested":{"a": 1}}}""", json"""{"schema":"iglu:me.chuwy/pg-test/jsonschema/1-0-1","data":{"requiredString":"two", "requiredUnion": false, "nested": {"a": 2}, "someArray": [2,"two",{}]}}""", json"""{"schema":"iglu:me.chuwy/pg-test/jsonschema/1-0-2","data":{"requiredString":"three","requiredUnion":"three","nested":{"a": 3},"bigInt": 3}}""" - ).map(SelfDescribingData.parse[Json]).map(_.getOrElse(throw new RuntimeException("Invalid SelfDescribingData"))).map(Data.SelfDescribing.apply) + ).map(SelfDescribingData.parse[Json]) + .map(_.getOrElse(throw new RuntimeException("Invalid SelfDescribingData"))) + .map(Data.SelfDescribing.apply) val stream = Stream.emits[IO, Data](rows) val ExpectedColumnInfo = List( - ColumnInfo("required_string", None, false, "character varying", Some(4096)), - ColumnInfo("required_union", None, false, "jsonb", None), - ColumnInfo("id", None, true, "uuid", None), - ColumnInfo("nested.a", None, true, "double precision", None), - ColumnInfo("nested.b", None, true, "character varying", Some(4096)), - ColumnInfo("some_array", None, true, "jsonb", None), - ColumnInfo("nested.c", None, true, "bigint", None), - ColumnInfo("some_date", None, true, "timestamp without time zone", None), - ColumnInfo("big_int", None, true, "bigint", None), + ColumnInfo("required_string", None, false, "character varying", Some(4096)), + ColumnInfo("required_union", None, false, "jsonb", None), + ColumnInfo("id", None, true, "uuid", None), + ColumnInfo("nested.a", None, true, "double precision", None), + ColumnInfo("nested.b", None, true, "character varying", Some(4096)), + ColumnInfo("some_array", None, true, "jsonb", None), + ColumnInfo("nested.c", None, true, "bigint", None), + ColumnInfo("some_date", None, true, "timestamp without time zone", None), + ColumnInfo("big_int", None, true, "bigint", None) ) implicit val D = DB.interpreter[IO](igluClient.resolver, xa, logger, Schema) val action = for { state <- State.init[IO](List(), igluClient.resolver) - _ <- stream.through(sink.goodSink(state, igluClient, processor)).compile.drain.action + _ <- stream.through(sink.goodSink(unorderedPipe, state, igluClient, processor)).compile.drain.action rows <- count("me_chuwy_pg_test_1").action table <- describeTable("me_chuwy_pg_test_1").action } yield (rows, table) @@ -115,7 +117,7 @@ class sinkspec extends Database { val result = action.value.unsafeRunSync() result must beRight.like { case (3, ExpectedColumnInfo) => ok - case (ctxs, cols) => ko(s"Unexpected result. Number of rows: $ctxs; Columns ${cols}") + case (ctxs, cols) => ko(s"Unexpected result. Number of rows: $ctxs; Columns ${cols}") } } } diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala index 0528434..bf7f91a 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/Cli.scala @@ -12,13 +12,13 @@ */ package com.snowplowanalytics.snowplow.postgres.config -import java.nio.file.{InvalidPathException, Files, Path, Paths} +import java.nio.file.{Files, InvalidPathException, Path, Paths} import java.util.Base64 import cats.data.{EitherT, ValidatedNel} import cats.implicits._ -import cats.effect.{Sync, Clock} +import cats.effect.{Clock, Sync} import io.circe.Json import io.circe.syntax._ @@ -44,37 +44,39 @@ object Cli { /** Parse list of arguments, validate against schema and initialize */ def parse[F[_]: Sync: Clock](args: List[String]): EitherT[F, String, Cli[F]] = command.parse(args) match { - case Left(help) => EitherT.leftT[F, Cli[F]](help.show) + case Left(help) => EitherT.leftT[F, Cli[F]](help.show) case Right(rawConfig) => fromRawConfig(rawConfig) } - private def fromRawConfig[F[_]: Sync: Clock](rawConfig: RawConfig): EitherT[F, String, Cli[F]] = { + private def fromRawConfig[F[_]: Sync: Clock](rawConfig: RawConfig): EitherT[F, String, Cli[F]] = for { resolverJson <- PathOrJson.load(rawConfig.resolver) igluClient <- Client.parseDefault[F](resolverJson).leftMap(_.show) configJson <- PathOrJson.load(rawConfig.config) - configData <- SelfDescribingData.parse(configJson).leftMap(e => s"Configuration JSON is not self-describing, ${e.message(configJson.noSpaces)}").toEitherT[F] + configData <- + SelfDescribingData + .parse(configJson) + .leftMap(e => s"Configuration JSON is not self-describing, ${e.message(configJson.noSpaces)}") + .toEitherT[F] _ <- igluClient.check(configData).leftMap(e => s"Iglu validation failed with following error\n: ${e.asJson.spaces2}") appConfig <- configData.data.as[LoaderConfig].toEitherT[F].leftMap(e => s"Error while decoding configuration JSON, ${e.show}") } yield Cli(appConfig, igluClient, rawConfig.debug) - } /** Config files for Loader can be passed either as FS path - * or as base64-encoded JSON (if `--base64` is provided) */ + * or as base64-encoded JSON (if `--base64` is provided) */ type PathOrJson = Either[Path, Json] object PathOrJson { def parse(string: String, encoded: Boolean): ValidatedNel[String, PathOrJson] = { - val result = if (encoded) - Either - .catchOnly[IllegalArgumentException](new String(Base64.getDecoder.decode(string))) - .leftMap(_.getMessage) - .flatMap(s => jsonParse(s).leftMap(_.show)) - .map(_.asRight) - else Either.catchOnly[InvalidPathException](Paths.get(string).asLeft).leftMap(_.getMessage) - result - .leftMap(error => s"Cannot parse as ${if (encoded) "base64-encoded JSON" else "FS path"}: $error") - .toValidatedNel + val result = + if (encoded) + Either + .catchOnly[IllegalArgumentException](new String(Base64.getDecoder.decode(string))) + .leftMap(_.getMessage) + .flatMap(s => jsonParse(s).leftMap(_.show)) + .map(_.asRight) + else Either.catchOnly[InvalidPathException](Paths.get(string).asLeft).leftMap(_.getMessage) + result.leftMap(error => s"Cannot parse as ${if (encoded) "base64-encoded JSON" else "FS path"}: $error").toValidatedNel } def load[F[_]: Sync](value: PathOrJson): EitherT[F, String, Json] = @@ -100,22 +102,26 @@ object Cli { help = "Self-describing JSON configuration" ) - val base64 = Opts.flag( - long = "base64", - help = "Configuration passed as Base64-encoded string, not as file path" - ).orFalse + val base64 = Opts + .flag( + long = "base64", + help = "Configuration passed as Base64-encoded string, not as file path" + ) + .orFalse - val debug = Opts.flag( - long = "debug", - help = "Show verbose SQL logging" - ).orFalse + val debug = Opts + .flag( + long = "debug", + help = "Show verbose SQL logging" + ) + .orFalse /** Temporary, pure config */ private case class RawConfig(config: PathOrJson, resolver: PathOrJson, debug: Boolean) private val command: Command[RawConfig] = - Command[(String, String, Boolean, Boolean)](BuildInfo.name, BuildInfo.version)((config, resolver, base64, debug).tupled) - .mapValidated { case (cfg, res, enc, deb) => + Command[(String, String, Boolean, Boolean)](BuildInfo.name, BuildInfo.version)((config, resolver, base64, debug).tupled).mapValidated { + case (cfg, res, enc, deb) => (PathOrJson.parse(cfg, enc), PathOrJson.parse(res, enc), deb.validNel).mapN(RawConfig.apply) - } + } } diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala index 3b4d290..0bfdd89 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/config/LoaderConfig.scala @@ -12,7 +12,7 @@ */ package com.snowplowanalytics.snowplow.postgres.config -import java.util.{UUID, Date} +import java.util.{Date, UUID} import java.time.Instant import scala.jdk.CollectionConverters._ @@ -30,16 +30,17 @@ import software.amazon.awssdk.regions.Region import software.amazon.kinesis.common.InitialPositionInStream case class LoaderConfig(name: String, - id: UUID, - source: Source, - host: String, - port: Int, - database: String, - username: String, - password: String, // TODO: can be EC2 store - sslMode: String, - schema: String, - purpose: Purpose) { + id: UUID, + source: Source, + host: String, + port: Int, + database: String, + username: String, + password: String, // TODO: can be EC2 store + sslMode: String, + schema: String, + purpose: Purpose +) { def getDBConfig: DBConfig = DBConfig(host, port, database, username, password, sslMode, schema) } @@ -54,12 +55,14 @@ object LoaderConfig { } sealed trait InitPosition { + /** Turn it into fs2-aws-compatible structure */ - def unwrap: Either[InitialPositionInStream, Date] = this match { - case InitPosition.Latest => InitialPositionInStream.LATEST.asLeft - case InitPosition.TrimHorizon => InitialPositionInStream.TRIM_HORIZON.asLeft - case InitPosition.AtTimestamp(date) => Date.from(date).asRight - } + def unwrap: Either[InitialPositionInStream, Date] = + this match { + case InitPosition.Latest => InitialPositionInStream.LATEST.asLeft + case InitPosition.TrimHorizon => InitialPositionInStream.TRIM_HORIZON.asLeft + case InitPosition.AtTimestamp(date) => Date.from(date).asRight + } } object InitPosition { case object Latest extends InitPosition @@ -70,7 +73,7 @@ object LoaderConfig { Decoder.decodeJson.emap { json => json.asString match { case Some("TRIM_HORIZON") => TrimHorizon.asRight - case Some("LATEST") => Latest.asRight + case Some("LATEST") => Latest.asRight case Some(other) => s"Initial position $other is unknown. Choose from LATEST and TRIM_HORIZEON. AT_TIMESTAMP must provide the timestamp".asLeft case None => @@ -83,7 +86,8 @@ object LoaderConfig { } yield AtTimestamp(timestamp) result match { case Some(atTimestamp) => atTimestamp.asRight - case None => "Initial position can be either LATEST or TRIM_HORIZON string or AT_TIMESTAMP object (e.g. 2020-06-03T00:00:00Z)".asLeft + case None => + "Initial position can be either LATEST or TRIM_HORIZON string or AT_TIMESTAMP object (e.g. 2020-06-03T00:00:00Z)".asLeft } } } @@ -97,8 +101,8 @@ object LoaderConfig { implicit def ioCirceConfigPurposeDecoder: Decoder[Purpose] = Decoder.decodeString.emap { case "ENRICHED_EVENTS" => Enriched.asRight - case "JSON" => SelfDescribing.asRight - case other => s"$other is not supported purpose, choose from ENRICHED_EVENTS and JSON".asLeft + case "JSON" => SelfDescribing.asRight + case other => s"$other is not supported purpose, choose from ENRICHED_EVENTS and JSON".asLeft } } diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala index bc5ad34..5b980ab 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/loader/Main.scala @@ -23,7 +23,7 @@ import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose import com.snowplowanalytics.snowplow.postgres.generated.BuildInfo import com.snowplowanalytics.snowplow.postgres.resources import com.snowplowanalytics.snowplow.postgres.storage.utils -import com.snowplowanalytics.snowplow.postgres.streaming.{sink, source} +import com.snowplowanalytics.snowplow.postgres.streaming.{UnorderedPipe, sink, source} object Main extends IOApp { @@ -40,12 +40,12 @@ object Main extends IOApp { implicit val db: DB[IO] = DB.interpreter[IO](iglu.resolver, xa, logger, loaderConfig.schema) for { _ <- loaderConfig.purpose match { - case Purpose.Enriched => utils.prepare[IO](loaderConfig.schema, xa, logger) + case Purpose.Enriched => utils.prepare[IO](loaderConfig.schema, xa, logger) case Purpose.SelfDescribing => IO.unit } - goodSink = sink.goodSink[IO](state, iglu, processor) - badSink = sink.badSink[IO] - s = dataStream.observeEither(badSink, goodSink.andThen(_.through(badSink))) + badSink = sink.badSink[IO](blocker) + goodSink = sink.goodSink[IO](UnorderedPipe.forTransactor(xa), state, iglu, processor).andThen(_.through(badSink)) + s = dataStream.observeEither(badSink, goodSink) _ <- s.compile.drain } yield ExitCode.Success diff --git a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala index 28e8e4b..370f8ba 100644 --- a/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala +++ b/modules/loader/src/main/scala/com/snowplowanalytics/snowplow/postgres/streaming/source.scala @@ -17,7 +17,7 @@ import java.nio.charset.StandardCharsets import cats.implicits._ -import cats.effect.{ContextShift, Sync, ConcurrentEffect, Blocker} +import cats.effect.{Blocker, ConcurrentEffect, ContextShift, Sync} import fs2.aws.kinesis.{CommittableRecord, KinesisConsumerSettings} import fs2.aws.kinesis.consumer.readFromKinesisStream @@ -32,27 +32,25 @@ import com.snowplowanalytics.iglu.core.circe.implicits._ import com.snowplowanalytics.snowplow.analytics.scalasdk.Event import com.snowplowanalytics.snowplow.analytics.scalasdk.ParsingError.NotTSV import com.snowplowanalytics.snowplow.badrows.{BadRow, Payload} -import com.snowplowanalytics.snowplow.postgres.config.{LoaderConfig, Cli} -import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.Purpose +import com.snowplowanalytics.snowplow.postgres.config.{Cli, LoaderConfig} +import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.{Purpose, Source} import com.snowplowanalytics.snowplow.postgres.streaming.data.{BadData, Data} import com.google.pubsub.v1.PubsubMessage -import com.permutive.pubsub.consumer.Model.{Subscription, ProjectId} +import com.permutive.pubsub.consumer.Model.{ProjectId, Subscription} import com.permutive.pubsub.consumer.decoder.MessageDecoder object source { /** - * Acquire a stream of parsed payloads - * + * Acquire a stream of parsed payloads + * * @param blocker thread pool for pulling events (used only in PubSub) - * @param purpose kind of data, enriched or plain JSONs - * @param config source configuration - * @return either error or stream of parsed payloads - */ - def getSource[F[_]: ConcurrentEffect: ContextShift](blocker: Blocker, - purpose: Purpose, - config: LoaderConfig.Source) = + * @param purpose kind of data, enriched or plain JSONs + * @param config source configuration + * @return either error or stream of parsed payloads + */ + def getSource[F[_]: ConcurrentEffect: ContextShift](blocker: Blocker, purpose: Purpose, config: Source) = config match { case LoaderConfig.Source.Kinesis(appName, streamName, region, position) => KinesisConsumerSettings.apply(streamName, appName, region, initialPositionInStream = position.unwrap) match { @@ -66,27 +64,29 @@ object source { val project = ProjectId(projectId) val subscription = Subscription(subscriptionId) val pubsubConfig = PubsubGoogleConsumerConfig[F](onFailedTerminate = pubsubOnFailedTerminate[F]) - PubsubGoogleConsumer.subscribeAndAck[F, Either[BadData, Data]](blocker, project, subscription, pubsubErrorHandler[F], pubsubConfig).asRight + PubsubGoogleConsumer + .subscribeAndAck[F, Either[BadData, Data]](blocker, project, subscription, pubsubErrorHandler[F], pubsubConfig) + .asRight } /** - * Parse Kinesis record into a valid Loader's record, either enriched event or self-describing JSON, - * depending on purpose of the Loader - */ + * Parse Kinesis record into a valid Loader's record, either enriched event or self-describing JSON, + * depending on purpose of the Loader + */ def parseRecord(kind: Purpose, record: CommittableRecord): Either[BadData, Data] = { - val string = try { - StandardCharsets.UTF_8.decode(record.record.data()).toString.asRight[BadData] - } catch { - case _: IllegalArgumentException => - val payload = StandardCharsets.UTF_8.decode(Base64.getEncoder.encode(record.record.data())).toString - kind match { - case Purpose.Enriched => - val badRow = BadRow.LoaderParsingError(Cli.processor, NotTSV, Payload.RawPayload(payload)) - BadData.BadEnriched(badRow).asLeft - case Purpose.SelfDescribing => - BadData.BadJson(payload, "Cannot deserialize self-describing JSON from Kinesis record").asLeft - } - } + val string = + try StandardCharsets.UTF_8.decode(record.record.data()).toString.asRight[BadData] + catch { + case _: IllegalArgumentException => + val payload = StandardCharsets.UTF_8.decode(Base64.getEncoder.encode(record.record.data())).toString + kind match { + case Purpose.Enriched => + val badRow = BadRow.LoaderParsingError(Cli.processor, NotTSV, Payload.RawPayload(payload)) + BadData.BadEnriched(badRow).asLeft + case Purpose.SelfDescribing => + BadData.BadJson(payload, "Cannot deserialize self-describing JSON from Kinesis record").asLeft + } + } string.flatMap { payload => kind match { diff --git a/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala index 1aefd45..e1eaedc 100644 --- a/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala +++ b/modules/loader/src/test/scala/com.snowplowanalytics.snowplow.postgres/config/CliSpec.scala @@ -15,9 +15,9 @@ package com.snowplowanalytics.snowplow.postgres.config import java.nio.file.Paths import java.util.UUID -import cats.effect.{IO, Clock} +import cats.effect.{Clock, IO} -import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.{Purpose, Source, InitPosition} +import com.snowplowanalytics.snowplow.postgres.config.LoaderConfig.{InitPosition, Purpose, Source} import org.specs2.mutable.Specification import software.amazon.awssdk.regions.Region @@ -47,7 +47,7 @@ class CliSpec extends Specification { val result = Cli.parse[IO](argv).value.unsafeRunSync() result must beRight.like { case Cli(config, _, false) => config must beEqualTo(expected) - case Cli(_, _, true) => ko("Unexpected debug flag") + case Cli(_, _, true) => ko("Unexpected debug flag") } } }