From 7f1cc5e98d90b025653c28d356e3eef71fce43fd Mon Sep 17 00:00:00 2001 From: miike Date: Fri, 25 Aug 2017 08:55:32 +1000 Subject: [PATCH] Schema DDL: add support for ZSTD encoding (close #237) --- .../redshift/ColumnAttribute.scala | 2 ++ .../redshift/generators/DdlGenerator.scala | 22 +++++++++---------- .../generators/EncodeSuggestions.scala | 6 ++++- .../generators/DdlGeneratorSpec.scala | 6 ++--- .../generators/MigrationGeneratorSpec.scala | 6 ++--- 5 files changed, 24 insertions(+), 18 deletions(-) diff --git a/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/ColumnAttribute.scala b/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/ColumnAttribute.scala index 3fa3f3d3..b916f597 100644 --- a/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/ColumnAttribute.scala +++ b/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/ColumnAttribute.scala @@ -69,3 +69,5 @@ case object RunLengthEncoding extends CompressionEncodingValue { def toDdl = "RU case object Text255Encoding extends CompressionEncodingValue { def toDdl = "TEXT255" } case object Text32KEncoding extends CompressionEncodingValue { def toDdl = "TEXT32K" } + +case object ZstdEncoding extends CompressionEncodingValue { def toDdl = "ZSTD"} diff --git a/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/DdlGenerator.scala b/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/DdlGenerator.scala index 7fa786af..585f5e98 100644 --- a/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/DdlGenerator.scala +++ b/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/DdlGenerator.scala @@ -85,19 +85,19 @@ object DdlGenerator { // Columns with data taken from self-describing schema private[redshift] val selfDescSchemaColumns = List( - Column("schema_vendor", RedshiftVarchar(128), Set(CompressionEncoding(RunLengthEncoding)), Set(Nullability(NotNull))), - Column("schema_name", RedshiftVarchar(128), Set(CompressionEncoding(RunLengthEncoding)), Set(Nullability(NotNull))), - Column("schema_format", RedshiftVarchar(128), Set(CompressionEncoding(RunLengthEncoding)), Set(Nullability(NotNull))), - Column("schema_version", RedshiftVarchar(128), Set(CompressionEncoding(RunLengthEncoding)), Set(Nullability(NotNull))) + Column("schema_vendor", RedshiftVarchar(128), Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))), + Column("schema_name", RedshiftVarchar(128), Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))), + Column("schema_format", RedshiftVarchar(128), Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))), + Column("schema_version", RedshiftVarchar(128), Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))) ) // Snowplow-specific columns private[redshift] val parentageColumns = List( Column("root_id", RedshiftChar(36), Set(CompressionEncoding(RawEncoding)), Set(Nullability(NotNull))), - Column("root_tstamp", RedshiftTimestamp, Set(CompressionEncoding(LzoEncoding)), Set(Nullability(NotNull))), - Column("ref_root", RedshiftVarchar(255), Set(CompressionEncoding(RunLengthEncoding)), Set(Nullability(NotNull))), - Column("ref_tree", RedshiftVarchar(1500), Set(CompressionEncoding(RunLengthEncoding)), Set(Nullability(NotNull))), - Column("ref_parent", RedshiftVarchar(255), Set(CompressionEncoding(RunLengthEncoding)), Set(Nullability(NotNull))) + Column("root_tstamp", RedshiftTimestamp, Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))), + Column("ref_root", RedshiftVarchar(255), Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))), + Column("ref_tree", RedshiftVarchar(1500), Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))), + Column("ref_parent", RedshiftVarchar(255), Set(CompressionEncoding(ZstdEncoding)), Set(Nullability(NotNull))) ) @@ -196,7 +196,7 @@ object DdlGenerator { ) // List of compression encoding suggestions - val encodingSuggestions: List[EncodingSuggestion] = List(lzoSuggestion) + val encodingSuggestions: List[EncodingSuggestion] = List(lzoSuggestion, zstdSuggestion) /** @@ -231,7 +231,7 @@ object DdlGenerator { * Takes each suggestion out of ``compressionEncodingSuggestions`` and * decide whether current properties satisfy it, then return the compression * encoding. - * If nothing suggested LZO Encoding returned as default + * If nothing suggested ZSTD Encoding returned as default * * @param properties is a string we need to recognize * @param dataType redshift data type for current column @@ -247,7 +247,7 @@ object DdlGenerator { : CompressionEncoding = { suggestions match { - case Nil => CompressionEncoding(LzoEncoding) // LZO is default for user-generated + case Nil => CompressionEncoding(ZstdEncoding) // ZSTD is default for user-generated case suggestion :: tail => suggestion(properties, dataType, columnName) match { case Some(encoding) => CompressionEncoding(encoding) case None => getEncoding(properties, dataType, columnName, tail) diff --git a/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/EncodeSuggestions.scala b/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/EncodeSuggestions.scala index 55f226ec..d19892ec 100644 --- a/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/EncodeSuggestions.scala +++ b/0-common/schema-ddl/src/main/scala/com.snowplowanalytics/iglu.schemaddl/redshift/generators/EncodeSuggestions.scala @@ -29,5 +29,9 @@ object EncodeSuggestions { case _ => None } - + val zstdSuggestion: EncodingSuggestion = (properties, dataType, columnName) => + dataType match { + case RedshiftVarchar(_) => Some(ZstdEncoding) + case _ => None + } } diff --git a/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/DdlGeneratorSpec.scala b/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/DdlGeneratorSpec.scala index 04e2c31f..2db50910 100644 --- a/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/DdlGeneratorSpec.scala +++ b/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/DdlGeneratorSpec.scala @@ -46,8 +46,8 @@ class DdlGeneratorSpec extends Specification { def is = s2""" DdlGenerator.selfDescSchemaColumns ++ DdlGenerator.parentageColumns ++ List( - Column("foo",RedshiftVarchar(30),Set(CompressionEncoding(LzoEncoding)),Set(Nullability(NotNull))), - Column("bar",RedshiftVarchar(5),Set(CompressionEncoding(LzoEncoding)),Set()) + Column("foo",RedshiftVarchar(30),Set(CompressionEncoding(ZstdEncoding)),Set(Nullability(NotNull))), + Column("bar",RedshiftVarchar(5),Set(CompressionEncoding(ZstdEncoding)),Set()) ), Set(ForeignKeyTable(NonEmptyList("root_id"),RefTable("atomic.events",Some("event_id")))), Set(Diststyle(Key), DistKeyTable("root_id"),SortKeyTable(None,NonEmptyList("root_tstamp"))) @@ -74,7 +74,7 @@ class DdlGeneratorSpec extends Specification { def is = s2""" DdlGenerator.parentageColumns ++ List( Column("foo",RedshiftBoolean,Set(CompressionEncoding(RunLengthEncoding)),Set(Nullability(NotNull))), - Column("bar",RedshiftVarchar(5),Set(CompressionEncoding(LzoEncoding)),Set()), + Column("bar",RedshiftVarchar(5),Set(CompressionEncoding(ZstdEncoding)),Set()), Column("baz",RedshiftBoolean,Set(CompressionEncoding(RunLengthEncoding)),Set()) ), Set(ForeignKeyTable(NonEmptyList("root_id"),RefTable("atomic.events",Some("event_id")))), diff --git a/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/MigrationGeneratorSpec.scala b/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/MigrationGeneratorSpec.scala index a578bba9..c9659f0e 100644 --- a/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/MigrationGeneratorSpec.scala +++ b/0-common/schema-ddl/src/test/scala/com/snowplowanalytics/iglu/schemaddl/redshift/generators/MigrationGeneratorSpec.scala @@ -51,7 +51,7 @@ class MigrationGeneratorSpec extends Specification { def is = s2""" |BEGIN TRANSACTION; | | ALTER TABLE atomic.com_acme_launch_missles_1 - | ADD COLUMN "status" VARCHAR(4096) ENCODE LZO; + | ADD COLUMN "status" VARCHAR(4096) ENCODE ZSTD; | | COMMENT ON TABLE atomic.com_acme_launch_missles_1 IS 'iglu:com.acme/launch_missles/jsonschema/1-0-1'; | @@ -108,9 +108,9 @@ class MigrationGeneratorSpec extends Specification { def is = s2""" |BEGIN TRANSACTION; | | ALTER TABLE atomic.com_acme_launch_missles_1 - | ADD COLUMN "status" VARCHAR(4096) ENCODE LZO; + | ADD COLUMN "status" VARCHAR(4096) ENCODE ZSTD; | ALTER TABLE atomic.com_acme_launch_missles_1 - | ADD COLUMN "launch_time" TIMESTAMP ENCODE LZO; + | ADD COLUMN "launch_time" TIMESTAMP ENCODE ZSTD; | ALTER TABLE atomic.com_acme_launch_missles_1 | ADD COLUMN "latitude" DOUBLE PRECISION ENCODE RAW; | ALTER TABLE atomic.com_acme_launch_missles_1