Merge 6242489 into 8b259d8

pureconfig · Jul 13, 2021 · d1c4998 · d1c4998
2 parents 8b259d8 + 6242489
commit d1c4998
Show file tree

Hide file tree

Showing 4 changed files with 261 additions and 0 deletions.
diff --git a/build.sbt b/build.sbt
@@ -62,6 +62,7 @@ lazy val joda = module(project) in file("modules/joda")
 lazy val magnolia = module(project) in file("modules/magnolia") dependsOn `generic-base`
 lazy val `scala-xml` = module(project) in file("modules/scala-xml")
 lazy val scalaz = module(project) in file("modules/scalaz")
+lazy val spark = module(project) in file("modules/spark")
 lazy val squants = module(project) in file("modules/squants")
 lazy val sttp = module(project) in file("modules/sttp")
 lazy val yaml = module(project) in file("modules/yaml")

diff --git a/modules/spark/README.md b/modules/spark/README.md
@@ -0,0 +1,161 @@
+# Spark module for PureConfig
+
+Adds support for selected [Spark](http://spark.apache.org/) classes to PureConfig.
+
+## Add pureconfig-spark to your project
+
+In addition to [core PureConfig](https://github.com/pureconfig/pureconfig), you'll need:
+
+```scala
+libraryDependencies += "com.github.pureconfig" %% "pureconfig-spark" % "0.16.0"
+```
+
+Also, `pureconfig-spark` depends on `spark-sql` with `provided` scope.
+Spark libraries are generally added on runtime.
+This module has been tested on Spark 3 but it should also work for Spark 2.4 since basic datatype APIs should stay the same.
+Please note that we are only supporting Scala 2.12 for all Spark versions.
+
+To use the Spark module you need to import:
+```scala
+import pureconfig.module.spark._
+```
+
+## Supported classes
+
+* `org.apache.spark.sql.types.DataType`
+* `org.apache.spark.sql.types.StructType`
+* `org.apache.spark.sql.types.Metadata`
+* `org.apache.spark.sql.types.StructField` (derivable)
+
+## Example
+
+### Custom HOCON schema to/from Spark schema
+Setup custom schema case classes and converters between custom schema and Spark schema.
+```scala
+import org.apache.spark.sql.types._
+import pureconfig._
+import pureconfig.generic.auto._
+import pureconfig.module.spark.sql._
+
+case class MySchema(name: String, fields: List[StructField], someOtherSetting: Option[String])
+
+def mySchemaToSparkSchema(schema: MySchema): StructType =
+  StructType(schema.fields)
+
+def sparkSchemaToMySchema(name: String, schema: StructType): MySchema =
+  MySchema(name, schema.fields.toList, None)
+```
+
+Convert custom schema to Spark and back to custom schema. Resultant string schema should match original source.
+```scala
+val mySchemaRes = ConfigSource.string(
+  """name: Employee,
+    |fields: [
+    |  { name: name, data-type: string }, #types are case-insensitive and some types have variations/truncations
+    |  { name: age, data-type: integer }, #also note that `nullable` and `metadata` are optional fields with Spark defaults
+    |  { name: salary, data-type: "decimal(6,2)" },
+    |  { name: address, data-type: "line1 string, line2 string" } #outer `struct` is optional
+    |]
+    |""".stripMargin).load[MySchema]
+// mySchemaRes: ConfigReader.Result[MySchema] = Right(
+//   MySchema(
+//     "Employee",
+//     List(
+//       StructField("name", StringType, true, {}),
+//       StructField("age", IntegerType, true, {}),
+//       StructField("salary", DecimalType(6, 2), true, {}),
+//       StructField(
+//         "address",
+//         StructType(
+//           StructField("line1", StringType, true, {}),
+//           StructField("line2", StringType, true, {})
+//         ),
+//         true,
+//         {}
+//       )
+//     ),
+//     None
+//   )
+// )
+
+val sparkSchemaRes = mySchemaRes.map(mySchemaToSparkSchema)
+// sparkSchemaRes: Either[error.ConfigReaderFailures, StructType] = Right(
+//   StructType(
+//     StructField("name", StringType, true, {}),
+//     StructField("age", IntegerType, true, {}),
+//     StructField("salary", DecimalType(6, 2), true, {}),
+//     StructField(
+//       "address",
+//       StructType(
+//         StructField("line1", StringType, true, {}),
+//         StructField("line2", StringType, true, {})
+//       ),
+//       true,
+//       {}
+//     )
+//   )
+// )
+
+val mySchemaRes2 =
+  for {
+    mySchema <- mySchemaRes
+    sparkSchema <- sparkSchemaRes
+  } yield sparkSchemaToMySchema(mySchema.name, sparkSchema)
+// mySchemaRes2: Either[error.ConfigReaderFailures, MySchema] = Right(
+//   MySchema(
+//     "Employee",
+//     List(
+//       StructField("name", StringType, true, {}),
+//       StructField("age", IntegerType, true, {}),
+//       StructField("salary", DecimalType(6, 2), true, {}),
+//       StructField(
+//         "address",
+//         StructType(
+//           StructField("line1", StringType, true, {}),
+//           StructField("line2", StringType, true, {})
+//         ),
+//         true,
+//         {}
+//       )
+//     ),
+//     None
+//   )
+// )
+
+val stringSchemaRes = mySchemaRes2.map(ConfigWriter[MySchema].to)
+// stringSchemaRes: Either[error.ConfigReaderFailures, com.typesafe.config.ConfigValue] = Right(
+//   SimpleConfigObject({"fields":[{"data-type":"STRING","metadata":"{}","name":"name","nullable":true},{"data-type":"INT","metadata":"{}","name":"age","nullable":true},{"data-type":"DECIMAL(6,2)","metadata":"{}","name":"salary","nullable":true},{"data-type":"STRUCT<`line1`: STRING, `line2`: STRING>","metadata":"{}","name":"address","nullable":true}],"name":"Employee"})
+// )
+```
+
+### Full schema encoded as HOCON String field to/from Spark schema
+You can also read Spark schemas directly as `StructType` instead of narrowing `DataType` yourself.
+```scala
+case class Config(schema: StructType)
+val configRes = ConfigSource.string(
+  """
+    |schema = "a int, b string, c struct<c1:int,c2:double>"
+    |""".stripMargin).load[Config]
+// configRes: ConfigReader.Result[Config] = Right(
+//   Config(
+//     StructType(
+//       StructField("a", IntegerType, true, {}),
+//       StructField("b", StringType, true, {}),
+//       StructField(
+//         "c",
+//         StructType(
+//           StructField("c1", IntegerType, true, {}),
+//           StructField("c2", DoubleType, true, {})
+//         ),
+//         true,
+//         {}
+//       )
+//     )
+//   )
+// )
+
+val stringSchemaRes2 = configRes.map(ConfigWriter[Config].to)
+// stringSchemaRes2: Either[error.ConfigReaderFailures, com.typesafe.config.ConfigValue] = Right(
+//   SimpleConfigObject({"schema":"STRUCT<`a`: INT, `b`: STRING, `c`: STRUCT<`c1`: INT, `c2`: DOUBLE>>"})
+// )
+```
diff --git a/modules/spark/build.sbt b/modules/spark/build.sbt
@@ -0,0 +1,17 @@
+import Dependencies.Version._
+
+name := "pureconfig-spark"
+
+crossScalaVersions := Seq(scala212) //Spark does not support Scala 2.13 yet
+
+libraryDependencies ++= Seq("org.apache.spark" %% "spark-sql" % "3.1.2" % "provided")
+mdocLibraryDependencies ++= Seq("org.apache.spark" %% "spark-sql" % "3.1.2")
+
+osgiSettings
+
+OsgiKeys.exportPackage := Seq("pureconfig.module.spark.*")
+OsgiKeys.privatePackage := Seq()
+OsgiKeys.importPackage := Seq(
+  s"""scala.*;version="[${scalaBinaryVersion.value}.0,${scalaBinaryVersion.value}.50)"""",
+  "*"
+)
diff --git a/modules/spark/docs/README.md b/modules/spark/docs/README.md
@@ -0,0 +1,82 @@
+# Spark module for PureConfig
+
+Adds support for selected [Spark](http://spark.apache.org/) classes to PureConfig.
+
+## Add pureconfig-spark to your project
+
+In addition to [core PureConfig](https://github.com/pureconfig/pureconfig), you'll need:
+
+```scala
+libraryDependencies += "com.github.pureconfig" %% "pureconfig-spark" % "@VERSION@"
+```
+
+Also, `pureconfig-spark` depends on `spark-sql` with `provided` scope.
+Spark libraries are generally added on runtime.
+This module has been tested on Spark 3 but it should also work for Spark 2.4 since basic datatype APIs should stay the same.
+Please note that we are only supporting Scala 2.12 for all Spark versions.
+
+To use the Spark module you need to import:
+```scala
+import pureconfig.module.spark._
+```
+
+## Supported classes
+
+* `org.apache.spark.sql.types.DataType`
+* `org.apache.spark.sql.types.StructType`
+* `org.apache.spark.sql.types.Metadata`
+* `org.apache.spark.sql.types.StructField` (derivable)
+
+## Example
+
+### Custom HOCON schema to/from Spark schema
+Setup custom schema case classes and converters between custom schema and Spark schema.
+```scala mdoc:silent
+import org.apache.spark.sql.types._
+import pureconfig._
+import pureconfig.generic.auto._
+import pureconfig.module.spark.sql._
+
+case class MySchema(name: String, fields: List[StructField], someOtherSetting: Option[String])
+
+def mySchemaToSparkSchema(schema: MySchema): StructType =
+  StructType(schema.fields)
+
+def sparkSchemaToMySchema(name: String, schema: StructType): MySchema =
+  MySchema(name, schema.fields.toList, None)
+```
+
+Convert custom schema to Spark and back to custom schema. Resultant string schema should match original source.
+```scala mdoc
+val mySchemaRes = ConfigSource.string(
+  """name: Employee,
+    |fields: [
+    |  { name: name, data-type: string }, #types are case-insensitive and some types have variations/truncations
+    |  { name: age, data-type: integer }, #also note that `nullable` and `metadata` are optional fields with Spark defaults
+    |  { name: salary, data-type: "decimal(6,2)" },
+    |  { name: address, data-type: "line1 string, line2 string" } #outer `struct` is optional
+    |]
+    |""".stripMargin).load[MySchema]
+
+val sparkSchemaRes = mySchemaRes.map(mySchemaToSparkSchema)
+
+val mySchemaRes2 =
+  for {
+    mySchema <- mySchemaRes
+    sparkSchema <- sparkSchemaRes
+  } yield sparkSchemaToMySchema(mySchema.name, sparkSchema)
+
+val stringSchemaRes = mySchemaRes2.map(ConfigWriter[MySchema].to)
+```
+
+### Full schema encoded as HOCON String field to/from Spark schema
+You can also read Spark schemas directly as `StructType` instead of narrowing `DataType` yourself.
+```scala mdoc
+case class Config(schema: StructType)
+val configRes = ConfigSource.string(
+  """
+    |schema = "a int, b string, c struct<c1:int,c2:double>"
+    |""".stripMargin).load[Config]
+
+val stringSchemaRes2 = configRes.map(ConfigWriter[Config].to)
+```