Skip to content
Merged

V1 3 #57

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
323 changes: 214 additions & 109 deletions README.md

Large diffs are not rendered by default.

33 changes: 22 additions & 11 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ThisBuild / organization := "com.phasmidsoftware"
ThisBuild / version := "1.2.6"
ThisBuild / version := "1.3.0"
ThisBuild / scalaVersion := "2.13.17"
ThisBuild / scalacOptions ++= Seq("-encoding", "UTF-8", "-unchecked", "-deprecation")
ThisBuild / scalacOptions ++= Seq("-java-output-version", "17")
Expand Down Expand Up @@ -54,6 +54,26 @@ lazy val cats = project.dependsOn(core).settings(
)
)

lazy val parquet = project.dependsOn(core).settings(
name := "tableparser-parquet",
libraryDependencies ++= Seq(
"org.apache.parquet" % "parquet-column" % "1.15.2",
"org.apache.parquet" % "parquet-hadoop" % "1.15.2",
"org.apache.hadoop" % "hadoop-common" % "3.4.1" % "provided",
"org.apache.hadoop" % "hadoop-mapreduce-client-core" % "3.4.1" % Test,
"org.scalatest" %% "scalatest" % scalaTestVersion % Test
)
)

lazy val spark = project.dependsOn(core).settings(
name := "tableparser-spark",
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"org.slf4j" % "slf4j-simple" % "2.0.17" % Test,
"org.scalatest" %% "scalatest" % scalaTestVersion % Test
)
)

lazy val zio = project.dependsOn(core).settings(
name := "tableparser-zio",
libraryDependencies ++= Seq(
Expand All @@ -68,17 +88,8 @@ lazy val zio = project.dependsOn(core).settings(
)
)

lazy val spark = project.dependsOn(core).settings(
name := "tableparser-spark",
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"org.slf4j" % "slf4j-simple" % "2.0.17" % Test,
"org.scalatest" %% "scalatest" % scalaTestVersion % Test
)
)

lazy val root = (project in file("."))
.aggregate(core, cats, zio, spark)
.aggregate(core, cats, parquet, spark, zio)
.settings(
name := "TableParser",
publish / skip := true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ object TableCrypt {
* @tparam A the cipher algorithm (for which there must be evidence of HexEncryption[A]).
* @param csvAttributes implicit value of CsvAttributes.
*/
@deprecated("Use writeCSVFileEncrypted(Path) instead", "1.3.0")
def writeCSVFileEncrypted[A: HexEncryption, Row](table: Table[Row])(file: File)(implicit renderer: CsvRenderer[Row], generator: CsvGenerator[Row], hasKey: HasKey[Row], csvAttributes: CsvAttributes): Unit =
CsvTableEncryptedFileRenderer[Row, A](file).render(table)
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import scala.util.Try

/**
* This example of table parsing is based on the Kaggle data set:
* [[https://www.kaggle.com/datasets/marshuu/crimes-in-uk-2023?select=2023-01-metropolitan-street.csv]]
* [[https://www.kaggle.com/datasets/marshuu/crimes-in-uk-2023?select=2023-01-metropolitan-street.csv Metropolitan Crime Data]]
*
* The file under resources is an edited version of the Metropolitan Crime Statistics 2023-01 (only the first 5,000 rows)
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,9 @@ object ColumnHelper {
/**
* Precede each upper case letter (or digit) with _.
*/
val camelToSnakeCaseColumnNameMapper: String => String = _.replaceAll("([A-Z\\d])", "_$1")
val camelToSnakeCaseColumnNameMapper: String => String =
_.replaceAll("([A-Z\\d])", "_$1")

val camelToSnakeCaseColumnNameMapperLower: String => String =
camelToSnakeCaseColumnNameMapper andThen (_.toLowerCase)
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package com.phasmidsoftware.tableparser.core.parse

import java.io.File
import java.net.URL
import java.nio.file.Path
import org.joda.time.LocalDate
import scala.annotation.implicitNotFound
import scala.util.parsing.combinator.JavaTokenParsers
Expand Down Expand Up @@ -290,6 +291,23 @@ object Parseable {

implicit object ParseableURL extends ParseableURL

/**
* A trait representing a path that can be parsed from a String.
*
* This type class extends `Parseable[java.nio.file.Path]`, enabling the parsing of `String` inputs
* into `java.nio.file.Path` objects. It provides a default implementation of the `parse`
* method, leveraging utility methods such as `lift` and `parseAndRecover` to handle parsing
* and recovery in case of errors.
*/
trait ParseablePath extends Parseable[Path] {
def parse(s: String, optModifier: Option[String]): Try[Path] =
parseAndRecover(s)(lift(java.nio.file.Paths.get(_)))(
w => s"ParseablePath: cannot interpret '$w' as a Path"
)
}

implicit object ParseablePath extends ParseablePath

/**
* Parser of File.
*/
Expand Down Expand Up @@ -452,6 +470,8 @@ object ParseableOption {

implicit object ParseableOptionFile extends ParseableOption[File]

implicit object ParseableOptionPath extends ParseableOption[Path]

}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package com.phasmidsoftware.tableparser.core.parse

import com.phasmidsoftware.tableparser.core.parse.TableParser.includeAll
import com.phasmidsoftware.tableparser.core.table.Header
import scala.util.Try

/**
* Minimal trait expressing the table-building contract, independent
* of how the input source is read.
*
* TableParser extends this for string-based sources.
* ParquetTableParser extends this directly for Parquet sources.
* Future source types (JDBC ResultSet, JSON, etc.) can do the same.
*
* @tparam Table the table type to be built.
*/
trait TableBuilder[Table] {

/**
* The row type.
*/
type Row

/**
* Method to construct a Table based on the given iterator of rows and the given header.
*
* @param rows an iterator of Row objects representing the data rows.
* @param header a Header object representing the table's column headers.
* @return the constructed Table based on the input rows and header.
*/
protected def builder(rows: Iterator[Row], header: Header): Table

/**
* If true, individual row failures are logged but do not
* cause the overall parse to fail.
*/
protected val forgiving: Boolean = false

/**
* Predicate to filter rows. Defaults to including all rows.
*/
protected val predicate: Try[Row] => Boolean = includeAll
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

package com.phasmidsoftware.tableparser.core.parse

import com.phasmidsoftware.tableparser.core.parse.TableParser.includeAll
import com.phasmidsoftware.tableparser.core.table._
import com.phasmidsoftware.tableparser.core.util.{Joinable, TeeIterator, TryUsing}
import org.slf4j.{Logger, LoggerFactory}
Expand All @@ -19,24 +18,18 @@ import scala.util.{Failure, Random, Success, Try}
* @tparam Table the Table type.
*/
@implicitNotFound(msg = "Cannot find an implicit instance of TableParser[${Table}]. Typically, you should define an instance of StringTableParser or StringsTableParser.")
trait TableParser[Table] {

/**
* The row type.
*/
type Row

trait TableParser[Table] extends TableBuilder[Table] {
/**
* The input type, typically `String` or `Strings`.
*/
type Input

/**
* This variable determines if there is a programmed, i.e., fixed, header for the parser.
* If its value is None, it signifies that we must look to the first line(s) of data
* for an appropriate header.
* Method to define a row parser.
*
* @return a RowParser[Row, Input].
*/
protected val maybeHeader: Option[Header] = None
val rowParser: RowParser[Row, Input]

/**
* This indicates the number of header rows which must be read from the input.
Expand All @@ -48,20 +41,11 @@ trait TableParser[Table] {
val headerRowsToRead: Int = 1

/**
* Method to construct a Table based on the provided rows and header.
*
* @param rows an iterator of Row objects representing the data rows.
* @param header a Header object representing the table's column headers.
* @return the constructed Table based on the input rows and header.
*/
protected def builder(rows: Iterator[Row], header: Header): Table

/**
* Method to determine how errors are handled.
*
* @return true if individual errors are logged but do not cause parsing to fail.
* This variable determines if there is a programmed, i.e., fixed, header for the parser.
* If its value is None, it signifies that we must look to the first line(s) of data
* for an appropriate header.
*/
protected val forgiving: Boolean = false
protected val maybeHeader: Option[Header] = None

/**
* Value to determine whether it is acceptable to have a quoted string span more than one line.
Expand All @@ -71,20 +55,7 @@ trait TableParser[Table] {
protected val multiline: Boolean = false

/**
* Function to determine whether or not a row should be included in the table.
* Typically used for random sampling.
*/
protected val predicate: Try[Row] => Boolean = includeAll

/**
* Method to define a row parser.
*
* @return a RowParser[Row, Input].
*/
val rowParser: RowParser[Row, Input]

/**
* Method to parse a table based on a sequence of Inputs.
* Method to parse a table based on an iterator of Inputs.
*
* @param xs the sequence of Inputs, one for each row
* @param n the number of rows to drop (length of the header).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import com.phasmidsoftware.tableparser.core.parse.{StringList, Strings}
import com.phasmidsoftware.tableparser.core.table._
import com.phasmidsoftware.tableparser.core.write.Writable
import java.io.{File, FileWriter}
import java.nio.file.Path
import org.joda.time.LocalDate
import scala.reflect.ClassTag
import scala.util.Try
Expand Down Expand Up @@ -167,8 +168,14 @@ case class CsvTableStringRenderer[T]()(implicit z1: CsvRenderer[T], z2: CsvGener
*
* TODO merge this with CsvTableEncryptedFileRenderer to avoid duplicate code.
*
* @param file the file to which the table will be written.
* @param path the path to which the table will be written.
* @param csvAttributes implicit instance of CsvAttributes.
* @tparam T the type of object to be rendered, must provide evidence of CsvRenderer[T] amd CsvGenerator[T].
*/
case class CsvTableFileRenderer[T: CsvRenderer : CsvGenerator](file: File)(implicit csvAttributes: CsvAttributes) extends CsvTableRenderer[T, FileWriter]()(implicitly[CsvRenderer[T]], implicitly[CsvGenerator[T]], Writable.fileWritable(file))
case class CsvTableFileRenderer[T: CsvRenderer : CsvGenerator](path: Path)(implicit csvAttributes: CsvAttributes) extends CsvTableRenderer[T, FileWriter]()(implicitly[CsvRenderer[T]], implicitly[CsvGenerator[T]], Writable.fileWritable(path.toFile))

object CsvTableFileRenderer {
@deprecated("Use apply(file: File) instead.", "1.3.0")
def apply[T: CsvRenderer : CsvGenerator](file: File)(implicit csvAttributes: CsvAttributes): CsvTableFileRenderer[T] =
CsvTableFileRenderer(file.toPath)(implicitly[CsvRenderer[T]], implicitly[CsvGenerator[T]], csvAttributes)
}
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ object Statistics {
* resource file path before running the application.
*/
object Main extends App {
// doMain(FP.resource[Crime]("2023-01-metropolitan-street.csv"))
doMain(FP.resource[Crime]("2023-01-metropolitan-street-sample.csv"))

/**
Expand Down
Loading