Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Allow key value metadata to be set after writing rows #399

Merged
merged 1 commit into from
Nov 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,29 @@ func (w *Writer) ReadRowsFrom(rows RowReader) (written int64, err error) {
// The returned value will be nil if no schema has yet been configured on w.
func (w *Writer) Schema() *Schema { return w.schema }

// SetKeyValueMetadata sets a key/value pair in the Parquet file metadata.
//
// Keys are assumed to be unique, if the same key is repeated multiple times the
// last value is retained. While the parquet format does not require unique keys,
// this design decision was made to optimize for the most common use case where
// applications leverage this extension mechanism to associate single values to
// keys. This may create incompatibilities with other parquet libraries, or may
// cause some key/value pairs to be lost when open parquet files written with
// repeated keys. We can revisit this decision if it ever becomes a blocker.
func (w *Writer) SetKeyValueMetadata(key, value string) {
for i, kv := range w.writer.metadata {
if kv.Key == key {
kv.Value = value
w.writer.metadata[i] = kv
return
}
}
w.writer.metadata = append(w.writer.metadata, format.KeyValue{
Key: key,
Value: value,
})
}

type writer struct {
buffer *bufio.Writer
writer offsetTrackingWriter
Expand Down
13 changes: 13 additions & 0 deletions writer_go18.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,19 @@ func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) {
return w.base.WriteRowGroup(rowGroup)
}

// SetKeyValueMetadata sets a key/value pair in the Parquet file metadata.
//
// Keys are assumed to be unique, if the same key is repeated multiple times the
// last value is retained. While the parquet format does not require unique keys,
// this design decision was made to optimize for the most common use case where
// applications leverage this extension mechanism to associate single values to
// keys. This may create incompatibilities with other parquet libraries, or may
// cause some key/value pairs to be lost when open parquet files written with
// repeated keys. We can revisit this decision if it ever becomes a blocker.
func (w *GenericWriter[T]) SetKeyValueMetadata(key, value string) {
w.base.SetKeyValueMetadata(key, value)
}

func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error) {
return w.base.ReadRowsFrom(rows)
}
Expand Down
39 changes: 39 additions & 0 deletions writer_go18_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,42 @@ func TestIssue375(t *testing.T) {
t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups))
}
}

func TestGenericSetKeyValueMetadata(t *testing.T) {
testKey := "test-key"
testValue := "test-value"

type Row struct{ FirstName, LastName string }

output := new(bytes.Buffer)
writer := parquet.NewGenericWriter[Row](output, parquet.MaxRowsPerRowGroup(10))

rows := []Row{
{FirstName: "First", LastName: "Last"},
}

_, err := writer.Write(rows)
if err != nil {
t.Fatal(err)
}

writer.SetKeyValueMetadata(testKey, testValue)

err = writer.Close()
if err != nil {
t.Fatal(err)
}

f, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len()))
if err != nil {
t.Fatal(err)
}

value, ok := f.Lookup(testKey)
if !ok {
t.Fatalf("key/value metadata should have included %q", testKey)
}
if value != testValue {
t.Errorf("expected %q, got %q", testValue, value)
}
}
85 changes: 85 additions & 0 deletions writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -711,3 +711,88 @@ func TestWriterMaxRowsPerRowGroup(t *testing.T) {
t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups))
}
}

func TestSetKeyValueMetadata(t *testing.T) {
testKey := "test-key"
testValue := "test-value"

type testStruct struct {
A string `parquet:"a,dict"`
}

schema := parquet.SchemaOf(&testStruct{})

b := bytes.NewBuffer(nil)
w := parquet.NewWriter(
b,
schema,
)

err := w.Write(&testStruct{A: "test"})
if err != nil {
t.Fatal(err)
}

w.SetKeyValueMetadata(testKey, testValue)

err = w.Close()
if err != nil {
t.Fatal(err)
}

f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
if err != nil {
t.Fatal(err)
}

value, ok := f.Lookup(testKey)
if !ok {
t.Fatalf("key/value metadata should have included %q", testKey)
}
if value != testValue {
t.Errorf("expected %q, got %q", testValue, value)
}
}

func TestSetKeyValueMetadataOverwritesExisting(t *testing.T) {
testKey := "test-key"
testValue := "test-value"

type testStruct struct {
A string `parquet:"a,dict"`
}

schema := parquet.SchemaOf(&testStruct{})

b := bytes.NewBuffer(nil)
w := parquet.NewWriter(
b,
schema,
parquet.KeyValueMetadata(testKey, "original-value"),
)

err := w.Write(&testStruct{A: "test"})
if err != nil {
t.Fatal(err)
}

w.SetKeyValueMetadata(testKey, testValue)

err = w.Close()
if err != nil {
t.Fatal(err)
}

f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
if err != nil {
t.Fatal(err)
}

value, ok := f.Lookup(testKey)
if !ok {
t.Fatalf("key/value metadata should have included %q", testKey)
}
if value != testValue {
t.Errorf("expected %q, got %q", testValue, value)
}
}