Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Allow key value metadata to be set after writing rows
Browse files Browse the repository at this point in the history
  • Loading branch information
tschaub committed Nov 3, 2022
1 parent ef6062d commit 0be014a
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 0 deletions.
23 changes: 23 additions & 0 deletions writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,29 @@ func (w *Writer) ReadRowsFrom(rows RowReader) (written int64, err error) {
// The returned value will be nil if no schema has yet been configured on w.
func (w *Writer) Schema() *Schema { return w.schema }

// KeyValueMetadata sets a key/value pair in the Parquet file metadata.
//
// Keys are assumed to be unique, if the same key is repeated multiple times the
// last value is retained. While the parquet format does not require unique keys,
// this design decision was made to optimize for the most common use case where
// applications leverage this extension mechanism to associate single values to
// keys. This may create incompatibilities with other parquet libraries, or may
// cause some key/value pairs to be lost when open parquet files written with
// repeated keys. We can revisit this decision if it ever becomes a blocker.
func (w *Writer) KeyValueMetadata(key, value string) {
for i, kv := range w.writer.metadata {
if kv.Key == key {
kv.Value = value
w.writer.metadata[i] = kv
return
}
}
w.writer.metadata = append(w.writer.metadata, format.KeyValue{
Key: key,
Value: value,
})
}

type writer struct {
buffer *bufio.Writer
writer offsetTrackingWriter
Expand Down
85 changes: 85 additions & 0 deletions writer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -711,3 +711,88 @@ func TestWriterMaxRowsPerRowGroup(t *testing.T) {
t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups))
}
}

func TestKeyValueMetadata(t *testing.T) {
testKey := "test-key"
testValue := "test-value"

type testStruct struct {
A string `parquet:"a,dict"`
}

schema := parquet.SchemaOf(&testStruct{})

b := bytes.NewBuffer(nil)
w := parquet.NewWriter(
b,
schema,
)

err := w.Write(&testStruct{A: "test"})
if err != nil {
t.Fatal(err)
}

w.KeyValueMetadata(testKey, testValue)

err = w.Close()
if err != nil {
t.Fatal(err)
}

f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
if err != nil {
t.Fatal(err)
}

value, ok := f.Lookup(testKey)
if !ok {
t.Fatalf("key/value metadata should have included %q", testKey)
}
if value != testValue {
t.Errorf("expected %q, got %q", testValue, value)
}
}

func TestKeyValueMetadataOverwritesExisting(t *testing.T) {
testKey := "test-key"
testValue := "test-value"

type testStruct struct {
A string `parquet:"a,dict"`
}

schema := parquet.SchemaOf(&testStruct{})

b := bytes.NewBuffer(nil)
w := parquet.NewWriter(
b,
schema,
parquet.KeyValueMetadata(testKey, "original-value"),
)

err := w.Write(&testStruct{A: "test"})
if err != nil {
t.Fatal(err)
}

w.KeyValueMetadata(testKey, testValue)

err = w.Close()
if err != nil {
t.Fatal(err)
}

f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len()))
if err != nil {
t.Fatal(err)
}

value, ok := f.Lookup(testKey)
if !ok {
t.Fatalf("key/value metadata should have included %q", testKey)
}
if value != testValue {
t.Errorf("expected %q, got %q", testValue, value)
}
}

0 comments on commit 0be014a

Please sign in to comment.