diff --git a/writer.go b/writer.go index d9fbf1839..ab03b011e 100644 --- a/writer.go +++ b/writer.go @@ -210,6 +210,30 @@ func (w *Writer) ReadRowsFrom(rows RowReader) (written int64, err error) { // The returned value will be nil if no schema has yet been configured on w. func (w *Writer) Schema() *Schema { return w.schema } +// KeyValueMetadata sets a key/value pair in the Parquet file metadata. +// to add to the metadata of parquet files. +// +// Keys are assumed to be unique, if the same key is repeated multiple times the +// last value is retained. While the parquet format does not require unique keys, +// this design decision was made to optimize for the most common use case where +// applications leverage this extension mechanism to associate single values to +// keys. This may create incompatibilities with other parquet libraries, or may +// cause some key/value pairs to be lost when open parquet files written with +// repeated keys. We can revisit this decision if it ever becomes a blocker. +func (w *Writer) KeyValueMetadata(key, value string) { + for i, kv := range w.writer.metadata { + if kv.Key == key { + kv.Value = value + w.writer.metadata[i] = kv + return + } + } + w.writer.metadata = append(w.writer.metadata, format.KeyValue{ + Key: key, + Value: value, + }) +} + type writer struct { buffer *bufio.Writer writer offsetTrackingWriter diff --git a/writer_test.go b/writer_test.go index 4ebe7ea54..6cfc6200c 100644 --- a/writer_test.go +++ b/writer_test.go @@ -711,3 +711,88 @@ func TestWriterMaxRowsPerRowGroup(t *testing.T) { t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups)) } } + +func TestKeyValueMetadata(t *testing.T) { + testKey := "test-key" + testValue := "test-value" + + type testStruct struct { + A string `parquet:"a,dict"` + } + + schema := parquet.SchemaOf(&testStruct{}) + + b := bytes.NewBuffer(nil) + w := parquet.NewWriter( + b, + schema, + ) + + err := w.Write(&testStruct{A: "test"}) + if err != nil { + t.Fatal(err) + } + + w.KeyValueMetadata(testKey, testValue) + + err = w.Close() + if err != nil { + t.Fatal(err) + } + + f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) + if err != nil { + t.Fatal(err) + } + + value, ok := f.Lookup(testKey) + if !ok { + t.Fatalf("key/value metadata should have included %q", testKey) + } + if value != testValue { + t.Errorf("expected %q, got %q", testValue, value) + } +} + +func TestKeyValueMetadataOverwritesExisting(t *testing.T) { + testKey := "test-key" + testValue := "test-value" + + type testStruct struct { + A string `parquet:"a,dict"` + } + + schema := parquet.SchemaOf(&testStruct{}) + + b := bytes.NewBuffer(nil) + w := parquet.NewWriter( + b, + schema, + parquet.KeyValueMetadata(testKey, "original-value"), + ) + + err := w.Write(&testStruct{A: "test"}) + if err != nil { + t.Fatal(err) + } + + w.KeyValueMetadata(testKey, testValue) + + err = w.Close() + if err != nil { + t.Fatal(err) + } + + f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) + if err != nil { + t.Fatal(err) + } + + value, ok := f.Lookup(testKey) + if !ok { + t.Fatalf("key/value metadata should have included %q", testKey) + } + if value != testValue { + t.Errorf("expected %q, got %q", testValue, value) + } +}