diff --git a/writer.go b/writer.go index d9fbf183..00ac1dec 100644 --- a/writer.go +++ b/writer.go @@ -210,6 +210,29 @@ func (w *Writer) ReadRowsFrom(rows RowReader) (written int64, err error) { // The returned value will be nil if no schema has yet been configured on w. func (w *Writer) Schema() *Schema { return w.schema } +// SetKeyValueMetadata sets a key/value pair in the Parquet file metadata. +// +// Keys are assumed to be unique, if the same key is repeated multiple times the +// last value is retained. While the parquet format does not require unique keys, +// this design decision was made to optimize for the most common use case where +// applications leverage this extension mechanism to associate single values to +// keys. This may create incompatibilities with other parquet libraries, or may +// cause some key/value pairs to be lost when open parquet files written with +// repeated keys. We can revisit this decision if it ever becomes a blocker. +func (w *Writer) SetKeyValueMetadata(key, value string) { + for i, kv := range w.writer.metadata { + if kv.Key == key { + kv.Value = value + w.writer.metadata[i] = kv + return + } + } + w.writer.metadata = append(w.writer.metadata, format.KeyValue{ + Key: key, + Value: value, + }) +} + type writer struct { buffer *bufio.Writer writer offsetTrackingWriter diff --git a/writer_go18.go b/writer_go18.go index 1145894e..2219cf20 100644 --- a/writer_go18.go +++ b/writer_go18.go @@ -176,6 +176,19 @@ func (w *GenericWriter[T]) WriteRowGroup(rowGroup RowGroup) (int64, error) { return w.base.WriteRowGroup(rowGroup) } +// SetKeyValueMetadata sets a key/value pair in the Parquet file metadata. +// +// Keys are assumed to be unique, if the same key is repeated multiple times the +// last value is retained. While the parquet format does not require unique keys, +// this design decision was made to optimize for the most common use case where +// applications leverage this extension mechanism to associate single values to +// keys. This may create incompatibilities with other parquet libraries, or may +// cause some key/value pairs to be lost when open parquet files written with +// repeated keys. We can revisit this decision if it ever becomes a blocker. +func (w *GenericWriter[T]) SetKeyValueMetadata(key, value string) { + w.base.SetKeyValueMetadata(key, value) +} + func (w *GenericWriter[T]) ReadRowsFrom(rows RowReader) (int64, error) { return w.base.ReadRowsFrom(rows) } diff --git a/writer_go18_test.go b/writer_go18_test.go index 66f1b537..3233369e 100644 --- a/writer_go18_test.go +++ b/writer_go18_test.go @@ -352,3 +352,42 @@ func TestIssue375(t *testing.T) { t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups)) } } + +func TestGenericSetKeyValueMetadata(t *testing.T) { + testKey := "test-key" + testValue := "test-value" + + type Row struct{ FirstName, LastName string } + + output := new(bytes.Buffer) + writer := parquet.NewGenericWriter[Row](output, parquet.MaxRowsPerRowGroup(10)) + + rows := []Row{ + {FirstName: "First", LastName: "Last"}, + } + + _, err := writer.Write(rows) + if err != nil { + t.Fatal(err) + } + + writer.SetKeyValueMetadata(testKey, testValue) + + err = writer.Close() + if err != nil { + t.Fatal(err) + } + + f, err := parquet.OpenFile(bytes.NewReader(output.Bytes()), int64(output.Len())) + if err != nil { + t.Fatal(err) + } + + value, ok := f.Lookup(testKey) + if !ok { + t.Fatalf("key/value metadata should have included %q", testKey) + } + if value != testValue { + t.Errorf("expected %q, got %q", testValue, value) + } +} diff --git a/writer_test.go b/writer_test.go index 4ebe7ea5..25449cb5 100644 --- a/writer_test.go +++ b/writer_test.go @@ -711,3 +711,88 @@ func TestWriterMaxRowsPerRowGroup(t *testing.T) { t.Errorf("wrong number of row groups in parquet file: want=10 got=%d", len(rowGroups)) } } + +func TestSetKeyValueMetadata(t *testing.T) { + testKey := "test-key" + testValue := "test-value" + + type testStruct struct { + A string `parquet:"a,dict"` + } + + schema := parquet.SchemaOf(&testStruct{}) + + b := bytes.NewBuffer(nil) + w := parquet.NewWriter( + b, + schema, + ) + + err := w.Write(&testStruct{A: "test"}) + if err != nil { + t.Fatal(err) + } + + w.SetKeyValueMetadata(testKey, testValue) + + err = w.Close() + if err != nil { + t.Fatal(err) + } + + f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) + if err != nil { + t.Fatal(err) + } + + value, ok := f.Lookup(testKey) + if !ok { + t.Fatalf("key/value metadata should have included %q", testKey) + } + if value != testValue { + t.Errorf("expected %q, got %q", testValue, value) + } +} + +func TestSetKeyValueMetadataOverwritesExisting(t *testing.T) { + testKey := "test-key" + testValue := "test-value" + + type testStruct struct { + A string `parquet:"a,dict"` + } + + schema := parquet.SchemaOf(&testStruct{}) + + b := bytes.NewBuffer(nil) + w := parquet.NewWriter( + b, + schema, + parquet.KeyValueMetadata(testKey, "original-value"), + ) + + err := w.Write(&testStruct{A: "test"}) + if err != nil { + t.Fatal(err) + } + + w.SetKeyValueMetadata(testKey, testValue) + + err = w.Close() + if err != nil { + t.Fatal(err) + } + + f, err := parquet.OpenFile(bytes.NewReader(b.Bytes()), int64(b.Len())) + if err != nil { + t.Fatal(err) + } + + value, ok := f.Lookup(testKey) + if !ok { + t.Fatalf("key/value metadata should have included %q", testKey) + } + if value != testValue { + t.Errorf("expected %q, got %q", testValue, value) + } +}