Skip to content
This repository has been archived by the owner on Aug 13, 2019. It is now read-only.

sort symbols in order of frequency rather than lexicographically #280

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions block.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ import (
type IndexWriter interface {
// AddSymbols registers all string symbols that are encountered in series
// and other indices.
AddSymbols(sym map[string]struct{}) error
AddSymbols(sym map[string]int) error

// AddSeries populates the index writer with a series and its offsets
// of chunks that the index can reference.
Expand All @@ -63,7 +63,7 @@ type IndexWriter interface {
type IndexReader interface {
// Symbols returns a set of string symbols that may occur in series' labels
// and indices.
Symbols() (map[string]struct{}, error)
Symbols() (map[string]int, error)

// LabelValues returns the possible label values.
LabelValues(names ...string) (index.StringTuples, error)
Expand Down Expand Up @@ -405,7 +405,7 @@ type blockIndexReader struct {
b *Block
}

func (r blockIndexReader) Symbols() (map[string]struct{}, error) {
func (r blockIndexReader) Symbols() (map[string]int, error) {
s, err := r.ir.Symbols()
return s, errors.Wrapf(err, "block: %s", r.b.Meta().ULID)
}
Expand Down
4 changes: 2 additions & 2 deletions compact.go
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,7 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta,

var (
set ChunkSeriesSet
allSymbols = make(map[string]struct{}, 1<<16)
allSymbols = make(map[string]int, 1<<16)
closers = []io.Closer{}
)
defer func() { closeAll(closers...) }()
Expand Down Expand Up @@ -619,7 +619,7 @@ func (c *LeveledCompactor) populateBlock(blocks []BlockReader, meta *BlockMeta,
return errors.Wrap(err, "read symbols")
}
for s := range symbols {
allSymbols[s] = struct{}{}
allSymbols[s] = symbols[s]
}

all, err := indexr.Postings(index.AllPostingsKey())
Expand Down
22 changes: 11 additions & 11 deletions head.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ type Head struct {
series *stripeSeries

symMtx sync.RWMutex
symbols map[string]struct{}
symbols map[string]int
values map[string]stringset // label names to possible values

postings *index.MemPostings // postings lists for terms
Expand Down Expand Up @@ -229,7 +229,7 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, chunkRange int
maxTime: math.MinInt64,
series: newStripeSeries(),
values: map[string]stringset{},
symbols: map[string]struct{}{},
symbols: make(map[string]int),
postings: index.NewUnorderedMemPostings(),
tombstones: newMemTombstones(),
}
Expand Down Expand Up @@ -897,12 +897,12 @@ func (h *Head) gc() {
h.postings.Delete(deleted)

// Rebuild symbols and label value indices from what is left in the postings terms.
symbols := make(map[string]struct{})
symbols := make(map[string]int)
values := make(map[string]stringset, len(h.values))

if err := h.postings.Iter(func(t labels.Label, _ index.Postings) error {
symbols[t.Name] = struct{}{}
symbols[t.Value] = struct{}{}
symbols[t.Name]++
symbols[t.Value]++

ss, ok := values[t.Name]
if !ok {
Expand Down Expand Up @@ -1046,14 +1046,14 @@ func (h *headIndexReader) Close() error {
return nil
}

func (h *headIndexReader) Symbols() (map[string]struct{}, error) {
func (h *headIndexReader) Symbols() (map[string]int, error) {
h.head.symMtx.RLock()
defer h.head.symMtx.RUnlock()

res := make(map[string]struct{}, len(h.head.symbols))
res := make(map[string]int, len(h.head.symbols))

for s := range h.head.symbols {
res[s] = struct{}{}
for s, num := range h.head.symbols {
res[s] = num
}
return res, nil
}
Expand Down Expand Up @@ -1202,8 +1202,8 @@ func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSerie
}
valset.set(l.Value)

h.symbols[l.Name] = struct{}{}
h.symbols[l.Value] = struct{}{}
h.symbols[l.Name]++
h.symbols[l.Value]++
}

return s, true
Expand Down
12 changes: 6 additions & 6 deletions head_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,12 +212,12 @@ func TestHead_Truncate(t *testing.T) {
testutil.Assert(t, postingsB2 == nil, "")
testutil.Assert(t, postingsC1 == nil, "")

testutil.Equals(t, map[string]struct{}{
"": {}, // from 'all' postings list
"a": {},
"b": {},
"1": {},
"2": {},
testutil.Equals(t, map[string]int{
"": 2, // from 'all' postings list
"a": 2,
"b": 1,
"1": 2,
"2": 1,
}, h.symbols)

testutil.Equals(t, map[string]stringset{
Expand Down
40 changes: 29 additions & 11 deletions index/index.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ func (s indexWriterSeriesSlice) Less(i, j int) bool {
return labels.Compare(s[i].labels, s[j].labels) < 0
}

type symbolFrequencyPair struct {
symbol string
frequency int
}

type symbolFrequencylist []symbolFrequencyPair

func (s symbolFrequencylist) Len() int { return len(s) }
func (s symbolFrequencylist) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s symbolFrequencylist) Greater(i, j int) bool { return s[i].frequency > s[j].frequency }

type indexWriterStage uint8

const (
Expand Down Expand Up @@ -368,17 +379,24 @@ func (w *Writer) AddSeries(ref uint64, lset labels.Labels, chunks ...chunks.Meta
return nil
}

func (w *Writer) AddSymbols(sym map[string]struct{}) error {
func (w *Writer) AddSymbols(sym map[string]int) error {
if err := w.ensureStage(idxStageSymbols); err != nil {
return err
}
// Generate sorted list of strings we will store as reference table.
symbols := make([]string, 0, len(sym))
symbols := make(symbolFrequencylist, 0, len(sym))

for s := range sym {
symbols = append(symbols, s)
for k, v := range sym {
symbols = append(symbols, symbolFrequencyPair{k, v})
}
sort.Strings(symbols)
sort.Slice(symbols, func(i, j int) bool {
// We get the symbols back as a map so we need to be sure
// to sort by symbol if the frequencies are the same.
if symbols[i].frequency == symbols[j].frequency {
return symbols[i].symbol > symbols[j].symbol
}
return symbols.Greater(i, j)
})

w.buf1.reset()
w.buf2.reset()
Expand All @@ -388,8 +406,8 @@ func (w *Writer) AddSymbols(sym map[string]struct{}) error {
w.symbols = make(map[string]uint32, len(symbols))

for index, s := range symbols {
w.symbols[s] = uint32(index)
w.buf2.putUvarintStr(s)
w.symbols[s.symbol] = uint32(index)
w.buf2.putUvarintStr(s.symbol)
}

w.buf1.putBE32int(w.buf2.len())
Expand Down Expand Up @@ -812,14 +830,14 @@ func (r *Reader) lookupSymbol(o uint32) (string, error) {
}

// Symbols returns a set of symbols that exist within the index.
func (r *Reader) Symbols() (map[string]struct{}, error) {
res := make(map[string]struct{}, len(r.symbolsV1)+len(r.symbolsV2))
func (r *Reader) Symbols() (map[string]int, error) {
res := make(map[string]int, len(r.symbolsV1)+len(r.symbolsV2))

for _, s := range r.symbolsV1 {
res[s] = struct{}{}
res[s] = 0
}
for _, s := range r.symbolsV2 {
res[s] = struct{}{}
res[s] = 0
}
return res, nil
}
Expand Down
62 changes: 52 additions & 10 deletions index/index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,13 @@ func TestIndexRW_Postings(t *testing.T) {
labels.FromStrings("a", "1", "b", "4"),
}

err = iw.AddSymbols(map[string]struct{}{
"a": {},
"b": {},
"1": {},
"2": {},
"3": {},
"4": {},
err = iw.AddSymbols(map[string]int{
"a": 1,
"b": 2,
"1": 1,
"2": 4,
"3": 5,
"4": 3,
})
testutil.Ok(t, err)

Expand Down Expand Up @@ -232,6 +232,48 @@ func TestIndexRW_Postings(t *testing.T) {
testutil.Ok(t, ir.Close())
}

func TestIndexRW_SymbolsOrder(t *testing.T) {
dir, err := ioutil.TempDir("", "test_index_order")
testutil.Ok(t, err)
defer os.RemoveAll(dir)

fn := filepath.Join(dir, "index")

iw, err := NewWriter(fn)
testutil.Ok(t, err)

err = iw.AddSymbols(map[string]int{
"a": 1,
"b": 2,
"c": 1,
"2": 4,
"3": 5,
"4": 3,
})

testutil.Ok(t, err)
testutil.Ok(t, iw.Close())

exp := []string{"3", "2", "4", "b", "c", "a"}

ir, err := NewFileReader(fn)
testutil.Ok(t, err)

toc, err := NewTOCFromByteSlice(ir.b)
testutil.Ok(t, err)

ir.symbolsV2, ir.symbolsV1, err = ReadSymbols(ir.b, ir.version, int(toc.Symbols))
testutil.Ok(t, err)

testutil.Equals(t, len(ir.symbolsV2), len(exp))

for i := range ir.symbolsV2 {
testutil.Equals(t, ir.symbolsV2[i], exp[i])
}

testutil.Ok(t, ir.Close())
}

func TestPersistence_index_e2e(t *testing.T) {
dir, err := ioutil.TempDir("", "test_persistence_e2e")
testutil.Ok(t, err)
Expand All @@ -243,11 +285,11 @@ func TestPersistence_index_e2e(t *testing.T) {
// Sort labels as the index writer expects series in sorted order.
sort.Sort(labels.Slice(lbls))

symbols := map[string]struct{}{}
symbols := make(map[string]int)
for _, lset := range lbls {
for _, l := range lset {
symbols[l.Name] = struct{}{}
symbols[l.Value] = struct{}{}
symbols[l.Name] = 0
symbols[l.Value] = 0
}
}

Expand Down
2 changes: 1 addition & 1 deletion mocks_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ type mockIndexWriter struct {
series []seriesSamples
}

func (mockIndexWriter) AddSymbols(sym map[string]struct{}) error { return nil }
func (mockIndexWriter) AddSymbols(sym map[string]int) error { return nil }
func (m *mockIndexWriter) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta) error {
i := -1
for j, s := range m.series {
Expand Down
10 changes: 5 additions & 5 deletions querier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1293,20 +1293,20 @@ type mockIndex struct {
series map[uint64]series
labelIndex map[string][]string
postings map[labels.Label][]uint64
symbols map[string]struct{}
symbols map[string]int
}

func newMockIndex() mockIndex {
ix := mockIndex{
series: make(map[uint64]series),
labelIndex: make(map[string][]string),
postings: make(map[labels.Label][]uint64),
symbols: make(map[string]struct{}),
symbols: make(map[string]int),
}
return ix
}

func (m mockIndex) Symbols() (map[string]struct{}, error) {
func (m mockIndex) Symbols() (map[string]int, error) {
return m.symbols, nil
}

Expand All @@ -1315,8 +1315,8 @@ func (m mockIndex) AddSeries(ref uint64, l labels.Labels, chunks ...chunks.Meta)
return errors.Errorf("series with reference %d already added", ref)
}
for _, lbl := range l {
m.symbols[lbl.Name] = struct{}{}
m.symbols[lbl.Value] = struct{}{}
m.symbols[lbl.Name] = 0
m.symbols[lbl.Value] = 0
}

s := series{l: l}
Expand Down