Skip to content

Commit

Permalink
Improve siva index generation in ReadWriter
Browse files Browse the repository at this point in the history
When using a ReadWriter the index in regenerated each time Index is
called. These are the steps to generate one usable index:

* Create new index merging index from file and current changes
* Remove duplicates
* Sort index by position

Also, to find a file in the index it is walked until a match is found.
This needs to be done each time a file has to be opened.

For small number of files this is OK but when a repo has a lot or
references the time spent here can be a lot.

Now there's a new index type called OrderedIndex that stores the
IndexEntries in lexicographic order. This allows to do binary searches
for faster file location and also makes possible update the index
instead of regenerating it each time.

Signed-off-by: Javi Fontan <jfontan@gmail.com>
  • Loading branch information
jfontan committed Oct 16, 2018
1 parent a31824b commit fd4c675
Show file tree
Hide file tree
Showing 4 changed files with 186 additions and 5 deletions.
117 changes: 114 additions & 3 deletions index.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,25 @@ func (i *Index) WriteTo(w io.Writer) error {
return nil
}

func (s Index) Len() int { return len(s) }
func (s Index) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
// Len implements sort.Interface.
func (s Index) Len() int { return len(s) }

// Swap implements sort.Interface.
func (s Index) Swap(i, j int) { s[i], s[j] = s[j], s[i] }

// Less implements sort.Interface.
func (s Index) Less(i, j int) bool { return s[i].absStart < s[j].absStart }

// Filter returns a filtered version of the current Index removing duplicates
// keeping the latest versions and filtering all the deleted files
func (i *Index) Filter() Index {
index := i.filter()
sort.Sort(index)

return index
}

func (i *Index) filter() Index {
var f Index

seen := make(map[string]bool)
Expand All @@ -186,7 +198,6 @@ func (i *Index) Filter() Index {
f = append(f, e)
}

sort.Sort(f)
return f
}

Expand Down Expand Up @@ -235,6 +246,106 @@ func (i Index) Glob(pattern string) ([]*IndexEntry, error) {
return matches, nil
}

// OrderedIndex is a specialized index lexicographically ordered. It has
// methods to add or delete IndexEntries and maintain its order. Also has
// as faster Find method.
type OrderedIndex Index

// Pos gets the position of the file in the index or where it should be
// inserted if it's not already there.
func (o OrderedIndex) Pos(path string) int {
if len(o) == 0 {
return 0
}

pos := sort.Search(len(o), func(i int) bool {
return o[i].Name >= path
})

return pos
}

// Update adds or deletes an IndexEntry to the index depending on the
// FlagDeleted value.
func (o OrderedIndex) Update(e *IndexEntry) OrderedIndex {
if e == nil {
return o
}

if e.Flags&FlagDeleted == 0 {
return o.Add(e)
}

return o.Delete(e.Name)
}

// Add returns an updated index with the new IndexEntry.
func (o OrderedIndex) Add(e *IndexEntry) OrderedIndex {
if e == nil {
return o
}

if len(o) == 0 {
return OrderedIndex{e}
}

path := e.Name
pos := o.Pos(path)
if pos < len(o) && o[pos].Name == path {
o[pos] = e
return o
}

if pos == len(o) {
return append(o, e)
}

return append(o[:pos], append(Index{e}, o[pos:]...)...)
}

// Delete returns an updated index with the IndexEntry for the path deleted.
func (o OrderedIndex) Delete(path string) OrderedIndex {
if len(o) == 0 {
return o
}

pos := o.Pos(path)
if pos < len(o) && o[pos].Name != path {
return o
}

return append(o[:pos], o[pos+1:]...)
}

// Find returns the IndexEntry for a path or nil. This version is faster than
// Index.Find.
func (o OrderedIndex) Find(path string) *IndexEntry {
if len(o) == 0 {
return nil
}

pos := o.Pos(path)
if pos >= 0 && pos < len(o) && o[pos].Name == path {
return o[pos]
}

return nil
}

// Sort orders the index lexicographically.
func (o OrderedIndex) Sort() {
sort.Sort(o)
}

// Len implements sort.Interface.
func (s OrderedIndex) Len() int { return len(s) }

// Swap implements sort.Interface.
func (s OrderedIndex) Swap(i, j int) { s[i], s[j] = s[j], s[i] }

// Less implements sort.Interface.
func (s OrderedIndex) Less(i, j int) bool { return s[i].Name < s[j].Name }

type IndexEntry struct {
Header
Start uint64
Expand Down
7 changes: 6 additions & 1 deletion readwriter.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,17 @@ func NewReaderWriter(rw io.ReadWriteSeeker) (*ReadWriter, error) {
}

w := newWriter(rw)
w.oIndex = OrderedIndex(i.filter())
w.oIndex.Sort()

getIndexFunc := func() (Index, error) {
for _, e := range w.index {
e.absStart = uint64(end) + e.Start
}
return append(i, w.index...), nil

return Index(w.oIndex), nil
}

r := newReaderWithIndex(rw, getIndexFunc)
return &ReadWriter{r, w}, nil
}
64 changes: 63 additions & 1 deletion readwriter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,15 @@ func (s *ReadWriterSuite) testWriteRead(c *C, f *os.File, iter int) {

index, err := rw.Index()
c.Assert(err, IsNil)
c.Assert(len(index), Equals, iters*iter+i+1)

// index after the first iteration will contain the total amount
// of files
num := i + 1
if iter > 0 {
num = iters
}

c.Assert(len(index), Equals, num)

e := index.Find(curName)
c.Assert(e, NotNil)
Expand Down Expand Up @@ -172,3 +180,57 @@ func (_ dummyReadWriterSeeker) Write(p []byte) (n int, err error) {
func (_ dummyReadWriterSeeker) Seek(offset int64, whence int) (n int64, err error) {
return
}

func (s *ReadWriterSuite) TestDelete(c *C) {
data := "data"

path := filepath.Join(s.tmpDir, c.TestName())
tmpFile, err := os.Create(path)
c.Assert(err, IsNil)
c.Assert(tmpFile, NotNil)

rw, err := siva.NewReaderWriter(tmpFile)
c.Assert(err, IsNil)

testSteps := []struct {
name string
del bool
files []string
}{
{"one", false, []string{"one"}},
{"two", false, []string{"one", "two"}},
{"three", false, []string{"one", "three", "two"}},
{"two", true, []string{"one", "three"}},
{"two", false, []string{"one", "three", "two"}},
{"four", true, []string{"one", "three", "two"}},
{"three", true, []string{"one", "two"}},
}

for _, t := range testSteps {
var flags siva.Flag
if t.del {
flags = siva.FlagDeleted
}

err := rw.WriteHeader(&siva.Header{
Name: t.name,
Flags: flags,
})
c.Assert(err, IsNil)

written, err := rw.Write([]byte(data))
c.Assert(err, IsNil)
c.Assert(written, Equals, len(data))

err = rw.Flush()
c.Assert(err, IsNil)

index, err := rw.Index()
c.Assert(err, IsNil)

c.Assert(len(index), Equals, len(t.files))
for i, name := range t.files {
c.Assert(index[i].Name, Equals, name)
}
}
}
3 changes: 3 additions & 0 deletions writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ type Writer interface {
type writer struct {
w *hashedWriter
index Index
oIndex OrderedIndex
current *IndexEntry
position uint64
closed bool
Expand Down Expand Up @@ -49,6 +50,8 @@ func (w *writer) WriteHeader(h *Header) error {
}

w.index = append(w.index, w.current)
w.oIndex = w.oIndex.Update(w.current)

return nil
}

Expand Down

0 comments on commit fd4c675

Please sign in to comment.