Skip to content

Commit

Permalink
perf(dsfs): don't calculate commit descriptions if title and message …
Browse files Browse the repository at this point in the history
…are set

diffing can be expensive, especially if we're going to throw away the results.
Add a set of byte-level checks before dropping down to diffing to ensure we
in fact have something to diff.
  • Loading branch information
b5 committed Nov 24, 2020
1 parent ad77962 commit f5ec420
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 32 deletions.
55 changes: 44 additions & 11 deletions api/testdata/expect/TestDatasetGet.test_ds.json
Expand Up @@ -5,9 +5,9 @@
"id": "QmZePf5LeXow3RW5U1AgEiNbW46YnRGhZ7HPvm1UmPFPwt"
},
"message": "created dataset from data.csv",
"path": "/mem/QmdG6Jyf8CqSbvux4eCYQNHh8ftLavjY2WbFDt1yDKGpLs",
"path": "/mem/QmdgEuXpqcUGyJ8pUCycPxBYJMpiv3xvhya6AEtxHnAzYX",
"qri": "cm:0",
"signature": "nvYDzyR4DWNk1XXGDP7imuw8WfDOgoTQfjfcKD1+u5zmuRjfgIGHb6V9xAk+awQUZ5h16wjNcDnUJ/5VtjVodsOcDuKXMJVRZ8ZD7Mh4RxaB7XPG1ZqHqfcCrF4ehlsizNt6kIShTLlJHqPbnVHlt1Z6bt501GFeU0Ddw+T9R5b6Onw8ouBxEJyjvkGhX3Bf1MZNB8CavQt+lRYoLa0i2fv/iqi6OD+5WY+Wb5KKH58RWQu57uj83ZpWqMNmlwHIDvO1wNNjwJAr29LV2IEbrKC5Q6qj63KLi3akaXUp32pEbOGOingds+xecabDT7SETI4wkYwxFSgaY0eipj3z6g==",
"signature": "aOrQREnFmMBkHv0Oai0UQ4xr3RWg5ukFWwxJIEY1rxr3o8cmdM2D8ptMMz0hLm/piev3FI1BtnYWcRB8CjO/Icxdo0q+v8rGsTcY71+2nPJPjmO+cJr+x5sa/tlSpeQSwBBQd1zNGXzjMlP/u5xge/fWTkInUfvGHX2zXGlNHGWz2tBBe6hviM/JjcXv/HhnKCWfMOn+dDZFyXXwKlj2XdzbjLvvXIK79C/TizBQBukuHle73T6gFl7KH+tr6SxHdeeUtqH31I0J1usefwFj2+r7G1ebG4Ew1gBWWLCdoTUhXrKpQ13PlglNxyYKz7cv317ly+RjOTzjECgs9FHA+g==",
"timestamp": "2001-01-01T01:01:01.000000001Z",
"title": "created dataset from data.csv"
},
Expand Down Expand Up @@ -58,35 +58,68 @@
}
},
"stats": {
"path": "/mem/QmdQz7LYFDChz3SuWjKaDEkVRymmJN1XGXgpXr6rRbMnK6",
"path": "/mem/QmZjVHiq1brwXQRKMDcc2TuE8UHhQCyTa3WMWnrYXJRUUG",
"qri": "sa:0",
"stats": [
{
"count": 5,
"frequencies": {},
"frequencies": {
"chatham": 1,
"chicago": 1,
"new york": 1,
"raleigh": 1,
"toronto": 1
},
"maxLength": 8,
"minLength": 7,
"type": "string"
"type": "string",
"unique": 5
},
{
"count": 5,
"histogram": {
"bins": null,
"frequencies": []
"bins": [
35000,
250000,
300000,
8500000,
40000000,
40000001
],
"frequencies": [
1,
1,
1,
1,
1
]
},
"max": 40000000,
"mean": 49085000,
"mean": 9817000,
"median": 300000,
"min": 35000,
"type": "numeric"
},
{
"count": 5,
"histogram": {
"bins": null,
"frequencies": []
"bins": [
44.4,
50.65,
55.5,
65.25,
66.25
],
"frequencies": [
2,
1,
1,
1
]
},
"max": 65.25,
"mean": 260.2,
"mean": 52.04,
"median": 50.65,
"min": 44.4,
"type": "numeric"
},
Expand Down
3 changes: 3 additions & 0 deletions base/component/component.go
Expand Up @@ -4,10 +4,13 @@ import (
"fmt"
"time"

logger "github.com/ipfs/go-log"
"github.com/qri-io/dataset"
"github.com/qri-io/qfs"
)

var log = logger.Logger("component")

// Component represents one of two things, either a single component (meta, body), or a collection
// of components, such as an entire dataset or a directory of files that encode components.
type Component interface {
Expand Down
95 changes: 88 additions & 7 deletions base/dsfs/commit.go
Expand Up @@ -73,10 +73,15 @@ func commitFileAddFunc(privKey crypto.PrivKey) addWriteFileFunc {
if cff, ok := wfs.body.(*computeFieldsFile); ok {
updateScriptPaths(ds, added)

if err := generateCommitTitleAndMessage(ctx, cff.fs, cff.pk, cff.ds, cff.prev, cff.bodyAct, cff.sw.FileHint, cff.sw.ForceIfNoChanges); err != nil {
log.Debugf("generateCommitTitleAndMessage: %s", err)
return nil, err
if err := confirmByteChangesExist(cff.ds, cff.prev, added, wfs.body.FullPath(), cff.sw.ForceIfNoChanges); err != nil {
return nil, fmt.Errorf("error saving: %w", err)
}

if err := ensureCommitTitleAndMessage(ctx, cff.fs, cff.pk, cff.ds, cff.prev, cff.bodyAct, cff.sw.FileHint, cff.sw.ForceIfNoChanges); err != nil {
log.Debugf("ensureCommitTitleAndMessage: %s", err)
return nil, fmt.Errorf("error saving: %w", err)
}

}

replaceComponentsWithRefs(ds, added, wfs.body.FullPath())
Expand All @@ -95,13 +100,89 @@ func commitFileAddFunc(privKey crypto.PrivKey) addWriteFileFunc {
}
}

// generateCommitTileAndMessage creates the commit and title, message
func generateCommitTitleAndMessage(ctx context.Context, fs qfs.Filesystem, privKey crypto.PrivKey, ds, prev *dataset.Dataset, bodyAct BodyAction, fileHint string, forceIfNoChanges bool) error {
log.Debugf("generateCommitTitleAndMessage bodyAct=%s", bodyAct)
// confirmByteChangesExist returns an early error if no components paths
// differ from the previous flag & we're not forcing a commit.
// if we are forcing a commit, set commit title and message values, which
// triggers a fast-path in ensureCommitTitleAndMessage
//
// keep in mind: it is possible for byte-level changes to exist, but not cause
// any alterations to dataset values, (for example: removing non-sensitive
// whitespace)
func confirmByteChangesExist(ds, prev *dataset.Dataset, added map[string]string, bodyPathName string, force bool) error {
if force {
log.Debugf("forcing changes. skipping uniqueness checks")
// fast path: forced changes ignore all comparison
if ds.Commit.Title == "" {
ds.Commit.Title = "forced update"
}
if ds.Commit.Message == "" {
ds.Commit.Message = "forced update"
}
return nil
}

if prev == nil {
return nil
}

// Viz, Readme and Transform components are inlined in the dataset, so they
// don't have path values before the commit component is finalized.
// use field equality checks instead of path comparison
if !ds.Viz.ShallowCompare(prev.Viz) {
log.Debugf("byte changes exist. viz components are inequal")
return nil
}
if !ds.Readme.ShallowCompare(prev.Readme) {
log.Debugf("byte changes exist. readme components are inequal")
return nil
}
if !ds.Transform.ShallowCompare(prev.Transform) {
log.Debugf("byte changes exist. transform components are inequal")
return nil
}

// create path map for previous, ignoring dataset & commit components which
// don't yet have paths on the next version
prevRefs := prev.PathMap("dataset", "commit")

// create an empty dataset & populate it with path references to avoid
// altering the in-flight dataset
next := &dataset.Dataset{}
replaceComponentsWithRefs(next, added, bodyPathName)
nextRefs := next.PathMap()

for key, nextPath := range nextRefs {
if prevRefs[key] != nextPath {
log.Debugf("byte changes exist. %q components are inequal", key)
return nil
}
}
// need to check previous paths in case next version is dropping components
for key, prevPath := range prevRefs {
if nextRefs[key] != prevPath {
log.Debugf("byte changes exist. %q components are inequal", key)
return nil
}
}

return fmt.Errorf("no changes")
}

// ensureCommitTitleAndMessage creates the commit and title, message, skipping
// if both title and message are set. If no values are provided a commit
// description is generated by examining changes between the two versions
func ensureCommitTitleAndMessage(ctx context.Context, fs qfs.Filesystem, privKey crypto.PrivKey, ds, prev *dataset.Dataset, bodyAct BodyAction, fileHint string, forceIfNoChanges bool) error {
if ds.Commit.Title != "" && ds.Commit.Message != "" {
log.Debugf("commit meta & title are set. skipping commit description calculation")
return nil
}

// fast path when commit and title are set
log.Debugf("ensureCommitTitleAndMessage bodyAct=%s", bodyAct)
shortTitle, longMessage, err := generateCommitDescriptions(ctx, fs, ds, prev, bodyAct, forceIfNoChanges)
if err != nil {
log.Debugf("generateCommitDescriptions err: %s", err)
return fmt.Errorf("error saving: %w", err)
return err
}

if shortTitle == defaultCreatedDescription && fileHint != "" {
Expand Down
8 changes: 1 addition & 7 deletions base/dsfs/dataset.go
Expand Up @@ -2,7 +2,6 @@ package dsfs

import (
"context"
"encoding/json"
"fmt"
"io"
"sync"
Expand Down Expand Up @@ -149,6 +148,7 @@ func CreateDataset(
if pk == nil {
return "", fmt.Errorf("private key is required to create a dataset")
}

if err := DerefDataset(ctx, source, ds); err != nil {
log.Debugf("dereferencing dataset components: %s", err)
return "", err
Expand Down Expand Up @@ -341,12 +341,6 @@ func emptyFile(fullPath string) qfs.File {
return qfs.NewMemfileBytes(fullPath, []byte{})
}

func jsonWriteHook(filename string, data json.Marshaler) qfs.WriteHook {
return func(ctx context.Context, f qfs.File, pathMap map[string]string) (io.Reader, error) {
return JSONFile(filename, data)
}
}

func updateScriptPaths(ds *dataset.Dataset, added map[string]string) {
for filepath, addr := range added {
switch filepath {
Expand Down
2 changes: 0 additions & 2 deletions base/save_test.go
Expand Up @@ -142,7 +142,6 @@ func TestCreateDataset(t *testing.T) {
ds.Name = dsName
ds.Meta.Title = "an update"
ds.PreviousPath = createdDs.Path
ds.Structure = createdDs.Structure
ds.SetBodyFile(qfs.NewMemfileBytes("/body.json", []byte("[]")))

createdDs, err = CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, createdDs, SaveSwitches{Pin: true, ShouldRender: true})
Expand All @@ -161,7 +160,6 @@ func TestCreateDataset(t *testing.T) {
// reliance on CreateDataset needing the ds.Name field.
ds.Name = dsName
ds.PreviousPath = createdDs.Path
ds.Structure = createdDs.Structure
ds.SetBodyFile(qfs.NewMemfileBytes("/body.json", []byte("[]")))

if createdDs, err = CreateDataset(ctx, r, r.Filesystem().DefaultWriteFS(), ds, createdDs, SaveSwitches{Pin: true, ShouldRender: true}); err == nil {
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Expand Up @@ -41,7 +41,7 @@ require (
github.com/olekukonko/tablewriter v0.0.4
github.com/pkg/errors v0.9.1
github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5
github.com/qri-io/dataset v0.2.1-0.20201120145927-556268c0b08c
github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6
github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b
github.com/qri-io/doggos v0.1.0
github.com/qri-io/ioes v0.1.1
Expand Down
6 changes: 2 additions & 4 deletions go.sum
Expand Up @@ -1126,8 +1126,8 @@ github.com/qri-io/compare v0.1.0 h1:A/MRx3uEnJ/iMjfJY1VOqH9CYs9zFSEYaFVeXuGfmis=
github.com/qri-io/compare v0.1.0/go.mod h1:i/tVuDGRXVxhuZ8ZUieF23u6rQ6wLGJl7KKWpoMRaTE=
github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5 h1:xeMaT6fLTvdrFOOP2N2+x68EL/uZrm4XC0zs1vjLiXo=
github.com/qri-io/dag v0.2.2-0.20201110155527-8fad5beb70f5/go.mod h1:1AwOy3yhcZTAXzaF4wGSdnrp87u3PBOrsWXUjOtQCXo=
github.com/qri-io/dataset v0.2.1-0.20201120145927-556268c0b08c h1:AC+6NiYlhbR3YmM/hTvcU1bmn1UKyDAy6DlA40KTv0Y=
github.com/qri-io/dataset v0.2.1-0.20201120145927-556268c0b08c/go.mod h1:HtwGskdCECbOON0iVQHEEm6fykwDqharlqabc1ssj3Y=
github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6 h1:a9CYZQ+DCzwqg8BgEN5oKboBoxueaYf0EKPnXeR/Mhk=
github.com/qri-io/dataset v0.2.1-0.20201124144731-82162a0f76e6/go.mod h1:HtwGskdCECbOON0iVQHEEm6fykwDqharlqabc1ssj3Y=
github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b h1:T8qEIv+qLi5mVWvSS329wJ+HbN7cfMwCWjRVzh/+upo=
github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b/go.mod h1:NrL/b7YvexgpGb4HEO3Rlx5RrMLDfxuKDf/XDAq5ac0=
github.com/qri-io/doggos v0.1.0 h1:B7Hn9ssRGDAonMhJ4UwDtPDmG9GtvLR8f7VFec7Rs7M=
Expand All @@ -1141,8 +1141,6 @@ github.com/qri-io/jsonpointer v0.1.1/go.mod h1:DnJPaYgiKu56EuDp8TU5wFLdZIcAnb/uH
github.com/qri-io/jsonschema v0.2.0 h1:is8lirh3HYwTkC0e+4jL/vWEHwzPLojnl4FWkUoeEPU=
github.com/qri-io/jsonschema v0.2.0/go.mod h1:g7DPkiOsK1xv6T/Ao5scXRkd+yTFygcANPBaaqW+VrI=
github.com/qri-io/qfs v0.1.1-0.20200623175200-23690284798a/go.mod h1:NP4Va1b0/V4nbyPdX3FmZ4cJTPatn0d5FZF5GBxcIJA=
github.com/qri-io/qfs v0.5.0 h1:B6nhA9ndyttTsemo7NikigE9GcbUFZQ+oKfMqj4+Qdw=
github.com/qri-io/qfs v0.5.0/go.mod h1:NP4Va1b0/V4nbyPdX3FmZ4cJTPatn0d5FZF5GBxcIJA=
github.com/qri-io/qfs v0.5.1-0.20201119141805-fb13393b2d1f h1:Oj2N/HRGMcs2NDZWl1K5gFsTifrxQVVE0KyFyUGgn8M=
github.com/qri-io/qfs v0.5.1-0.20201119141805-fb13393b2d1f/go.mod h1:SU+DUq8+BfHNod1SXzmD8FrNLgPt42aKyQuO3fnFEQI=
github.com/qri-io/starlib v0.4.2 h1:ZGzmzT9fOqdluezcwhAZAbTn/v6kMg1tC6ALVjQPhpQ=
Expand Down

0 comments on commit f5ec420

Please sign in to comment.