Skip to content

Commit

Permalink
feat(dataset.Structure): add ErrCount field to structure
Browse files Browse the repository at this point in the history
dataset.Structure now has a field to store the number of validation
errors in a given dataset. dsfs.CreateDataset now fills this out
at time of dataset creation automatically.

closes #47
  • Loading branch information
b5 committed Jan 24, 2018
1 parent e962d42 commit fbcee73
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 18 deletions.
4 changes: 4 additions & 0 deletions dsfs/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ func prepareDataset(store cafs.Filestore, ds *dataset.Dataset, df cafs.File, pri
}
ds.Structure.Length = len(data)

// set error count
validationErrors := ds.Structure.Schema.ValidateBytes(data)
ds.Structure.ErrCount = len(validationErrors)

// TODO - add a dsio.RowCount function that avoids actually arranging data into rows
rr, err := dsio.NewValueReader(ds.Structure, memfs.NewMemfileBytes("data", data))
if err != nil {
Expand Down
4 changes: 2 additions & 2 deletions dsfs/dataset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ func TestCreateDataset(t *testing.T) {
}{
{"testdata/bad/invalid_reference.json", "testdata/cities.csv", "", "", 0, "error loading dataset commit: error loading commit file: datastore: key not found"},
{"testdata/bad/invalid.json", "testdata/cities.csv", "", "", 0, "commit is required"},
{"testdata/cities.json", "testdata/cities.csv", "cities.csv", "/map/QmYsnYBrS6mcga94M792uhaBRvNZSyE9y96p9koxxnPmPo", 6, ""},
{"testdata/complete.json", "testdata/complete.csv", "complete.csv", "/map/QmRsQAKAqPLv5hiCvWKdhHQxPZxzXMc8obbULg8E8zWNqS", 13, ""},
{"testdata/cities.json", "testdata/cities.csv", "cities.csv", "/map/QmQAHgLH7biAnD3wChpfyBaz1HNUkUwzotttm6TE15smkG", 6, ""},
{"testdata/complete.json", "testdata/complete.csv", "complete.csv", "/map/QmQ2CuZ8dbKqjyaKvoQwynXgqnxPKTywojNVJ2Jpj2yb6c", 13, ""},
}

for i, c := range cases {
Expand Down
4 changes: 2 additions & 2 deletions dsfs/transform_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestSaveTransform(t *testing.T) {
return
}

hash := "/map/QmNyE9y5GJUTK6q72AyXAt8KdbL3iR9koxVHgv5rFmUyha"
hash := "/map/QmbaGqpbMmdJwRS4spApiwtE4JcwqHpR7YhvbpgUx1fqXF"
if hash != key.String() {
t.Errorf("key mismatch: %s != %s", hash, key.String())
return
Expand Down Expand Up @@ -113,7 +113,7 @@ func TestSaveAbstractTransform(t *testing.T) {
return
}

hash := "/map/QmQzv5jSobdHSGTsBJnu8hGAdNx7LNCb6cczCuVCfJBvZx"
hash := "/map/QmXcMVfHAjhiahiXKEvAb9AvDahv4T7cG4FoVgkefScKEa"
if hash != key.String() {
t.Errorf("key mismatch: %s != %s", hash, key.String())
return
Expand Down
18 changes: 11 additions & 7 deletions structure.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
type Structure struct {
// private storage for reference to this object
path datastore.Key
// Checksum is a bas58-encoded multihash checksum of the data
// Checksum is a bas58-encoded multihash checksum of the entire data
// file this structure points to. This is different from IPFS
// hashes, which are calculated after breaking the file into blocks
Checksum string `json:"checksum,omitempty"`
Expand All @@ -28,6 +28,9 @@ type Structure struct {
// Encoding specifics character encoding
// should assume utf-8 if not specified
Encoding string `json:"encoding,omitempty"`
// ErrCount is the number of errors returned by validating data
// against this schema. required
ErrCount int `json:"errCount"`
// Entries is number of top-level entries in the dataset. With tablular data
// this is the same as the number of rows
// required when structure is concrete, and must match underlying dataset.
Expand Down Expand Up @@ -97,6 +100,7 @@ type _structure struct {
Compression compression.Type `json:"compression,omitempty"`
Encoding string `json:"encoding,omitempty"`
Entries int `json:"entries,omitempty"`
ErrCount int `json:"errCount"`
Format DataFormat `json:"format"`
FormatConfig map[string]interface{} `json:"formatConfig,omitempty"`
Length int `json:"length,omitempty"`
Expand Down Expand Up @@ -125,6 +129,7 @@ func (s Structure) MarshalJSON() (data []byte, err error) {
Compression: s.Compression,
Encoding: s.Encoding,
Entries: s.Entries,
ErrCount: s.ErrCount,
Format: s.Format,
FormatConfig: opt,
Length: s.Length,
Expand Down Expand Up @@ -162,6 +167,7 @@ func (s *Structure) UnmarshalJSON(data []byte) (err error) {
Compression: _s.Compression,
Encoding: _s.Encoding,
Entries: _s.Entries,
ErrCount: _s.ErrCount,
Format: _s.Format,
FormatConfig: fmtCfg,
Length: _s.Length,
Expand All @@ -177,6 +183,7 @@ func (s *Structure) IsEmpty() bool {
s.Compression == compression.None &&
s.Encoding == "" &&
s.Entries == 0 &&
s.ErrCount == 0 &&
s.Format == UnknownDataFormat &&
s.FormatConfig == nil &&
s.Length == 0 &&
Expand Down Expand Up @@ -206,6 +213,9 @@ func (s *Structure) Assign(structures ...*Structure) {
if st.Entries != 0 {
s.Entries = st.Entries
}
if st.ErrCount != 0 {
s.ErrCount = st.ErrCount
}
if st.Format != UnknownDataFormat {
s.Format = st.Format
}
Expand Down Expand Up @@ -283,9 +293,3 @@ func base26(d int) (s string) {
}
return s
}

// func SchemaFieldNames(rs *jsonschema.RootSchema) (fn []string) {
// if itemsch, ok := rs.Validators["items"].(*jsonschema.Schema); ok {
// itemsch.Validators["items"]
// }
// }
9 changes: 5 additions & 4 deletions structure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func TestStrucureHash(t *testing.T) {
hash string
err error
}{
{&Structure{Qri: KindStructure, Format: CSVDataFormat}, "QmXKrm8qWRuY5HeU12Y6Ld83L9SGCxWfi4BW87a9yGpwfj", nil},
{&Structure{Qri: KindStructure, Format: CSVDataFormat}, "QmUqNTfVuJamhRfXLC1QUZ8RLaGhUaTY31ChX4GbtamW2o", nil},
}

for i, c := range cases {
Expand Down Expand Up @@ -71,6 +71,7 @@ func TestStructureIsEmpty(t *testing.T) {
{&Structure{Compression: compression.Tar}},
{&Structure{Encoding: "a"}},
{&Structure{Entries: 1}},
{&Structure{ErrCount: 1}},
{&Structure{Format: CSVDataFormat}},
{&Structure{FormatConfig: &CSVOptions{}}},
{&Structure{Length: 1}},
Expand Down Expand Up @@ -183,9 +184,9 @@ func TestStructureMarshalJSON(t *testing.T) {
out []byte
err error
}{
{&Structure{Format: CSVDataFormat}, []byte(`{"format":"csv","qri":"st:0"}`), nil},
{&Structure{Format: CSVDataFormat, Qri: KindStructure}, []byte(`{"format":"csv","qri":"st:0"}`), nil},
{AirportCodesStructure, []byte(`{"format":"csv","formatConfig":{"headerRow":true},"qri":"st:0","schema":{"items":{"items":[{"title":"ident","type":"string"},{"title":"type","type":"string"},{"title":"name","type":"string"},{"title":"latitude_deg","type":"string"},{"title":"longitude_deg","type":"string"},{"title":"elevation_ft","type":"string"},{"title":"continent","type":"string"},{"title":"iso_country","type":"string"},{"title":"iso_region","type":"string"},{"title":"municipality","type":"string"},{"title":"gps_code","type":"string"},{"title":"iata_code","type":"string"},{"title":"local_code","type":"string"}],"type":"array"},"type":"array"}}`), nil},
{&Structure{Format: CSVDataFormat}, []byte(`{"errCount":0,"format":"csv","qri":"st:0"}`), nil},
{&Structure{Format: CSVDataFormat, Qri: KindStructure}, []byte(`{"errCount":0,"format":"csv","qri":"st:0"}`), nil},
{AirportCodesStructure, []byte(`{"errCount":5,"format":"csv","formatConfig":{"headerRow":true},"qri":"st:0","schema":{"items":{"items":[{"title":"ident","type":"string"},{"title":"type","type":"string"},{"title":"name","type":"string"},{"title":"latitude_deg","type":"string"},{"title":"longitude_deg","type":"string"},{"title":"elevation_ft","type":"string"},{"title":"continent","type":"string"},{"title":"iso_country","type":"string"},{"title":"iso_region","type":"string"},{"title":"municipality","type":"string"},{"title":"gps_code","type":"string"},{"title":"iata_code","type":"string"},{"title":"local_code","type":"string"}],"type":"array"},"type":"array"}}`), nil},
}

for i, c := range cases {
Expand Down
1 change: 1 addition & 0 deletions testdata/structures/airport-codes.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{
"errCount": 5,
"format": "csv",
"format_options": {
"header_row": true
Expand Down
7 changes: 4 additions & 3 deletions testdata_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@ var AirportCodesAbstract = &Dataset{
Structure: AirportCodesStructureAbstract,
}

const AirportCodesJSON = `{"commit":{"qri":"cm:0","timestamp":"0001-01-01T00:00:00Z","title":"initial commit"},"meta":{"citations":[{"name":"Our Airports","url":"http://ourairports.com/data/"}],"homePath":"http://www.ourairports.com/","license":"PDDL-1.0","qri":"md:0","title":"Airport Codes"},"qri":"ds:0","structure":{"format":"csv","formatConfig":{"headerRow":true},"qri":"st:0","schema":{"items":{"items":[{"title":"ident","type":"string"},{"title":"type","type":"string"},{"title":"name","type":"string"},{"title":"latitude_deg","type":"string"},{"title":"longitude_deg","type":"string"},{"title":"elevation_ft","type":"string"},{"title":"continent","type":"string"},{"title":"iso_country","type":"string"},{"title":"iso_region","type":"string"},{"title":"municipality","type":"string"},{"title":"gps_code","type":"string"},{"title":"iata_code","type":"string"},{"title":"local_code","type":"string"}],"type":"array"},"type":"array"}}}`
const AirportCodesJSON = `{"commit":{"qri":"cm:0","timestamp":"0001-01-01T00:00:00Z","title":"initial commit"},"meta":{"citations":[{"name":"Our Airports","url":"http://ourairports.com/data/"}],"homePath":"http://www.ourairports.com/","license":"PDDL-1.0","qri":"md:0","title":"Airport Codes"},"qri":"ds:0","structure":{"errCount":5,"format":"csv","formatConfig":{"headerRow":true},"qri":"st:0","schema":{"items":{"items":[{"title":"ident","type":"string"},{"title":"type","type":"string"},{"title":"name","type":"string"},{"title":"latitude_deg","type":"string"},{"title":"longitude_deg","type":"string"},{"title":"elevation_ft","type":"string"},{"title":"continent","type":"string"},{"title":"iso_country","type":"string"},{"title":"iso_region","type":"string"},{"title":"municipality","type":"string"},{"title":"gps_code","type":"string"},{"title":"iata_code","type":"string"},{"title":"local_code","type":"string"}],"type":"array"},"type":"array"}}}`

var AirportCodesStructure = &Structure{
Format: CSVDataFormat,
Qri: KindStructure,
ErrCount: 5,
Format: CSVDataFormat,
Qri: KindStructure,
FormatConfig: &CSVOptions{
HeaderRow: true,
},
Expand Down

0 comments on commit fbcee73

Please sign in to comment.