Skip to content

Commit

Permalink
Implement a Wikidata identifier for Siegfried
Browse files Browse the repository at this point in the history
Wikidata contains a host of file format information. Wikidata records
provide information about format names, identifiers (PUID, LoC FDD),
file-extensions, MIMEtypes, and format identification patterns. This
means it might be quite handy as an extension to Siegfried's
capabilties.

This is the first-cut of that work. We are able to harvest
information from the Wikidata Query Service. Create an identifier.
And consume the information in the identifier to match file-formats
using format-identification patterns taken directly from the
service.

We attempt to return new information not otherwise present in other
identifiers yet, for example, signature provenance. Provenance is
recorded in Wikidata as well as the date a format-identification
pattern was added. We try and replay this to the user to enrich
what is available to them.

This work is graciously supported by Yale University Library
(Euan Cochran, Kat Thornton) and Richard Lehane. It has been fun
trying to put this out there for folk.
  • Loading branch information
ross-spencer committed Sep 13, 2020
1 parent 8a95ae4 commit dfb579b
Show file tree
Hide file tree
Showing 41 changed files with 112,196 additions and 73 deletions.
106,636 changes: 106,636 additions & 0 deletions cmd/roy/data/wikidata/wikidata-test-definitions

Large diffs are not rendered by default.

2,195 changes: 2,195 additions & 0 deletions cmd/roy/data/wikidata/wikidata-test-definitions-small

Large diffs are not rendered by default.

62 changes: 62 additions & 0 deletions cmd/roy/harvest_wikidata.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package main

import (
"fmt"
"io/ioutil"
"log"
"os"

"github.com/richardlehane/siegfried/pkg/config"
"github.com/ross-spencer/spargo/pkg/spargo"
)

// harvestWikidata will connect to the configured Wikidata query service
// and save the results of the configured query to disk.
func harvestWikidata() error {
log.Printf(
"Roy (Wikidata): Harvesting Wikidata definitions: lang '%s'",
config.WikidataLang(),
)
err := os.MkdirAll(config.WikidataHome(), os.ModePerm)
if err != nil {
return fmt.Errorf(
"Roy (Wikidata): Error harvesting Wikidata definitions: '%s'",
err,
)
}
log.Printf(
"Roy (Wikidata): Harvesting definitions from: '%s'",
config.WikidataEndpoint(),
)
sparqlMe := spargo.SPARQLClient{}
sparqlMe.ClientInit(config.WikidataEndpoint(), config.WikidataSPARQL())
sparqlMe.SetUserAgent(config.UserAgent())
res := sparqlMe.SPARQLGo()
path := config.WikidataDefinitionsPath()
err = ioutil.WriteFile(path, []byte(res.Human), config.WikidataFileMode())
if err != nil {
return fmt.Errorf(
"Roy (Wikidata): Error harvesting Wikidata: '%s'",
err,
)
}
log.Printf(
"Roy (Wikidata): Harvesting Wikidata definitions '%s' complete",
path,
)
return nil
}
183 changes: 115 additions & 68 deletions cmd/roy/roy.go

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions cmd/roy/roy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"github.com/richardlehane/siegfried/pkg/mimeinfo"
"github.com/richardlehane/siegfried/pkg/pronom"
"github.com/richardlehane/siegfried/pkg/sets"
wd "github.com/richardlehane/siegfried/pkg/wikidata"
)

var testhome = flag.String("home", "data", "override the default home directory")
Expand Down Expand Up @@ -66,6 +67,20 @@ func TestFreedesktop(t *testing.T) {
}
}

func TestWikidata(t *testing.T) {
s := siegfried.New()
config.SetHome(*testhome)
config.SetWikidataDefinitions("wikidata-test-definitions-small")
m, err := wd.New(config.SetWikidataNamespace())
if err != nil {
t.Fatal(err)
}
err = s.Add(m)
if err != nil {
t.Fatal(err)
}
}

func TestPronomTikaLoc(t *testing.T) {
s := siegfried.New()
config.SetHome(*testhome)
Expand Down
6 changes: 6 additions & 0 deletions cmd/sf/sf.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ var (
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
conff = flag.String("conf", "", "set the configuration file")
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata")
)

var (
Expand Down Expand Up @@ -377,6 +378,11 @@ func main() {
serveFpr(config.Fpr(), s)
return
}
// present source in the basis field within the Wikidata identifier
// instead of its own field.
if *sourceinline {
config.SetWikidataSourceFieldOff()
}
// check -multi
if *multi > maxMulti || *multi < 1 || (*archive && *multi > 1) {
log.Println("[WARN] -multi must be > 0 and =< 1024. If -z, -multi must be 1. Resetting -multi to 1")
Expand Down
4 changes: 4 additions & 0 deletions cmd/sf/sf_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ func TestSuite(t *testing.T) {
return nil
}
suite := filepath.Join(*testdata, "skeleton-suite")
_, err = os.Stat(suite)
if err != nil {
t.Fatal(err)
}
err = filepath.Walk(suite, wf)
if err != nil {
t.Fatal(err)
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
1 change: 1 addition & 0 deletions cmd/sf/testdata/wikidata/wd/Q10287816.gz
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�
Binary file added cmd/sf/testdata/wikidata/wd/Q28205479.info
Binary file not shown.
2 changes: 1 addition & 1 deletion cmd/sf/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ func getHttp(url string) ([]byte, error) {
return nil, err
}
_, timeout, transport := config.UpdateOptions()
req.Header.Add("User-Agent", "siegfried/siegbot (+https://github.com/richardlehane/siegfried)")
req.Header.Add("User-Agent", config.UserAgent())
req.Header.Add("Cache-Control", "no-cache")
timer := time.AfterFunc(timeout, func() {
transport.CancelRequest(req)
Expand Down
126 changes: 126 additions & 0 deletions cmd/sf/wikidata_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package main

import (
"flag"
"fmt"

"os"

"path/filepath"
"strings"
"testing"

"github.com/richardlehane/siegfried"
"github.com/richardlehane/siegfried/pkg/config"
"github.com/richardlehane/siegfried/pkg/wikidata"
)

// Path components associated with the Roy command folder.
const wikidataTestDefinitions = "wikidata-test-definitions"
const wikidataDefinitionsBaseDir = "definitionsBaseDir"

var royTestData = filepath.Join("..", "roy", "data")

// Path components within the Siegfried command folder.
const siegfriedTestData = "testdata"
const wikidataTestData = "wikidata"
const wikidataPRONOMSkeletons = "pro"
const wikidataCustomSkeletons = "wd"

var (
wikidataDefinitions = flag.String(
wikidataDefinitionsBaseDir,
royTestData,
"Creates an flag var that is compatible with the config functions...",
)
)

var wdSiegfried *siegfried.Siegfried

func setupWikidata(opts ...config.Option) error {
if opts == nil && wdSiegfried != nil {
return fmt.Errorf(
"Wikidata setup options are not properly configured",
)
}
wdSiegfried = siegfried.New()
config.SetHome(*wikidataDefinitions)
config.SetWikidataNamespace()
config.SetWikidataDefinitions(wikidataTestDefinitions)
config.SetWikidataNoPRONOM()
identifier, err := wikidata.New()
if err != nil {
return err
}
wdSiegfried.Add(identifier)
return nil
}

// identificationTests provides our structure for table driven tests.
type identificationTests struct {
fname string
qid string
extMatch bool
byteMatch bool
error bool
}

var skeletonSamples = []identificationTests{
identificationTests{
filepath.Join(wikidataPRONOMSkeletons, "fmt-11-signature-id-58.png"),
"Q178051", true, true, false},
identificationTests{
filepath.Join(wikidataPRONOMSkeletons, "fmt-279-signature-id-295.flac"),
"Q27881556", true, true, false},
identificationTests{
filepath.Join(wikidataCustomSkeletons, "Q10287816.gz"),
"Q10287816", true, true, false},
identificationTests{
filepath.Join(wikidataCustomSkeletons, "Q28205479.info"),
"Q28205479", true, true, false},
}

// Rudimentary consts that can help us determine the method of
// identification. Can also add "container name" here for when we want
// to validate PRONOM alongside Wikidata.
const extensionMatch = "extension match"
const byteMatch = "byte match"

// TestWikidataBasic will perform some rudimentary tests using some
// simple Skeleton files and the Wikidata identifier without PRONOM.
func TestWikidataBasic(t *testing.T) {
err := setupWikidata()
if err != nil {
t.Error(err)
}
for _, test := range skeletonSamples {
path := filepath.Join(siegfriedTestData, wikidataTestData, test.fname)
file, err := os.Open(path)
if err != nil {
t.Errorf("failed to open %v, got: %v", path, err)
}
defer file.Close()
res, err := wdSiegfried.Identify(file, path, "")
if err != nil && !test.error {
t.Fatal(err)
}
if len(res) > 1 {
t.Errorf("Match length greater than one: '%d'", len(res))
}
id := res[0].Values()[1]
basis := res[0].Values()[5]
if id != test.qid {
t.Errorf(
"QID match different than anticipated: '%s' expected '%s'",
id,
test.qid,
)
}
if test.extMatch && !strings.Contains(basis, extensionMatch) {
t.Errorf("Extension match not returned by identifier: %s", basis)
}
if test.byteMatch && !strings.Contains(basis, byteMatch) {
t.Errorf("Byte match not returned by identifier: %s", basis)
}
}
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ require (
github.com/richardlehane/mscfb v1.0.3
github.com/richardlehane/webarchive v1.0.0
github.com/richardlehane/xmldetect v1.0.2
github.com/ross-spencer/spargo v0.0.0-20200323024642-38971d4365a7
golang.org/x/image v0.0.0-20200119044424-58c23975cae1
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4
golang.org/x/text v0.3.2 // indirect
Expand Down
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ github.com/richardlehane/webarchive v1.0.0 h1:RnmjrTFzNoenSypXrIFW0dvw8Isytv5z0N
github.com/richardlehane/webarchive v1.0.0/go.mod h1:R+J0ocBfFb0A3vzwXZwrYoPe69PJLEejLzFL5ceDroM=
github.com/richardlehane/xmldetect v1.0.2 h1:/3ooFuJwtgpMMe14/7m8a/JIvECMx6SpsPcDRiNyR8o=
github.com/richardlehane/xmldetect v1.0.2/go.mod h1:Zp1lhTLRJa2p2QKA4jOruVQYc0NFQDO0YUz3k/k6JcE=
github.com/ross-spencer/spargo v0.0.0-20200323024642-38971d4365a7 h1:G50l+RXrUyL5DE+Mj1+OOJgOR+hq8Ghf/ozx3FFcffQ=
github.com/ross-spencer/spargo v0.0.0-20200323024642-38971d4365a7/go.mod h1:5mytCwysAzmwG9GJTFD7GR8+ZrhStjTOe3krU9Rlm8c=
golang.org/dl v0.0.0-20200611200201-72429b14455f/go.mod h1:IUMfjQLJQd4UTqG1Z90tenwKoCX93Gn3MAQJMOSBsDQ=
golang.org/x/image v0.0.0-20200119044424-58c23975cae1 h1:5h3ngYt7+vXCDZCup/HkCQgW5XwmSvR/nA2JmJ0RErg=
golang.org/x/image v0.0.0-20200119044424-58c23975cae1/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4 h1:sfkvUWPNGwSV+8/fNqctR5lS2AqCSqYwXdrjCxp/dXo=
Expand Down
6 changes: 5 additions & 1 deletion internal/identifier/parseable.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,11 @@ func inspect(p Parseable, ids ...string) (string, error) {
lines := make([]string, 0, 10)
info, ok := p.Infos()[id]
if ok {
lines = append(lines, strings.ToUpper(info.String()+" ("+id+")"))
if config.GetWikidataNamespace() != "" {
lines = append(lines, strings.ToUpper(info.String()+"QID: ("+id+")"))
} else {
lines = append(lines, strings.ToUpper(info.String()))
}
if has(gids, id) {
lines = append(lines, "globs: "+strings.Join(get(gids, gs, id), ", "))
}
Expand Down
17 changes: 14 additions & 3 deletions pkg/config/identifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,19 @@ var identifier = struct {
}

// GETTERS
const emptyNamespace = ""

// Name returns the name of the identifier.
func Name() string {
switch {
case identifier.name != "":
case identifier.name != emptyNamespace:
return identifier.name
case mimeinfo.mi != "":
case mimeinfo.mi != emptyNamespace:
return mimeinfo.name
case loc.fdd != "":
case loc.fdd != emptyNamespace:
return loc.name
case GetWikidataNamespace() != emptyNamespace:
return GetWikidataNamespace()
default:
return pronom.name
}
Expand All @@ -79,6 +82,14 @@ func Details(extra ...string) string {
extra = append(extra, ContainerBase())
}
}
} else if wikidata.definitions != "" {
str = wikidata.definitions
if !wikidata.nopronom {
extra = append(extra, DroidBase())
if !identifier.noContainer {
extra = append(extra, ContainerBase())
}
}
} else {
str = DroidBase()
if !identifier.noContainer {
Expand Down
Loading

0 comments on commit dfb579b

Please sign in to comment.