Skip to content

Commit dfb579b

Browse files
committed
Implement a Wikidata identifier for Siegfried
Wikidata contains a host of file format information. Wikidata records provide information about format names, identifiers (PUID, LoC FDD), file-extensions, MIMEtypes, and format identification patterns. This means it might be quite handy as an extension to Siegfried's capabilties. This is the first-cut of that work. We are able to harvest information from the Wikidata Query Service. Create an identifier. And consume the information in the identifier to match file-formats using format-identification patterns taken directly from the service. We attempt to return new information not otherwise present in other identifiers yet, for example, signature provenance. Provenance is recorded in Wikidata as well as the date a format-identification pattern was added. We try and replay this to the user to enrich what is available to them. This work is graciously supported by Yale University Library (Euan Cochran, Kat Thornton) and Richard Lehane. It has been fun trying to put this out there for folk.
1 parent 8a95ae4 commit dfb579b

41 files changed

Lines changed: 112196 additions & 73 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cmd/roy/data/wikidata/wikidata-test-definitions

Lines changed: 106636 additions & 0 deletions
Large diffs are not rendered by default.

cmd/roy/data/wikidata/wikidata-test-definitions-small

Lines changed: 2195 additions & 0 deletions
Large diffs are not rendered by default.

cmd/roy/harvest_wikidata.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Copyright 2020 Ross Spencer, Richard Lehane. All rights reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12+
// implied. See the License for the specific language governing
13+
// permissions and limitations under the License.
14+
15+
package main
16+
17+
import (
18+
"fmt"
19+
"io/ioutil"
20+
"log"
21+
"os"
22+
23+
"github.com/richardlehane/siegfried/pkg/config"
24+
"github.com/ross-spencer/spargo/pkg/spargo"
25+
)
26+
27+
// harvestWikidata will connect to the configured Wikidata query service
28+
// and save the results of the configured query to disk.
29+
func harvestWikidata() error {
30+
log.Printf(
31+
"Roy (Wikidata): Harvesting Wikidata definitions: lang '%s'",
32+
config.WikidataLang(),
33+
)
34+
err := os.MkdirAll(config.WikidataHome(), os.ModePerm)
35+
if err != nil {
36+
return fmt.Errorf(
37+
"Roy (Wikidata): Error harvesting Wikidata definitions: '%s'",
38+
err,
39+
)
40+
}
41+
log.Printf(
42+
"Roy (Wikidata): Harvesting definitions from: '%s'",
43+
config.WikidataEndpoint(),
44+
)
45+
sparqlMe := spargo.SPARQLClient{}
46+
sparqlMe.ClientInit(config.WikidataEndpoint(), config.WikidataSPARQL())
47+
sparqlMe.SetUserAgent(config.UserAgent())
48+
res := sparqlMe.SPARQLGo()
49+
path := config.WikidataDefinitionsPath()
50+
err = ioutil.WriteFile(path, []byte(res.Human), config.WikidataFileMode())
51+
if err != nil {
52+
return fmt.Errorf(
53+
"Roy (Wikidata): Error harvesting Wikidata: '%s'",
54+
err,
55+
)
56+
}
57+
log.Printf(
58+
"Roy (Wikidata): Harvesting Wikidata definitions '%s' complete",
59+
path,
60+
)
61+
return nil
62+
}

cmd/roy/roy.go

Lines changed: 115 additions & 68 deletions
Large diffs are not rendered by default.

cmd/roy/roy_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"github.com/richardlehane/siegfried/pkg/mimeinfo"
1111
"github.com/richardlehane/siegfried/pkg/pronom"
1212
"github.com/richardlehane/siegfried/pkg/sets"
13+
wd "github.com/richardlehane/siegfried/pkg/wikidata"
1314
)
1415

1516
var testhome = flag.String("home", "data", "override the default home directory")
@@ -66,6 +67,20 @@ func TestFreedesktop(t *testing.T) {
6667
}
6768
}
6869

70+
func TestWikidata(t *testing.T) {
71+
s := siegfried.New()
72+
config.SetHome(*testhome)
73+
config.SetWikidataDefinitions("wikidata-test-definitions-small")
74+
m, err := wd.New(config.SetWikidataNamespace())
75+
if err != nil {
76+
t.Fatal(err)
77+
}
78+
err = s.Add(m)
79+
if err != nil {
80+
t.Fatal(err)
81+
}
82+
}
83+
6984
func TestPronomTikaLoc(t *testing.T) {
7085
s := siegfried.New()
7186
config.SetHome(*testhome)

cmd/sf/sf.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ var (
6464
name = flag.String("name", "", "provide a filename when scanning a stream e.g. sf -name myfile.txt -")
6565
conff = flag.String("conf", "", "set the configuration file")
6666
setconff = flag.Bool("setconf", false, "record flags used with this command in configuration file")
67+
sourceinline = flag.Bool("sourceinline", false, "display provenance in-line (basis field) when it is available for an identifier, e.g. Wikidata")
6768
)
6869

6970
var (
@@ -377,6 +378,11 @@ func main() {
377378
serveFpr(config.Fpr(), s)
378379
return
379380
}
381+
// present source in the basis field within the Wikidata identifier
382+
// instead of its own field.
383+
if *sourceinline {
384+
config.SetWikidataSourceFieldOff()
385+
}
380386
// check -multi
381387
if *multi > maxMulti || *multi < 1 || (*archive && *multi > 1) {
382388
log.Println("[WARN] -multi must be > 0 and =< 1024. If -z, -multi must be 1. Resetting -multi to 1")

cmd/sf/sf_test.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ func TestSuite(t *testing.T) {
126126
return nil
127127
}
128128
suite := filepath.Join(*testdata, "skeleton-suite")
129+
_, err = os.Stat(suite)
130+
if err != nil {
131+
t.Fatal(err)
132+
}
129133
err = filepath.Walk(suite, wf)
130134
if err != nil {
131135
t.Fatal(err)
28 Bytes
Loading
8 Bytes
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
�

0 commit comments

Comments
 (0)