Skip to content

Commit

Permalink
internetarchive: add support for Metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
Lesmiscore committed Jul 8, 2022
1 parent b4d847c commit 42dfadf
Show file tree
Hide file tree
Showing 3 changed files with 208 additions and 17 deletions.
196 changes: 180 additions & 16 deletions backend/internetarchive/internetarchive.go
Expand Up @@ -38,6 +38,84 @@ func init() {
Name: "internetarchive",
Description: "Internet Archive",
NewFs: NewFs,

MetadataInfo: &fs.MetadataInfo{
System: map[string]fs.MetadataHelp{
"name": {
Help: "Full file path, without the bucket part",
Type: "filename",
Example: "backend/internetarchive/internetarchive.go",
},
"source": {
Help: "The source of the file",
Type: "string",
Example: "original",
},
"mtime": {
Help: "Time of last modification, managed by Rclone",
Type: "RFC 3339",
Example: "2006-01-02T15:04:05.999999999Z",
},
"size": {
Help: "File size in bytes",
Type: "decimal number",
Example: "123456",
},
"md5": {
Help: "MD5 hash calculated by Internet Archive",
Type: "string",
Example: "01234567012345670123456701234567",
},
"crc32": {
Help: "CRC32 calculated by Internet Archive",
Type: "string",
Example: "01234567",
},
"sha1": {
Help: "SHA1 hash calculated by Internet Archive",
Type: "string",
Example: "0123456701234567012345670123456701234567",
},
"format": {
Help: "Name of format identified by Internet Archive",
Type: "string",
Example: "Comma-Separated Values",
},
"old_version": {
Help: "Whether the file was replaced and moved by keep-old-version flag",
Type: "boolean",
Example: "true",
},
"viruscheck": {
Help: "The last time viruscheck process was run for the file (?)",
Type: "unixtime",
Example: "1654191352",
},

"rclone-ia-mtime": {
Help: "Time of last modification, managed by Internet Archive",
Type: "RFC 3339",
Example: "2006-01-02T15:04:05.999999999Z",
},
"rclone-mtime": {
Help: "Time of last modification, managed by Rclone",
Type: "RFC 3339",
Example: "2006-01-02T15:04:05.999999999Z",
},
"rclone-update-track": {
Help: "Random value used by Rclone for tracking changes inside Internet Archive",
Type: "string",
Example: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
},
},
Help: `Metadata fields provided by Internet Archive.
If there are multiple values for a key, only the first one is returned.
This is a limitation of Rclone, that supports one value per one key.
Owner is able to add custom keys. Metadata feature grabs all the keys including them.
`,
},

Options: []fs.Option{{
Name: "access_key_id",
Help: "IAS3 Access Key.\n\nLeave blank for anonymous access.\nYou can find one here: https://archive.org/account/s3.php",
Expand Down Expand Up @@ -90,6 +168,14 @@ Only enable if you need to be guaranteed to be reflected after write operations.
// maximum size of an item. this is constant across all items
const iaItemMaxSize int64 = 1099511627776

// metadata keys that are not writeable
var roMetadataKey = map[string]interface{}{
// do not add mtime here, it's a documented exception
"name": nil, "source": nil, "size": nil, "md5": nil,
"crc32": nil, "sha1": nil, "format": nil, "old_version": nil,
"viruscheck": nil,
}

// Options defines the configuration for this backend
type Options struct {
AccessKeyID string `config:"access_key_id"`
Expand Down Expand Up @@ -122,6 +208,7 @@ type Object struct {
md5 string // md5 hash of the file presented by the server
sha1 string // sha1 hash of the file presented by the server
crc32 string // crc32 of the file presented by the server
rawData json.RawMessage
}

// IAFile reprensents a subset of object in MetadataResponse.Files
Expand All @@ -135,6 +222,8 @@ type IAFile struct {
Md5 string `json:"md5"`
Crc32 string `json:"crc32"`
Sha1 string `json:"sha1"`

rawData json.RawMessage
}

// MetadataResponse reprensents subset of the JSON object returned by (frontend)/metadata/
Expand All @@ -143,6 +232,12 @@ type MetadataResponse struct {
ItemSize int64 `json:"item_size"`
}

// MetadataResponseRaw is the form of MetadataResponse to deal with metadata
type MetadataResponseRaw struct {
Files []json.RawMessage `json:"files"`
ItemSize int64 `json:"item_size"`
}

// ModMetadataResponse represents response for amending metadata
type ModMetadataResponse struct {
// https://archive.org/services/docs/api/md-write.html#example
Expand Down Expand Up @@ -226,7 +321,10 @@ func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, e
}
f.setRoot(root)
f.features = (&fs.Features{
BucketBased: true,
BucketBased: true,
ReadMetadata: true,
WriteMetadata: true,
UserMetadata: true,
}).Fill(ctx, f)

f.srv = rest.NewClient(fshttp.NewClient(ctx))
Expand Down Expand Up @@ -307,18 +405,17 @@ func (o *Object) SetModTime(ctx context.Context, t time.Time) (err error) {
}

// https://archive.org/services/docs/api/md-write.html
var patch = []interface{}{
// the following code might be useful for modifying metadata of an uploaded file
patch := []map[string]string{
// we should drop it first to clear all rclone-provided mtimes
struct {
Op string `json:"op"`
Path string `json:"path"`
}{"remove", "/rclone-mtime"},
struct {
Op string `json:"op"`
Path string `json:"path"`
Value string `json:"value"`
}{"add", "/rclone-mtime", t.Format(time.RFC3339Nano)},
}
{
"op": "remove",
"path": "/rclone-mtime",
}, {
"op": "add",
"path": "/rclone-mtime",
"value": t.Format(time.RFC3339Nano),
}}
res, err := json.Marshal(patch)
if err != nil {
return err
Expand Down Expand Up @@ -685,6 +782,23 @@ func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, op
headers["Content-Length"] = fmt.Sprintf("%d", size)
headers["x-archive-size-hint"] = fmt.Sprintf("%d", size)
}
var mdata fs.Metadata
mdata, err = fs.GetMetadataOptions(ctx, src, options)
if err == nil && mdata != nil {
for mk, mv := range mdata {
mk = strings.ToLower(mk)
if strings.HasPrefix(mk, "rclone-") {
fs.LogPrintf(fs.LogLevelWarning, o, "reserved metadata key %s is about to set", mk)
} else if _, ok := roMetadataKey[mk]; ok {
fs.LogPrintf(fs.LogLevelWarning, o, "setting or modifying read-only key %s is requested, skipping", mk)
continue
} else if mk == "mtime" {
// redirect to make it work
mk = "rclone-mtime"
}
headers[fmt.Sprintf("x-amz-filemeta-%s", mk)] = mv
}
}

// read the md5sum if available
var md5sumHex string
Expand Down Expand Up @@ -762,6 +876,34 @@ func (o *Object) String() string {
return o.remote
}

// Metadata returns all file metadata provided by Internet Archive
func (o *Object) Metadata(ctx context.Context) (m fs.Metadata, err error) {
if o.rawData == nil {
return nil, nil
}
raw := make(map[string]json.RawMessage)
err = json.Unmarshal(o.rawData, &raw)
if err != nil {
// fatal: json parsing failed
return
}
for k, v := range raw {
items, err := listOrString(v)
if len(items) == 0 || err != nil {
// skip: an entry failed to parse
continue
}
m.Set(k, items[0])
}
// move the old mtime to an another key
if v, ok := m["mtime"]; ok {
m["rclone-ia-mtime"] = v
}
// overwrite with a correct mtime
m["mtime"] = o.modTime.Format(time.RFC3339Nano)
return
}

func (f *Fs) shouldRetry(resp *http.Response, err error) (bool, error) {
if resp != nil {
for _, e := range retryErrorCodes {
Expand All @@ -788,20 +930,23 @@ func (o *Object) split() (bucket, bucketPath string) {
return o.fs.split(o.remote)
}

func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result MetadataResponse, err error) {
func (f *Fs) requestMetadata(ctx context.Context, bucket string) (result *MetadataResponse, err error) {
var resp *http.Response
// make a GET request to (frontend)/metadata/:item/
opts := rest.Opts{
Method: "GET",
Path: path.Join("/metadata/", bucket),
}

var temp MetadataResponseRaw
err = f.pacer.Call(func() (bool, error) {
resp, err = f.front.CallJSON(ctx, &opts, nil, &result)
resp, err = f.front.CallJSON(ctx, &opts, nil, &temp)
return f.shouldRetry(resp, err)
})

return result, err
if err != nil {
return
}
return temp.unraw()
}

// list up all files/directories without any filters
Expand Down Expand Up @@ -998,6 +1143,7 @@ func makeValidObject(f *Fs, remote string, file IAFile, mtime time.Time, size in
md5: file.Md5,
crc32: file.Crc32,
sha1: file.Sha1,
rawData: file.rawData,
}
}

Expand Down Expand Up @@ -1045,6 +1191,23 @@ func (file IAFile) parseMtime() (mtime time.Time) {
return mtime
}

func (mrr *MetadataResponseRaw) unraw() (_ *MetadataResponse, err error) {
var files []IAFile
for _, raw := range mrr.Files {
var parsed IAFile
err = json.Unmarshal(raw, &parsed)
if err != nil {
return nil, err
}
parsed.rawData = raw
files = append(files, parsed)
}
return &MetadataResponse{
Files: files,
ItemSize: mrr.ItemSize,
}, nil
}

func compareSize(a, b int64) bool {
if a < 0 || b < 0 {
// we won't compare if any of them is not known
Expand Down Expand Up @@ -1106,4 +1269,5 @@ var (
_ fs.PublicLinker = &Fs{}
_ fs.Abouter = &Fs{}
_ fs.Object = &Object{}
_ fs.Metadataer = &Object{}
)
27 changes: 27 additions & 0 deletions docs/content/internetarchive.md
Expand Up @@ -38,6 +38,33 @@ You can optionally wait for the server's processing to finish, by setting non-ze
By making it wait, rclone can do normal file comparison.
Make sure to set a large enough value (e.g. `30m0s` for smaller files) as it can take a long time depending on server's queue.

## About metadata
This backend supports setting, updating and reading metadata of each file.
The metadata will appear as file metadata on Internet Archive.
However, some fields are reserved by both Internet Archive and rclone.

The following are reserved by Internet Archive:
- `name`
- `source`
- `size`
- `md5`
- `crc32`
- `sha1`
- `format`
- `old_version`
- `viruscheck`

Trying to set values to these keys is ignored with a warning.
Only setting `mtime` is an exception. Doing so make it the identical behavior as setting ModTime.

rclone reserves all the keys starting with `rclone-`. Setting value for these keys will give you warnings, but values are set according to request.

If there are multiple values for a key, only the first one is returned.
This is a limitation of rclone, that supports one value per one key.
It can be triggered when you did a server-side copy.

Reading metadata will also provide custom (non-standard nor reserved) ones.

## Configuration

Here is an example of making an internetarchive configuration.
Expand Down
2 changes: 1 addition & 1 deletion docs/content/overview.md
Expand Up @@ -33,7 +33,7 @@ Here is an overview of the major features of each cloud storage system.
| HiDrive | HiDrive ¹² | R/W | No | No | - | - |
| HTTP | - | R | No | No | R | - |
| Hubic | MD5 | R/W | No | No | R/W | - |
| Internet Archive | MD5, SHA1, CRC32 | R/W ¹¹ | No | No | - | - |
| Internet Archive | MD5, SHA1, CRC32 | R/W ¹¹ | No | No | - | RWU |
| Jottacloud | MD5 | R/W | Yes | No | R | - |
| Koofr | MD5 | - | Yes | No | - | - |
| Mail.ru Cloud | Mailru ⁶ | R/W | Yes | No | - | - |
Expand Down

0 comments on commit 42dfadf

Please sign in to comment.