/
dedupe.go
329 lines (304 loc) · 9.2 KB
/
dedupe.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
// dedupe - gets rid of identical files remotes which can have duplicate file names (drive, mega)
package operations
import (
"context"
"fmt"
"log"
"path"
"sort"
"strings"
"github.com/pkg/errors"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fs/walk"
"github.com/spf13/pflag"
)
// dedupeRename renames the objs slice to different names
func dedupeRename(ctx context.Context, f fs.Fs, remote string, objs []fs.Object) {
doMove := f.Features().Move
if doMove == nil {
log.Fatalf("Fs %v doesn't support Move", f)
}
ext := path.Ext(remote)
base := remote[:len(remote)-len(ext)]
outer:
for i, o := range objs {
suffix := 1
newName := fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
_, err := f.NewObject(ctx, newName)
for ; err != fs.ErrorObjectNotFound; suffix++ {
if err != nil {
fs.CountError(err)
fs.Errorf(o, "Failed to check for existing object: %v", err)
continue outer
}
if suffix > 100 {
fs.Errorf(o, "Could not find an available new name")
continue outer
}
newName = fmt.Sprintf("%s-%d%s", base, i+suffix, ext)
_, err = f.NewObject(ctx, newName)
}
if !fs.Config.DryRun {
newObj, err := doMove(ctx, o, newName)
if err != nil {
fs.CountError(err)
fs.Errorf(o, "Failed to rename: %v", err)
continue
}
fs.Infof(newObj, "renamed from: %v", o)
} else {
fs.Logf(remote, "Not renaming to %q as --dry-run", newName)
}
}
}
// dedupeDeleteAllButOne deletes all but the one in keep
func dedupeDeleteAllButOne(ctx context.Context, keep int, remote string, objs []fs.Object) {
count := 0
for i, o := range objs {
if i == keep {
continue
}
err := DeleteFile(ctx, o)
if err == nil {
count++
}
}
if count > 0 {
fs.Logf(remote, "Deleted %d extra copies", count)
}
}
// dedupeDeleteIdentical deletes all but one of identical (by hash) copies
func dedupeDeleteIdentical(ctx context.Context, ht hash.Type, remote string, objs []fs.Object) (remainingObjs []fs.Object) {
// See how many of these duplicates are identical
byHash := make(map[string][]fs.Object, len(objs))
for _, o := range objs {
md5sum, err := o.Hash(ctx, ht)
if err != nil || md5sum == "" {
remainingObjs = append(remainingObjs, o)
} else {
byHash[md5sum] = append(byHash[md5sum], o)
}
}
// Delete identical duplicates, filling remainingObjs with the ones remaining
for md5sum, hashObjs := range byHash {
remainingObjs = append(remainingObjs, hashObjs[0])
if len(hashObjs) > 1 {
fs.Logf(remote, "Deleting %d/%d identical duplicates (%v %q)", len(hashObjs)-1, len(hashObjs), ht, md5sum)
for _, o := range hashObjs[1:] {
err := DeleteFile(ctx, o)
if err != nil {
remainingObjs = append(remainingObjs, o)
}
}
}
}
return remainingObjs
}
// dedupeInteractive interactively dedupes the slice of objects
func dedupeInteractive(ctx context.Context, f fs.Fs, ht hash.Type, remote string, objs []fs.Object) {
fmt.Printf("%s: %d duplicates remain\n", remote, len(objs))
for i, o := range objs {
md5sum, err := o.Hash(ctx, ht)
if err != nil {
md5sum = err.Error()
}
fmt.Printf(" %d: %12d bytes, %s, %v %32s\n", i+1, o.Size(), o.ModTime(ctx).Local().Format("2006-01-02 15:04:05.000000000"), ht, md5sum)
}
switch config.Command([]string{"sSkip and do nothing", "kKeep just one (choose which in next step)", "rRename all to be different (by changing file.jpg to file-1.jpg)"}) {
case 's':
case 'k':
keep := config.ChooseNumber("Enter the number of the file to keep", 1, len(objs))
dedupeDeleteAllButOne(ctx, keep-1, remote, objs)
case 'r':
dedupeRename(ctx, f, remote, objs)
}
}
type objectsSortedByModTime []fs.Object
func (objs objectsSortedByModTime) Len() int { return len(objs) }
func (objs objectsSortedByModTime) Swap(i, j int) { objs[i], objs[j] = objs[j], objs[i] }
func (objs objectsSortedByModTime) Less(i, j int) bool {
return objs[i].ModTime(context.TODO()).Before(objs[j].ModTime(context.TODO()))
}
// DeduplicateMode is how the dedupe command chooses what to do
type DeduplicateMode int
// Deduplicate modes
const (
DeduplicateInteractive DeduplicateMode = iota // interactively ask the user
DeduplicateSkip // skip all conflicts
DeduplicateFirst // choose the first object
DeduplicateNewest // choose the newest object
DeduplicateOldest // choose the oldest object
DeduplicateRename // rename the objects
DeduplicateLargest // choose the largest object
)
func (x DeduplicateMode) String() string {
switch x {
case DeduplicateInteractive:
return "interactive"
case DeduplicateSkip:
return "skip"
case DeduplicateFirst:
return "first"
case DeduplicateNewest:
return "newest"
case DeduplicateOldest:
return "oldest"
case DeduplicateRename:
return "rename"
case DeduplicateLargest:
return "largest"
}
return "unknown"
}
// Set a DeduplicateMode from a string
func (x *DeduplicateMode) Set(s string) error {
switch strings.ToLower(s) {
case "interactive":
*x = DeduplicateInteractive
case "skip":
*x = DeduplicateSkip
case "first":
*x = DeduplicateFirst
case "newest":
*x = DeduplicateNewest
case "oldest":
*x = DeduplicateOldest
case "rename":
*x = DeduplicateRename
case "largest":
*x = DeduplicateLargest
default:
return errors.Errorf("Unknown mode for dedupe %q.", s)
}
return nil
}
// Type of the value
func (x *DeduplicateMode) Type() string {
return "string"
}
// Check it satisfies the interface
var _ pflag.Value = (*DeduplicateMode)(nil)
// dedupeFindDuplicateDirs scans f for duplicate directories
func dedupeFindDuplicateDirs(ctx context.Context, f fs.Fs) ([][]fs.Directory, error) {
dirs := map[string][]fs.Directory{}
err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListDirs, func(entries fs.DirEntries) error {
entries.ForDir(func(d fs.Directory) {
dirs[d.Remote()] = append(dirs[d.Remote()], d)
})
return nil
})
if err != nil {
return nil, errors.Wrap(err, "find duplicate dirs")
}
duplicateDirs := [][]fs.Directory{}
for _, ds := range dirs {
if len(ds) > 1 {
duplicateDirs = append(duplicateDirs, ds)
}
}
return duplicateDirs, nil
}
// dedupeMergeDuplicateDirs merges all the duplicate directories found
func dedupeMergeDuplicateDirs(ctx context.Context, f fs.Fs, duplicateDirs [][]fs.Directory) error {
mergeDirs := f.Features().MergeDirs
if mergeDirs == nil {
return errors.Errorf("%v: can't merge directories", f)
}
dirCacheFlush := f.Features().DirCacheFlush
if dirCacheFlush == nil {
return errors.Errorf("%v: can't flush dir cache", f)
}
for _, dirs := range duplicateDirs {
if !fs.Config.DryRun {
fs.Infof(dirs[0], "Merging contents of duplicate directories")
err := mergeDirs(ctx, dirs)
if err != nil {
return errors.Wrap(err, "merge duplicate dirs")
}
} else {
fs.Infof(dirs[0], "NOT Merging contents of duplicate directories as --dry-run")
}
}
dirCacheFlush()
return nil
}
// Deduplicate interactively finds duplicate files and offers to
// delete all but one or rename them to be different. Only useful with
// Google Drive which can have duplicate file names.
func Deduplicate(ctx context.Context, f fs.Fs, mode DeduplicateMode) error {
fs.Infof(f, "Looking for duplicates using %v mode.", mode)
// Find duplicate directories first and fix them - repeat
// until all fixed
for {
duplicateDirs, err := dedupeFindDuplicateDirs(ctx, f)
if err != nil {
return err
}
if len(duplicateDirs) == 0 {
break
}
err = dedupeMergeDuplicateDirs(ctx, f, duplicateDirs)
if err != nil {
return err
}
if fs.Config.DryRun {
break
}
}
// find a hash to use
ht := f.Hashes().GetOne()
// Now find duplicate files
files := map[string][]fs.Object{}
err := walk.ListR(ctx, f, "", true, fs.Config.MaxDepth, walk.ListObjects, func(entries fs.DirEntries) error {
entries.ForObject(func(o fs.Object) {
remote := o.Remote()
files[remote] = append(files[remote], o)
})
return nil
})
if err != nil {
return err
}
for remote, objs := range files {
if len(objs) > 1 {
fs.Logf(remote, "Found %d duplicates - deleting identical copies", len(objs))
objs = dedupeDeleteIdentical(ctx, ht, remote, objs)
if len(objs) <= 1 {
fs.Logf(remote, "All duplicates removed")
continue
}
switch mode {
case DeduplicateInteractive:
dedupeInteractive(ctx, f, ht, remote, objs)
case DeduplicateFirst:
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateNewest:
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
dedupeDeleteAllButOne(ctx, len(objs)-1, remote, objs)
case DeduplicateOldest:
sort.Sort(objectsSortedByModTime(objs)) // sort oldest first
dedupeDeleteAllButOne(ctx, 0, remote, objs)
case DeduplicateRename:
dedupeRename(ctx, f, remote, objs)
case DeduplicateLargest:
largest, largestIndex := int64(-1), -1
for i, obj := range objs {
size := obj.Size()
if size > largest {
largest, largestIndex = size, i
}
}
if largestIndex > -1 {
dedupeDeleteAllButOne(ctx, largestIndex, remote, objs)
}
case DeduplicateSkip:
// skip
default:
//skip
}
}
}
return nil
}