Skip to content

Commit

Permalink
Add folder rename detection (stashapp#3817)
Browse files Browse the repository at this point in the history
  • Loading branch information
WithoutPants committed Jul 11, 2023
1 parent 5c38836 commit 93b41fb
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 13 deletions.
2 changes: 2 additions & 0 deletions pkg/file/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,12 @@ type Getter interface {
FindByFingerprint(ctx context.Context, fp Fingerprint) ([]File, error)
FindByZipFileID(ctx context.Context, zipFileID ID) ([]File, error)
FindAllInPaths(ctx context.Context, p []string, limit, offset int) ([]File, error)
FindByFileInfo(ctx context.Context, info fs.FileInfo, size int64) ([]File, error)
}

type Counter interface {
CountAllInPaths(ctx context.Context, p []string) (int, error)
CountByFolderID(ctx context.Context, folderID FolderID) (int, error)
}

// Creator provides methods to create Files.
Expand Down
195 changes: 195 additions & 0 deletions pkg/file/folder_rename_detect.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
package file

import (
"context"
"errors"
"fmt"
"io/fs"

"github.com/stashapp/stash/pkg/logger"
)

type folderRenameCandidate struct {
folder *Folder
found int
files int
}

type folderRenameDetector struct {
// candidates is a map of folder id to the number of files that match
candidates map[FolderID]folderRenameCandidate
// rejects is a set of folder ids which were found to still exist
rejects map[FolderID]struct{}
}

func (d *folderRenameDetector) isReject(id FolderID) bool {
_, ok := d.rejects[id]
return ok
}

func (d *folderRenameDetector) getCandidate(id FolderID) *folderRenameCandidate {
c, ok := d.candidates[id]
if !ok {
return nil
}

return &c
}

func (d *folderRenameDetector) setCandidate(c folderRenameCandidate) {
d.candidates[c.folder.ID] = c
}

func (d *folderRenameDetector) reject(id FolderID) {
d.rejects[id] = struct{}{}
}

// bestCandidate returns the folder that is the best candidate for a rename.
// This is the folder that has the largest number of its original files that
// are still present in the new location.
func (d *folderRenameDetector) bestCandidate() *Folder {
if len(d.candidates) == 0 {
return nil
}

var best *folderRenameCandidate

for _, c := range d.candidates {
// ignore folders that have less than 50% of their original files
if c.found < c.files/2 {
continue
}

// prefer the folder with the most files if the ratio is the same
if best == nil || c.found > best.found {
cc := c
best = &cc
}
}

if best == nil {
return nil
}

return best.folder
}

func (s *scanJob) detectFolderMove(ctx context.Context, file scanFile) (*Folder, error) {
// in order for a folder to be considered moved, the existing folder must be
// missing, and the majority of the old folder's files must be present, unchanged,
// in the new folder.

detector := folderRenameDetector{
candidates: make(map[FolderID]folderRenameCandidate),
rejects: make(map[FolderID]struct{}),
}
// rejects is a set of folder ids which were found to still exist

if err := symWalk(file.fs, file.Path, func(path string, d fs.DirEntry, err error) error {
if err != nil {
// don't let errors prevent scanning
logger.Errorf("error scanning %s: %v", path, err)
return nil
}

// ignore root
if path == file.Path {
return nil
}

// ignore directories
if d.IsDir() {
return fs.SkipDir
}

info, err := d.Info()
if err != nil {
return fmt.Errorf("reading info for %q: %w", path, err)
}

if !s.acceptEntry(ctx, path, info) {
return nil
}

size, err := getFileSize(file.fs, path, info)
if err != nil {
return fmt.Errorf("getting file size for %q: %w", path, err)
}

// check if the file exists in the database based on basename, size and mod time
existing, err := s.Repository.Store.FindByFileInfo(ctx, info, size)
if err != nil {
return fmt.Errorf("checking for existing file %q: %w", path, err)
}

for _, e := range existing {
// ignore files in zip files
if e.Base().ZipFileID != nil {
continue
}

parentFolderID := e.Base().ParentFolderID

if detector.isReject(parentFolderID) {
// folder was found to still exist, not a candidate
continue
}

c := detector.getCandidate(parentFolderID)

if c == nil {
// need to check if the folder exists in the filesystem
pf, err := s.Repository.FolderStore.Find(ctx, e.Base().ParentFolderID)
if err != nil {
return fmt.Errorf("getting parent folder %d: %w", e.Base().ParentFolderID, err)
}

if pf == nil {
// shouldn't happen, but just in case
continue
}

// parent folder must be missing
_, err = file.fs.Lstat(pf.Path)
if err == nil {
// parent folder exists, not a candidate
detector.reject(parentFolderID)
continue
}

if !errors.Is(err, fs.ErrNotExist) {
return fmt.Errorf("checking for parent folder %q: %w", pf.Path, err)
}

// parent folder is missing, possible candidate
// count the total number of files in the existing folder
count, err := s.Repository.Store.CountByFolderID(ctx, parentFolderID)
if err != nil {
return fmt.Errorf("counting files in folder %d: %w", parentFolderID, err)
}

if count == 0 {
// no files in the folder, not a candidate
detector.reject(parentFolderID)
continue
}

c = &folderRenameCandidate{
folder: pf,
found: 0,
files: count,
}
}

// increment the count and set it in the map
c.found++
detector.setCandidate(*c)
}

return nil
}); err != nil {
return nil, fmt.Errorf("walking filesystem for folder rename detection: %w", err)
}

return detector.bestCandidate(), nil
}
76 changes: 63 additions & 13 deletions pkg/file/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,19 +215,6 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs
return fmt.Errorf("reading info for %q: %w", path, err)
}

var size int64

// #2196/#3042 - replace size with target size if file is a symlink
if info.Mode()&os.ModeSymlink == os.ModeSymlink {
targetInfo, err := f.Stat(path)
if err != nil {
return fmt.Errorf("reading info for symlink %q: %w", path, err)
}
size = targetInfo.Size()
} else {
size = info.Size()
}

if !s.acceptEntry(ctx, path, info) {
if info.IsDir() {
return fs.SkipDir
Expand All @@ -236,6 +223,11 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs
return nil
}

size, err := getFileSize(f, path, info)
if err != nil {
return err
}

ff := scanFile{
BaseFile: &BaseFile{
DirEntry: DirEntry{
Expand Down Expand Up @@ -294,6 +286,19 @@ func (s *scanJob) queueFileFunc(ctx context.Context, f FS, zipFile *scanFile) fs
}
}

func getFileSize(f FS, path string, info fs.FileInfo) (int64, error) {
// #2196/#3042 - replace size with target size if file is a symlink
if info.Mode()&os.ModeSymlink == os.ModeSymlink {
targetInfo, err := f.Stat(path)
if err != nil {
return 0, fmt.Errorf("reading info for symlink %q: %w", path, err)
}
return targetInfo.Size(), nil
}

return info.Size(), nil
}

func (s *scanJob) acceptEntry(ctx context.Context, path string, info fs.FileInfo) bool {
// always accept if there's no filters
accept := len(s.options.ScanFilters) == 0
Expand Down Expand Up @@ -485,6 +490,15 @@ func (s *scanJob) handleFolder(ctx context.Context, file scanFile) error {
}

func (s *scanJob) onNewFolder(ctx context.Context, file scanFile) (*Folder, error) {
renamed, err := s.handleFolderRename(ctx, file)
if err != nil {
return nil, err
}

if renamed != nil {
return renamed, nil
}

now := time.Now()

toCreate := &Folder{
Expand Down Expand Up @@ -522,6 +536,42 @@ func (s *scanJob) onNewFolder(ctx context.Context, file scanFile) (*Folder, erro
return toCreate, nil
}

func (s *scanJob) handleFolderRename(ctx context.Context, file scanFile) (*Folder, error) {
// ignore folders in zip files
if file.ZipFileID != nil {
return nil, nil
}

// check if the folder was moved from elsewhere
renamedFrom, err := s.detectFolderMove(ctx, file)
if err != nil {
return nil, fmt.Errorf("detecting folder move: %w", err)
}

if renamedFrom == nil {
return nil, nil
}

// if the folder was moved, update the existing folder
logger.Infof("%s moved to %s. Updating path...", renamedFrom.Path, file.Path)
renamedFrom.Path = file.Path

// update the parent folder ID
// find the parent folder
parentFolderID, err := s.getFolderID(ctx, filepath.Dir(file.Path))
if err != nil {
return nil, fmt.Errorf("getting parent folder for %q: %w", file.Path, err)
}

renamedFrom.ParentFolderID = parentFolderID

if err := s.Repository.FolderStore.Update(ctx, renamedFrom); err != nil {
return nil, fmt.Errorf("updating folder for rename %q: %w", renamedFrom.Path, err)
}

return renamedFrom, nil
}

func (s *scanJob) onExistingFolder(ctx context.Context, f scanFile, existing *Folder) (*Folder, error) {
update := false

Expand Down
27 changes: 27 additions & 0 deletions pkg/sqlite/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ import (
"database/sql"
"errors"
"fmt"
"io/fs"
"path/filepath"
"strings"
"time"

"github.com/doug-martin/goqu/v9"
"github.com/doug-martin/goqu/v9/exp"
Expand Down Expand Up @@ -713,6 +715,31 @@ func (qb *FileStore) FindByZipFileID(ctx context.Context, zipFileID file.ID) ([]
return qb.getMany(ctx, q)
}

// FindByFileInfo finds files that match the base name, size, and mod time of the given file.
func (qb *FileStore) FindByFileInfo(ctx context.Context, info fs.FileInfo, size int64) ([]file.File, error) {
table := qb.table()

modTime := info.ModTime().Format(time.RFC3339)

q := qb.selectDataset().Prepared(true).Where(
table.Col("basename").Eq(info.Name()),
table.Col("size").Eq(size),
table.Col("mod_time").Eq(modTime),
)

return qb.getMany(ctx, q)
}

func (qb *FileStore) CountByFolderID(ctx context.Context, folderID file.FolderID) (int, error) {
table := qb.table()

q := qb.countDataset().Prepared(true).Where(
table.Col("parent_folder_id").Eq(folderID),
)

return count(ctx, q)
}

func (qb *FileStore) IsPrimary(ctx context.Context, fileID file.ID) (bool, error) {
joinTables := []exp.IdentifierExpression{
scenesFilesJoinTable,
Expand Down

0 comments on commit 93b41fb

Please sign in to comment.