From fd73c7a3269192a69af18e6b43fec4967d14ba6d Mon Sep 17 00:00:00 2001 From: Ruben Vermeersch Date: Fri, 22 May 2015 15:23:27 +0200 Subject: [PATCH] Make it work --- catalog_test.go | 31 +++++++ dupefinder.go | 198 ++++++++++++++++++++++++++++++++++++++++--- fixtures/a/c/bla.txt | 1 + generate_test.go | 22 ++++- init_test.go | 16 ++++ 5 files changed, 255 insertions(+), 13 deletions(-) create mode 100644 catalog_test.go create mode 100644 fixtures/a/c/bla.txt create mode 100644 init_test.go diff --git a/catalog_test.go b/catalog_test.go new file mode 100644 index 0000000..45f0c6f --- /dev/null +++ b/catalog_test.go @@ -0,0 +1,31 @@ +package dupefinder + +import ( + "strings" + "testing" +) + +func TestCatalogParse(t *testing.T) { + in := ` +# A comment and an empty line: + +00e3261a6e0d79c329445acd540fb2b07187a0dcf6017065c8814010283ac67f test +5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03 a/b.txt +` + + reader := strings.NewReader(in) + + entries, err := ParseCatalogReader(reader) + if err != nil { + t.Error(err) + } + if len(entries) != 2 { + t.Errorf("Unexpected number of entries: %d", len(entries)) + } + if entries["00e3261a6e0d79c329445acd540fb2b07187a0dcf6017065c8814010283ac67f"] != "test" { + t.Error("Bad entry") + } + if entries["5891b5b522d5df086d0ff0b110fbd9d21bb4fc7163af34d08286a2e846f6be03"] != "a/b.txt" { + t.Error("Bad entry") + } +} diff --git a/dupefinder.go b/dupefinder.go index f1b1fb1..d8aff8d 100644 --- a/dupefinder.go +++ b/dupefinder.go @@ -1,26 +1,71 @@ package dupefinder import ( + "bufio" + "crypto/sha256" "fmt" + "io" "io/ioutil" "os" + "path" + "strings" ) +type FileHash struct { + Hash string + Filename string +} + +const header = `# This is a dupefinder catalog +# +# See https://github.com/rubenv/dupefinder for more info + +` + func Generate(catalog string, folders ...string) error { err := validateFolders(folders...) if err != nil { return err } - errs := make(chan error) - - go walkAllFolders(errs, folders...) + out, err := os.Create(catalog) + if err != nil { + return err + } + defer out.Close() - err = <-errs + _, err = out.WriteString(header) if err != nil { return err } + errs := make(chan error) + filenames := make(chan string, 100) + entries := make(chan FileHash, 100) + + go walkAllFolders(errs, filenames, folders...) + go hashFiles(errs, filenames, entries) + + for { + entry, ok := <-entries + if !ok { + break + } + + _, err := out.WriteString(fmt.Sprintf("%s %s\n", entry.Hash, entry.Filename)) + if err != nil { + return err + } + } + + select { + case err := <-errs: + if err != nil { + return err + } + default: + } + return nil } @@ -30,6 +75,55 @@ func Detect(catalog string, echo, rm bool, folders ...string) error { return err } + catalogEntries, err := ParseCatalog(catalog) + if err != nil { + return err + } + + errs := make(chan error) + filenames := make(chan string, 100) + entries := make(chan FileHash, 100) + + go walkAllFolders(errs, filenames, folders...) + go hashFiles(errs, filenames, entries) + + deleted := int64(0) + for { + entry, ok := <-entries + if !ok { + break + } + + if orig, ok := catalogEntries[entry.Hash]; ok { + fi, err := os.Stat(entry.Filename) + if err != nil { + return err + } + + deleted += fi.Size() + + if echo { + fmt.Printf("Would delete %s (matches %s)\n", entry.Filename, orig) + } else { + fmt.Printf("Deleting %s (matches %s)\n", entry.Filename, orig) + err := os.Remove(entry.Filename) + if err != nil { + return err + } + } + } + } + + fmt.Printf("Size saved: %d bytes\n", deleted) + + select { + case err := <-errs: + if err != nil { + return err + } + default: + } + return nil } @@ -62,11 +156,11 @@ func isFolder(filename string) (bool, error) { return fi.IsDir(), nil } -func walkAllFolders(errs chan error, folders ...string) { - defer close(errs) +func walkAllFolders(errs chan error, filenames chan string, folders ...string) { + defer close(filenames) for _, f := range folders { - err := walkFolder(f) + err := walkFolder(f, filenames) if err != nil { errs <- err return @@ -74,16 +168,100 @@ func walkAllFolders(errs chan error, folders ...string) { } } -func walkFolder(filename string) error { +func walkFolder(filename string, out chan string) error { fi, err := ioutil.ReadDir(filename) if err != nil { return err } - fmt.Println(filename) for _, child := range fi { - fmt.Println(child) + fullname := path.Join(filename, child.Name()) + if child.IsDir() { + err := walkFolder(fullname, out) + if err != nil { + return err + } + } else if child.Mode().IsRegular() { + out <- fullname + } } return nil } + +func hashFiles(errs chan error, filenames chan string, entries chan FileHash) { + defer close(entries) + + for { + filename, ok := <-filenames + if !ok { + return + } + + hash, err := hashFile(filename) + if err != nil { + errs <- err + return + } + + entries <- FileHash{ + Hash: hash, + Filename: filename, + } + } +} + +func hashFile(filename string) (string, error) { + file, err := os.Open(filename) + if err != nil { + return "", err + } + defer file.Close() + + hash := sha256.New() + if _, err := io.Copy(hash, file); err != nil { + return "", err + } + + return fmt.Sprintf("%x", hash.Sum([]byte{})), nil +} + +func ParseCatalog(filename string) (map[string]string, error) { + file, err := os.Open(filename) + if err != nil { + return nil, err + } + defer file.Close() + + return ParseCatalogReader(file) +} + +func ParseCatalogReader(reader io.Reader) (map[string]string, error) { + result := map[string]string{} + + bufreader := bufio.NewReader(reader) + + done := false + for !done { + line, err := bufreader.ReadString('\n') + if err == io.EOF { + done = true + } else if err != nil { + return nil, err + } + + line = strings.TrimSpace(line) + if line == "" || line[0] == '#' { + continue + } + + parts := strings.SplitN(line, " ", 2) + if len(parts) != 2 { + return nil, fmt.Errorf("Malformed line: %#v", line) + } + + result[parts[0]] = parts[1] + } + + return result, nil +} diff --git a/fixtures/a/c/bla.txt b/fixtures/a/c/bla.txt new file mode 100644 index 0000000..a7f8d9e --- /dev/null +++ b/fixtures/a/c/bla.txt @@ -0,0 +1 @@ +bla diff --git a/generate_test.go b/generate_test.go index bffaff4..e72cfbc 100644 --- a/generate_test.go +++ b/generate_test.go @@ -1,6 +1,9 @@ package dupefinder -import "testing" +import ( + "os" + "testing" +) func TestGenerateFolders(t *testing.T) { err := Generate("catalog.txt", "invalid") @@ -10,10 +13,23 @@ func TestGenerateFolders(t *testing.T) { } func TestGenerateFinds(t *testing.T) { - err := Generate("catalog.txt", "fixtures/a") + catalog := tempFilename(t) + defer os.Remove(catalog) + + err := Generate(catalog, "fixtures/a") if err != nil { t.Error(err) } - // TODO: Read catalog, see if files are there and check checksums + entries, err := ParseCatalog(catalog) + if err != nil { + t.Error(err) + } + if len(entries) != 2 { + t.Errorf("Unexpected number of entries: %d", len(entries)) + } + + if entries["00e3261a6e0d79c329445acd540fb2b07187a0dcf6017065c8814010283ac67f"] != "fixtures/a/c/bla.txt" { + t.Error("Bad entry") + } } diff --git a/init_test.go b/init_test.go new file mode 100644 index 0000000..94537c6 --- /dev/null +++ b/init_test.go @@ -0,0 +1,16 @@ +package dupefinder + +import ( + "io/ioutil" + "testing" +) + +func tempFilename(t *testing.T) string { + f, err := ioutil.TempFile("", "dupefinder") + if err != nil { + t.Fatal(err) + } + f.Close() + + return f.Name() +}