sreedevk · sreedevk · Jan 18, 2023 · Jan 18, 2023 · Jan 18, 2023 · Jan 18, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,13 +10,13 @@ authors = ["Sreedev Kodichath <sreedevpadmakumar@gmail.com>", "Valentin Bersier
 
 [dependencies]
 anyhow = "1.0.68"
+bytesize = "1.1.0"
 chrono = "0.4.23"
 clap = { version = "4.0.32", features = ["derive"] }
 colored = "2.0.0"
 dashmap = { version = "5.4.0", features = ["rayon"] }
 fxhash = "0.2.1"
 glob = "0.3.0"
-humansize = "2.1.2"
 indicatif = { version = "0.17.2", features = ["rayon", "tokio"] }
 itertools = "0.10.5"
 memmap2 = "0.5.8"

diff --git a/README.md b/README.md
@@ -14,11 +14,12 @@ NOTE: This project is still being developed. At the moment, as shown in the scre
 Usage: deduplicator [OPTIONS]
 
 Options:
-  -t, --types <TYPES>  Filetypes to deduplicate (default = all)
-      --dir <DIR>      Run Deduplicator on dir different from pwd
-  -i, --interactive    Delete files interactively
-  -h, --help           Print help information
-  -V, --version        Print version information
+  -t, --types <TYPES>      Filetypes to deduplicate (default = all)
+      --dir <DIR>          Run Deduplicator on dir different from pwd
+  -i, --interactive        Delete files interactively
+  -m, --minsize <MINSIZE>  Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0]
+  -h, --help               Print help information
+  -V, --version            Print version information
 ```
 
 <h2 align="center">Installation</h2>

diff --git a/src/filters.rs b/src/filters.rs
@@ -0,0 +1,12 @@
+use crate::file_manager::File;
+use crate::params::Params;
+
+pub fn is_file_gt_minsize(app_opts: &Params, file: &File) -> bool {
+    match app_opts.get_minsize() {
+        Some(msize) => match file.size {
+            Some(fsize) => fsize >= msize,
+            None => true,
+        },
+        None => true,
+    }
+}
diff --git a/src/main.rs b/src/main.rs
@@ -3,6 +3,7 @@ mod file_manager;
 mod output;
 mod params;
 mod scanner;
+mod filters;
 
 use anyhow::Result;
 use app::App;

diff --git a/src/output.rs b/src/output.rs
@@ -5,7 +5,6 @@ use chrono::offset::Utc;
 use chrono::DateTime;
 use colored::Colorize;
 use dashmap::DashMap;
-use humansize::{format_size, DECIMAL};
 use itertools::Itertools;
 use prettytable::{format, row, Table};
 use std::io::Write;
@@ -30,10 +29,8 @@ fn format_path(path: &str, opts: &Params) -> Result<String> {
     Ok(format!("...{:<32}", display_range))
 }
 
-fn file_size(path: &String) -> Result<String> {
-    let mdata = fs::metadata(path)?;
-    let formatted_size = format!("{:>12}", format_size(mdata.len(), DECIMAL));
-    Ok(formatted_size)
+fn file_size(file: &File) -> Result<String> {
+    Ok(format!("{:>12}", bytesize::ByteSize::b(file.size.unwrap())))
 }
 
 fn modified_time(path: &String) -> Result<String> {
@@ -119,6 +116,15 @@ fn process_group_action(duplicates: &Vec<File>, dup_index: usize, dup_size: usiz
 
 pub fn interactive(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
     print_meta_info();
+
+    if duplicates.is_empty() {
+        println!(
+            "\n{}",
+            "No duplicates found matching your search criteria.".green()
+        );
+        return;
+    }
+
     duplicates
         .clone()
         .into_iter()
@@ -131,7 +137,7 @@ pub fn interactive(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
                 itable.add_row(row![
                     index,
                     format_path(&file.path, opts).unwrap_or_default().blue(),
-                    file_size(&file.path).unwrap_or_default().red(),
+                    file_size(&file).unwrap_or_default().red(),
                     modified_time(&file.path).unwrap_or_default().yellow()
                 ]);
             });
@@ -143,6 +149,14 @@ pub fn interactive(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
 pub fn print(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
     print_meta_info();
 
+    if duplicates.is_empty() {
+        println!(
+            "\n{}",
+            "No duplicates found matching your search criteria.".green()
+        );
+        return;
+    }
+
     let mut output_table = Table::new();
     output_table.set_titles(row!["hash", "duplicates"]);
     duplicates.into_iter().for_each(|(hash, group)| {
@@ -151,7 +165,7 @@ pub fn print(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
         group.iter().for_each(|file| {
             inner_table.add_row(row![
                 format_path(&file.path, opts).unwrap_or_default().blue(),
-                file_size(&file.path).unwrap_or_default().red(),
+                file_size(&file).unwrap_or_default().red(),
                 modified_time(&file.path).unwrap_or_default().yellow()
             ]);
         });

diff --git a/src/params.rs b/src/params.rs
@@ -14,9 +14,24 @@ pub struct Params {
     /// Delete files interactively
     #[arg(long, short)]
     pub interactive: bool,
+    /// Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0]
+    #[arg(long, short)]
+    pub minsize: Option<String>,
 }
 
 impl Params {
+    pub fn get_minsize(&self) -> Option<u64> {
+        match &self.minsize {
+            Some(msize) => {
+                match msize.parse::<bytesize::ByteSize>() {
+                    Ok(units) => Some(units.0),
+                    Err(_) => None
+                }
+            },
+            None => None
+        }
+    }
+
     pub fn get_directory(&self) -> Result<String> {
         let dir_pathbuf: PathBuf = self
             .dir

diff --git a/src/scanner.rs b/src/scanner.rs
@@ -1,3 +1,4 @@
+use crate::{file_manager::File, filters, params::Params};
 use anyhow::Result;
 use dashmap::DashMap;
 use fxhash::hash64 as hasher;
@@ -8,8 +9,6 @@ use rayon::prelude::*;
 use std::hash::Hasher;
 use std::{fs, path::PathBuf};
 
-use crate::{file_manager::File, params::Params};
-
 #[derive(Clone, Copy)]
 enum IndexCritera {
     Size,
@@ -28,12 +27,7 @@ pub fn duplicates(app_opts: &Params) -> Result<DashMap<String, Vec<File>>> {
         .collect::<Vec<File>>();
 
     if sizewize_duplicate_files.len() > 1 {
-        let size_wise_duplicate_paths = sizewize_duplicate_files
-            .into_par_iter()
-            .map(|file| file.path)
-            .collect::<Vec<String>>();
-
-        let hash_index_store = index_files(size_wise_duplicate_paths, IndexCritera::Hash)?;
+        let hash_index_store = index_files(sizewize_duplicate_files, IndexCritera::Hash)?;
         let duplicate_files = hash_index_store
             .into_par_iter()
             .filter(|(_, files)| files.len() > 1)
@@ -45,9 +39,9 @@ pub fn duplicates(app_opts: &Params) -> Result<DashMap<String, Vec<File>>> {
     }
 }
 
-fn scan(app_opts: &Params) -> Result<Vec<String>> {
+fn scan(app_opts: &Params) -> Result<Vec<File>> {
     let glob_patterns: Vec<PathBuf> = app_opts.get_glob_patterns();
-    let files: Vec<String> = glob_patterns
+    let files: Vec<File> = glob_patterns
         .par_iter()
         .progress_with_style(ProgressStyle::with_template(
             "{spinner:.green} [scanning files] [{wide_bar:.cyan/blue}] {pos}/{len} files",
@@ -63,42 +57,39 @@ fn scan(app_opts: &Params) -> Result<Vec<String>> {
                 })
                 .collect::<Vec<String>>()
         })
+        .map(|file_path| File {
+            path: file_path.clone(),
+            hash: None,
+            size: Some(fs::metadata(file_path).unwrap().len()),
+        })
+        .filter(|file| filters::is_file_gt_minsize(app_opts, file))
         .collect();
 
     Ok(files)
 }
 
-fn process_file_size_index(fpath: String) -> Result<File> {
-    Ok(File {
-        path: fpath.clone(),
-        size: Some(fs::metadata(fpath)?.len()),
-        hash: None,
-    })
-}
-
-fn process_file_hash_index(fpath: String) -> Result<File> {
+fn process_file_hash_index(file: &File) -> Result<File> {
     Ok(File {
-        path: fpath.clone(),
-        size: None,
-        hash: Some(hash_file(&fpath).unwrap_or_default()),
+        path: file.path.clone(),
+        size: file.size,
+        hash: Some(hash_file(&file.path).unwrap_or_default()),
     })
 }
 
 fn process_file_index(
-    fpath: String,
+    file: File,
     store: &DashMap<String, Vec<File>>,
     index_criteria: IndexCritera,
 ) {
     match index_criteria {
         IndexCritera::Size => {
-            let processed_file = process_file_size_index(fpath).unwrap();
             store
-                .entry(processed_file.size.unwrap_or_default().to_string())
-                .and_modify(|fileset| fileset.push(processed_file.clone()))
-                .or_insert_with(|| vec![processed_file]);
+                .entry(file.size.unwrap_or_default().to_string())
+                .and_modify(|fileset| fileset.push(file.clone()))
+                .or_insert_with(|| vec![file]);
         }
         IndexCritera::Hash => {
-            let processed_file = process_file_hash_index(fpath).unwrap();
+            let processed_file = process_file_hash_index(&file).unwrap();
             let indexhash = processed_file.clone().hash.unwrap_or_default();
 
             store
@@ -110,7 +101,7 @@ fn process_file_index(
 }
 
 fn index_files(
-    files: Vec<String>,
+    files: Vec<File>,
     index_criteria: IndexCritera,
 ) -> Result<DashMap<String, Vec<File>>> {
     let store: DashMap<String, Vec<File>> = DashMap::new();
@@ -124,7 +115,7 @@ fn index_files(
     Ok(store)
 }
 
-pub fn incremental_hashing(filepath: &str) -> Result<String> {
+fn incremental_hashing(filepath: &str) -> Result<String> {
     let file = fs::File::open(filepath)?;
     let fmap = unsafe { Mmap::map(&file)? };
     let mut inchasher = fxhash::FxHasher::default();
@@ -135,12 +126,12 @@ pub fn incremental_hashing(filepath: &str) -> Result<String> {
     Ok(format!("{}", inchasher.finish()))
 }
 
-pub fn standard_hashing(filepath: &str) -> Result<String> {
+fn standard_hashing(filepath: &str) -> Result<String> {
     let file = fs::read(filepath)?;
     Ok(hasher(&*file).to_string())
 }
 
-pub fn hash_file(filepath: &str) -> Result<String> {
+fn hash_file(filepath: &str) -> Result<String> {
     let filemeta = fs::metadata(filepath)?;
 
     // NOTE: USE INCREMENTAL HASHING ONLY FOR FILES > 100MB