Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Min Size Filter Feature #26

Merged
merged 4 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 7 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ authors = ["Sreedev Kodichath <sreedevpadmakumar@gmail.com>", "Valentin Bersier

[dependencies]
anyhow = "1.0.68"
bytesize = "1.1.0"
chrono = "0.4.23"
clap = { version = "4.0.32", features = ["derive"] }
colored = "2.0.0"
dashmap = { version = "5.4.0", features = ["rayon"] }
fxhash = "0.2.1"
glob = "0.3.0"
humansize = "2.1.2"
indicatif = { version = "0.17.2", features = ["rayon", "tokio"] }
itertools = "0.10.5"
memmap2 = "0.5.8"
Expand Down
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ NOTE: This project is still being developed. At the moment, as shown in the scre
Usage: deduplicator [OPTIONS]

Options:
-t, --types <TYPES> Filetypes to deduplicate (default = all)
--dir <DIR> Run Deduplicator on dir different from pwd
-i, --interactive Delete files interactively
-h, --help Print help information
-V, --version Print version information
-t, --types <TYPES> Filetypes to deduplicate (default = all)
--dir <DIR> Run Deduplicator on dir different from pwd
-i, --interactive Delete files interactively
-m, --minsize <MINSIZE> Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0]
-h, --help Print help information
-V, --version Print version information
```

<h2 align="center">Installation</h2>
Expand Down
12 changes: 12 additions & 0 deletions src/filters.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
use crate::file_manager::File;
use crate::params::Params;

pub fn is_file_gt_minsize(app_opts: &Params, file: &File) -> bool {
match app_opts.get_minsize() {
Some(msize) => match file.size {
Some(fsize) => fsize >= msize,
None => true,
},
None => true,
}
}
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ mod file_manager;
mod output;
mod params;
mod scanner;
mod filters;

use anyhow::Result;
use app::App;
Expand Down
28 changes: 21 additions & 7 deletions src/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use chrono::offset::Utc;
use chrono::DateTime;
use colored::Colorize;
use dashmap::DashMap;
use humansize::{format_size, DECIMAL};
use itertools::Itertools;
use prettytable::{format, row, Table};
use std::io::Write;
Expand All @@ -30,10 +29,8 @@ fn format_path(path: &str, opts: &Params) -> Result<String> {
Ok(format!("...{:<32}", display_range))
}

fn file_size(path: &String) -> Result<String> {
let mdata = fs::metadata(path)?;
let formatted_size = format!("{:>12}", format_size(mdata.len(), DECIMAL));
Ok(formatted_size)
fn file_size(file: &File) -> Result<String> {
Ok(format!("{:>12}", bytesize::ByteSize::b(file.size.unwrap())))
}

fn modified_time(path: &String) -> Result<String> {
Expand Down Expand Up @@ -119,6 +116,15 @@ fn process_group_action(duplicates: &Vec<File>, dup_index: usize, dup_size: usiz

pub fn interactive(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
print_meta_info();

if duplicates.is_empty() {
println!(
"\n{}",
"No duplicates found matching your search criteria.".green()
);
return;
}

duplicates
.clone()
.into_iter()
Expand All @@ -131,7 +137,7 @@ pub fn interactive(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
itable.add_row(row![
index,
format_path(&file.path, opts).unwrap_or_default().blue(),
file_size(&file.path).unwrap_or_default().red(),
file_size(&file).unwrap_or_default().red(),
modified_time(&file.path).unwrap_or_default().yellow()
]);
});
Expand All @@ -143,6 +149,14 @@ pub fn interactive(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
pub fn print(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
print_meta_info();

if duplicates.is_empty() {
println!(
"\n{}",
"No duplicates found matching your search criteria.".green()
);
return;
}

let mut output_table = Table::new();
output_table.set_titles(row!["hash", "duplicates"]);
duplicates.into_iter().for_each(|(hash, group)| {
Expand All @@ -151,7 +165,7 @@ pub fn print(duplicates: DashMap<String, Vec<File>>, opts: &Params) {
group.iter().for_each(|file| {
inner_table.add_row(row![
format_path(&file.path, opts).unwrap_or_default().blue(),
file_size(&file.path).unwrap_or_default().red(),
file_size(&file).unwrap_or_default().red(),
modified_time(&file.path).unwrap_or_default().yellow()
]);
});
Expand Down
15 changes: 15 additions & 0 deletions src/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,24 @@ pub struct Params {
/// Delete files interactively
#[arg(long, short)]
pub interactive: bool,
/// Minimum filesize of duplicates to scan (e.g., 100B/1K/2M/3G/4T). [default = 0]
#[arg(long, short)]
pub minsize: Option<String>,
}

impl Params {
pub fn get_minsize(&self) -> Option<u64> {
match &self.minsize {
Some(msize) => {
match msize.parse::<bytesize::ByteSize>() {
Ok(units) => Some(units.0),
Err(_) => None
}
},
None => None
}
}

pub fn get_directory(&self) -> Result<String> {
let dir_pathbuf: PathBuf = self
.dir
Expand Down
55 changes: 23 additions & 32 deletions src/scanner.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::{file_manager::File, filters, params::Params};
use anyhow::Result;
use dashmap::DashMap;
use fxhash::hash64 as hasher;
Expand All @@ -8,8 +9,6 @@ use rayon::prelude::*;
use std::hash::Hasher;
use std::{fs, path::PathBuf};

use crate::{file_manager::File, params::Params};

#[derive(Clone, Copy)]
enum IndexCritera {
Size,
Expand All @@ -28,12 +27,7 @@ pub fn duplicates(app_opts: &Params) -> Result<DashMap<String, Vec<File>>> {
.collect::<Vec<File>>();

if sizewize_duplicate_files.len() > 1 {
let size_wise_duplicate_paths = sizewize_duplicate_files
.into_par_iter()
.map(|file| file.path)
.collect::<Vec<String>>();

let hash_index_store = index_files(size_wise_duplicate_paths, IndexCritera::Hash)?;
let hash_index_store = index_files(sizewize_duplicate_files, IndexCritera::Hash)?;
let duplicate_files = hash_index_store
.into_par_iter()
.filter(|(_, files)| files.len() > 1)
Expand All @@ -45,9 +39,9 @@ pub fn duplicates(app_opts: &Params) -> Result<DashMap<String, Vec<File>>> {
}
}

fn scan(app_opts: &Params) -> Result<Vec<String>> {
fn scan(app_opts: &Params) -> Result<Vec<File>> {
let glob_patterns: Vec<PathBuf> = app_opts.get_glob_patterns();
let files: Vec<String> = glob_patterns
let files: Vec<File> = glob_patterns
.par_iter()
.progress_with_style(ProgressStyle::with_template(
"{spinner:.green} [scanning files] [{wide_bar:.cyan/blue}] {pos}/{len} files",
Expand All @@ -63,42 +57,39 @@ fn scan(app_opts: &Params) -> Result<Vec<String>> {
})
.collect::<Vec<String>>()
})
.map(|file_path| File {
path: file_path.clone(),
hash: None,
size: Some(fs::metadata(file_path).unwrap().len()),
})
.filter(|file| filters::is_file_gt_minsize(app_opts, file))
.collect();

Ok(files)
}

fn process_file_size_index(fpath: String) -> Result<File> {
Ok(File {
path: fpath.clone(),
size: Some(fs::metadata(fpath)?.len()),
hash: None,
})
}

fn process_file_hash_index(fpath: String) -> Result<File> {
fn process_file_hash_index(file: &File) -> Result<File> {
Ok(File {
path: fpath.clone(),
size: None,
hash: Some(hash_file(&fpath).unwrap_or_default()),
path: file.path.clone(),
size: file.size,
hash: Some(hash_file(&file.path).unwrap_or_default()),
})
}

fn process_file_index(
fpath: String,
file: File,
store: &DashMap<String, Vec<File>>,
index_criteria: IndexCritera,
) {
match index_criteria {
IndexCritera::Size => {
let processed_file = process_file_size_index(fpath).unwrap();
store
.entry(processed_file.size.unwrap_or_default().to_string())
.and_modify(|fileset| fileset.push(processed_file.clone()))
.or_insert_with(|| vec![processed_file]);
.entry(file.size.unwrap_or_default().to_string())
.and_modify(|fileset| fileset.push(file.clone()))
.or_insert_with(|| vec![file]);
}
IndexCritera::Hash => {
let processed_file = process_file_hash_index(fpath).unwrap();
let processed_file = process_file_hash_index(&file).unwrap();
let indexhash = processed_file.clone().hash.unwrap_or_default();

store
Expand All @@ -110,7 +101,7 @@ fn process_file_index(
}

fn index_files(
files: Vec<String>,
files: Vec<File>,
index_criteria: IndexCritera,
) -> Result<DashMap<String, Vec<File>>> {
let store: DashMap<String, Vec<File>> = DashMap::new();
Expand All @@ -124,7 +115,7 @@ fn index_files(
Ok(store)
}

pub fn incremental_hashing(filepath: &str) -> Result<String> {
fn incremental_hashing(filepath: &str) -> Result<String> {
let file = fs::File::open(filepath)?;
let fmap = unsafe { Mmap::map(&file)? };
let mut inchasher = fxhash::FxHasher::default();
Expand All @@ -135,12 +126,12 @@ pub fn incremental_hashing(filepath: &str) -> Result<String> {
Ok(format!("{}", inchasher.finish()))
}

pub fn standard_hashing(filepath: &str) -> Result<String> {
fn standard_hashing(filepath: &str) -> Result<String> {
let file = fs::read(filepath)?;
Ok(hasher(&*file).to_string())
}

pub fn hash_file(filepath: &str) -> Result<String> {
fn hash_file(filepath: &str) -> Result<String> {
let filemeta = fs::metadata(filepath)?;

// NOTE: USE INCREMENTAL HASHING ONLY FOR FILES > 100MB
Expand Down