-
Notifications
You must be signed in to change notification settings - Fork 850
/
find_filegroups.py
120 lines (105 loc) · 3.68 KB
/
find_filegroups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Sebastian Raschka 2014-2022
# mlxtend Machine Learning Library Extensions
#
# A function for collecting file-group names from local directories.
# Author: Sebastian Raschka <sebastianraschka.com>
#
# License: BSD 3 clause
import os
import re
from . import find_files
def find_filegroups(
paths,
substring="",
extensions=None,
validity_check=True,
ignore_invisible=True,
rstrip="",
ignore_substring=None,
):
"""Find and collect files from different directories in a python dictionary.
Parameters
----------
paths : `list`
Paths of the directories to be searched. Dictionary keys are build from
the first directory.
substring : `str` (default: '')
Substring that all files have to contain to be considered.
extensions : `list` (default: None)
`None` or `list` of allowed file extensions for each path.
If provided, the number of extensions must match the number of `paths`.
validity_check : `bool` (default: None)
If `True`, checks if all dictionary values
have the same number of file paths. Prints
a warning and returns an empty dictionary if the validity check failed.
ignore_invisible : `bool` (default: True)
If `True`, ignores invisible files
(i.e., files starting with a period).
rstrip : `str` (default: '')
If provided, strips characters from right side of the file
base names after splitting the extension.
Useful to trim different filenames to a common stem.
E.g,. "abc_d.txt" and "abc_d_.csv" would share
the stem "abc_d" if rstrip is set to "_".
ignore_substring : `str` (default: None)
Ignores files that contain the specified substring.
Returns
----------
groups : `dict`
Dictionary of files paths. Keys are the file names
found in the first directory listed
in `paths` (without file extension).
Examples
-----------
For usage examples, please see
http://rasbt.github.io/mlxtend/user_guide/file_io/find_filegroups/
"""
n = len(paths)
# must have same number of paths and extensions
assert len(paths) >= 2
if extensions:
assert len(extensions) == n
else:
extensions = ["" for i in range(n)]
base = find_files(
path=paths[0],
substring=substring,
check_ext=extensions[0],
ignore_invisible=ignore_invisible,
ignore_substring=ignore_substring,
)
rest = [
find_files(
path=paths[i],
substring=substring,
check_ext=extensions[i],
ignore_invisible=ignore_invisible,
ignore_substring=ignore_substring,
)
for i in range(1, n)
]
groups = {}
for f in base:
basename = os.path.splitext(os.path.basename(f))[0]
basename = re.sub(r"\%s$" % rstrip, "", basename)
groups[basename] = [f]
# groups = {os.path.splitext(os.path.basename(f))[0].rstrip(rstrip):[f]
# for f in base}
for idx, r in enumerate(rest):
for f in r:
basename, ext = os.path.splitext(os.path.basename(f))
basename = re.sub(r"\%s$" % rstrip, "", basename)
try:
if extensions[idx + 1] == "" or ext == extensions[idx + 1]:
groups[basename].append(f)
except KeyError:
pass
if validity_check:
lens = [len(groups[k]) for k in groups.keys()]
if len(set(lens)) > 1:
raise ValueError(
"Warning, some keys have more/less values than"
" others. Set validity_check=False"
" to ignore this warning."
)
return groups