-
Notifications
You must be signed in to change notification settings - Fork 4
/
dnn_prepare.py
149 lines (140 loc) · 5.53 KB
/
dnn_prepare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
This script downloads available dataset(s) in MLDS, applies commonly used
transformations for training deep neural networks, and saves them to disk.
"""
import argparse
import mlds
def main(datasets):
"""
This function is the main entry point for downloading and preparing
datasets for their use with deep neural networks. Specifically, this calls
mlds.datasets.process function for each dataset with: (1) the desired
dataset, (2) UniformScaler for image dataset transformations, and,
MinMaxScaler, and OneHotEncoder for other dataset transformations (where
appropriate), (3) data-cleaning for non-image datasets (via the
Destupifier), (4) sets of features that correspond to the appropriate
transformer, (5) filenames set to the name of the dataset, and (6) sets the
label transformation to LabelEncoder (where appropriate).
:param datasets: the dataset(s) to download and process
:type datasets: tuple of mlds.downloader module objects
"""
# define feature transformations for each dataset
transformations = {
mlds.downloaders.cicmalmem2022: {
"data_transforms": (mlds.transformations.MinMaxScaler,),
"destupefy": True,
"features": (("all",),),
"filename": "cicmalmem2022",
"label_transform": mlds.transformations.LabelEncoder,
},
mlds.downloaders.cifar10: {
"data_transforms": (mlds.transformations.UniformScaler,),
"destupefy": False,
"features": (("all",),),
"filename": "cifar10",
"label_transform": mlds.transformations.IdentityTransformer,
},
mlds.downloaders.fashionmnist: {
"data_transforms": (mlds.transformations.UniformScaler,),
"destupefy": False,
"features": (("all",),),
"filename": "fashionmnist",
"label_transform": mlds.transformations.IdentityTransformer,
},
mlds.downloaders.mnist: {
"data_transforms": (mlds.transformations.UniformScaler,),
"destupefy": False,
"features": (("all",),),
"filename": "mnist",
"label_transform": mlds.transformations.IdentityTransformer,
},
mlds.downloaders.nslkdd: {
"data_transforms": (
mlds.transformations.MinMaxScaler,
mlds.transformations.OneHotEncoder,
),
"destupefy": True,
"features": (
("all",),
(
"protocol_type",
"service",
"flag",
),
),
"filename": "nslkdd",
"label_transform": mlds.transformations.LabelEncoder,
},
mlds.downloaders.phishing: {
"data_transforms": (
mlds.transformations.MinMaxScaler,
mlds.transformations.OneHotEncoder,
),
"destupefy": True,
"features": (
("all",),
(
"SubdomainLevelRT",
"UrlLengthRT",
"PctExtResourceUrlsRT",
"AbnormalExtFormActionR",
"ExtMetaScriptLinkRT",
"PctExtNullSelfRedirectHyperlinksRT",
),
),
"filename": "phishing",
"label_transform": mlds.transformations.IdentityTransformer,
},
mlds.downloaders.unswnb15: {
"data_transforms": (
mlds.transformations.MinMaxScaler,
mlds.transformations.OneHotEncoder,
),
"destupefy": True,
"features": (
("all",),
(
"proto",
"service",
"state",
),
),
"filename": "unswnb15",
"label_transform": mlds.transformations.LabelEncoder,
},
}
# process each dataset
for idx, dataset in enumerate(datasets, start=1):
print(f"On dataset {idx}/{len(datasets)}: {dataset.__name__.split('.').pop()}")
mlds.datasets.process(dataset, **transformations[dataset])
return None
if __name__ == "__main__":
"""
This script downloads available dataset(s) in MLDS, applies commonly used
transformations for deep neural networks, and saves them to disk.
Specifically, this script: (1) parses command-line arguments, (2) downloads
dataset(s), (3) applies label encoding to datasets with non-numeric (or
non-zero-indexed) labels, scales image datasets uniformly to [0, 1] (via
UniformScaler), while other datasets are scaled to [0, 1] per feature (via
MinMaxScaler) categorical variables are one-hot encoded (via OneHotEncoder)
and deficiencies are removed (via Destupifier), and (4) saves the
transformed dataset(s) to disk as the name of the dataset (which can then
be subsequently loaded and used in other scripts).
"""
datasets = tuple(getattr(mlds.downloaders, d) for d in mlds.downloaders.__all__)
parser = argparse.ArgumentParser(
description="Download and transform dataset(s) for deep neural networks"
)
parser.add_argument(
"-d",
"--datasets",
choices=datasets,
default=datasets,
help="Dataset(s) to download",
metavar=", ".join(mlds.downloaders.__all__),
nargs="+",
type=lambda d: getattr(mlds.downloaders, d),
)
args = parser.parse_args()
main(datasets=tuple(args.datasets))
raise SystemExit(0)