-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.yml
208 lines (178 loc) · 6.35 KB
/
data.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
# Main dataset config file
avilatr:
name: "Avila"
# source: https://archive.ics.uci.edu/ml/datasets/Avila
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/AvilaTR.csv"
bloodtrans:
name: "Blood Transfusion"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data"
import_args:
header: 0
names:
- "Recency"
- "Frequency"
- "Monetary"
- "Time"
- "Donated"
label_col: "Donated"
brtiss:
name: "Breast Tissue"
# source: https://archive.ics.uci.edu/ml/datasets/Breast+Tissue
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/BreastTissue.csv"
import_args:
header: 0
index_col: 0
label_col: "Class"
ecoli:
name: "Ecoli"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/ecoli/ecoli.data"
import_args:
sep: "\\s+"
export_args:
dropcols:
- 0
label_col: 8
fossil:
name: "Fossil"
# source: by email from Anderson
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/ChernoffFossil.csv"
glass:
name: "Glass"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
import_args:
index_col: 0
label_col: 10
haberman:
name: "Haberman"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data"
htru2:
name: "HTRU2"
# source: https://archive.ics.uci.edu/ml/datasets/HTRU2
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/HTRU2.csv"
iris:
name: "Iris"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
leaf:
name: "Leaf"
# Nb. the supplied Readme.pdf claims 40 classes, whilst only 30 representing
# simple leaves are included in the data
# source: https://archive.ics.uci.edu/ml/datasets/Leaf
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/Leaf.csv"
label_col: 0
letterrec:
name: "Letter Recognition"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data"
label_col: 0
libras:
name: "Libras Movement"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/libras/movement_libras.data"
#lungcancer:
# name: "Lung Cancer"
# data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/lung-cancer/lung-cancer.data"
# import_args:
# na_values:
# - "?"
# label_col: 0
musk1:
name: "Musk 1"
# source: https://archive.ics.uci.edu/ml/datasets/Musk+%28Version+1%29
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/Musk1.csv"
import_args:
index_col: 1
label_col: 168
export_args:
dropcols:
- 0
musk2:
name: "Musk 2"
# source: https://archive.ics.uci.edu/ml/datasets/Musk+%28Version+2%29
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/Musk2.csv"
import_args:
index_col: 1
label_col: 168
export_args:
dropcols:
- 0
# We need labels, so use the training set of 3,823 instances
optdigits:
name: "Optical Recognition" # of Handwritten Digits"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/optdigits.tra"
export_args:
# These two are just all zeros
dropcols:
- 0
- 39
pageblocks:
name: "Page Blocks"
# source: https://archive.ics.uci.edu/ml/datasets/Page+Blocks+Classification
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/PageBlocks.csv"
pendigits:
name: "Pen-Based Recognition" # of Handwritten Digits"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra"
parkinsons:
name: "Parkinsons"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data"
import_args:
header: 0
index_col: 0
label_col: "status"
sonar_all:
name: "Sonar all"
#name: "Connectionist Bench (Sonar, Mines vs. Rocks)"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
spambase:
name: "Spambase"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
vehicle:
name: "Vehicle Silhouettes"
# Comes split into 9 files, for no obvious reason
# source: https://archive.ics.uci.edu/ml/datasets/Statlog+%28Vehicle+Silhouettes%29
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/Vehicle.csv"
import_args:
sep: "\\s+"
vertebral:
name: "Vertebral Column"
# Use the 3 cluster set, as per Arbelaitz 2013
# Source: https://archive.ics.uci.edu/ml/datasets/Vertebral+Column
data_url: "https://raw.githubusercontent.com/simonharris/cleandata/master/offline_data/Vertebral3C.csv"
import_args:
sep: "\\s+"
wbcd: # diagnostic
name: "Breast Cancer (Diag.)"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
import_args:
index_col: 0
label_col: 1
wbco: # original
name: "Breast Cancer (Orig.)"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
import_args:
na_values:
- '?'
index_col: 0
label_col: 10
wine:
name: "Wine"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
label_col: 0
wineq_red:
name: "Wine Quality (Red)"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
import_args:
header: 0
sep: ";"
label_col: "quality"
wineq_white:
name: "Wine Quality (White)"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
import_args:
header: 0
sep: ";"
label_col: "quality"
yeast:
name: "Yeast"
data_url: "https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"
import_args:
index_col: 0
sep: "\\s+"
label_col: 9