-
Notifications
You must be signed in to change notification settings - Fork 2
/
types.go
276 lines (243 loc) · 9.02 KB
/
types.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
// Package tada (TAble DAta) enables test-driven data pipelines.
//
// tada combines concepts from pandas, spreadsheets, R, Apache Spark, and SQL.
// Its most common use cases are cleaning, aggregating, transforming, and analyzing data.
// Some notable features of tada:
//
// * flexible constructor that supports most primitive data types
//
// * seamlessly handles null data and type conversions
//
// * robust datetime support
//
// * advanced filtering, lookups and merging, grouping, sorting, and pivoting
//
// * multi-level labels and columns
//
// * complete test coverage
//
// * interoperable with existing pandas dataframes via Apache Arrow
//
// The key data types are Series, DataFrames, and groupings of each.
// A Series is analogous to one column of a spreadsheet, and a DataFrame is analogous to a whole spreadsheet.
// Printing either data type will render an ASCII table.
//
// Both Series and DataFrames have one or more "label levels".
// On printing, these appear as the leftmost columns in a table, and typically have values that help identify ("label") specific rows.
// They are analogous to the "index" concept in pandas.
//
// For more detail and implementation notes, see https://docs.google.com/document/d/18DvZzd6Tg6Bz0SX0fY2SrXOjE8d9xDhU6bDEnaIc_rM/
package tada
import (
"time"
)
type valueContainer struct {
slice interface{}
isNull []bool
cache []string
name string
}
// A Series is a single column of data with one or more levels of aligned labels.
type Series struct {
values *valueContainer
labels []*valueContainer
sharedData bool
err error
}
// A SeriesIterator iterates over the rows in a Series.
type SeriesIterator struct {
current int
s *Series
}
// A SeriesMutator is used to change Series values in place.
type SeriesMutator struct {
series *Series
}
// A DataFrame is one or more columns of data with one or more levels of aligned labels.
// A DataFrame is analogous to a spreadsheet.
type DataFrame struct {
labels []*valueContainer
values []*valueContainer
name string
err error
colLevelNames []string
}
// A DataFrameIterator iterates over the rows in a DataFrame.
type DataFrameIterator struct {
current int
df *DataFrame
}
// A DataFrameMutator is used to change DataFrame values in place.
type DataFrameMutator struct {
dataframe *DataFrame
}
// A GroupedSeries is a collection of row positions sharing the same group key.
// A GroupedSeries has a reference to an underlying Series, which is used for reduce operations.
type GroupedSeries struct {
orderedKeys []string
rowIndices [][]int
labels []*valueContainer
series *Series
aligned bool
err error
}
// GroupedSeriesIterator iterates over all Series in the group.
type GroupedSeriesIterator struct {
current int
rowIndices [][]int
s *Series
}
// A GroupedDataFrame is a collection of row positions sharing the same group key.
// A GroupedDataFrame has a reference to an underlying DataFrame, which is used for reduce operations.
type GroupedDataFrame struct {
orderedKeys []string
rowIndices [][]int
labels []*valueContainer
df *DataFrame
aligned bool
err error
}
// GroupedDataFrameIterator iterates over all DataFrames in the group.
type GroupedDataFrameIterator struct {
current int
rowIndices [][]int
df *DataFrame
}
// Matrix is an interface which is compatible with gonum's mat.Matrix interface
type Matrix interface {
Dims() (r, c int)
At(i, j int) float64
}
type floatValueContainer struct {
slice []float64
isNull []bool
index []int
}
type stringValueContainer struct {
slice []string
isNull []bool
index []int
}
type dateTimeValueContainer struct {
slice []time.Time
isNull []bool
index []int
}
// A Sorter supplies details to the Sort() function.
// `Name` specifies the container (either label or column name) to sort.
// If `Descending` is true, values are sorted in descending order.
// `DType` specifies the data type to which values will be coerced before they are sorted (default: float64).
// Null values are always sorted to the bottom.
type Sorter struct {
Name string
Descending bool
DType DType
}
// An Element is one {value, null status} pair in either a Series or DataFrame.
type Element struct {
Val interface{}
IsNull bool
}
// NullFiller fills every row with a null value and changes the row status to not-null.
// If multiple fields are provided, resolves in the following order:
// 1) `FillForward` - fills with the last valid value,
// 2) `FillBackward` - fills with the next valid value,
// 3) `FillZero` - fills with the zero type of the slice,
// 4) `FillFloat` - coerces to float64 and fills with the value provided.
type NullFiller struct {
FillForward bool
FillBackward bool
FillZero bool
FillFloat float64
}
// A FilterFn is an anonymous function supplied to a Filter or Where function.
// The function will be called on every val in the container.
type FilterFn func(value interface{}) bool
// An ApplyFn is an anonymous function supplied to an Apply function to convert one slice to another.
// The function input will be a slice, and it must return a slice of equal length (though the type may be different).
// isNull contains the null status of every row in the input slice.
// The null status of a row may be changed by setting that row's isNull element within the function body.
type ApplyFn func(slice interface{}, isNull []bool) (equalLengthSlice interface{})
// A ReduceFn is an anonymous function supplied to a Reduce function
// to reduce a slice of values to one value and one null status per group.
// isNull contains the null status of every value in the group.
type ReduceFn func(slice interface{}, isNull []bool) (value interface{}, null bool)
// DType is a DataType that may be used in Sort() or Cast().
type DType int
const (
// Float64 -> float64
Float64 DType = iota
// String -> string
String
// DateTime -> time.Time
DateTime // always tz-aware
// Time -> civil.Time
Time
// Date -> civil.Date
Date
)
// A WriteOption configures a write function.
// Available write options: WriteOptionExcludeLabels, WriteOptionDelimiter.
type WriteOption func(*writeConfig)
// A writeConfig configures a read function.
// All write functions accept zero or more modifiers that alter the default write config, which is:
// Include labels; "," as field delimiter; and rows as the major dimension of a nested slice.
type writeConfig struct {
includeLabels bool
delimiter rune
}
// A ReadOption configures a read function.
// Available read options: ReadOptionHeaders, ReadOptionLabels, ReadOptionDelimiter, and ReadOptionSwitchDims.
type ReadOption func(*readConfig)
// A readConfig configures a read function.
// All read functions accept zero or more modifiers that alter the default read config, which is:
// 1 header row, 0 label levels, "," as field delimiter, and rows as the major dimension of a nested slice.
type readConfig struct {
numHeaderRows int
numLabelLevels int
delimiter rune
majorDimIsCols bool
}
// A JoinOption configures a lookup or merge function.
// Available lookup options: JoinOptionHow, JoinOptionLeftOn, JoinOptionRightOn
type JoinOption func(*joinConfig)
// A joinConfig configures a lookup or merge function.
// All lookup/merge functions accept zero or more modifiers that alter the default read config, which is:
// left join, no specified join keys (so automatically uses shared label names as keys)
type joinConfig struct {
how string
leftOn []string
rightOn []string
}
// Resampler supplies logic for the Resample() function.
// Only the first `By` field that is selected (i.e., not left nil) is used - any others are ignored
// (if `ByWeek` is selected, it may be modified by `StartOfWeek`).
// `ByYear` truncates the timestamp by year.
// `ByMonth` truncates the timestamp by month.
// `ByDay` truncates the timestamp by day.
// `ByWeek` returns the first day of the most recent week (starting on `StartOfWeek`) relative to timestamp.
// Otherwise, truncates the timestamp `ByDuration`.
// If `Location` is not provided, time.UTC is used as the default location.
type Resampler struct {
ByYear bool
ByMonth bool
ByDay bool
ByWeek bool
StartOfWeek time.Weekday
ByDuration time.Duration
Location *time.Location
}
// Binner supplies logic for the Bin() function.
// If `AndLess` is true, a bin is added that ranges between negative infinity and the first bin value.
// If `AndMore` is true, a bin is added that ranges between the last bin value and positive infinity.
// If `Labels` is not nil, then category names correspond to labels, and the number of labels must be one less than the number of bin values.
// Otherwise, category names are auto-generated from the range of the bin intervals.
type Binner struct {
AndLess bool
AndMore bool
Labels []string
}
// A StructTransposer is a row-oriented representation of a DataFrame
// that can be randomly shuffled or transposed into a column-oriented struct representation of a DataFrame.
// It is useful for intuitive row-oriented testing.
type StructTransposer [][]interface{}