-
Notifications
You must be signed in to change notification settings - Fork 2
/
dataframe.go
2700 lines (2499 loc) · 93.3 KB
/
dataframe.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package tada
import (
"bytes"
"encoding/csv"
"fmt"
"io"
"io/ioutil"
"math/rand"
"reflect"
"strings"
"unicode"
"github.com/ptiger10/tablediff"
"github.com/ptiger10/tablewriter"
)
// -- CONSTRUCTORS
// NewDataFrame creates a new DataFrame with slices (akin to column values) and optional labels.
// Slices must be comprised of supported slices, and each label must be a supported slice.
//
// If no labels are supplied, a default label level is inserted ([]int incrementing from 0).
// Columns are named sequentially (e.g., 0, 1, etc) by default. Default column names are displayed on printing.
// Label levels are named *n (e.g., *0, *1, etc) by default. Default label names are hidden on printing.
//
// Supported slice types: all variants of []float, []int, & []uint,
// []string, []bool, []time.Time, []interface{},
// and 2-dimensional variants of each (e.g., [][]string, [][]float64).
func NewDataFrame(slices []interface{}, labels ...interface{}) *DataFrame {
if slices == nil && labels == nil {
return dataFrameWithError(fmt.Errorf("constructing new DataFrame: slices and labels cannot both be nil"))
}
values := make([]*valueContainer, 0)
var err error
if slices != nil {
// handle values
values, err = makeValueContainersFromInterfaces(slices, false)
if err != nil {
return dataFrameWithError(fmt.Errorf("constructing new DataFrame: slices: %v", err))
}
}
// handle labels
retLabels, err := makeValueContainersFromInterfaces(labels, true)
if err != nil {
return dataFrameWithError(fmt.Errorf("constructing new DataFrame: labels: %v", err))
}
if len(retLabels) == 0 {
// handle default labels case
numRows := reflect.ValueOf(slices[0]).Len()
defaultLabels := makeDefaultLabels(0, numRows, true)
retLabels = append(retLabels, defaultLabels)
}
// ensure equal-lengthed slices
var requiredLength int
if len(values) > 0 {
requiredLength = values[0].len()
} else {
// handle null values case
requiredLength = retLabels[0].len()
}
err = ensureEqualLengths(retLabels, requiredLength)
if err != nil {
return dataFrameWithError(fmt.Errorf("constructing new DataFrame: labels: %v", err))
}
if len(values) > 0 {
err = ensureEqualLengths(values, requiredLength)
if err != nil {
return dataFrameWithError(fmt.Errorf("constructing new DataFrame: columns: %v", err))
}
}
return &DataFrame{values: values, labels: retLabels, colLevelNames: []string{"*0"}}
}
// MakeMultiLevelLabels expects labels to be a slice of slices.
// It returns a product of these slices by repeating each label value n times,
// where n is the number of unique label values in the other slices.
//
// For example, [["foo", "bar"], [1, 2, 3]]
// returns [["foo", "foo", "foo", "bar", "bar", "bar"], [1, 2, 3, 1, 2, 3]]
func MakeMultiLevelLabels(labels []interface{}) ([]interface{}, error) {
for k := range labels {
if !isSlice(labels[k]) {
return nil, fmt.Errorf("constructing multi level labels: position %d: must be slice", k)
}
}
var numNewRows int
for k := range labels {
v := reflect.ValueOf(labels[k])
if k == 0 {
numNewRows = v.Len()
} else {
numNewRows *= v.Len()
}
}
ret := make([]interface{}, len(labels))
for k := range labels {
v := reflect.ValueOf(labels[k])
newValues := reflect.MakeSlice(v.Type(), numNewRows, numNewRows)
numRepeats := numNewRows / v.Len()
// for first slice, repeat each value individually
if k == 0 {
for i := 0; i < v.Len(); i++ {
for j := 0; j < numRepeats; j++ {
offset := j + i*numRepeats
src := v.Index(i)
dst := newValues.Index(offset)
dst.Set(src)
}
}
} else {
// otherwise, repeat values in blocks as-is
for j := 0; j < numRepeats; j++ {
for i := 0; i < v.Len(); i++ {
offset := i + j*v.Len()
src := v.Index(i)
dst := newValues.Index(offset)
dst.Set(src)
}
}
}
ret[k] = newValues.Interface()
}
return ret, nil
}
// Copy returns a new DataFrame with identical values as the original but no shared objects
// (i.e., all internals are newly allocated).
func (df *DataFrame) Copy() *DataFrame {
colLevelNames := make([]string, len(df.colLevelNames))
copy(colLevelNames, df.colLevelNames)
ret := &DataFrame{
values: copyContainers(df.values),
labels: copyContainers(df.labels),
err: df.err,
colLevelNames: colLevelNames,
name: df.name,
}
return ret
}
// ConcatSeries merges multiple Series from left-to-right, one after the other, via left joins on shared keys.
// For advanced cases, use df.LookupAdvanced() + df.WithCol().
func ConcatSeries(series ...*Series) (*DataFrame, error) {
var ret *DataFrame
for k, s := range series {
if k == 0 {
ret = s.DataFrame()
} else {
var err error
ret, err = ret.Merge(s.DataFrame())
if err != nil {
return nil, fmt.Errorf("concatenating Series: %v", err)
}
}
}
return ret, nil
}
// Cast coerces the underlying container values (column or label level) to
// []float64, []string, []time.Time (aka timezone-aware DateTime), []civil.Date, or []civil.Time
// and caches the []byte values of the container (if inexpensive).
// Use cast to improve performance when calling multiple operations on values.
func (df *DataFrame) Cast(containerAsType map[string]DType) {
mergedLabelsAndCols := append(df.labels, df.values...)
for name, dtype := range containerAsType {
index, err := indexOfContainer(name, mergedLabelsAndCols)
if err != nil {
df.resetWithError(fmt.Errorf("type casting: %v", err))
return
}
mergedLabelsAndCols[index].cast(dtype)
}
return
}
// -- READERS
// ReadOptionHeaders configures a read function to expect n rows to be column headers (default: 1).
func ReadOptionHeaders(n int) func(*readConfig) {
return func(r *readConfig) {
r.numHeaderRows = n
}
}
// ReadOptionLabels configures a read function to expect the first n columns to be label levels (default: 0).
func ReadOptionLabels(n int) func(*readConfig) {
return func(r *readConfig) {
r.numLabelLevels = n
}
}
// ReadOptionDelimiter configures a read function to use sep as a field delimiter for use in ReadCSV (default: ",").
func ReadOptionDelimiter(sep rune) func(*readConfig) {
return func(r *readConfig) {
r.delimiter = sep
}
}
// ReadOptionSwitchDims configures a read function to expect columns to be the major dimension of csv data
// (default: expects rows to be the major dimension).
// For example, when reading this data:
//
// [["foo", "bar"], ["baz", "qux"]]
//
// default ReadOptionSwitchDims()
// (major dimension: rows) (major dimension: columns)
// foo bar foo baz
// baz qux bar qux
func ReadOptionSwitchDims() func(*readConfig) {
return func(r *readConfig) {
r.majorDimIsCols = true
}
}
// ReadCSV reads csv records in r into a Dataframe (configured by options).
// Rows must be the major dimension of r.
// For advanced cases, use the standard csv library NewReader().ReadAll() + tada.ReadCSVFromRecords().
// Available options: ReadOptionHeaders, ReadOptionLabels, ReadOptionDelimiter.
//
// Default if no options are supplied:
// 1 header row; no labels; field delimiter is ","
//
// If no labels are supplied, a default label level is inserted ([]int incrementing from 0).
// If no headers are supplied, a default level of sequential column names (e.g., 0, 1, etc) is used. Default column names are displayed on printing
// Label levels are named *i (e.g., *0, *1, etc) by default when first created. Default label names are hidden on printing.
func ReadCSV(r io.Reader, options ...ReadOption) (*DataFrame, error) {
config := setReadConfig(options)
b, err := ioutil.ReadAll(r)
if err != nil {
return nil, fmt.Errorf("reading csv: %s", err)
}
numRows, numCols, err := extractCSVDimensions(b, config.delimiter)
if err != nil {
return nil, fmt.Errorf("reading csv: %v", err)
}
retVals := makeStringMatrix(numCols, numRows)
retNulls := makeBoolMatrix(numCols, numRows)
data := bytes.NewReader(b)
err = readCSVBytes(data, retVals, retNulls, config.delimiter)
if err != nil {
return nil, fmt.Errorf("reading csv: %s", err)
}
return makeDataFrameFromMatrices(retVals, retNulls, config), nil
}
// ReadCSVFromRecords reads records into a DataFrame (configured by options).
// Often used with encoding/csv.NewReader().ReadAll()
// All columns will be read as []string.
// Available options: ReadOptionHeaders, ReadOptionLabels, ReadOptionSwitchDims.
//
// Default if no options are supplied:
// 1 header row; no labels; rows as major dimension
//
// If no labels are supplied, a default label level is inserted ([]int incrementing from 0).
// If no headers are supplied, a default level of sequential column names (e.g., 0, 1, etc) is used. Default column names are displayed on printing.
// Label levels are named *i (e.g., *0, *1, etc) by default when first created. Default label names are hidden on printing.
func ReadCSVFromRecords(records [][]string, options ...ReadOption) (ret *DataFrame, err error) {
if len(records) == 0 {
return nil, fmt.Errorf("reading csv from records: must have at least one record")
}
if len(records[0]) == 0 {
return nil, fmt.Errorf("reading csv from records: first record cannot be empty")
}
config := setReadConfig(options)
if config.majorDimIsCols {
ret, err = readCSVByCols(records, config)
} else {
ret, err = readCSVByRows(records, config)
}
if err != nil {
return nil, fmt.Errorf("reading csv from records: %v", err)
}
return ret, nil
}
// ReadInterfaceRecords reads records into a DataFrame (configured by options).
// All columns will be read as []interface{}.
// Available options: ReadOptionHeaders, ReadOptionLabels, ReadOptionSwitchDims.
//
// Default if no options are supplied:
// 1 header row; no labels; rows as major dimension
//
// If no labels are supplied, a default label level is inserted ([]int incrementing from 0).
// If no headers are supplied, a default level of sequential column names (e.g., 0, 1, etc) is used. Default column names are displayed on printing.
// Label levels are named *i (e.g., *0, *1, etc) by default when first created. Default label names are hidden on printing.
func ReadInterfaceRecords(records [][]interface{}, options ...ReadOption) (ret *DataFrame, err error) {
if len(records) == 0 {
return nil, fmt.Errorf("reading records from [][]interface{}: must have at least one record")
}
if len(records[0]) == 0 {
return nil, fmt.Errorf("reading records from [][]interface{}: first record cannot be empty")
}
config := setReadConfig(options)
var slices []interface{}
if !config.majorDimIsCols {
slices, err = readNestedInterfaceByRows(records)
} else {
slices, err = readNestedInterfaceByCols(records)
}
if err != nil {
return nil, fmt.Errorf("reading records from [][]interface{}: %v", err)
}
numCols := len(slices) - config.numLabelLevels
labelNames := make([]string, config.numLabelLevels)
colNames := make([]string, numCols)
// iterate over all containers to get header names
for j := 0; j < config.numLabelLevels; j++ {
// write label headers, no offset
fields := make([]string, config.numHeaderRows)
for i := range fields {
fields[i] = fmt.Sprint(slices[j].([]interface{})[i])
}
labelNames[j] = strings.Join(fields, optionLevelSeparator)
// remove label headers from input
slices[j] = slices[j].([]interface{})[config.numHeaderRows:]
}
for k := 0; k < numCols; k++ {
// write col headers, offset for label cols
offsetFromLabelCols := k + config.numLabelLevels
fields := make([]string, config.numHeaderRows)
for i := range fields {
fields[i] = fmt.Sprint(slices[offsetFromLabelCols].([]interface{})[i])
}
colNames[k] = strings.Join(fields, optionLevelSeparator)
// remove column headers from input
slices[offsetFromLabelCols] = slices[offsetFromLabelCols].([]interface{})[config.numHeaderRows:]
}
labels := slices[:config.numLabelLevels]
slices = slices[config.numLabelLevels:]
if len(labels) > 0 {
ret = NewDataFrame(slices, labels...)
if ret.err != nil {
return nil, fmt.Errorf("reading records from [][]interface{}: %v", ret.err)
}
ret = ret.SetLabelNames(labelNames).SetColNames(colNames)
} else {
ret = NewDataFrame(slices)
if ret.err != nil {
return nil, fmt.Errorf("reading records from [][]interface{}: %v", ret.err)
}
if config.numHeaderRows > 0 {
ret = ret.SetColNames(colNames)
}
}
return ret, nil
}
// ReadStruct reads the exported fields in strct into a DataFrame.
// strct must be a struct or pointer to a struct.
// If any exported field in strct is nil, returns an error.
//
// If a "tada" tag is present with the value "isNull", this field must be [][]bool with one equal-lengthed slice for each exported field.
// These values will set the null status for each of the resulting value containers in the DataFrame, from left-to-right.
// If a "tada" tag has any other value, the resulting value container will have the same name as the tag value.
// Otherwise, the value container will have the same name as the exported field.
func ReadStruct(strct interface{}, options ...ReadOption) (*DataFrame, error) {
config := setReadConfig(options)
if reflect.TypeOf(strct).Kind() == reflect.Ptr {
strct = reflect.ValueOf(strct).Elem().Interface()
}
if reflect.TypeOf(strct).Kind() != reflect.Struct {
return nil, fmt.Errorf("reading struct: strct must be reflect.Kind struct, not %s",
reflect.TypeOf(strct).Kind())
}
labels := make([]interface{}, 0)
values := make([]interface{}, 0)
labelNames := make([]string, 0)
colNames := make([]string, 0)
v := reflect.ValueOf(strct)
var hasNullTag bool
var nullField string
nullTag := "isNull"
var offset int
for k := 0; k < v.NumField(); k++ {
field := reflect.TypeOf(strct).Field(k)
// is unexported field?
if unicode.IsLower([]rune(field.Name)[0]) {
offset--
continue
}
// has null tag?
if field.Tag.Get("tada") == nullTag {
offset--
if field.Type.String() != "[][]bool" {
return nil, fmt.Errorf("reading struct: field with tag %v must be type [][]bool, not %s",
nullTag, field.Type.String())
}
hasNullTag = true
nullField = field.Name
continue
}
// is nil?
if v.Field(k).IsZero() {
return nil, fmt.Errorf("reading struct: field %s: strct cannot contain a nil exported field",
field.Name)
}
container := k + offset
var name string
// check tada tag first, then default to exported name
if name = field.Tag.Get("tada"); name == "" {
name = field.Name
}
// write to label
if container < config.numLabelLevels {
labelNames = append(labelNames, name)
labels = append(labels, v.Field(k).Interface())
// write to column
} else {
colNames = append(colNames, name)
values = append(values, v.Field(k).Interface())
}
}
df := NewDataFrame(values, labels...)
if df.err != nil {
return nil, fmt.Errorf("reading struct as schema: %v", df.err)
}
// not default labels? apply label names
if config.numLabelLevels > 0 {
df = df.SetLabelNames(labelNames)
}
df = df.SetColNames(colNames)
if hasNullTag {
var min int
// default labels? do not change nulls
if config.numLabelLevels == 0 {
min = 1
}
containers := makeIntRange(min, df.NumLevels()+df.NumColumns())
nullTable := v.FieldByName(nullField).Interface().([][]bool)
if len(nullTable) > 0 {
for incrementor, k := range containers {
err := df.SetNulls(k, nullTable[incrementor])
if err != nil {
return nil, fmt.Errorf("reading struct: writing nulls: position %d: %v", incrementor, err)
}
}
}
}
return df, nil
}
// ReadMatrix reads data satisfying the gonum Matrix interface into a DataFrame.
// Panics if any slices in the matrix are shorter than the first slice.
func ReadMatrix(mat Matrix) *DataFrame {
numRows, numCols := mat.Dims()
// major dimension: columns
data := make([]interface{}, numCols)
for k := range data {
floats := make([]float64, numRows)
for i := 0; i < numRows; i++ {
floats[i] = mat.At(i, k)
}
data[k] = floats
}
ret := NewDataFrame(data)
return ret
}
// ReadStructSlice reads a slice of structs into a DataFrame with field names converted to column names,
// field values converted to column values, and default labels. The structs must all be of the same type.
//
// A default label level named *0 is inserted ([]int incrementing from 0). Default label names are hidden on printing.
func ReadStructSlice(slice interface{}) (*DataFrame, error) {
values, err := readStruct(slice)
if err != nil {
return nil, fmt.Errorf("reading struct slice: %v", err)
}
defaultLabels := makeDefaultLabels(0, reflect.ValueOf(slice).Len(), true)
return &DataFrame{
values: values,
labels: []*valueContainer{defaultLabels},
colLevelNames: []string{"*0"},
}, nil
}
// Series converts a single-columned DataFrame to a Series that shares the same underlying values and labels.
func (df *DataFrame) Series() *Series {
if len(df.values) != 1 {
return seriesWithError(fmt.Errorf("converting to Series: DataFrame must have a single column"))
}
return &Series{
values: df.values[0],
labels: df.labels,
sharedData: true,
}
}
// EqualsCSV reads want (configured by wantOptions) into a dataframe,
// converts both df and want into [][]string records,
// and evaluates whether the stringified values match.
// If they do not match, returns a tablediff.Differences object that can be printed to isolate their differences.
//
// If includeLabels is true, then df's labels are included as columns.
func (df *DataFrame) EqualsCSV(includeLabels bool, want io.Reader, wantOptions ...ReadOption) (bool, *tablediff.Differences, error) {
config := setReadConfig(wantOptions)
df2, err := ReadCSV(want, wantOptions...)
if err != nil {
return false, nil, fmt.Errorf("comparing csv: reading want: %v", err)
}
got := df.CSVRecords(writeOptionIncludeLabels(includeLabels))
// df2 has default labels? exclude them
wantDF := df2.CSVRecords(writeOptionIncludeLabels(config.numLabelLevels > 0))
diffs, eq := tablediff.Diff(got, wantDF)
return eq, diffs, nil
}
// -- WRITERS
// WriteOptionExcludeLabels excludes the label levels from the output.
func WriteOptionExcludeLabels() func(*writeConfig) {
return func(w *writeConfig) {
w.includeLabels = false
}
}
// for internal use
func writeOptionIncludeLabels(set bool) func(w *writeConfig) {
return func(w *writeConfig) {
w.includeLabels = set
}
}
// WriteOptionDelimiter configures a write function to use sep as a field delimiter for use in write functions (default: ",").
func WriteOptionDelimiter(sep rune) func(*writeConfig) {
return func(w *writeConfig) {
w.delimiter = sep
}
}
// InterfaceRecords writes a DataFrame to a [][]interface{} with columns as the major dimension.
// Null values are replaced with "(null)".
func (df *DataFrame) InterfaceRecords(options ...WriteOption) [][]interface{} {
config := setWriteConfig(options)
containers := append(df.labels, df.values...)
if !config.includeLabels {
containers = df.values
}
ret := make([][]interface{}, len(containers))
for k := range ret {
ret[k] = containers[k].interfaceSlice(true)
}
return ret
}
// CSVRecords writes a DataFrame to a [][]string with rows as the major dimension.
// Null values are replaced with "(null)".
func (df *DataFrame) CSVRecords(options ...WriteOption) [][]string {
config := setWriteConfig(options)
transposedStringValues, err := df.toCSVByRows(config.includeLabels)
if err != nil {
return nil
}
mergedLabelsAndCols := df.values
if config.includeLabels {
mergedLabelsAndCols = append(df.labels, df.values...)
}
// overwrite null values, skipping headers
for i := range transposedStringValues[df.numColLevels():] {
for k := range transposedStringValues[i] {
if mergedLabelsAndCols[k].isNull[i] {
transposedStringValues[i+df.numColLevels()][k] = optionsNullPrinter
}
}
}
return transposedStringValues
}
// Struct writes the values of the df containers into structPointer.
// Returns an error if df does not contain, from left-to-right, the same container names and types
// as the exported fields that appear, from top-to-bottom, in structPointer.
// Exported struct fields must be types that are supported by NewDataFrame().
// If a "tada" tag is present with the value "isNull", this field must be [][]bool.
// The null status of each value container in the DataFrame, from left-to-right, will be written into this field in equal-lengthed slices.
// If df contains additional containers beyond those in structPointer, those are ignored.
func (df *DataFrame) Struct(structPointer interface{}, options ...WriteOption) error {
config := setWriteConfig(options)
if reflect.TypeOf(structPointer).Kind() != reflect.Ptr {
return fmt.Errorf("writing to struct: structPointer must be pointer to struct, not %s", reflect.TypeOf(structPointer).Kind())
}
if reflect.TypeOf(structPointer).Elem().Kind() != reflect.Struct {
return fmt.Errorf("writing to struct: structPointer must be pointer to struct, not to %s", reflect.TypeOf(structPointer).Elem().Kind())
}
v := reflect.ValueOf(structPointer).Elem()
var mergedLabelsAndCols []*valueContainer
if config.includeLabels {
mergedLabelsAndCols = append(df.labels, df.values...)
} else {
mergedLabelsAndCols = df.values
}
var offset int
var hasNullTag bool
var nullField string
nullTag := "isNull"
for k := 0; k < v.NumField(); k++ {
field := reflect.TypeOf(structPointer).Elem().Field(k)
// is unexported field?
if unicode.IsLower([]rune(field.Name)[0]) {
offset--
continue
}
tag := field.Tag.Get("tada")
// has null tag?
if tag == nullTag {
offset--
if field.Type.String() != "[][]bool" {
return fmt.Errorf("writing to struct: field with tag %v must be type [][]bool, not %s", nullTag, field.Type.String())
}
hasNullTag = true
nullField = field.Name
continue
}
container := k + offset
// df does not have enough containers?
if container >= len(mergedLabelsAndCols) {
return fmt.Errorf("writing to struct: writing to exported field %s [%d]: insufficient number of containers [%d]",
field.Name, container, len(mergedLabelsAndCols))
}
// use tag as name if it exists, else default to exported name
name := tag
if tag == "" {
name = field.Name
}
if mergedLabelsAndCols[container].name != name {
return fmt.Errorf("writing to struct: writing to exported field %s [%d]: container name does not match (%s != %s)",
field.Name, container,
mergedLabelsAndCols[container].name, name)
}
if mergedLabelsAndCols[container].dtype() != field.Type {
return fmt.Errorf("writing to struct: writing to exported field %s [%d]: container %s has wrong type (%s != %s)",
field.Name, container, mergedLabelsAndCols[container].name,
mergedLabelsAndCols[container].dtype(), field.Type)
}
src := reflect.ValueOf(mergedLabelsAndCols[container].slice)
dst := v.FieldByName(field.Name)
dst.Set(src)
}
if hasNullTag {
copiedFields := v.NumField() + offset
nullTable := make([][]bool, copiedFields)
for k := 0; k < copiedFields; k++ {
nullTable[k] = mergedLabelsAndCols[k].isNull
}
src := reflect.ValueOf(nullTable).Interface()
dst := v.FieldByName(nullField)
dst.Set(reflect.ValueOf(src))
}
return nil
}
// WriteCSV converts a DataFrame to a csv with rows as the major dimension,
// and writes the output to w.
// Null values are replaced with "(null)".
func (df *DataFrame) WriteCSV(w io.Writer, options ...WriteOption) error {
config := setWriteConfig(options)
ret := df.CSVRecords(writeOptionIncludeLabels(config.includeLabels))
var b bytes.Buffer
cw := csv.NewWriter(&b)
cw.Comma = config.delimiter
cw.WriteAll(ret)
_, err := w.Write(b.Bytes())
return err
}
// WriteMockCSV reads r (configured by options) and writes n mock rows to w,
// with column names and types inferred based on the data in src.
// Regardless of the major dimension of src, the major dimension of the output is rows.
// Available options: ReadOptionHeaders, ReadOptionLabels, ReadOptionSwitchDims.
//
// Default if no options are supplied:
// 1 header row, no labels, rows as major dimension
func WriteMockCSV(w io.Writer, n int, r io.Reader, options ...ReadOption) error {
config := setReadConfig(options)
numSampleRows := 10
inferredTypes := make([]map[string]int, 0)
dtypes := []string{"float", "int", "string", "datetime", "time", "bool"}
var headers [][]string
var rowCount int
data, err := ReadCSV(r, options...)
if err != nil {
return fmt.Errorf("writing mock csv: reading r: %v", err)
}
// data has default labels? exclude them
src := data.CSVRecords(writeOptionIncludeLabels(config.numLabelLevels > 0))
if !config.majorDimIsCols {
rowCount = len(src)
} else {
rowCount = len(src[0])
}
// numSampleRows must not exceed total number of non-header rows in src
maxRows := rowCount - config.numHeaderRows
if maxRows < numSampleRows {
numSampleRows = maxRows
}
// major dimension is rows?
if !config.majorDimIsCols {
// copy headers
for i := 0; i < config.numHeaderRows; i++ {
headers = append(headers, src[i])
}
// prepare one inferredTypes map per column
for range src[0] {
emptyMap := map[string]int{}
for _, dtype := range dtypes {
emptyMap[dtype] = 0
}
inferredTypes = append(inferredTypes, emptyMap)
}
// for each row, infer type column-by-column
// offset data sample by header rows
dataSample := src[config.numHeaderRows : numSampleRows+config.numHeaderRows]
for i := range dataSample {
for k := range dataSample[i] {
value := dataSample[i][k]
dtype := inferType(value)
inferredTypes[k][dtype]++
}
}
// major dimension is columns?
} else {
// prepare one inferredTypes map per column
for range src {
emptyMap := map[string]int{}
for _, dtype := range dtypes {
emptyMap[dtype] = 0
}
inferredTypes = append(inferredTypes, emptyMap)
}
// copy headers
headers = make([][]string, 0)
for l := 0; l < config.numHeaderRows; l++ {
headers = append(headers, make([]string, len(src)))
for k := range src {
// NB: major dimension of output is rows
headers[l][k] = src[k][l]
}
}
// for each column, infer type row-by-row
for k := range src {
// offset by header rows
// infer type of only the sample rows
dataSample := src[k][config.numHeaderRows : numSampleRows+config.numHeaderRows]
for i := range dataSample {
dtype := inferType(dataSample[i])
inferredTypes[k][dtype]++
}
}
}
// major dimension of output is rows, for compatibility with csv.NewWriter
mockCSV := mockCSVFromDTypes(inferredTypes, n)
mockCSV = append(headers, mockCSV...)
writer := csv.NewWriter(w)
return writer.WriteAll(mockCSV)
}
// -- GETTERS
// String prints the DataFrame in table form, with the number of rows constrained by optionMaxRows,
// and the number of columns constrained by optionMaxColumns,
// which may be configured with PrintOptionMaxRows(n) and PrintOptionMaxColumns(n), respectively.
// By default, repeated values are merged together, but this behavior may be disabled with PrintOptionAutoMerge(false).
// By default, overly-wide non-header cells are truncated, but this behavior may be changed to wrapping with PrintOptionWrapLines(true).
func (df *DataFrame) String() string {
if df.err != nil {
return fmt.Sprintf("Error: %v", df.err)
}
var data [][]string
if df.Len() <= optionMaxRows {
data = df.CSVRecords()
} else {
// truncate rows
n := optionMaxRows / 2
topHalf := df.Head(n).CSVRecords()
bottomHalf := df.Tail(n).CSVRecords()[df.numColLevels():]
filler := make([]string, df.NumLevels()+df.NumColumns())
for k := range filler {
filler[k] = "..."
}
data = append(
append(topHalf, filler),
bottomHalf...)
}
// do not print *0-type label names
for j := 0; j < df.NumLevels(); j++ {
row := df.numColLevels() - 1
data[row][j] = suppressDefaultName(data[row][j])
}
// truncate columns
if df.NumColumns() >= optionMaxColumns {
n := (optionMaxColumns / 2)
for i := range data {
labels := data[i][:df.NumLevels()]
leftHalf := data[i][df.NumLevels() : n+df.NumLevels()]
filler := "..."
rightHalf := data[i][df.NumLevels()+df.NumColumns()-n:]
data[i] = append(
append(
labels,
append(leftHalf, filler)...),
rightHalf...)
}
}
// truncate cells
if defaultMaxCellWidth() != optionMaxCellWidth {
for i := range data {
for k := range data[i] {
if r := []rune(data[i][k]); len(r) > optionMaxCellWidth {
data[i][k] = string(r[:optionMaxCellWidth-3]) + "..."
}
}
}
}
// create table
var buf bytes.Buffer
table := tablewriter.NewTable(&buf)
// configure table
if optionMergeRepeats {
table.MergeRepeats()
}
if !optionWrapLines {
table.TruncateWideCells()
}
table.SetAlignment(tablewriter.AlignRight)
table.SetLabelLevelCount(df.NumLevels())
// write headers and rows
for l := 0; l < df.numColLevels(); l++ {
table.AppendHeaderRow(data[l])
}
table.AppendRows(data[df.numColLevels():])
table.Render()
ret := string(buf.Bytes())
// append optional caption
if df.name != "" {
ret += fmt.Sprintf("name: %v\n", df.name)
}
return ret
}
// At returns the Element at the row and column index positions.
// If row or column is out of range, returns nil.
func (df *DataFrame) At(row, column int) *Element {
if row >= df.Len() {
return nil
}
if column >= df.NumColumns() {
return nil
}
v := reflect.ValueOf(df.values[column].slice)
return &Element{
Val: v.Index(row).Interface(),
IsNull: df.values[column].isNull[row],
}
}
// Len returns the number of rows in each column of the DataFrame.
func (df *DataFrame) Len() int {
return reflect.ValueOf(df.values[0].slice).Len()
}
// Err returns the most recent error attached to the DataFrame, if any.
func (df *DataFrame) Err() error {
return df.err
}
// HasType returns the index positions of all label and column containers
// containing a slice of values where reflect.Type.String() == sliceType.
// Container index positions may then be supplied to df.SubsetLabels() or df.SubsetCols().
//
// For example, to search for datetime labels: labels, _ := df.HasType("[]time.Time")
//
// To search for float64 columns: _, cols := df.HasType("[]float64")
//
func (df *DataFrame) HasType(sliceType string) (labelIndex, columnIndex []int) {
for j := range df.labels {
if df.labels[j].dtype().String() == sliceType {
labelIndex = append(labelIndex, j)
}
}
for k := range df.values {
if df.values[k].dtype().String() == sliceType {
columnIndex = append(columnIndex, k)
}
}
return
}
func (df *DataFrame) numColLevels() int {
return len(df.colLevelNames)
}
// NumColumns returns the number of colums in the DataFrame.
func (df *DataFrame) NumColumns() int {
return len(df.values)
}
// NumLevels returns the number of label levels in the DataFrame.
func (df *DataFrame) NumLevels() int {
return len(df.labels)
}
func listNames(columns []*valueContainer) []string {
ret := make([]string, len(columns))
for k := range columns {
ret[k] = columns[k].name
}
return ret
}
func listNamesAtLevel(columns []*valueContainer, level int, numLevels int) ([]string, error) {
ret := make([]string, len(columns))
if level >= numLevels {
return nil, fmt.Errorf("level out of range: %d >= %d", level, numLevels)
}
for k := range columns {
levels := splitNameIntoLevels(columns[k].name)
ret[k] = levels[level]
}
return ret, nil
}
// ListColNames returns the name of all the columns in the DataFrame, in order.
// If df has multiple column levels, each column name is a single string with level values separated by "|" (may be changed with SetOptionDefaultSeparator).
// To return the names at a specific level, use ListColNamesAtLevel().
func (df *DataFrame) ListColNames() []string {
return listNames(df.values)
}
// ListColNamesAtLevel returns the name of all the columns in the DataFrame, in order, at the supplied column level.
// If level is out of range, returns a nil slice.
func (df *DataFrame) ListColNamesAtLevel(level int) []string {
ret, err := listNamesAtLevel(df.values, level, df.numColLevels())
if err != nil {
return nil
}
return ret
}
// ListLabelNames returns the name of all the label levels in the DataFrame, in order.
func (df *DataFrame) ListLabelNames() []string {
return listNames(df.labels)
}
// HasLabels returns an error if the DataFrame does not contain all of the labelNames supplied.
func (df *DataFrame) HasLabels(labelNames ...string) error {
for _, name := range labelNames {
_, err := indexOfContainer(name, df.labels)
if err != nil {
return fmt.Errorf("verifying labels: %v", err)
}
}
return nil
}
// HasCols returns an error if the DataFrame does not contain all of the colNames supplied.
func (df *DataFrame) HasCols(colNames ...string) error {
for _, name := range colNames {
_, err := indexOfContainer(name, df.values)
if err != nil {
return fmt.Errorf("verifying columns: %v", err)
}
}
return nil
}
// InPlace returns a DataFrameMutator, which contains most of the same methods as DataFrame
// but never returns a new DataFrame.
// If you want to save memory and improve performance and do not need to preserve the original DataFrame,
// consider using InPlace().
func (df *DataFrame) InPlace() *DataFrameMutator {
return &DataFrameMutator{dataframe: df}
}
// Subset returns only the rows specified at the index positions, in the order specified.
//Returns a new DataFrame.
func (df *DataFrame) Subset(index []int) *DataFrame {
df = df.Copy()
err := df.InPlace().Subset(index)
if err != nil {
return dataFrameWithError(err)