-
Notifications
You must be signed in to change notification settings - Fork 18
/
dataset.clj
145 lines (126 loc) · 4.62 KB
/
dataset.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
(ns tablecloth.api.dataset
(:refer-clojure :exclude [concat])
(:require [tech.v3.dataset :as ds]
[tech.v3.dataset.column :as col]
[tech.v3.protocols.dataset :as prot]
[tech.v3.dataset.print :as p]
[tech.v3.tensor :as tensor]
[tech.v3.dataset.tensor :as ds-tensor]
[tablecloth.api.utils :refer [iterable-sequence? grouped? mark-as-group map-inst?]]))
;;;;;;;;;;;;;;;;;;;;;
;; DATASET CREATION
;;;;;;;;;;;;;;;;;;;;;
(defn dataset?
"Is `ds` a `dataset` type?"
[ds]
(satisfies? prot/PColumnarDataset ds))
(defn empty-ds?
[ds]
(zero? (ds/row-count ds)))
(defn- fix-map-dataset
"If map contains value which is not a sequence, convert it to a sequence."
[map-ds]
(let [c (if-let [first-seq (->> map-ds
(vals)
(filter iterable-sequence?)
(first))]
(count first-seq)
1)]
(apply array-map (interleave (keys map-ds)
(map #(if (iterable-sequence? %) % (repeat c %)) (vals map-ds))))))
(def ^:private numerical-classes (set (map #(Class/forName %) ["[[B" "[[S" "[[I" "[[J" "[[F" "[[D"])))
(defn- from-tensor
[data column-names layout]
(let [t (tensor/->tensor data)
t (-> (if (= layout :as-columns) (tensor/transpose t [1 0]) t)
(ds-tensor/tensor->dataset))]
(if column-names
(ds/rename-columns t (zipmap (range (ds/column-count t)) column-names))
t)))
(defn dataset
"Create `dataset`.
Dataset can be created from:
* single value
* map of values and/or sequences
* sequence of maps
* sequence of columns
* file or url"
([] (ds/new-dataset nil))
([data]
(dataset data nil))
([data {:keys [single-value-column-name column-names layout]
:or {single-value-column-name :$value layout :as-columns}
:as options}]
(cond
(dataset? data) data
(map-inst? data) (ds/->dataset (fix-map-dataset data) options)
(and (iterable-sequence? data)
(every? iterable-sequence? data)
(every? #(and (= 2 (count %))
(or (keyword? (first %))
(string? (first %)))) data)) (dataset (apply array-map (mapcat identity data)) options)
(and (iterable-sequence? data)
(every? col/is-column? data)) (ds/new-dataset options data)
(or (numerical-classes (class data))
(and (iterable-sequence? data)
(not-every? map? data))) (dataset (from-tensor data column-names layout))
(not (seqable? data)) (ds/->dataset [{single-value-column-name data}] options)
:else (ds/->dataset data options))))
(defn shape
"Returns shape of the dataset [rows, cols]"
[ds]
[(ds/row-count ds)
(ds/column-count ds)])
(defn- columns-info
[ds]
(dataset (->> (ds/columns ds)
(map meta))
{:dataset-name (str (ds/dataset-name ds) " :column info")}))
(defn info
([ds] (info ds :descriptive))
([ds result-type]
(condp = result-type
:descriptive (ds/descriptive-stats ds)
:columns (columns-info ds)
(let [grouped? (boolean (:grouped? (meta ds)))
nm (ds/dataset-name ds)
inf {:name nm
:grouped? grouped?}]
(dataset (if grouped?
(assoc inf :groups (ds/row-count inf))
(assoc inf
:rows (ds/row-count ds)
:columns (ds/column-count ds)))
{:dataset-name (str nm " :basic info")})))))
(defn columns
([ds] (columns ds :as-seqs))
([ds result-type]
(let [cols (ds/columns ds)]
(case result-type
:as-map (zipmap (ds/column-names ds) cols)
:as-double-arrays (into-array (map double-array (ds/columns ds)))
:as-seqs cols
cols))))
(defn rows
([ds] (rows ds :as-seqs))
([ds result-type]
(case result-type
:as-maps (ds/mapseq-reader ds)
:as-double-arrays (into-array (map double-array (ds/value-reader ds)))
:as-seqs (ds/value-reader ds)
(ds/value-reader ds))))
(defn print-dataset
([ds] (println (p/dataset->str ds)))
([ds options] (println (p/dataset->str ds options))))
;;
(defn- do-concat
[concat-fs ds & datasets]
(let [res (apply concat-fs ds datasets)]
(if (and (grouped? ds)
(every? grouped? datasets))
(-> res
(ds/add-or-update-column :group-id (range (ds/row-count res)))
(mark-as-group))
res)))
(defn concat [dataset & datasets] (apply do-concat ds/concat dataset datasets))
(defn concat-copying [dataset & datasets] (apply do-concat ds/concat-copying dataset datasets))