/
join_separate.clj
107 lines (95 loc) · 5.66 KB
/
join_separate.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
(ns tablecloth.api.join-separate
(:require [tech.v3.dataset :as ds]
[clojure.string :as str]
[tablecloth.api.utils :refer [iterable-sequence? column-names grouped? process-group-data]]
[tablecloth.api.columns :refer [select-columns drop-columns add-column]]))
(defn- process-join-columns
[ds target-column join-function col-names drop-columns?]
(let [cols (select-columns ds col-names)
result (add-column ds target-column (when (seq cols) (->> (ds/value-reader cols)
(map join-function))))]
(if drop-columns? (drop-columns result col-names) result)))
(defn join-columns
([ds target-column columns-selector] (join-columns ds target-column columns-selector nil))
([ds target-column columns-selector {:keys [separator missing-subst drop-columns? result-type parallel?]
:or {separator "-" drop-columns? true result-type :string}}]
(let [missing-subst-fn #(map (fn [v] (or v missing-subst)) %)
col-names (column-names ds columns-selector)
join-function (comp (cond
(= :map result-type) #(zipmap col-names %)
(= :seq result-type) seq
(fn? result-type) result-type
:else (if (iterable-sequence? separator)
(let [sep (concat
(conj (seq separator) :empty)
(cycle separator))]
(fn [row] (->> row
(remove nil?)
(interleave sep)
(rest)
(apply str))))
(fn [row] (->> row
(remove nil?)
(str/join separator)))))
missing-subst-fn)]
(if (grouped? ds)
(process-group-data ds #(process-join-columns % target-column join-function col-names drop-columns?) parallel?)
(process-join-columns ds target-column join-function col-names drop-columns?)))))
;;
(defn- separate-column->columns
[col target-columns replace-missing separator-fn]
(let [res (pmap separator-fn col)]
(if-not (iterable-sequence? target-columns)
(ds/->dataset res) ;; ds/column->dataset functionality
(->> (map-indexed vector target-columns)
(reduce (fn [curr [idx colname]]
(if-not colname
curr
(conj curr colname (map #(replace-missing (nth % idx)) res)))) [])
(apply array-map)
(ds/->dataset)))))
(defn- prepare-missing-subst-fn
[missing-subst]
(let [missing-subst-fn (cond
(or (set? missing-subst)
(fn? missing-subst)) missing-subst
(iterable-sequence? missing-subst) (set missing-subst)
:else (partial = missing-subst))]
(fn [v] (if (missing-subst-fn v) nil v))))
(defn- process-separate-columns
[ds column target-columns replace-missing separator-fn drop-column?]
(let [result (separate-column->columns (ds column) target-columns replace-missing separator-fn)]
(if (= drop-column? :all)
result
(let [[dataset-before dataset-after] (map (partial ds/select-columns ds)
(split-with #(not= % column)
(ds/column-names ds)))]
(cond-> (ds/->dataset dataset-before)
(not drop-column?) (ds/add-column (ds column))
result (ds/append-columns (seq (ds/columns result)))
:else (ds/append-columns (ds/columns (ds/drop-columns dataset-after [column]))))))))
(defn separate-column
([ds column separator] (separate-column ds column nil separator))
([ds column target-columns separator] (separate-column ds column target-columns separator nil))
([ds column target-columns separator {:keys [missing-subst drop-column? parallel?]
:or {missing-subst ""}}]
(let [separator-fn (cond
(string? separator) (let [pat (re-pattern separator)]
#(-> (str %)
(str/split pat)
(concat (repeat ""))))
(instance? java.util.regex.Pattern separator) #(-> separator
(re-matches (str %))
(rest)
(concat (repeat "")))
:else separator)
replace-missing (if missing-subst
(prepare-missing-subst-fn missing-subst)
identity)
drop-column? (cond
(not (nil? drop-column?)) drop-column?
(not (iterable-sequence? target-columns)) :all
:else true)]
(if (grouped? ds)
(process-group-data ds #(process-separate-columns % column target-columns replace-missing separator-fn drop-column?) parallel?)
(process-separate-columns ds column target-columns replace-missing separator-fn drop-column?)))))