Permalink
Browse files

Implement summary function for DateTime columns

  • Loading branch information...
1 parent 6a24385 commit 25888047f9717d8142d13c99071e605aafc37cb0 @ray1729 committed Feb 18, 2012
@@ -6,7 +6,8 @@
org.clojure/clojure-contrib]]
[incanter/parallelcolt "0.9.4"
:exclusions [org.clojure/clojure
- org.clojure/clojure-contrib]]]
+ org.clojure/clojure-contrib]]
+ [clj-time "0.3.6"]]
:dev-dependencies [[lein-clojars "0.7.0"
:exclusions [org.clojure/clojure
org.clojure/clojure-contrib]]]
@@ -37,6 +37,7 @@
(cern.jet.stat.tdouble DoubleDescriptive
Probability)
(incanter Weibull))
+ (:require [clj-time.coerce :as ctime])
(:use [clojure.set :only [difference intersection union]])
(:use [incanter.core :only ($ abs plus minus div mult mmult to-list bind-columns
gamma pow sqrt diag trans regularized-beta ncol
@@ -2582,39 +2583,42 @@ Test for different variances between 2 samples
{:col col :min (reduce min (remove nil? ($ col ds))) :max (reduce max (remove nil? ($ col ds)))
:mean (mean (remove nil? ($ col ds))) :median (median (remove nil? ($ col ds))) :is-numeric true}))
-
(defn category-col-summarizer
"Returns a summarizer function which takes a category column and returns a list of the top 5 columns by volume, and a
count of remaining rows"
([col ds]
(let [freqs (frequencies ($ col ds)) top-5 (take 5 (reverse (sort-by val freqs)))]
(into {:col col :count (- (reduce + (map val freqs)) (reduce + (map val (into {} top-5)))) :is-numeric false} top-5))))
+(defn date-col-summarizer
+ "Returns a summarizer function that takes a column with DateTime values"
+ ([col ds]
+ (let [vs (map ctime/to-long (remove nil? ($ col ds)))]
+ {:col col :min (ctime/from-long (reduce min vs)) :max (ctime/from-long (reduce max vs))
+ :mean (ctime/from-long (long (mean vs))) :median (ctime/from-long (long (median vs))) :is-numeric false})))
(defn choose-singletype-col-summarizer
"Takes in a type, and returns a suitable column summarizer"
([col-type]
- (if (.isAssignableFrom java.lang.Number col-type)
- numeric-col-summarizer
- (if (or (.isAssignableFrom java.lang.String col-type) (.isAssignableFrom clojure.lang.Keyword col-type))
- category-col-summarizer
- ; FIXME Deal with date columns
- (str "Don't know how to summarize a column of type: " col-type)
- ))))
-
+ (cond
+ (.isAssignableFrom java.lang.Number col-type) numeric-col-summarizer
+ (.isAssignableFrom java.lang.String col-type) category-col-summarizer
+ (.isAssignableFrom clojure.lang.Keyword col-type) category-col-summarizer
+ (.isAssignableFrom org.joda.time.DateTime col-type) date-col-summarizer
+ :else (str "Don't know how to summarize a column of type: " col-type))))
(defn summarizer-fn
"Takes in a column (number or name) and a dataset. Returns a function to summarize the column if summarizable, and a
string describing why the column can't be summarized in the event that it can't"
([col ds]
- (let [type-counts (dissoc (count-col-types col ds) nil)]
- (if (= 1 (count type-counts))
- (choose-singletype-col-summarizer (nth (keys type-counts) 0))
- (if (every? #(.isAssignableFrom java.lang.Number %) (keys type-counts))
- numeric-col-summarizer
- (if (and (= 2 (count type-counts)) (contains? type-counts java.lang.String) (contains? type-counts clojure.lang.Keyword))
- category-col-summarizer
- (stat-summarizable type-counts)))))))
+ (let [type-counts (dissoc (count-col-types col ds) nil)]
+ (cond
+ (= 1 (count type-counts)) (choose-singletype-col-summarizer (nth (keys type-counts) 0))
+ (every? #(.isAssignableFrom java.lang.Number %) (keys type-counts)) numeric-col-summarizer
+ (and (= 2 (count type-counts)) (contains? type-counts java.lang.String) (contains? type-counts clojure.lang.Keyword)) category-col-summarizer
+ (every? #(.isAssignableFrom org.joda.time.DateTime %) (keys type-counts)) date-col-summarizer
+ :else (stat-summarizable type-counts)))))
+
(defn summarizable?
"Takes in a column name (or number) and a dataset. Returns true if the column can be summarized, and false otherwise"
@@ -21,7 +21,8 @@
(ns incanter.stats-tests
(:use clojure.test
- (incanter core stats)))
+ (incanter core stats))
+ (:require [clj-time.core :as ct]))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; UNIT TESTS FOR incanter.stats.clj
@@ -78,6 +79,8 @@
(def summary-ds8 (to-dataset [["a"] ["b"] ["c"] ["d"] ["b"] ["e"] ["a"] ["b"] ["f"] ["a"] ["b"] ["e"]]))
(def summary-ds9 (to-dataset [["a" 1.2] [":b" 3] [:c 0.1] ["d" 8] ["b" 9] ["e" 7.21] ["a" 1E1] ["b" 6.0000] ["f" 1e-2] ["a" 3.0] ["b" 4] ["e" 5]]))
+(def summary-ds10 (to-dataset (map #(ct/date-time 2012 02 %) (range 1 10))))
+
(deftest mean-test
(is (= (map mean (trans test-mat)) [108.0 130.0])))
@@ -280,7 +283,15 @@
(is (not (summarizable? 0 summary-ds5)))
(is (not (summarizable? 0 summary-ds6)))
(is (summarizable? 0 summary-ds7))
- )
+ (is (summarizable? 0 summary-ds10)))
+
+(deftest summarize-date-column
+ (let [s (first (summary summary-ds10))]
+ (is (:min s) (ct/date-time 2012 02 1))
+ (is (:max s) (ct/date-time 2012 02 9))
+ (is (:mean s) (ct/date-time 2012 02 5))
+ (is (:median s) (ct/date-time 2012 02 5))
+ (is (not (:is-numeric? s)))))
(deftest simple-p-value-test
(testing "Basic p-value testing"
View
@@ -28,7 +28,8 @@
[swingrepl "1.3.0"
:exclusions [org.clojure/clojure
org.clojure/clojure-contrib]]
- [jline "0.9.94"]]
+ [jline "0.9.94"]
+ [clj-time "0.3.6"]]
:dev-dependencies [[lein-clojars "0.7.0"
:exclusions [org.clojure/clojure
org.clojure/clojure-contrib]]]

0 comments on commit 2588804

Please sign in to comment.