splitrb · andrew · Nov 19, 2013 · Nov 18, 2013 · Nov 18, 2013 · Nov 18, 2013
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,7 @@ Gemfile.lock
 pkg/*
 *.rbc
 .idea
-coverage
+coverage
+issues.rtf
+dump.rdb
+.gitignore
diff --git a/README.mdown b/README.mdown
@@ -134,6 +134,20 @@ Thanks for signing up, dude! <% finished("signup_page_redesign") %>
 
 You can find more examples, tutorials and guides on the [wiki](https://github.com/andrew/split/wiki).
 
+## Statistical Validity
+
+Split uses a z test (n>30) of the difference between your control and alternative conversion rates to calculate statistical significance. 
+
+This means that Split will tell you whether an alternative is better or worse than your control, but it will not distinguish between which alternative is the best in an experiment with multiple alternatives. To find that out, run a new experiment with one of the prior alternatives as the control.
+
+Also, as per this [blog post](http://www.evanmiller.org/how-not-to-run-an-ab-test.html) on the pitfalls of A/B testing, it is highly recommended that you determine your requisite sample size for each branch before running the experiment. Otherwise, you'll have an increased rate of false positives (experiments which show a significant effect where really there is none).
+
+[Here](http://www.evanmiller.org/ab-testing/sample-size.html) is a sample size calculator for your convenience.
+
+Finally, two things should be noted about the dashboard:
+* Split will only tell if you if your experiment is 90%, 95%, or 99% significant. For levels of lesser significance, Split will simply show "insufficient significance."
+* If you have less than 30 participants or 5 conversions for a branch, Split will not calculate significance, as you have not yet gathered enough data.
+
 ## Extras
 
 ### Weighted alternatives

diff --git a/lib/split.rb b/lib/split.rb
@@ -9,7 +9,8 @@
    persistence
    encapsulated_helper
    trial
-   version].each do |f|
+   version
+   zscore].each do |f|
   require "split/#{f}"
 end
 

diff --git a/lib/split/alternative.rb b/lib/split/alternative.rb
@@ -1,9 +1,15 @@
+require 'split/zscore'
+
+# TODO - take out require and implement using file paths?
+
 module Split
   class Alternative
     attr_accessor :name
     attr_accessor :experiment_name
     attr_accessor :weight
 
+    include Zscore
+
     def initialize(name, experiment_name)
       @experiment_name = experiment_name
       if Hash === name
@@ -84,29 +90,23 @@ def experiment
     end
 
     def z_score(goal = nil)
-      # CTR_E = the CTR within the experiment split
-      # CTR_C = the CTR within the control split
-      # E = the number of impressions within the experiment split
-      # C = the number of impressions within the control split
+      # p_a = Pa = proportion of users who converted within the experiment split (conversion rate)
+      # p_c = Pc = proportion of users who converted within the control split (conversion rate)
+      # n_a = Na = the number of impressions within the experiment split
+      # n_c = Nc = the number of impressions within the control split
 
       control = experiment.control
-
       alternative = self
 
       return 'N/A' if control.name == alternative.name
 
-      ctr_e = alternative.conversion_rate(goal)
-      ctr_c = control.conversion_rate(goal)
-
-
-      e = alternative.participant_count
-      c = control.participant_count
-
-      return 0 if ctr_c.zero?
+      p_a = alternative.conversion_rate(goal)
+      p_c = control.conversion_rate(goal)
 
-      standard_deviation = ((ctr_e / ctr_c**3) * ((e*ctr_e)+(c*ctr_c)-(ctr_c*ctr_e)*(c+e))/(c*e)) ** 0.5
+      n_a = alternative.participant_count
+      n_c = control.participant_count
 
-      z_score = ((ctr_e / ctr_c) - 1) / standard_deviation
+      z_score = Split::Zscore.calculate(p_a, n_a, p_c, n_c)
     end
 
     def save

diff --git a/lib/split/dashboard/helpers.rb b/lib/split/dashboard/helpers.rb
@@ -21,17 +21,16 @@ def confidence_level(z_score)
 
       z = round(z_score.to_s.to_f, 3).abs
 
-      if z == 0.0
-        'No Change'
-      elsif z < 1.645
-        'no confidence'
-      elsif z < 1.96
-        '95% confidence'
-      elsif z < 2.57
+      if z >= 2.58
         '99% confidence'
+      elsif z >= 1.96
+        '95% confidence'
+      elsif z >= 1.65
+        '90% confidence'
       else
-        '99.9% confidence'
+        'Insufficient confidence'
       end
+
     end
   end
 end
diff --git a/lib/split/zscore.rb b/lib/split/zscore.rb
@@ -0,0 +1,56 @@
+module Split
+  module Zscore
+
+    include Math
+
+    def self.calculate(p1, n1, p2, n2)
+      # p_1 = Pa = proportion of users who converted within the experiment split (conversion rate)
+      # p_2 = Pc = proportion of users who converted within the control split (conversion rate)
+      # n_1 = Na = the number of impressions within the experiment split
+      # n_2 = Nc = the number of impressions within the control split
+      # s_1 = SEa = standard error of p_1, the estiamte of the mean
+      # s_2 = SEc = standard error of p_2, the estimate of the control
+      # s_p = SEp = standard error of p_1 - p_2, assuming a pooled variance
+      # s_unp = SEunp = standard error of p_1 - p_2, assuming unpooled variance
+
+      p_1 = p1.to_f
+      p_2 = p2.to_f
+
+      n_1 = n1.to_f
+      n_2 = n2.to_f
+
+      # Perform checks on data to make sure we can validly run our confidence tests
+      if n_1 < 30 || n_2 < 30
+        error = "Needs 30+ participants."
+        return error
+      elsif p_1 * n_1 < 5 || p_2 * n_2 < 5
+        error = "Needs 5+ conversions."
+        return error
+      end
+
+      # Formula for standard error: root(pq/n) = root(p(1-p)/n)
+      s_1 = Math.sqrt((p_1)*(1-p_1)/(n_1))
+      s_2 = Math.sqrt((p_2)*(1-p_2)/(n_2))
+
+      # Formula for pooled error of the difference of the means: root(π*(1-π)*(1/na+1/nc)
+      # π = (xa + xc) / (na + nc)
+      pi = (p_1*n_1 + p_2*n_2)/(n_1 + n_2) 
+      s_p = Math.sqrt(pi*(1-pi)*(1/n_1 + 1/n_2))
+
+      # Formula for unpooled error of the difference of the means: root(sa**2/na + sc**2/nc)
+      s_unp = Math.sqrt(s_1**2 + s_2**2)
+
+      # Boolean variable decides whether we can pool our variances
+      pooled = s_1/s_2 < 2 && s_2/s_1 < 2
+
+      # Assign standard error either the pooled or unpooled variance
+      se = pooled ? s_p : s_unp
+
+      # Calculate z-score
+      z_score = (p_1 - p_2)/(se)
+
+      return z_score
+
+    end
+  end
+end
diff --git a/spec/alternative_spec.rb b/spec/alternative_spec.rb
@@ -211,10 +211,45 @@
   end
 
   describe 'z score' do
-    it 'should be zero when the control has no conversions' do
-      alternative2.z_score.should eql(0)
-      alternative2.z_score(goal1).should eql(0)
-      alternative2.z_score(goal2).should eql(0)
+
+    it "should return an error string when the control has 0 people" do
+      alternative2.z_score.should eql("Needs 30+ participants.")
+      alternative2.z_score(goal1).should eql("Needs 30+ participants.")
+      alternative2.z_score(goal2).should eql("Needs 30+ participants.")
+    end
+
+    it "should return an error string when the data is skewed or incomplete as per the np > 5 test" do
+      control = experiment.control
+      control.participant_count = 100
+      control.set_completed_count(50)
+
+      alternative2.participant_count = 50
+      alternative2.set_completed_count(1)
+
+      alternative2.z_score.should eql("Needs 5+ conversions.")
+    end
+
+    it "should return a float for a z_score given proper data" do
+      control = experiment.control
+      control.participant_count = 120
+      control.set_completed_count(20)
+
+      alternative2.participant_count = 100
+      alternative2.set_completed_count(25)
+
+      alternative2.z_score.should be_kind_of(Float)
+      alternative2.z_score.should_not eql(0)
+    end
+
+    it "should correctly calculate a z_score given proper data" do
+      control = experiment.control
+      control.participant_count = 126
+      control.set_completed_count(89)
+
+      alternative2.participant_count = 142
+      alternative2.set_completed_count(119)
+
+      alternative2.z_score.round(2).should eql(2.58)
     end
 
     it "should be N/A for the control" do

diff --git a/spec/dashboard_helpers_spec.rb b/spec/dashboard_helpers_spec.rb
@@ -6,12 +6,22 @@
 describe Split::DashboardHelpers do
   describe 'confidence_level' do
     it 'should handle very small numbers' do
-      confidence_level(Complex(2e-18, -0.03)).should eql('No Change')
+      confidence_level(Complex(2e-18, -0.03)).should eql('Insufficient confidence')
     end
 
-    it "should consider a z-score of 1.645 < z < 1.96 as 95% confident" do
-      confidence_level(1.80).should eql('95% confidence')
+    it "should consider a z-score of 1.65 <= z < 1.96 as 90% confident" do
+      confidence_level(1.65).should eql('90% confidence')
+      confidence_level(1.80).should eql('90% confidence')
     end
 
+    it "should consider a z-score of 1.96 <= z < 2.58 as 95% confident" do
+      confidence_level(1.96).should eql('95% confidence')
+      confidence_level(2.00).should eql('95% confidence')
+    end
+
+    it "should consider a z-score of z >= 2.58 as 95% confident" do
+      confidence_level(2.58).should eql('99% confidence')
+      confidence_level(3.00).should eql('99% confidence')
+    end
   end
 end