Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ Gemfile.lock
pkg/*
*.rbc
.idea
coverage
coverage
issues.rtf
dump.rdb
.gitignore
14 changes: 14 additions & 0 deletions README.mdown
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,20 @@ Thanks for signing up, dude! <% finished("signup_page_redesign") %>

You can find more examples, tutorials and guides on the [wiki](https://github.com/andrew/split/wiki).

## Statistical Validity

Split uses a z test (n>30) of the difference between your control and alternative conversion rates to calculate statistical significance.

This means that Split will tell you whether an alternative is better or worse than your control, but it will not distinguish between which alternative is the best in an experiment with multiple alternatives. To find that out, run a new experiment with one of the prior alternatives as the control.

Also, as per this [blog post](http://www.evanmiller.org/how-not-to-run-an-ab-test.html) on the pitfalls of A/B testing, it is highly recommended that you determine your requisite sample size for each branch before running the experiment. Otherwise, you'll have an increased rate of false positives (experiments which show a significant effect where really there is none).

[Here](http://www.evanmiller.org/ab-testing/sample-size.html) is a sample size calculator for your convenience.

Finally, two things should be noted about the dashboard:
* Split will only tell if you if your experiment is 90%, 95%, or 99% significant. For levels of lesser significance, Split will simply show "insufficient significance."
* If you have less than 30 participants or 5 conversions for a branch, Split will not calculate significance, as you have not yet gathered enough data.

## Extras

### Weighted alternatives
Expand Down
3 changes: 2 additions & 1 deletion lib/split.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
persistence
encapsulated_helper
trial
version].each do |f|
version
zscore].each do |f|
require "split/#{f}"
end

Expand Down
30 changes: 15 additions & 15 deletions lib/split/alternative.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
require 'split/zscore'

# TODO - take out require and implement using file paths?

module Split
class Alternative
attr_accessor :name
attr_accessor :experiment_name
attr_accessor :weight

include Zscore

def initialize(name, experiment_name)
@experiment_name = experiment_name
if Hash === name
Expand Down Expand Up @@ -84,29 +90,23 @@ def experiment
end

def z_score(goal = nil)
# CTR_E = the CTR within the experiment split
# CTR_C = the CTR within the control split
# E = the number of impressions within the experiment split
# C = the number of impressions within the control split
# p_a = Pa = proportion of users who converted within the experiment split (conversion rate)
# p_c = Pc = proportion of users who converted within the control split (conversion rate)
# n_a = Na = the number of impressions within the experiment split
# n_c = Nc = the number of impressions within the control split

control = experiment.control

alternative = self

return 'N/A' if control.name == alternative.name

ctr_e = alternative.conversion_rate(goal)
ctr_c = control.conversion_rate(goal)


e = alternative.participant_count
c = control.participant_count

return 0 if ctr_c.zero?
p_a = alternative.conversion_rate(goal)
p_c = control.conversion_rate(goal)

standard_deviation = ((ctr_e / ctr_c**3) * ((e*ctr_e)+(c*ctr_c)-(ctr_c*ctr_e)*(c+e))/(c*e)) ** 0.5
n_a = alternative.participant_count
n_c = control.participant_count

z_score = ((ctr_e / ctr_c) - 1) / standard_deviation
z_score = Split::Zscore.calculate(p_a, n_a, p_c, n_c)
end

def save
Expand Down
15 changes: 7 additions & 8 deletions lib/split/dashboard/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,16 @@ def confidence_level(z_score)

z = round(z_score.to_s.to_f, 3).abs

if z == 0.0
'No Change'
elsif z < 1.645
'no confidence'
elsif z < 1.96
'95% confidence'
elsif z < 2.57
if z >= 2.58
'99% confidence'
elsif z >= 1.96
'95% confidence'
elsif z >= 1.65
'90% confidence'
else
'99.9% confidence'
'Insufficient confidence'
end

end
end
end
56 changes: 56 additions & 0 deletions lib/split/zscore.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
module Split
module Zscore

include Math

def self.calculate(p1, n1, p2, n2)
# p_1 = Pa = proportion of users who converted within the experiment split (conversion rate)
# p_2 = Pc = proportion of users who converted within the control split (conversion rate)
# n_1 = Na = the number of impressions within the experiment split
# n_2 = Nc = the number of impressions within the control split
# s_1 = SEa = standard error of p_1, the estiamte of the mean
# s_2 = SEc = standard error of p_2, the estimate of the control
# s_p = SEp = standard error of p_1 - p_2, assuming a pooled variance
# s_unp = SEunp = standard error of p_1 - p_2, assuming unpooled variance

p_1 = p1.to_f
p_2 = p2.to_f

n_1 = n1.to_f
n_2 = n2.to_f

# Perform checks on data to make sure we can validly run our confidence tests
if n_1 < 30 || n_2 < 30
error = "Needs 30+ participants."
return error
elsif p_1 * n_1 < 5 || p_2 * n_2 < 5
error = "Needs 5+ conversions."
return error
end

# Formula for standard error: root(pq/n) = root(p(1-p)/n)
s_1 = Math.sqrt((p_1)*(1-p_1)/(n_1))
s_2 = Math.sqrt((p_2)*(1-p_2)/(n_2))

# Formula for pooled error of the difference of the means: root(π*(1-π)*(1/na+1/nc)
# π = (xa + xc) / (na + nc)
pi = (p_1*n_1 + p_2*n_2)/(n_1 + n_2)
s_p = Math.sqrt(pi*(1-pi)*(1/n_1 + 1/n_2))

# Formula for unpooled error of the difference of the means: root(sa**2/na + sc**2/nc)
s_unp = Math.sqrt(s_1**2 + s_2**2)

# Boolean variable decides whether we can pool our variances
pooled = s_1/s_2 < 2 && s_2/s_1 < 2

# Assign standard error either the pooled or unpooled variance
se = pooled ? s_p : s_unp

# Calculate z-score
z_score = (p_1 - p_2)/(se)

return z_score

end
end
end
43 changes: 39 additions & 4 deletions spec/alternative_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -211,10 +211,45 @@
end

describe 'z score' do
it 'should be zero when the control has no conversions' do
alternative2.z_score.should eql(0)
alternative2.z_score(goal1).should eql(0)
alternative2.z_score(goal2).should eql(0)

it "should return an error string when the control has 0 people" do
alternative2.z_score.should eql("Needs 30+ participants.")
alternative2.z_score(goal1).should eql("Needs 30+ participants.")
alternative2.z_score(goal2).should eql("Needs 30+ participants.")
end

it "should return an error string when the data is skewed or incomplete as per the np > 5 test" do
control = experiment.control
control.participant_count = 100
control.set_completed_count(50)

alternative2.participant_count = 50
alternative2.set_completed_count(1)

alternative2.z_score.should eql("Needs 5+ conversions.")
end

it "should return a float for a z_score given proper data" do
control = experiment.control
control.participant_count = 120
control.set_completed_count(20)

alternative2.participant_count = 100
alternative2.set_completed_count(25)

alternative2.z_score.should be_kind_of(Float)
alternative2.z_score.should_not eql(0)
end

it "should correctly calculate a z_score given proper data" do
control = experiment.control
control.participant_count = 126
control.set_completed_count(89)

alternative2.participant_count = 142
alternative2.set_completed_count(119)

alternative2.z_score.round(2).should eql(2.58)
end

it "should be N/A for the control" do
Expand Down
16 changes: 13 additions & 3 deletions spec/dashboard_helpers_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,22 @@
describe Split::DashboardHelpers do
describe 'confidence_level' do
it 'should handle very small numbers' do
confidence_level(Complex(2e-18, -0.03)).should eql('No Change')
confidence_level(Complex(2e-18, -0.03)).should eql('Insufficient confidence')
end

it "should consider a z-score of 1.645 < z < 1.96 as 95% confident" do
confidence_level(1.80).should eql('95% confidence')
it "should consider a z-score of 1.65 <= z < 1.96 as 90% confident" do
confidence_level(1.65).should eql('90% confidence')
confidence_level(1.80).should eql('90% confidence')
end

it "should consider a z-score of 1.96 <= z < 2.58 as 95% confident" do
confidence_level(1.96).should eql('95% confidence')
confidence_level(2.00).should eql('95% confidence')
end

it "should consider a z-score of z >= 2.58 as 95% confident" do
confidence_level(2.58).should eql('99% confidence')
confidence_level(3.00).should eql('99% confidence')
end
end
end