In [91]:
import pandas as pd 
import numpy as np 


class sample_size_calculator():
    
    
    def __init__(
        self, 
        p1, 
        ratio = 1,
        volume = None,
        uplift = None,
        min_size = None,
        max_size = None
    ): 
        
        """
           :param float p1: The control rate.
           :param float uplift: The relative expected change against the control.
           :param float ratio: The ratio between variant and control sample size. 
           :type priority: integer or None
           :return: The expected number of observations the variant needs for statistical significance. 
        """
        
        self.volume = volume
        self.p1 = p1 
        self.p2 = None 
        self.ratio = ratio 
        self.uplift = uplift 
        self.min = min_size
        self.max = max_size 
        
    def get_sample_size(self):
        
        assert self.uplift is not None, 'Need to specify uplift'
    
        self.p2 = self.p1*(self.uplift+1)

        if self.uplift < 0:
            self.p2 = self.p1*(self.uplift-1)

        nom = ((1.96+0.84)**2)*(self.p1*(1-self.p1)+self.p2*(1-self.p2)*self.ratio)**0.5
        denom = (self.p1-self.p2)**2

        return int(nom/denom) 
    
    
    def simulate(self): 
        
        assert self.min is not None, 'Need to specify grid sizes'
        assert self.max is not None, 'Need to specify grid sizes'
    
        grid = np.arange(self.min, self.max, 0.005)

        df = pd.DataFrame(columns=['Uplift - %', 'Samples', 'Current Volume'])

        for i, u in enumerate(grid): 
            self.uplift = u
            df.at[i, 'Uplift - %'] = u*100
            df.at[i, 'Samples'] = self.get_sample_size()
            df.at[i, 'Current Volume'] = self.volume

        if self.volume is None: 
            return df.drop('Current Volume', axis=1)
        else: 
            df['Additional Units of Time To Significance'] = np.where(df['Samples']<df['Current Volume'], 'No Units', df['Samples']/df['Current Volume'])
            df['Expected Significance'] = np.where(df['Current Volume'] > df['Samples'], 'Yes', 'No')
            return df 
    
    
    
    

In [94]:
init = sample_size_calculator(p1 = 0.5, min_size= 0.03, max_size=0.2, volume = 5000)


init.simulate()


Unnamed: 0,Uplift - %,Samples,Current Volume,Additional Units of Time To Significance,Expected Significance
0,3.0,24633,5000,4.9266,No
1,3.5,18096,5000,3.6192,No
2,4.0,13853,5000,2.7706,No
3,4.5,10945,5000,2.189,No
4,5.0,8864,5000,1.7728,No
5,5.5,7324,5000,1.4648,No
6,6.0,6154,5000,1.2308,No
7,6.5,5242,5000,1.0484,No
8,7.0,4519,5000,No Units,Yes
9,7.5,3936,5000,No Units,Yes


In [40]:
for i in np.arange(0, 0.2, 0.005):
    print(i)

0.0
0.005
0.01
0.015
0.02
0.025
0.03
0.035
0.04
0.045
0.05
0.055
0.06
0.065
0.07
0.075
0.08
0.085
0.09
0.095
0.1
0.105
0.11
0.115
0.12
0.125
0.13
0.135
0.14
0.145
0.15
0.155
0.16
0.165
0.17
0.17500000000000002
0.18
0.185
0.19
0.195
