In [3]:
import math 

In [13]:
def calculate_sample_size(alpha, power, effect_size, n_binary, n_cat, n_cont):
    """
    Calculate the sample size needed for a given significance level, power, effect size, and number of binary, categorical and continuous predictors.
    Parameters:
        alpha (float): The significance level for the hypothesis test, typically 0.05.
        power (float): The desired statistical power for the hypothesis test, typically 0.8 or 0.9.
        effect_size (float): the expected effect size of the predictor, estimated in previous studies.
        n_binary (int): the number of binary predictors.
        n_cat (int): the number of categorical predictors.
        n_cont (int): the number of continuous predictors.
    Returns:
        n_total (int): the total sample size needed to achieve the desired performance in terms of accuracy.
    """
    
    # Calculate the Z-values for the desired significance level and power (Cumulative Distribution Function)
    z_alpha = abs(math.erf(-1 * alpha / math.sqrt(2))) ## abs to obtain positive value
    z_power = abs(math.erf(power / math.sqrt(2)) - math.erf(-1 * power / math.sqrt(2)))
    
    # Calculate the required sample size for a single predictor variable 
    n_base = math.ceil((z_alpha + z_power)**2 / effect_size**2) ## use ceil to round up value to nearest integer 
    
    # Adjust the sample size for the number of binary, categorical and continuous predictors 
    n_adjusted = n_base / (n_binary + n_cat + n_cont +1)
    
    ## Round up to nearest integer and return sample size 
    n_total = math.ceil(n_adjusted * (n_binary + n_cat + n_cont +1))
    
    return n_total 


## Set desired significance level and power
alpha = 0.05 
power = 0.8

## estimate effect size based on previous studies 
effect_size = 0.656853 ## feature importance 

## set number of binary, categorical and continuous predictors
n_binary = 10
n_cat = 4
n_cont = 0

## Adjust the effect size for the number of variables using Cohen's f-squared
f_squared = effect_size**2 / (1 - effect_size**2)
f_squared_adj = f_squared / (n_binary + n_cat + n_cont + f_squared)

## Calculate the required sample size using the adjusted effect size 
n_total = calculate_sample_size(alpha, power, math.sqrt(f_squared_adj), n_binary, n_cat, n_cont)
n_total

28