In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

from collections import defaultdict, Counter

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import matplotlib.dates as mdates

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import patsy

import dataStatsAnalysis_old as dsa
import dataStatsPlotting as dsp

dsp.SetParams()

In [2]:
def PowerMean(data, test_mean, alpha=0.05, alternative='two-sided', num_runs=100):
    a = np.array(data)
    
    # Set counter for significant p-values to zero
    count = 0
        
    # Run resampling of the array to simulate the alternative hypothesis of an existing difference from the test mean
    for i in range(num_runs):
        run_data = np.random.choice(a, size=len(a), replace=True)
            
        # Build a sampling distribution for the run
        mean_estimates = [np.random.choice(run_data, size=len(run_data), replace=True).mean() for _ in range(100)]
            
        # Create an rv of the results and calculate left and right side p-values
        rv = dsa.DiscreteRv(mean_estimates)
        p_value_right = 1 - rv.cdf(test_mean)
        p_value_left = rv.cdf(test_mean)
        
        # Case of a two-sided test
        if alternative == 'two-sided':
            if (p_value_right < alpha/2) or (p_value_left < alpha/2):
                count += 1
        
        # Case of testing for an effect that is smaller than the test mean
        elif alternative == 'smaller': 
            if p_value_right < alpha:
                count += 1
        
        # Case of testing for an effect that is larger than the test mean
        elif alternative == 'larger': 
            if p_value_left < alpha:
                count += 1
        
        else:
            raise ValueError("alternative has to be 'two-sided', 'smaller', or 'larger'")
            
    return count / num_runs

In [3]:
class UnimplementedMethodException(Exception):
    """Exception if someone calls a method that should be overridden."""

In [4]:
# Create the PowerTest superclass
class PowerTest():
    
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100):
        self.data = data
        self.test_stat = test_stat
        self.alpha = alpha
        self.alternative = alternative
        self.num_runs = num_runs
        self.PrepareData()
    
    # Provide functionality to convert the data into format needed for use in BuildRv
    # Ex. Convert to array, split data into component groups, etc.
    # See child classes for examples
    def PrepareData(self):
        UnimplementedMethodException()
    
    # Provide functionality to create a sampling distribution and build an rv
    # This involves doing one resample to simulate pulling an additional sample from the population,
    # then building building a sampling distribution for that sample,
    # and finally building an rv from the sampling distribution
    # See child classes for examples
    def BuildRv(self):
        UnimplementedMethodException()
    
    # Computes the pvalue of test stat from an rv,
    # and adds to pvalue_count if less than significance level
    def _RunPvalueCount(self):
        rv = self.BuildRv()
        p_value_right = 1 - rv.cdf(self.test_stat)
        p_value_left = rv.cdf(self.test_stat)
        
        # Case of a two-sided test
        if self.alternative == 'two-sided':
            if (p_value_right < self.alpha/2) or (p_value_left < self.alpha/2):
                self.pvalue_count+= 1
        
        # Case of testing for an effect that is smaller than the test mean
        elif self.alternative == 'smaller': 
            if p_value_right < self.alpha:
                self.pvalue_count += 1
        
        # Case of testing for an effect that is larger than the test mean
        elif self.alternative == 'larger': 
            if p_value_left < self.alpha:
                self.pvalue_count += 1
        
        else:
            raise ValueError("alternative has to be 'two-sided', 'smaller', or 'larger'")
    
    # Method for computing power 
    def Power(self):
        self.pvalue_count = 0
        for i in range(self.num_runs):
            self._RunPvalueCount()
            
        return self.pvalue_count / self.num_runs

In [5]:
class PTMean(PowerTest):
    
    def PrepareData(self):
        self.a = np.array(self.data)
    
    def BuildRv(self):
        run_data = np.random.choice(self.a, size=len(self.a), replace=True)
        mean_estimates = [np.random.choice(run_data, size=len(run_data), replace=True).mean() for _ in range(100)]
        rv = dsa.DiscreteRv(mean_estimates)
        return rv

In [6]:
mean_data = np.random.randint(-8,11,size=100)

In [7]:
powmean = PTMean(mean_data, 0, alternative='two-sided')

In [8]:
powmean.Power()

0.3

In [9]:
PowerMean(mean_data, 0, alternative='two-sided')

0.29

In [10]:
def PowerDiffMeans(a, b, test_diff_means, alpha = 0.05, alternative = 'two-sided', num_runs=100):
    a = np.array(a)
    b = np.array(b)
    
    # Set counter for significant p-values to zero
    count = 0
    
    # Run resampling of the arrays separately (not pooled) to simulate the alternative hypothesis of a difference existing
    for i in range(num_runs):
        sample1 = np.random.choice(a, size=len(a), replace=True)
        sample2 = np.random.choice(b, size=len(b), replace=True)
        
        diff_mean_results = []
        
        # Build a sampling distribution for the run
        for j in range(100):
            group1 = np.random.choice(sample1, size=len(sample1), replace=True)
            group2 = np.random.choice(sample2, size=len(sample2), replace=True)
            result = group1.mean() - group2.mean()
            diff_mean_results.append(result)
        
        # Create an rv of the results and calculate left and right side p-values
        rv = dsa.DiscreteRv(diff_mean_results)
        p_value_right = 1 - rv.cdf(test_diff_means)
        p_value_left = rv.cdf(test_diff_means)        

        # Case of a two-sided test
        if alternative == 'two-sided':
            if (p_value_right < alpha/2) or (p_value_left < alpha/2):
                count += 1
        
        # Case of testing for an effect that is smaller than the test difference of means
        elif alternative == 'smaller': 
            if p_value_right < alpha:
                count += 1
        
        # Case of testing for an effect that is larger than the test difference of means
        elif alternative == 'larger': 
            if p_value_left < alpha:
                count += 1
        
        else:
            raise ValueError("alternative has to be 'two-sided', 'smaller', or 'larger'")
    
    return count / num_runs

In [11]:
class PTDiffMeans(PowerTest):
    
    def PrepareData(self):
        self.a, self.b = self.data
        self.a = np.array(self.a)
        self.b = np.array(self.b)
    
    def BuildRv(self):
        # Create run data
        sample1 = np.random.choice(self.a, size=len(self.a), replace=True)
        sample2 = np.random.choice(self.b, size=len(self.b), replace=True)
        
        diff_mean_results = []
        
        # Build a sampling distribution for the run
        for j in range(100):
            group1 = np.random.choice(sample1, size=len(sample1), replace=True)
            group2 = np.random.choice(sample2, size=len(sample2), replace=True)
            result = group1.mean() - group2.mean()
            diff_mean_results.append(result)
        
        rv = dsa.DiscreteRv(diff_mean_results)
        
        return rv

In [12]:
import first

live, firsts, others = first.MakeFrames()

In [13]:
data = [firsts.prglngth.values, others.prglngth.values]

In [14]:
diffmeans = PTDiffMeans(data, 0)

In [15]:
# Seems to be working
diffmeans.Power()

0.26

In [16]:
PowerDiffMeans(firsts.prglngth.values, others.prglngth.values, 0)

0.28

In [17]:
len(firsts)

4413

In [18]:
# Gpower gives a power of 0.41, very close
np.mean(firsts.prglngth.values), np.mean(others.prglngth.values), np.std(firsts.prglngth.values), np.std(others.prglngth.values)

(38.60095173351461, 38.52291446673706, 2.791585069824391, 2.6155761106844744)

In [19]:
class PTCorrelation(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100, method='pearson'):
        PowerTest.__init__(self, data, test_stat, alpha, alternative, num_runs)
        self.method = method
    
    def PrepareData(self):
        self.x, self.y = self.data
        self.df = pd.DataFrame({'x':self.x, 'y': self.y})
    
    def BuildRv(self):
        # Create run data
        run_data = self.df.sample(n=len(self.df), replace=True)
        
        corrs=[]
        
        # Build rv
        if self.method == 'pearson':          
            for _ in range(100):
                sample = run_data.sample(n=len(run_data), replace=True)
                r = stats.pearsonr(sample.x, sample.y)[0]
                corrs.append(r)
    
        elif self.method == 'spearman':  
            for _ in range(100):
                sample = run_data.sample(n=len(run_data), replace=True)
                r = stats.spearmanr(sample.x, sample.y)[0]
                corrs.append(r)
    
        else:
            raise Exception('Must enter either pearson or spearman as a string for method argument')
               
        rv = dsa.DiscreteRv(corrs)
        
        return rv

In [20]:
cleaned = live.dropna(subset=['agepreg', 'totalwgt_lb'])
data = cleaned.agepreg.values, cleaned.totalwgt_lb.values

In [21]:
powcorr = PTCorrelation(data,0)

In [22]:
powcorr.Power()

1.0

In [23]:
penguins = sns.load_dataset('penguins')

In [24]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [25]:
length = penguins.bill_length_mm.dropna()
depth = penguins.bill_depth_mm.dropna()

In [26]:
stats.pearsonr(length, depth)

(-0.23505287035553274, 1.119662196137215e-05)

In [27]:
data = length.values, depth.values

In [28]:
powcorr_penguins = PTCorrelation(data,0)
powcorr_penguins.Power()

1.0

In [29]:
len(penguins.bill_length_mm.dropna()), len(penguins.bill_depth_mm.dropna())

(342, 342)

In [30]:
df_test = sns.load_dataset('diamonds')
df_test.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [31]:
df_test = df_test.dropna()

In [32]:
stats.pearsonr(df_test.depth, df_test.price)

(-0.01064740458414299, 0.013403249011863443)

In [33]:
data = df_test.depth.values, df_test.price.values

In [34]:
# Might be working, not sure how to confirm
# Comparison with Gpower results (z-tests - correlations: two independent Pearson's rs) is not close (0.41)
# Also strange that changing to smaller alternative didn't change the result of the calculation
testpowcorr = PTCorrelation(data,0, alternative='smaller')
testpowcorr.Power()

0.88

In [35]:
len(df_test.price.values)

53940

In [36]:
# Find a smaller correlation data set to test this with
a = np.random.randint(1,20,50)
a

array([ 8,  8, 11,  3,  3, 14, 10,  5, 13,  1, 19,  7, 11,  7,  7,  8,  8,
       10,  9,  4, 17, 18,  5, 11,  5,  9,  2, 15, 11, 19,  3,  7,  1,  5,
        6,  4,  6,  8,  2, 15, 17, 19,  5, 13, 14,  4, 19, 15,  9, 16])

In [37]:
b = np.random.randint(1,20,50)
b

array([17, 10,  8, 16,  6,  4,  1, 13,  7, 16, 16, 18, 16, 13, 14, 11,  9,
       12,  1, 19, 11,  8,  3,  5, 15,  7,  4, 15,  4,  8, 16, 14, 12, 15,
       12, 19, 13, 13, 16,  4,  4, 19,  1, 17, 17, 19,  7, 14,  6,  6])

In [38]:
stats.pearsonr(a,b)

(-0.19846434180924646, 0.1670798494325032)

In [39]:
data2 = a,b
testpowcorr2 = PTCorrelation(data2, 0, alternative='two-sided')
testpowcorr2.Power()

0.26

In [40]:
# A test with same data from Hypothesis test... notebook
c = [12,  7,  2, 12, 11,  5, 15,  5,  6, 12,  5, 15, 10,  2,  8,  2,  5,
       16, 16,  2, 17,  7, 11, 13, 13, 13, 17,  4,  9, 15, 13,  5,  5, 19,
        3,  4, 11, 16,  4, 14,  7, 12, 14,  9,  8,  4,  2,  8,  8,  8]
c = np.array(c)

In [41]:
d = [13,  3,  8, 18, 12,  8, 14, 19,  5,  2, 10, 17,  6, 12,  3,  2, 18,
        6, 11,  7, 12, 18, 16, 13, 13, 14, 19,  2, 14, 17,  3,  5, 14,  2,
       10, 14,  9, 11,  2,  5, 11, 18,  3,  4, 18,  1, 11,  4, 18,  3]
d= np.array(d)

In [42]:
# Yields same result so this doesn't appear to be a problem with using a class instead of a function
data3 = c,d
testpowcorr3 = PTCorrelation(data3, 0, alternative='two-sided')
testpowcorr3.Power()

0.34

In [43]:
# From a test of the two correlation hypothesis test methods (H0 and Ha) it seems there is quite a difference
# See test near bottom of Hypothesis test... notebook
# Try building a power test with the null hypothesis method

In [44]:
class PTCorrelationH0(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100, method='pearson'):
        PowerTest.__init__(self, data, test_stat, alpha, alternative, num_runs)
        self.method = method
    
    def PrepareData(self):
        self.x, self.y = self.data
    
    def BuildRv(self):
        # Create run data
        run_x = np.random.permutation(self.x)
        run_y = self.y
        
        corrs=[]
        
        # Build rv
        if self.method == 'pearson':          
            for _ in range(100):
                x_perm = np.random.permutation(run_x)
                r = stats.pearsonr(x_perm , run_y)[0]
                corrs.append(r)
    
        elif self.method == 'spearman':  
            for _ in range(100):
                x_perm = np.random.permutation(run_x)
                r = stats.pearsonr(x_perm , run_y)[0]
                corrs.append(r)
    
        else:
            raise Exception('Must enter either pearson or spearman as a string for method argument')   
        
        rv = dsa.DiscreteRv(corrs)
        
        return rv

In [45]:
# Need the correlation from the orginal
# Interesting, the p-value from this is close to the one from the Ha computation not the H0 one
corr_original = stats.pearsonr(c,d)
corr_original

(0.1684736086385471, 0.24218627009242552)

In [46]:
# Is it working? gives almost zero power, sometimes 0.01
# Gpower gives 0.13
corr0 = PTCorrelationH0(data3, corr_original[0])
corr0.Power()

0.0

In [47]:
# Test original correlation hypothesis tests to see if there is a significant difference in pvalues

In [48]:
car = sns.load_dataset('car_crashes')
car.head(3)

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ


In [53]:
stats.pearsonr(car.no_previous, car.ins_premium)

(-0.15689520004339752, 0.2715478689798989)

In [54]:
# Checking pvalue from null hypothesis resampling
# It's very close to that of stats one above
resultH0 = dsa.ResampleCorrelationH0(car.no_previous.values, car.ins_premium.values)
dsa.PValueFromEstimates(resultH0[1],resultH0[0])

0.27999999999999947

In [57]:
def ResampleCorrelation_Ha(x, y, iters=1000, method='pearson'):
    """Generates a correlation sampling distribution for the alternative hypothesis of correlation existing between the variables. 
    This is done by resampling x, y pairs and calculating correlation on new samples. 
    Can then make an rv of this distribution to calculate sampling distribution mean, std deviation (std error), and confidence interval (rv.interval). 
    Can also get a one-sided p-value for case of no difference null hypothesis using rv.cdf(0). 
    For two-sided p-value, can double the one-sided if sampling distribution is symmetrical or use the H0 version of this function. 
    Can also use the 'min' and 'max' built-ins to find what the most extreme values are from the simluations.

    Args:
        x (array-like): Input variable 1
        y (array-like): Input variable 2
        iters (int): The number of simulations to run (Defaults to 1000)
        method (string): Select 'pearson' or 'spearman' method (default: 'pearson')
        
    Returns:
        actual_r: Original actual correlation value
        corrs (array): Sampling distribution for the alternative hypothesis of no correlation obtained from resampling
    """
    if method == 'pearson':  
        # Calculate actual correlation
        actual_r = stats.pearsonr(x, y)[0]

        # Create a dataframe to hold the x and y values as pairs
        df = pd.DataFrame({'x':x, 'y': y})

        corrs=[]    
        for _ in range(iters):
            sample = df.sample(n=len(df), replace=True)
            r = stats.pearsonr(sample.x, sample.y)[0]
            corrs.append(r)
    
    elif method == 'spearman':
        # Calculate actual correlation
        actual_r = stats.spearmanr(x, y)[0]

        # Create a dataframe to hold the x and y values as pairs
        df = pd.DataFrame({'x':x, 'y': y})

        corrs=[]    
        for _ in range(iters):
            sample = df.sample(n=len(df), replace=True)
            r = stats.spearmanr(sample.x, sample.y)[0]
            corrs.append(r)
    
    else:
        raise Exception('Must enter either pearson or spearman as a string for method argument')
      
    return actual_r, np.array(corrs)


In [58]:
# Checking pvalue from alternative hypothesis resampling
# It's lower but still close to the stats one above
resultHa = ResampleCorrelation_Ha(car.no_previous, car.ins_premium)
dsa.PValueFromEstimates(resultHa[1], 0)

0.18299999999999939

In [59]:
data = car.no_previous, car.ins_premium
car_power = PTCorrelation(data,0)
car_power.Power()

0.2

In [60]:
data = car.no_previous, car.ins_premium
car_powerH0 = PTCorrelationH0(data,-0.1568)
car_powerH0.Power()

0.0

In [61]:
len(car)

51

In [62]:
# G-power gives 0.19 for this

In [63]:
# See how my power diff means one compares with GPower result
# Did this above and very similar result