In [45]:
%matplotlib inline

import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

from collections import defaultdict, Counter

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import matplotlib.dates as mdates

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

import dataStatsAnalysis as dsa
import dataStatsPlotting as dsp

dsp.SetParams()

In [46]:
class PowerTest():
    
    def __init__(self, data, alpha=0.05, alternative='two-sided', num_runs=100):
        self.data = data
        self.alpha = alpha
        self.alternative = alternative
        self.num_runs = num_runs
        self.PrepareData()
    
    # Provide functionality to convert the data into format needed for use in BuildRv
    # Ex. Convert to array, split data into component groups, etc.
    # See child classes for examples
    def PrepareData(self):
        UnimplementedMethodException()
    
    # Provide functionality that creates the run data and then computes the run's test stat and rv
    # This involves doing one resample to simulate pulling an additional sample from the population,
    # then calculating the test_stat, building a sampling distribution, and computing the rv
    # See child classes for examples
    def ComputeRVandTestStat(self):
        UnimplementedMethodException()
    
    # Computes the pvalue of test stat from an rv,
    # and adds to pvalue_count if less than significance level
    def _RunPvalueCount(self):
        test_stat, rv = self.ComputeRVandTestStat()
        
        p_value_right = 1 - rv.cdf(test_stat)
        p_value_left = rv.cdf(test_stat)
        
        # Two-sided test
        if self.alternative == 'two-sided':
            if (p_value_right < self.alpha/2) or (p_value_left < self.alpha/2):
                self.pvalue_count+= 1
        
        # One-sided test using the right side of the distribution
        elif self.alternative == 'right': 
            if p_value_right < self.alpha:
                self.pvalue_count += 1
        
        # One-sided test using the left side of the distribution
        elif self.alternative == 'left': 
            if p_value_left < self.alpha:
                self.pvalue_count += 1
        
        else:
            raise ValueError("alternative has to be 'two-sided', 'right', or 'left")
    
    # Method for computing power 
    def Power(self):
        self.pvalue_count = 0
        for i in range(self.num_runs):
            self._RunPvalueCount()
            
        return self.pvalue_count / self.num_runs

In [47]:
class PTCorrelationH0(PowerTest):
    def __init__(self, data, alpha=0.05, alternative='two-sided', num_runs=100, method='pearson'):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        self.method = method
    
    def PrepareData(self):
        self.x, self.y = self.data
        self.x = np.array(self.x)
        self.y = np.array(self.y)
        self.df = pd.DataFrame({'x':self.x, 'y': self.y})
    
    def ComputeRVandTestStat(self):
        # Create run data
        run_data = self.df.sample(n=len(self.df), replace=True)
        run_x = run_data.x.values
        run_y = run_data.y.values
        
        corrs=[]
        
        # Compute test_stat and build rv for the run
        if self.method == 'pearson':
            test_stat = stats.pearsonr(run_x , run_y)[0]
            
            for _ in range(100):
                x_perm = np.random.permutation(run_x)
                r = stats.pearsonr(x_perm , run_y)[0]
                corrs.append(r)
    
        elif self.method == 'spearman':
            test_stat = stats.spearmanr(run_x , run_y)[0]
            
            for _ in range(100):
                x_perm = np.random.permutation(run_x)
                r = stats.spearmanr(x_perm , run_y)[0]
                corrs.append(r)
    
        else:
            raise Exception('Must enter either pearson or spearman as a string for method argument')   
        
        rv = dsa.DiscreteRv(corrs)
        
        return test_stat, rv

In [48]:
car = sns.load_dataset('car_crashes')
car.head(3)

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ


In [49]:
stats.pearsonr(car.no_previous, car.ins_premium)

(-0.1568952000433975, 0.2715478689798987)

In [50]:
# I think it might be working, a lot of variability in results though (0.17~0.35)
# This variability is likely due to low sample size of just 51 for car data set
# Gpower gives 0.19 for result
data = car.no_previous, car.ins_premium
car_power = PTCorrelationH0(data, alternative = 'two-sided', method='pearson')
car_power.Power()

0.22

In [51]:
# Question: for the alternative hypothesis power test I think I do need a test stat variable in my class
# Need to be able to provide zero or another value for null test stat
# For null hypothesis power test I can calcuate the test-stat from the run data
# Solved this problem

In [52]:
class PTCorrelationHa(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100, method='pearson'):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        self.method = method
        # Alternative hypothesis power tests require a test_stat be provided for null hypothesis (eg. zero for no effect)
        self.test_stat = test_stat 
    
    def PrepareData(self):
        self.x, self.y = self.data
        self.df = pd.DataFrame({'x':self.x, 'y': self.y})
    
    def ComputeRVandTestStat(self):
        # Create run data
        run_data = self.df.sample(n=len(self.df), replace=True)
        
        corrs=[]
        
        # Build rv
        if self.method == 'pearson':          
            for _ in range(100):
                sample = run_data.sample(n=len(run_data), replace=True)
                r = stats.pearsonr(sample.x, sample.y)[0]
                corrs.append(r)
    
        elif self.method == 'spearman':            
            for _ in range(100):
                sample = run_data.sample(n=len(run_data), replace=True)
                r = stats.spearmanr(sample.x, sample.y)[0]
                corrs.append(r)
    
        else:
            raise Exception('Must enter either pearson or spearman as a string for method argument')
               
        test_stat = self.test_stat
        rv = dsa.DiscreteRv(corrs)
        
        return test_stat, rv

In [53]:
# This one seems to be working too but giving a bit lower values than null version 0.11~0.23
data2 = car.no_previous, car.ins_premium
car_power2 = PTCorrelationHa(data2, 0, alternative = 'two-sided', method='pearson')
car_power2.Power()

0.12

In [54]:
# Next create PTMeans, PTDiffMeansH0, and PTDiffMeansHa - done
# Test all cases for correlation (eg. spearman too), and some edge cases like entering different length sequences
# Spearman works, giving lower power than pearson
# Different length sequences produce ValueError: arrays must all be same length

In [55]:
# Test case of different sequence lengths
# Produces ValueError: arrays must all be same length, as expected
# s1 = np.random.randint(1,50,100)
# s2 = np.random.randint(1,50,90)
# data3 = s1, s2
# test_power = PTCorrelationHa(data3, 0, alternative = 'two-sided', method='pearson')
# test_power.Power()

In [56]:
class PTMean(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        # Alternative hypothesis power tests require a test_stat be provided for null hypothesis (eg. zero for no effect)
        self.test_stat = test_stat 
    
    def PrepareData(self):
        self.data = np.array(self.data)
    
    def ComputeRVandTestStat(self):
        run_data = np.random.choice(self.data, size=len(self.data), replace=True)
        mean_estimates = [np.random.choice(run_data, size=len(run_data), replace=True).mean() for _ in range(100)]
        
        test_stat = self.test_stat
        rv = dsa.DiscreteRv(mean_estimates)
        
        return test_stat, rv

In [57]:
mean_data = np.random.randint(-8,11,size=100)

In [58]:
powmean = PTMean(mean_data, 0, alternative='two-sided')
powmean.Power()

0.42

In [59]:
class PTDiffMeansHa(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        # Alternative hypothesis power tests require a test_stat be provided for null hypothesis (eg. zero for no effect)
        self.test_stat = test_stat 
    
    def PrepareData(self):
        self.a, self.b = self.data
        self.a = np.array(self.a)
        self.b = np.array(self.b)
    
    def ComputeRVandTestStat(self):
        # Create run data
        sample1 = np.random.choice(self.a, size=len(self.a), replace=True)
        sample2 = np.random.choice(self.b, size=len(self.b), replace=True)
        
        diff_mean_results = []
        
        # Build a sampling distribution for the run
        for _ in range(100):
            group1 = np.random.choice(sample1, size=len(sample1), replace=True)
            group2 = np.random.choice(sample2, size=len(sample2), replace=True)
            result = group1.mean() - group2.mean()
            diff_mean_results.append(result)
        
        test_stat = self.test_stat
        rv = dsa.DiscreteRv(diff_mean_results)
        
        return test_stat, rv

In [60]:
import first
live, firsts, others = first.MakeFrames()

In [61]:
data = [firsts.prglngth.values, others.prglngth.values]

In [62]:
diffmeans = PTDiffMeansHa(data, 0, num_runs=100)
diffmeans.Power()

0.34

In [63]:
class PTDiffMeansH0(PowerTest):
    
    def PrepareData(self):
        self.a, self.b = self.data
        self.a = np.array(self.a)
        self.b = np.array(self.b)
        self.pooled_data = np.hstack((self.a, self.b))
        self.a_size = len(self.a)
    
    def ComputeRVandTestStat(self):
        # Create run data by resampling the two groups
        sample1 = np.random.choice(self.a, size=len(self.a), replace=True)
        sample2 = np.random.choice(self.b, size=len(self.b), replace=True)
        
        # Calculate test_stat for the run data
        test_stat = sample1.mean() - sample2.mean()
        
        diff_mean_results = []
        
        # Build a sampling distribution for the run
        for _ in range(100):
            np.random.shuffle(self.pooled_data)
            group1 = self.pooled_data[:self.a_size]
            group2 = self.pooled_data[self.a_size:]
            result = group1.mean() - group2.mean()
            diff_mean_results.append(result)
        
        rv = dsa.DiscreteRv(diff_mean_results)
        
        return test_stat, rv

In [64]:
data2 = [firsts.prglngth.values, others.prglngth.values]

In [65]:
# I think both these diff means classes are working now!
diffmeans2 = PTDiffMeansH0(data2, num_runs=100)
diffmeans2.Power()

0.33

In [66]:
# Make a chi-square power test

In [67]:
# First play around to recall how my chi square function works
# Could also look again at Star Trek example to recall the type of situation to use this in

In [68]:
observed = [11,7,4,7,12,16,13]
expected = [10,10,10,10,10,10,10]
observed = np.array(observed)
expected = np.array(expected)

In [69]:
n = sum(expected)
values = list(range(len(expected)))
p_exp = expected/sum(expected)
n, values, p_exp

(70,
 [0, 1, 2, 3, 4, 5, 6],
 array([0.14285714, 0.14285714, 0.14285714, 0.14285714, 0.14285714,
        0.14285714, 0.14285714]))

In [70]:
hist = Counter({x:0 for x in values})
hist

Counter({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0})

In [71]:
hist.update(np.random.choice(values, size=n, replace=True, p=p_exp))
hist

Counter({0: 8, 1: 7, 2: 8, 3: 13, 4: 11, 5: 13, 6: 10})

In [72]:
sorted_hist = sorted(hist.items())
sorted_hist

[(0, 8), (1, 7), (2, 8), (3, 13), (4, 11), (5, 13), (6, 10)]

In [73]:
model_observed = np.array([x[1] for x in sorted_hist])
model_observed

array([ 8,  7,  8, 13, 11, 13, 10])

In [74]:
# I need a resampling of the population assuming the alternative hypothesis is true
# Could I use the same kind of computation as done above for the null to simulate the alternative hypothesis?
# Just use observed instead and create a new resampled sequence?

In [75]:
n = sum(observed)
values_obs = list(range(len(observed)))
p_obs = observed/sum(observed)
n, values_obs, p_obs

(70,
 [0, 1, 2, 3, 4, 5, 6],
 array([0.15714286, 0.1       , 0.05714286, 0.1       , 0.17142857,
        0.22857143, 0.18571429]))

In [76]:
hist = Counter({x:0 for x in values_obs})
hist

Counter({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0})

In [77]:
hist.update(np.random.choice(values_obs, size=n, replace=True, p=p_obs))
hist

Counter({0: 8, 1: 1, 2: 4, 3: 10, 4: 14, 5: 20, 6: 13})

In [78]:
sorted_hist_obs = sorted(hist.items())
sorted_hist_obs

[(0, 8), (1, 1), (2, 4), (3, 10), (4, 14), (5, 20), (6, 13)]

In [79]:
# I think this will work
model_observed_obs = np.array([x[1] for x in sorted_hist_obs])
model_observed_obs

array([ 8,  1,  4, 10, 14, 20, 13])

In [80]:
class PTChiSquareH0(PowerTest):
    '''Takes data in form of two sequences: data = observed, expected'''
    
    def PrepareData(self):
        self.observed, self.expected = self.data
        self.observed = np.array(self.observed)
        self.expected = np.array(self.expected)
    
    def ComputeRVandTestStat(self):
        # Create run data by resampling the observed sequence (assuming the alternative hypothesis)
        n = sum(self.observed)
        values_obs = list(range(len(self.observed)))
        p_obs = self.observed/sum(self.observed)
        
        hist = Counter({x:0 for x in values_obs})
        hist.update(np.random.choice(values_obs, size=n, replace=True, p=p_obs))
        sorted_hist = sorted(hist.items())
        run_observed = np.array([x[1] for x in sorted_hist])
        
        # Calculate test_stat for the run data using the observed sequence (alternative hypothesis)
        test_stat = sum((run_observed - self.expected)**2 / self.expected)
        
        chis = []
        
        # Build a chi sqaure sampling distribution for the run using the expected sequence (null hypothesis)
        for _ in range(100):
            n = sum(self.expected)
            values = list(range(len(self.expected)))
            p_exp = self.expected/sum(self.expected)
            
            hist = Counter({x:0 for x in values}) # Initialize a Counter with zero values
            hist.update(np.random.choice(values, size=n, replace=True, p=p_exp))
            sorted_hist = sorted(hist.items())
            model_observed = np.array([x[1] for x in sorted_hist])
            chi = sum((model_observed - self.expected)**2 / self.expected)
            chis.append(chi)
        
        rv = dsa.DiscreteRv(chis)
        
        return test_stat, rv

In [81]:
observed = [11,7,4,7,12,16,13]
expected = [10,10,10,10,10,10,10]
observed = np.array(observed)
expected = np.array(expected)
data = observed, expected

In [82]:
# these results are 0.52~0.66
ptchi = PTChiSquareH0(data)
ptchi.Power()

0.5

In [83]:
# Try with statsmodels
p_exp = expected/sum(expected)
p_obs = observed/sum(observed)
effect = sms.gof.chisquare_effectsize(p_exp, p_obs)
effect

0.3854496446637727

In [84]:
# Not entirely sure what nobs and n_bins are
# I think I've figured out what nobs and n_bins are
# And now the results match very well with mine
# nobs is the total number of observations (ie. sum(observed) or sum(expected))
# n_bins is the number of cells in the sequence/array (ie. len(observed) or len(expected))
smsptchi = sms.GofChisquarePower()
smsptchi.solve_power(effect_size=effect, nobs=70, alpha=0.05, n_bins=7)

0.6642702634182477

In [85]:
# Make an alternative hypothesis chi square power test too
# I've decided not to include this in my dsa module
# The main reason is that I don't have confidence in the methodology of calculating the test_stat
# It seems odd to calculate the test_stat using a run_observed calculated from the expected values
# This could be valid but I'm just not sure
# Plus having two of these functions is really overkill I think
class PTChiSquareHa(PowerTest):
    '''Takes data in form of two sequences: data = observed, expected'''
    
    def PrepareData(self):
        self.observed, self.expected = self.data
        self.observed = np.array(self.observed)
        self.expected = np.array(self.expected)
    
    def ComputeRVandTestStat(self):
        # Create run data by resampling the expected sequence (assuming the null hypothesis)
        n = sum(self.expected)
        values_exp = list(range(len(self.expected)))
        p_exp = self.expected/sum(self.expected)
        
        hist = Counter({x:0 for x in values_exp})
        hist.update(np.random.choice(values_exp, size=n, replace=True, p=p_exp))
        sorted_hist = sorted(hist.items())
        run_observed = np.array([x[1] for x in sorted_hist])
        
        # Calculate test_stat for the run data (assuming the null hypothesis)
        test_stat = sum((run_observed - self.expected)**2 / self.expected)
        
        chis = []
        
        # Build a chi square sampling distribution for the run using the observed sequence (alternative hypothesis)
        for _ in range(100):
            n = sum(self.observed)
            values = list(range(len(self.observed)))
            p_obs = self.observed/sum(self.observed)
            
            hist = Counter({x:0 for x in values}) # Initialize a Counter with zero values
            hist.update(np.random.choice(values, size=n, replace=True, p=p_obs))
            sorted_hist = sorted(hist.items())
            model_observed = np.array([x[1] for x in sorted_hist])
            chi = sum((model_observed - self.expected)**2 / self.expected)
            chis.append(chi)
        
        rv = dsa.DiscreteRv(chis)
        
        return test_stat, rv

In [86]:
observed = [11,7,4,7,12,16,13]
expected = [10,10,10,10,10,10,10]
observed = np.array(observed)
expected = np.array(expected)
data2 = observed, expected

In [87]:
# This is 0.43~0.54 (lower than H0 version)
ptchi = PTChiSquareHa(data2)
ptchi.Power()

0.44

In [88]:
# Think about changing the docstring for my ResampleMean, and alternative hypothesis functions in my module
# Is doubling the "one-sided" p-value really the right way to think about this?
# Rather than doubling the p-value wouldn't it make more sense to halve the alpha when testing for significance?
# This is the data we have and the p-value is what it is giving us
# Try testing some cases in which the sampling distributions will be all positive or all negative
# In this kind of case the p-values calculated should be the same regardless of the alternative used
# Stats exchange discussions say "don't halve the p-value" but they are using only analytical methods
# Also I'm talking about halving alpha, not changing the p-value
# https://stats.stackexchange.com/questions/267192/doubling-or-halving-p-values-for-one-vs-two-tailed-tests
# Try to find the discussion of this in the Statistics by Jim book

# Also try comparing results between Ha and H0 hypothesis tests
# to see whether or not doubling p-values from Ha distributions is comparable to two-sided H0 results