In [2]:
%matplotlib inline

import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

from collections import defaultdict, Counter

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import matplotlib.dates as mdates

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import patsy

import dataStatsAnalysis_old as dsa
import dataStatsPlotting as dsp

dsp.SetParams()

In [3]:
class PowerTest():
    
    def __init__(self, data, alpha=0.05, alternative='two-sided', num_runs=100):
        self.data = data
        self.alpha = alpha
        self.alternative = alternative
        self.num_runs = num_runs
        self.PrepareData()
    
    # Provide functionality to convert the data into format needed for use in BuildRv
    # Ex. Convert to array, split data into component groups, etc.
    # See child classes for examples
    def PrepareData(self):
        UnimplementedMethodException()
    
    # Provide functionality that creates the run data and then computes the run's test stat and rv
    # This involves doing one resample to simulate pulling an additional sample from the population,
    # then calculating the test_stat, building a sampling distribution, and computing the rv
    # See child classes for examples
    def ComputeRVandTestStat(self):
        UnimplementedMethodException()
    
    # Computes the pvalue of test stat from an rv,
    # and adds to pvalue_count if less than significance level
    def _RunPvalueCount(self):
        test_stat, rv = self.ComputeRVandTestStat()
        
        p_value_right = 1 - rv.cdf(test_stat)
        p_value_left = rv.cdf(test_stat)
        
        # Two-sided test
        if self.alternative == 'two-sided':
            if (p_value_right < self.alpha/2) or (p_value_left < self.alpha/2):
                self.pvalue_count+= 1
        
        # One-sided test using the right side of the distribution
        elif self.alternative == 'right': 
            if p_value_right < self.alpha:
                self.pvalue_count += 1
        
        # One-sided test using the left side of the distribution
        elif self.alternative == 'left': 
            if p_value_left < self.alpha:
                self.pvalue_count += 1
        
        else:
            raise ValueError("alternative has to be 'two-sided', 'right', or 'left")
    
    # Method for computing power 
    def Power(self):
        self.pvalue_count = 0
        for i in range(self.num_runs):
            self._RunPvalueCount()
            
        return self.pvalue_count / self.num_runs

In [4]:
class PTCorrelationH0(PowerTest):
    def __init__(self, data, alpha=0.05, alternative='two-sided', num_runs=100, method='pearson'):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        self.method = method
    
    def PrepareData(self):
        self.x, self.y = self.data
        self.x = np.array(self.x)
        self.y = np.array(self.y)
        self.df = pd.DataFrame({'x':self.x, 'y': self.y})
    
    def ComputeRVandTestStat(self):
        # Create run data
        run_data = self.df.sample(n=len(self.df), replace=True)
        run_x = run_data.x.values
        run_y = run_data.y.values
        
        corrs=[]
        
        # Compute test_stat and build rv for the run
        if self.method == 'pearson':
            test_stat = stats.pearsonr(run_x , run_y)[0]
            
            for _ in range(100):
                x_perm = np.random.permutation(run_x)
                r = stats.pearsonr(x_perm , run_y)[0]
                corrs.append(r)
    
        elif self.method == 'spearman':
            test_stat = stats.spearmanr(run_x , run_y)[0]
            
            for _ in range(100):
                x_perm = np.random.permutation(run_x)
                r = stats.spearmanr(x_perm , run_y)[0]
                corrs.append(r)
    
        else:
            raise Exception('Must enter either pearson or spearman as a string for method argument')   
        
        rv = dsa.DiscreteRv(corrs)
        
        return test_stat, rv

In [5]:
car = sns.load_dataset('car_crashes')
car.head(3)

Unnamed: 0,total,speeding,alcohol,not_distracted,no_previous,ins_premium,ins_losses,abbrev
0,18.8,7.332,5.64,18.048,15.04,784.55,145.08,AL
1,18.1,7.421,4.525,16.29,17.014,1053.48,133.93,AK
2,18.6,6.51,5.208,15.624,17.856,899.47,110.35,AZ


In [6]:
stats.pearsonr(car.no_previous, car.ins_premium)

(-0.15689520004339752, 0.2715478689798989)

In [7]:
# I think it might be working, a lot of variability in results though (0.17~0.35)
# This variability is likely due to low sample size of just 51 for car data set
# Gpower gives 0.19 for result
data = car.no_previous, car.ins_premium
car_power = PTCorrelationH0(data, alternative = 'two-sided')
car_power.Power()

0.18

In [8]:
# Question: for the alternative hypothesis power test I think I do need a test stat variable in my class
# Need to be able to provide zero or another value for null test stat
# For null hypothesis power test I can calcuate the test-stat from the run data
# Solved this problem

In [9]:
class PTCorrelationHa(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100, method='pearson'):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        self.method = method
        # Alternative hypothesis power tests require a test_stat be provided for null hypothesis (eg. zero for no effect)
        self.test_stat = test_stat 
    
    def PrepareData(self):
        self.x, self.y = self.data
        self.df = pd.DataFrame({'x':self.x, 'y': self.y})
    
    def ComputeRVandTestStat(self):
        # Create run data
        run_data = self.df.sample(n=len(self.df), replace=True)
        
        corrs=[]
        
        # Build rv
        if self.method == 'pearson':          
            for _ in range(100):
                sample = run_data.sample(n=len(run_data), replace=True)
                r = stats.pearsonr(sample.x, sample.y)[0]
                corrs.append(r)
    
        elif self.method == 'spearman':            
            for _ in range(100):
                sample = run_data.sample(n=len(run_data), replace=True)
                r = stats.spearmanr(sample.x, sample.y)[0]
                corrs.append(r)
    
        else:
            raise Exception('Must enter either pearson or spearman as a string for method argument')
               
        test_stat = self.test_stat
        rv = dsa.DiscreteRv(corrs)
        
        return test_stat, rv

In [10]:
# This one seems to be working too but giving a bit lower values than null version 0.11~0.23
data2 = car.no_previous, car.ins_premium
car_power2 = PTCorrelationHa(data2, 0, alternative = 'two-sided')
car_power2.Power()

0.23

In [11]:
# Next create PTMeans, PTDiffMeansH0, and PTDiffMeansHa
# Test all cases for correlation (eg. spearman too), and some edge cases like entering different length sequences

In [12]:
class PTMean(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        # Alternative hypothesis power tests require a test_stat be provided for null hypothesis (eg. zero for no effect)
        self.test_stat = test_stat 
    
    def PrepareData(self):
        self.data = np.array(self.data)
    
    def ComputeRVandTestStat(self):
        run_data = np.random.choice(self.data, size=len(self.data), replace=True)
        mean_estimates = [np.random.choice(run_data, size=len(run_data), replace=True).mean() for _ in range(100)]
        
        test_stat = self.test_stat
        rv = dsa.DiscreteRv(mean_estimates)
        
        return test_stat, rv

In [13]:
mean_data = np.random.randint(-8,11,size=100)

In [14]:
powmean = PTMean(mean_data, 0, alternative='two-sided')
powmean.Power()

0.49

In [15]:
class PTDiffMeansHa(PowerTest):
    def __init__(self, data, test_stat, alpha=0.05, alternative='two-sided', num_runs=100):
        PowerTest.__init__(self, data, alpha, alternative, num_runs)
        # Alternative hypothesis power tests require a test_stat be provided for null hypothesis (eg. zero for no effect)
        self.test_stat = test_stat 
    
    def PrepareData(self):
        self.a, self.b = self.data
        self.a = np.array(self.a)
        self.b = np.array(self.b)
    
    def ComputeRVandTestStat(self):
        # Create run data
        sample1 = np.random.choice(self.a, size=len(self.a), replace=True)
        sample2 = np.random.choice(self.b, size=len(self.b), replace=True)
        
        diff_mean_results = []
        
        # Build a sampling distribution for the run
        for j in range(100):
            group1 = np.random.choice(sample1, size=len(sample1), replace=True)
            group2 = np.random.choice(sample2, size=len(sample2), replace=True)
            result = group1.mean() - group2.mean()
            diff_mean_results.append(result)
        
        test_stat = self.test_stat
        rv = dsa.DiscreteRv(diff_mean_results)
        
        return test_stat, rv

In [16]:
import first
live, firsts, others = first.MakeFrames()

In [17]:
data = [firsts.prglngth.values, others.prglngth.values]

In [18]:
diffmeans = PTDiffMeansHa(data, 0)
diffmeans.Power()

0.24

In [19]:
class PTDiffMeansH0(PowerTest):
    
    def PrepareData(self):
        self.a, self.b = self.data
        self.a = np.array(self.a)
        self.b = np.array(self.b)
        self.pooled_data = np.hstack((self.a, self.b))
        self.a_size = len(self.a)
    
    def ComputeRVandTestStat(self):
        # Create run data by resampling the two groups
        sample1 = np.random.choice(self.a, size=len(self.a), replace=True)
        sample2 = np.random.choice(self.b, size=len(self.b), replace=True)
        
        # Calculate test_stat for the run data
        test_stat = sample1.mean() - sample2.mean()
        
        diff_mean_results = []
        
        # Build a sampling distribution for the run
        for j in range(100):
            np.random.shuffle(self.pooled_data)
            group1 = self.pooled_data[:self.a_size]
            group2 = self.pooled_data[self.a_size:]
            result = group1.mean() - group2.mean()
            diff_mean_results.append(result)
        
        rv = dsa.DiscreteRv(diff_mean_results)
        
        return test_stat, rv

In [20]:
data2 = [firsts.prglngth.values, others.prglngth.values]

In [21]:
# I think both these diff means classes are working now!
diffmeans2 = PTDiffMeansH0(data2)
diffmeans2.Power()

0.36

In [22]:
# Make a chi-square power test

In [23]:
# First play around to recall how my chi square function works
# Could also look again at Star Trek example to recall the type of situation to use this in

In [24]:
observed = [11,7,8,9,10,12,13]
expected = [10,10,10,10,10,10,10]
observed = np.array(observed)
expected = np.array(expected)

In [25]:
n = sum(expected)
values = list(range(len(expected)))
p_exp = expected/sum(expected)
n, values, p_exp

(70,
 [0, 1, 2, 3, 4, 5, 6],
 array([0.14285714, 0.14285714, 0.14285714, 0.14285714, 0.14285714,
        0.14285714, 0.14285714]))

In [26]:
hist = Counter({x:0 for x in values})
hist

Counter({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0})

In [27]:
hist.update(np.random.choice(values, size=n, replace=True, p=p_exp))
hist

Counter({0: 16, 1: 13, 2: 11, 3: 6, 4: 9, 5: 9, 6: 6})

In [28]:
sorted_hist = sorted(hist.items())
sorted_hist

[(0, 16), (1, 13), (2, 11), (3, 6), (4, 9), (5, 9), (6, 6)]

In [29]:
model_observed = np.array([x[1] for x in sorted_hist])
model_observed

array([16, 13, 11,  6,  9,  9,  6])

In [30]:
# I need a resampling of the population assuming the alternative hypothesis is true
# Could I use the same kind of computation as done above for the null to simulate the alternative hypothesis?
# Just use observed instead and create a new resampled sequence?

In [31]:
n = sum(observed)
values_obs = list(range(len(observed)))
p_obs = observed/sum(observed)
n, values_obs, p_obs

(70,
 [0, 1, 2, 3, 4, 5, 6],
 array([0.15714286, 0.1       , 0.11428571, 0.12857143, 0.14285714,
        0.17142857, 0.18571429]))

In [32]:
hist = Counter({x:0 for x in values_obs})
hist

Counter({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0})

In [33]:
hist.update(np.random.choice(values_obs, size=n, replace=True, p=p_obs))
hist

Counter({0: 11, 1: 6, 2: 10, 3: 12, 4: 7, 5: 10, 6: 14})

In [34]:
sorted_hist_obs = sorted(hist.items())
sorted_hist_obs

[(0, 11), (1, 6), (2, 10), (3, 12), (4, 7), (5, 10), (6, 14)]

In [35]:
# I think this will work
model_observed_obs = np.array([x[1] for x in sorted_hist_obs])
model_observed_obs

array([11,  6, 10, 12,  7, 10, 14])

In [36]:
# Think about changing the docstring for my ResampleMean, and alternative hypothesis functions in my module
# Is doubling the "one-sided" p-value really the right way to think about this?
# Rather than doubling the p-value wouldn't it make more sense to halve the alpha when testing for significance?
# This is the data we have and the p-value is what it is giving us
# Try testing some cases in which the sampling distributions will be all positive or all negative
# In this kind of case the p-values calculated should be the same regardless of the alternative used
# Stats exchange discussions say "don't halve the p-value" but they are using only analytical methods
# https://stats.stackexchange.com/questions/267192/doubling-or-halving-p-values-for-one-vs-two-tailed-tests

# Also try comparing results between Ha and H0 hypothesis tests
# to see whether or not doubling p-values from Ha distributions is comparable to two-sided H0 results