In [2]:
%matplotlib inline

import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

from collections import defaultdict, Counter

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import matplotlib.dates as mdates

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

import dataStatsAnalysis_old as dsa
import dataStatsPlotting as dsp

dsp.SetParams()

In [3]:
class UnimplementedMethodException(Exception):
    """Exception if someone calls a method that should be overridden."""


class HypothesisTest():
    """Hypothesis test superclass. 

    This class cannot be used as is. 
    It is to be used to construct hypothesis tests 
    for various different test statistics. 
    See the existing child classes below for examples.
    """
    def __init__(self, data, tail='right', iters=1000):
        self.data = data
        self.tail = tail
        self.iters = iters
        self.PrepareData(data)
        self.TestStat()
        self.sampling_dist, self.rv = self.ComputeRv() # pylint: disable=assignment-from-no-return

    # Provide the functionality to convert the data into the format needed 
    # for use in ComputeRv and Power functions. 
    # Ex. Convert to array, split data into component groups, etc. 
    # The self data variables must be created in the function, not returned. 
    # See child classes for examples
    def PrepareData(self, data):
        UnimplementedMethodException()
        
    # This function only needs to be written in the case of a null hypothesis based test. 
    # The self.test_stat needs to be created in the function, not returned. 
    # In the case of an alternative hypothesis based test 
    # test_stat will be provided via a class parameter. 
    # See child classes for examples
    def TestStat(self):
        pass
    
    # Provide the functionality that computes the sampling distribution and rv for the data.
    # Both the sampling distribution and the rv need to be returned by the function 
    # See child classes for examples
    def ComputeRv(self):
        UnimplementedMethodException()
        
    # Provide the functionality that computes the power by running multiple iterations of the hypothesis test.
    # The code in the for loop must first create new data for the run, 
    # which simulates taking another sample from the population, and then run the hypothesis test.
    # See child classes for examples
    def Power(self):
        UnimplementedMethodException()
    
    def PValue(self):
        """Computes the p-value for the hypothesis test.

        returns: float p-value
        """
        if self.tail == 'left':
            pvalue = self.rv.cdf(self.test_stat) # pylint: disable=no-member
        elif self.tail == 'right':
            pvalue = 1 - self.rv.cdf(self.test_stat) # pylint: disable=no-member
        else:
            raise Exception('The value of \'tail\' can only be either \'left\' or \'right\'')

        return pvalue

    def MinMaxTestStat(self):
        """Returns the smallest and largest test statistics in the sampling distribution.
        """
        return min(self.sampling_dist), max(self.sampling_dist)

    def PlotCdf(self):
        """Draws a Cdf with a vertical line at the test stat.
        """      
        plt.plot(self.rv.xk, self.rv.cdf(self.rv.xk), color='C0', lw=2) # pylint: disable=no-member
        
        plt.axvline(self.test_stat, color='C1', lw=1.3) # pylint: disable=no-member

In [58]:
class HTOnewayAnova(HypothesisTest):
    """A chi square hypothesis test. 
    Uses resampling of the expected sequence to simulate the null hypothesis 
    and build the null hypothesis chi square statistic sampling distribution. 
    Accepts data in the form of a list or tuple of two sequences (observed, expected).
    The passed sequences must be the same length, be integer counts of a categorical variable 
    and the sum of the sequence values must be the same. 
    If the sum of the sequence values is different, first normalize the expected values 
    and then create a new expected values sequence by multiplying by the total number of observed values. 
    adjust_expected = expected/sum(expected)*sum(observed)

    Parameters
    ----------
    data (array-like):
        A list or tuple of two sequences (observed, expected)
    tail (str):
        The tail of the distribution to be used in the PValue function
        Accepts only 'right' or 'left'
        Defaults to 'right'
    iters (int):
        The number of iterations to run in the ComputeRv function 
        Defaults to 1000
    
    Attributes
    ----------
    data:
        The original data
    test_stat:
        The test statistic used in the hypothesis test
    sampling_dist:
        The sampling distribution generated by resampling
    rv:
        A scipy.stats discrete_rv object (random variable) 
        that represents the sampling distribution
        This object provides numerous useful attributes and methods
        See the discrete_rv documentation for details

    Methods
    -------
    PValue():
        Computes the p-value for the hypothesis test
    Power(alpha=0.05, num_runs=1000):
        Computes the power of the hypothesis test
        alpha: the significance level for the hypothesis test, default=0.05
        num_runs: the number of hypothesis tests to run, default=1000
    MinMaxTestStat():
        Returns the smallest and largest test statistics in the sampling distribution
    PlotCdf():
        Draws a Cdf of the distribution with a vertical line at the test stat
    """
    def PrepareData(self, data):
        self.pooled_data = np.hstack(data)
        
    def TestStat(self):
        self.test_stat, _ = stats.f_oneway(*data)
        
    def ComputeRv(self):
        # Calculate the variables needed for resampling        
        
        
        # Build the sampling distribution
        f_stats = []
        
        for _ in range(self.iters):
            pooled_data_perm = np.random.permutation(self.pooled_data)
            data_perm_list = []
            
            for x in self.data:
                x_perm = pooled_data_perm[:len(x)]
                data_perm_list.append(x_perm)
    
                pooled_data_perm = pooled_data_perm[len(x):]
                
                f_stat, _ = stats.f_oneway(*data_perm_list)
            
            f_stats.append(f_stat)
            
        return np.array(f_stats), dsa.DiscreteRv(f_stats)
    
#     def Power(self, alpha=0.05, num_runs=1000):
#         """Computes the power of the hypothesis test. 

#         Args
#         ----
#         alpha (float):
#             The significance level for the hypothesis test.
#             Must be between 0 and 1. Defaults to 0.05
#         num_runs (int):
#             The number of times to run the hypothesis test to compute power.
#             Defaults to 1000.

#         Returns
#         -------
#         power:
#             Computed as the percentage of significant pvalues in num_runs of the test.
#             Returned value is between 0 and 1.
#         """    
#         pvalue_count = 0
        
#         for _ in range(num_runs):
#             # Create a new run_observed by resampling the observed sequence 
#             # Then create the new run_data using run_observed and the original expected sequence
#             n = sum(self.observed)
#             values_obs = list(range(len(self.observed)))
#             p_obs = self.observed/sum(self.observed)
        
#             hist = Counter({x:0 for x in values_obs})
#             hist.update(np.random.choice(values_obs, size=n, replace=True, p=p_obs))
#             sorted_hist = sorted(hist.items())
#             run_observed = np.array([x[1] for x in sorted_hist])
#             run_data = run_observed, self.expected

#             # Run the hypothesis test with run_data
#             test = HTChiSquare(run_data, tail=self.tail, iters=100)
#             pvalue = test.PValue()
            
#             if pvalue < alpha:
#                 pvalue_count += 1
            
#         return pvalue_count / num_runs

In [59]:
# Data to use: comes from the scipy.stats.f_oneway example
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.f_oneway.html

tillamook = [0.0571, 0.0813, 0.0831, 0.0976, 0.0817, 0.0859, 0.0735,
             0.0659, 0.0923, 0.0836]
newport = [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835,
           0.0725]
petersburg = [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105]
magadan = [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764,
           0.0689]
tvarminne = [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]

In [60]:
stats.f_oneway(*data)

F_onewayResult(statistic=7.121019471642447, pvalue=0.0002812242314534544)

In [61]:
# Try permutation , petersburg_perm, magadan_perm, tvarminne_perm

data = [tillamook, newport, petersburg, magadan, tvarminne]
pooled_data = np.hstack(data)
pooled_data_perm = np.random.permutation(pooled_data)

data_perm_list = []
for x in data:
    x_perm = pooled_data_perm[:len(x)]
    data_perm_list.append(x_perm)
    
    pooled_data_perm = pooled_data_perm[len(x):]

data_perm_list, stats.f_oneway(*data_perm_list)
# pooled_data_perm = np.random.permutation(pooled_data)
# tillamook_perm = pooled_data_perm[:len(tillamook)]
# newport_perm = pooled_data_perm[len(tillamook):len(tillamook)+len(newport)]
# petersburg

# tillamook_perm, newport_perm

([array([0.0649, 0.0781, 0.0835, 0.0749, 0.1033, 0.1045, 0.0817, 0.0764,
         0.0689, 0.1352]),
  array([0.0735, 0.0974, 0.0873, 0.0976, 0.105 , 0.0725, 0.0677, 0.0659]),
  array([0.0813, 0.0836, 0.0697, 0.0685, 0.0831, 0.0923, 0.0819]),
  array([0.0703, 0.0968, 0.1064, 0.0817, 0.1026, 0.0672, 0.0571, 0.0956]),
  array([0.0915, 0.0662, 0.1039, 0.0973, 0.0859, 0.1016])],
 F_onewayResult(statistic=0.4118576440954995, pvalue=0.7988356024907164))

In [62]:
data

[[0.0571,
  0.0813,
  0.0831,
  0.0976,
  0.0817,
  0.0859,
  0.0735,
  0.0659,
  0.0923,
  0.0836],
 [0.0873, 0.0662, 0.0672, 0.0819, 0.0749, 0.0649, 0.0835, 0.0725],
 [0.0974, 0.1352, 0.0817, 0.1016, 0.0968, 0.1064, 0.105],
 [0.1033, 0.0915, 0.0781, 0.0685, 0.0677, 0.0697, 0.0764, 0.0689],
 [0.0703, 0.1026, 0.0956, 0.0973, 0.1039, 0.1045]]

In [135]:
# Might be working now, but hard to tell because the pvalues are so low
# Should try changing the numbers a bit to get a higher pvalue
# Still need to create the Power part
htanova = HTOnewayAnova(data)

In [136]:
htanova.PValue()

0.0009999999999992237

In [137]:
htanova.MinMaxTestStat()

(0.016241385059404173, 8.013588563313762)

In [138]:
htanova.test_stat

7.121019471642447