In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

from collections import defaultdict, Counter

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import matplotlib.dates as mdates

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

import dataStatsAnalysis as dsa
import dataStatsPlotting as dsp

dsp.SetParams()

In [None]:
# Load the hypothesis test superclass
class HypothesisTest():
    """Represents a hypothesis test. 
    The actual test statistic for the data is available through a .actual attribute. 
    After PValue is run the scipy stats random variable for the sampling distribution is available through a .rv attribute. 
    The cdf of the distribution along with a line representing the test statistic value can be plotted using PlotCdf(). 
    The largest test statistic seen in the simulations is given by MaxTestStat()."""

    def __init__(self, data, tail='right', iters=1000):
        """Initializes the hypothesis test.

        data: data in whatever form is relevant
        """
        self.data = data
        self.tail = tail
        self.iters = iters
        self.test_stat, self.rv = ComputeTestStatandRv(self)

    # Provide functionality to convert the data into format needed for use in BuildRv
    # Ex. Convert to array, split data into component groups, etc.
    # See child classes for examples
    def PrepareData(self):
        UnimplementedMethodException()
    
    # Provide functionality that creates the run data and then computes the run's test stat and rv
    # This involves doing one resample to simulate pulling an additional sample from the population,
    # then calculating the test_stat, building a sampling distribution, and computing the rv
    # See child classes for examples
    def ComputeTestStatandRv(self):
        UnimplementedMethodException()
        
    def PValue(self):
        """Computes the distribution of the test statistic and p-value.

        returns: float p-value
        """
        if tail == 'left':
            pvalue = self.rv.cdf(self.test_stat)
        elif tail == 'right':
            pvalue = 1 - self.rv.cdf(self.test_stat)
        else:
            raise Exception('The value of \'tail\' can only be either \'left\' or \'right\'')

        return pvalue

    def MaxTestStat(self):
        """Returns the largest test statistic seen during simulations.
        """
        return max(self.rv.xk)

    def PlotCdf(self, label=None):
        """Draws a Cdf with vertical lines at the observed test stat.
        """      
        def VertLine(x):
            """Draws a vertical line at x."""
            plt.plot([x, x], [0, 1], color='0.8')

        VertLine(self.actual)
        plt.plot(self.rv.xk, self.rv.cdf(self.rv.xk)) # pylint: disable=no-member