In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

from pandas.api.types import CategoricalDtype

from collections import defaultdict, Counter

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import matplotlib.dates as mdates

import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf

import dataStatsAnalysis_old as dsa
import dataStatsPlotting as dsp

from functools import reduce

dsp.SetParams()

In [2]:
# Load the existing chi-square function
def ResampleChiSquare(observed, expected, iters=1000):
    """Generates a chisquared statistic sampling distribution by randomly choosing values 
    according to the expected probablities to simulate the null hypothesis. 
    The sequences must be the same length, be integer counts of a categorical variable 
    and the sum of the sequence values must be the same. 
    If the sum of the sequence values is different, first normalize the expected values 
    and then create a new expected values sequence by multiplying by the total number of observed values. 
    adjust_expected = expected/sum(expected)*sum(observed) 
    Can then make an rv of this distribution to plot cdf and  
    compute a p-value for the actual chi-squared statistic (eg. rv.cdf at actual statistic (test_chi)). 
    Can also use the 'min' and 'max' built-ins to find what the most extreme values are from the simluations.

    Args:
        observed (array-like): observed values sequence
        expected (array-like): expected values sequence
        iters (int, optional): [description]. Defaults to 1000.

    Returns:
        test_chi: Original actual chi squared value
        chis (array): Sampling distribution for the null hypothesis obtained from resampling
    """
    observed, expected = np.array(observed), np.array(expected)
    
    # Check that sum of values are euqal
    if np.isclose(sum(observed), sum(expected)) == False:
        raise ValueError('The sum of the values for observed and expected must be equal.')
    
    # Calculate the chi square test statistic
    test_chi = sum((observed - expected)**2 / expected)
        
    # Calculate the variables needed for resampling
    n = sum(expected)
    values = list(range(len(expected)))
    p_exp = expected/sum(expected)
    
    # Build the chi square sampling distribution for the null hypothesis
    chis=[]
    for _ in range(iters):
        # Build a model_observed sequence generated by resampling using expected probabilities
        hist = Counter({x:0 for x in values})
        hist.update(np.random.choice(values, size=n, replace=True, p=p_exp))
        sorted_hist = sorted(hist.items())
        model_observed = np.array([x[1] for x in sorted_hist])

        # Compute chi square statistic and append
        chi = sum((model_observed - expected)**2 / expected)
        chis.append(chi)
    
    return test_chi, np.array(chis)

In [3]:
# Build the contingency chi square function
# Calculating the expected within my function should not be necessary
# The scipy chi square contingency function can do this for me
def ResampleChiSquareContingency_old(observed, iters=1000):
    # Put the data into array form
    observed = np.asarray(observed, dtype=np.float64)
    
    # Calculate the marginal sums
    # From https://github.com/scipy/scipy/blob/v1.7.1/scipy/stats/contingency.py
    margsums = []
    ranged = list(range(observed.ndim))
    for k in ranged:
        marg = np.apply_over_axes(np.sum, observed, [j for j in ranged if j != k])
        margsums.append(marg)
    
    # Calculate the expected contingency table
    # From https://github.com/scipy/scipy/blob/v1.7.1/scipy/stats/contingency.py
    d = observed.ndim
    expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)

In [4]:
def ResampleChiSquareContingency(observed, iters=1000):
    # Put the data into array form
    observed = np.asarray(observed, dtype=np.float64)
    
    # Calculate the test chi square statistic and the expected array
    test_chi,_,_,expected = stats.chi2_contingency(observed)
    
    # Calculate variables to be used in resampling
    expected = np.asarray(expected, dtype=np.float64)
    expected_shape = expected.shape
    expected_ps = expected / np.sum(expected)
    values = np.array(list(range(len(expected.ravel())))) # Flatten the array and then reshape it later
    n= int(np.sum(expected))
      
    # Compute resampled expected values and compute chi square 
    # to build a sampling distribution that represents the null hypothesis
    chis=[]
    for _ in range(iters):
        hist = Counter({x:0 for x in values}) # Initiate an empty histogram to hold resampled values
        hist.update(np.random.choice(values, size=n, replace=True, p=expected_ps.ravel()))
        sorted_hist = sorted(hist.items())
        resampled_expected = np.array([x[1] for x in sorted_hist])
        resampled_expected_reshaped = resampled_expected.reshape(expected_shape) # Put back into original shape

        chi = stats.chi2_contingency(resampled_expected_reshaped)[0]
        chis.append(chi)

    return test_chi, np.array(chis)

Figure out how to resample a contingency table.<br>
Will first try method #2 suggested here:<br>
https://stats.stackexchange.com/questions/303939/bootstrap-resampling-for-contingency-table

I think I have to use the expected contingency table that is output from the scipy function.<br>
The reason is that I need to use the observed test_chi against the null to find the p-value.<br>
The below code seems to be working just need to change to expected I think.<br>
Changed to expected in the final function and it's now working.

In [5]:
observed = [[10,13,15],
            [13,14,15]]

In [6]:
observed = np.asarray(observed, dtype=np.float64)
observed

array([[10., 13., 15.],
       [13., 14., 15.]])

In [7]:
observed_shape = observed.shape
observed_shape

(2, 3)

In [8]:
observed_ps = observed / np.sum(observed)
observed_ps

array([[0.125 , 0.1625, 0.1875],
       [0.1625, 0.175 , 0.1875]])

In [9]:
values = np.array(list(range(len(observed.ravel()))))
values

array([0, 1, 2, 3, 4, 5])

In [10]:
n= int(np.sum(observed))

In [11]:
hist = Counter({x:0 for x in values})
hist

Counter({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0})

In [12]:
hist.update(np.random.choice(values, size=n, replace=True, p=observed_ps.ravel()))
sorted_hist = sorted(hist.items())
model_observed = np.array([x[1] for x in sorted_hist])
model_observed

array([ 3, 11, 12, 12, 20, 22])

In [13]:
model_observed_reshaped = model_observed.reshape(observed_shape)
model_observed_reshaped

array([[ 3, 11, 12],
       [12, 20, 22]])

In [14]:
test_chi,p,dof,expected = stats.chi2_contingency(observed)
test_chi, p

(0.22891366903571353, 0.8918504347412783)

In [15]:
np.sum(expected)

80.0

Test out the new function

In [16]:
chi_cont_results = ResampleChiSquareContingency(observed)
chi_cont_results

(0.22891366903571353,
 array([3.76022120e-01, 5.61674718e-01, 4.66672807e-02, 1.29534741e+00,
        1.87742082e+00, 8.02235474e-01, 1.45364605e+00, 9.11060721e-01,
        2.96452251e+00, 6.74629299e-01, 7.10721949e+00, 3.25000000e-01,
        2.17560218e+00, 5.90812647e+00, 3.52979066e+00, 3.00450622e+00,
        6.22380203e+00, 1.71166208e+00, 2.06317760e+00, 9.42358067e-01,
        4.25568627e+00, 1.60487800e+00, 1.17930850e+00, 2.61483201e-01,
        3.03609434e-01, 1.40621905e+00, 3.59055172e+00, 1.48544266e-02,
        3.82409658e-01, 4.04098716e-01, 1.35642136e+00, 6.64943752e+00,
        1.00455958e+01, 1.32445416e+00, 7.62768836e-01, 7.62314471e-01,
        4.61434098e+00, 2.99682540e+00, 1.09989301e+00, 2.17942482e-01,
        2.61808792e+00, 1.67512927e-02, 2.86666391e+00, 2.26255293e+00,
        9.66616916e-01, 3.07284028e-01, 1.86945944e+00, 1.20808194e+00,
        1.06546951e+00, 1.50072150e+00, 3.14835620e-01, 1.81581167e+00,
        2.13837298e+00, 2.44461421e-01, 5.

In [17]:
# This is VERY close to the result from the scipy funtion above.
# I'm sure this is accurate
dsa.PvalueFromEstimates(chi_cont_results[1], chi_cont_results[0], tail='right')

AttributeError: module 'dataStatsAnalysis_old' has no attribute 'PvalueFromEstimates'

In [None]:
values.reshape([2,3])

In [None]:
values

In [None]:
margsums = []
ranged = list(range(observed.ndim))
for k in ranged:
    marg = np.apply_over_axes(np.sum, observed, [j for j in ranged if j != k])
    margsums.append(marg)
margsums

In [None]:

d = observed.ndim
expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
expected

In [None]:
reduce(np.multiply, margsums)