Author: Phu Dang

Purpose: Permutation testing to determine how statistically significant the disparities in data collection by study sites (states) are

Date: 05.12.2023

Null Hypothesis: The data collection sites among returned and non-returned patients have the same distribution, where any observed difference is due to chance

Alternative Hypothesis: The distributions of data collection sites among returned and non-returned patients are different


In [3]:
# Importing libraries and packages

%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.utils import shuffle
import os
import math
pd.options.plotting.backend = 'plotly'

print(os.getcwd())
from permutation_test_util import *

c:\Users\phuro\UCSD\Project_WHEL\hypothesis_tests


In [4]:
# Importing dataset

os.chdir("C:/Users/phuro/UCSD/Project_WHEL/data")
path = os.path.join('interim', 'return_status_by_site.csv')
df = pd.read_csv(path, index_col=0)

In [5]:
df

Unnamed: 0,site,status
0,AZ,unreturned
1,TX,returned
2,CA,returned
3,CA,returned
4,CA,returned
...,...,...
2856,TX,unreturned
2857,CA,returned
2858,CA,returned
2859,OR,returned


In [6]:
table = df.pivot_table(index='site', columns='status', aggfunc='size')
print(table)
table = table / table.sum()
table

table_log = df.pivot_table(index='site', columns='status', aggfunc='size')
table_log = np.log(table_log)
table_log

status  returned  unreturned
site                        
AZ           296         144
CA          1287         554
OR           194          38
TX           212         136


status,returned,unreturned
site,Unnamed: 1_level_1,Unnamed: 2_level_1
AZ,5.690359,4.969813
CA,7.160069,6.317165
OR,5.267858,3.637586
TX,5.356586,4.912655


In [7]:
# Plot observed distribution

fig = table.plot(kind='barh', title='Distribution of study site by return status (proportions)',
                 barmode='group')
fig.show()

# fig = table_log.plot(kind='barh', title='Distribution of study site by return status (log scale)',
#                  barmode='group')
# fig.show()

In [8]:
result, test_stats = permutation_simulation(df, 10000, shuffle_column='status', 
                                            cats_column='site', 
                                            significance_level=0.01)

0.03115068657432394

STEPS:
shuffled  returned  unreturned
site                          
AZ             307         133
CA            1278         563
OR             163          69
TX             241         107

shuffled  returned  unreturned
site                          
AZ        0.154349    0.152523
CA        0.642534    0.645642
OR        0.081951    0.079128
TX        0.121166    0.122706

shuffled  returned  unreturned
site                          
AZ             NaN   -0.001826
CA             NaN    0.003108
OR             NaN   -0.002822
TX             NaN    0.001540

site
AZ   -0.001826
CA    0.003108
OR   -0.002822
TX    0.001540
Name: unreturned, dtype: float64

site
AZ    0.001826
CA    0.003108
OR    0.002822
TX    0.001540
Name: unreturned, dtype: float64

0.004648271917564957
END

0.028233264606713063
0.030059247881697955
0.012896042914931245
0.0331819272051328
0.02775644485034661
0.034831481404606096
0.011832855937011368
0.03269069330861024
0.010183301737538071


In [9]:
# sanity check

# Calculate observed test statistic (TVD)
observed = tvd_of_groups(df, groups='status', cats='site')
observed

0.06569619143823138

In [10]:
result

'Obsersed TVD = 0.066, P-value = 0.001, Reject null hypothesis at 0.01'

In [11]:
test_stats

[0.03115068657432394,
 0.004648271917564957,
 0.00842420007287787,
 0.03327706053016363,
 0.02502928953279735,
 0.02669325787242678,
 0.027865992315533626,
 0.021143813912297485,
 0.026678843732270578,
 0.0426024326456059,
 0.016781518535431132,
 0.031041139109136973,
 0.030974257498812295,
 0.03153237300565959,
 0.025029289532797365,
 0.029977952131217102,
 0.018621915950572228,
 0.015213260086438737,
 0.0199710794691906,
 0.012896042914931204,
 0.018445486875060542,
 0.01348241013648463,
 0.05315300667432348,
 0.03427336589775879,
 0.013673253352152442,
 0.0052490532792745515,
 0.029801523055705464,
 0.0249341562077666,
 0.03162750633069036,
 0.00201682649065274,
 0.01191415168749222,
 0.03115068657432394,
 0.017844705513350913,
 0.016605089459919425,
 0.029801523055705464,
 0.031532373005659566,
 0.0360993491727437,
 0.025043703672953582,
 0.012023699152679188,
 0.029405999049819928,
 0.016686385210400334,
 0.014655144579591453,
 0.021143813912297443,
 0.019618221318167366,
 0.01777

In [12]:
# Plot observed tvd and simulated tvds

fig = px.histogram(pd.DataFrame({'simulated tvds': test_stats}), 
                x='simulated tvds', nbins=55, histnorm='probability', 
                title=f'Empirical Distribution of TVD <br><sup>{result} significance level</sup>')
fig.add_vline(x=observed, line_color='rgb(0,100,80)', 
              annotation_text='observed', annotation_position='top right')
p_99 = np.percentile(test_stats, 99)
fig.add_vline(x=p_99, line_color='rgb(0,176,246)', 
              annotation_text='significance level', annotation_position='top left')
fig.update_layout(xaxis_range=[0, 0.09])

In [13]:
fig.write_html('C:/Users/phuro/UCSD/Project_WHEL/hypothesis_tests/results.html')