In [28]:
import numpy as np
import numpy.random as rand
from numpy.testing import assert_allclose, assert_equal

import pandas as pd
from scipy.stats import chi2_contingency

import sys
import os
# let us import local app packages
PACKAGE_PARENT = '../..'
sys.path.append(os.path.normpath(PACKAGE_PARENT))

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

## A basic table:

In [3]:
age = np.random.choice(['18-36','37-54','55+'], size = 2000, p = [0.3,0.4,0.3]);
favorite_show = np.random.choice(['NCIS','House of Cards','Westworld'], size = 2000, p = [0.2,0.4,0.4])
survey_results = pd.DataFrame({"age": age, "favorite_show": favorite_show})
survey_results.index.name = "respondent_id"
survey_results.head(10)

Unnamed: 0_level_0,age,favorite_show
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,55+,House of Cards
1,55+,Westworld
2,37-54,House of Cards
3,37-54,Westworld
4,18-36,Westworld
5,18-36,House of Cards
6,55+,House of Cards
7,37-54,House of Cards
8,18-36,Westworld
9,37-54,House of Cards


In [4]:
%%R -i survey_results

table(survey_results)

       favorite_show
age     House of Cards NCIS Westworld
  18-36            245  113       264
  37-54            323  152       323
  55+              239  116       225


In [5]:
from statsmodels.stats.contingency_tables import Table

table =  Table.from_data(survey_results)
print(table)

Contingency Table: 
favorite_show  House of Cards  NCIS  Westworld
age                                           
18-36                     245   113        264
37-54                     323   152        323
55+                       239   116        225


In [6]:
independence_result = table.test_nominal_association()
print(independence_result)

Contingency Table Independence Result:
chi-squared statistic: 1.7704843257882488
degrees of freedom: 4
p value: 0.7778777647485371



In [7]:
%%R -i survey_results

chisq.test(table(survey_results))


	Pearson's Chi-squared test

data:  table(survey_results)
X-squared = 1.7705, df = 4, p-value = 0.7779



## A table with a relationship:

In [8]:
age = np.random.choice(['less than 18','19-36','37+'], size = 2000, p = [0.3,0.4,0.3]);
survey_results = pd.DataFrame({"age": age})
survey_results.index.name = "respondent_id"

def weighted_choice(age):
    # weight tuples: (snapchat, instagram, facebook)
    weights = {'less than 18': (.5, .4, .1),
               '19-36': (.3, .3, .4),
               '37+': (.1, .2, .7)}
    choices = ("snapchat", "instagram", "facebook")
    favorite_network = np.random.choice(choices, p=weights[age])
    return favorite_network

favorite_social_network = survey_results.age.apply(weighted_choice)
survey_results['favorite_social_network'] = favorite_social_network
survey_results.head(10)

Unnamed: 0_level_0,age,favorite_social_network
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,less than 18,facebook
1,19-36,facebook
2,19-36,instagram
3,19-36,instagram
4,less than 18,snapchat
5,19-36,snapchat
6,37+,facebook
7,37+,instagram
8,37+,facebook
9,37+,facebook


In [9]:
from statsmodels.stats.contingency_tables import Table

table =  Table.from_data(survey_results)
print(table)

Contingency Table: 
favorite_social_network  facebook  instagram  snapchat
age                                                   
19-36                         341        242       225
37+                           440        110        49
less than 18                   65        247       281


In [10]:
independence_result = table.test_nominal_association()
print(independence_result)

Contingency Table Independence Result:
chi-squared statistic: 494.16888956343263
degrees of freedom: 4
p value: 0.0



In [11]:
%%R -i survey_results

chisq.test(table(survey_results))


	Pearson's Chi-squared test

data:  table(survey_results)
X-squared = 494.17, df = 4, p-value < 2.2e-16



## A table with multiple response factors

In [15]:
import statsmodels.api as sm
from statsmodels.datasets import presidential2016

data = sm.datasets.presidential2016.load_pandas()
presidential_data = data.data
presidential_data.head()

Unnamed: 0,Hillary_Clinton,Donald_Trump,Jill_Stein,Gary_Johnson,None_Of_The_Above,I_Probably_Wont_Vote,Hillary_Clinton_is_involved_in_many_coverups,Trump_changes_his_positions_all_of_the_time,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Trump_is_a_successful_businessman,Trumps_temper_could_get_the_country_into_trouble,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Waiting_for_debates
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [89]:
narrow_row_factor = rows_factor.cast_wide_to_narrow()
srcv_presidential = pd.concat([narrow_row_factor.as_dataframe().loc[:, 'variable'], columns_factor.as_dataframe()], axis=1)
srcv_presidential.head()

Unnamed: 0,variable,Hillary_Clinton_is_involved_in_many_coverups,Trump_changes_his_positions_all_of_the_time,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Trump_is_a_successful_businessman,Trumps_temper_could_get_the_country_into_trouble
0,None_Of_The_Above,0.0,0.0,0.0,1.0,0.0
1,None_Of_The_Above,0.0,0.0,0.0,0.0,0.0
2,Hillary_Clinton,0.0,0.0,0.0,0.0,0.0
3,Donald_Trump,0.0,0.0,0.0,0.0,0.0
4,Hillary_Clinton,1.0,0.0,0.0,0.0,1.0


### Single response versus multiple response

In [13]:
from statsmodels.stats.contingency_tables import Factor, MRCVTable

In [14]:
rows_factor = Factor(data.data.iloc[:, :6], data.data.columns[:6], "expected_choice", orientation="wide")
columns_factor = Factor(data.data.iloc[:, 6:11], data.data.columns[6:11], "believe_true", orientation="wide")
multiple_response_table = MRCVTable([rows_factor,], [columns_factor])
multiple_response_table

variable_col          Hillary_Clinton_is_involved_in_many_coverups  \
variable_row                                                         
Donald_Trump                                                    51   
Gary_Johnson                                                    23   
Hillary_Clinton                                                 22   
I_Probably_Wont_Vote                                            36   
Jill_Stein                                                       4   
None_Of_The_Above                                               34   

variable_col          Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi  \
variable_row                                                                                 
Donald_Trump                                                         45                      
Gary_Johnson                                                         18                      
Hillary_Clinton                                                

### MMI Item Response Table

In [19]:
srcv_item_response_table_python = multiple_response_table._build_item_response_table_for_MMI(rows_factor, columns_factor)
srcv_item_response_table_python

multiple_response_level,Hillary_Clinton_is_involved_in_many_coverups,Hillary_Clinton_is_involved_in_many_coverups,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Trump_changes_his_positions_all_of_the_time,Trump_changes_his_positions_all_of_the_time,Trump_is_a_successful_businessman,Trump_is_a_successful_businessman,Trumps_temper_could_get_the_country_into_trouble,Trumps_temper_could_get_the_country_into_trouble
selected?,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
single_response_level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Donald_Trump,131,51,137,45,172,10,143,39,168,14
Gary_Johnson,86,23,91,18,92,17,93,16,96,13
Hillary_Clinton,205,22,209,18,176,51,206,21,185,42
I_Probably_Wont_Vote,180,36,197,19,192,24,194,22,197,19
Jill_Stein,35,4,36,3,27,12,34,5,32,7
None_Of_The_Above,193,34,200,27,201,26,206,21,203,24


In [20]:
srcv_item_response_table_python.iloc[:, (0,1)].sum().sum()

1000

In [77]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_item_response_table_result.csv")
%R -i fpath

In [99]:
%%R -i srcv_presidential

library('MRCV')

srcv_item_response_table_r <- item.response.table(srcv_presidential, I=1, J=5)
srcv_item_response_table_r_dataframe = as.data.frame.matrix(srcv_item_response_table_r)
write.table(srcv_item_response_table_r, file = fpath, sep=",")
## R is apparently bad at writing out tables with nested headers so we just get "term", "term", "term"
colnames(srcv_item_response_table_r)

 [1] "term" "term" "term" "term" "term" "term" "term" "term" "term" "term"


In [105]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_item_response_table_result.csv")
srcv_item_response_table_r = pd.DataFrame.from_csv(fpath)
srcv_item_response_table_r

Unnamed: 0,term,term.1,term.2,term.3,term.4,term.5,term.6,term.7,term.8,term.9
1,131,51,172,10,137,45,143,39,168,14
2,86,23,92,17,91,18,93,16,96,13
3,205,22,176,51,209,18,206,21,185,42
4,180,36,192,24,197,19,194,22,197,19
5,35,4,27,12,36,3,34,5,32,7
6,193,34,201,26,200,27,206,21,203,24


In [128]:
for i in range(0, len(columns_factor.labels)*2, 2):
    c = columns_factor.labels[i // 2]
    r_left_offset = i
    r_right_offset = i + 2
    py_group = srcv_item_response_table_python.loc[:, c]
    r_group = srcv_item_response_table_r.iloc[:, r_left_offset:r_right_offset]
    assert_allclose(py_group.values, r_group)

## MMI Full Chi-squard tables

In [122]:
python_result = multiple_response_table._calculate_pairwise_chi2s_for_MMI_item_response_table(rows_factor, columns_factor)
python_result

Hillary_Clinton_is_involved_in_many_coverups                            27.493592
Trump_changes_his_positions_all_of_the_time                             36.511000
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi    33.065276
Trump_is_a_successful_businessman                                       19.305086
Trumps_temper_could_get_the_country_into_trouble                        16.238789
dtype: float64

In [87]:
%%R -i srcv_presidential

library('MRCV')
r_srcv_chis <- MI.stat(srcv_presidential, I=1, J=5)

$X.sq.S
[1] 132.6137

$X.sq.S.ij
     Hillary_Clinton_is_involved_in_many_coverups
[1,]                                     27.49359
     Trump_changes_his_positions_all_of_the_time
[1,]                                      36.511
     Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi
[1,]                                                             33.06528
     Trump_is_a_successful_businessman
[1,]                          19.30509
     Trumps_temper_could_get_the_country_into_trouble
[1,]                                         16.23879

$valid.margins
[1] 5



In [82]:
%R -o r_srcv_chis

In [134]:

r_results = pd.Series({k:v for k, v in zip(r_srcv_chis[1].names[1], r_srcv_chis[1])})
r_result_reordered = r_results.reindex(python_result.index, )
assert_allclose(python_result, r_result_reordered)

In [137]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_all_chis_result.csv")
r_result_reordered.to_csv(fpath)

### MMI bonferroni test

In [134]:
bonferroni_test = multiple_response_table._test_for_marginal_mutual_independence_using_bonferroni_correction
results = bonferroni_test(rows_factor, columns_factor)
table_p_value_bonferroni_corrected, pairwise_bonferroni_corrected_p_values = results
print("Overall table p value: {}\n\n".format(table_p_value_bonferroni_corrected))
print("Pairwise p values (likelihood of independence between single select variable and specific multi-select option):")
pairwise_bonferroni_corrected_p_values

Overall table p value: 3.752624460014494e-06


Pairwise p values (likelihood of independence between single select variable and specific multi-select option):


Hillary_Clinton_is_involved_in_many_coverups                            0.000229
Trump_changes_his_positions_all_of_the_time                             0.000004
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi    0.000018
Trump_is_a_successful_businessman                                       0.008431
Trumps_temper_could_get_the_country_into_trouble                        0.030972
dtype: float64

In [146]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_bonferroni.csv")
%R -i fpath

In [169]:
%%R 

res <- MI.test(srcv_presidential, I=1, J=5, type="bon")
res$bon$X.sq.S.ij.p.bon
write.csv(res$bon, file=fpath)

In [166]:
r_result = pd.DataFrame.from_csv(fpath)
table_p_value_r = r_result["p.value.bon"] 
cell_p_values_r = r_result.iloc[:, 1:]
reshaped_python_values = pairwise_bonferroni_corrected_p_values.values.reshape(5,1)
assert_allclose(reshaped_python_values, cell_p_values_r.T)
assert_allclose(table_p_value_r, table_p_value_bonferroni_corrected)

### MMI Rao Scott 2 Test

In [295]:
import itertools
from scipy.stats import chi2_contingency, chi2

self = multiple_response_table
single_response_factor = rows_factor
multiple_response_factor = columns_factor

if single_response_factor.orientation == "wide":
    W = single_response_factor.cast_wide_to_narrow().as_dataframe()
else:
    W = single_response_factor.as_dataframe()
if not isinstance(W, pd.Series):
    W = W.iloc[:, 0]
Y = multiple_response_factor.as_dataframe()
n = len(W)
I = 1  # single response variable must have exactly one column
J = len(Y.columns)
c = J
r = len(W.unique())

def conjoint_combinations(srcv, mrcv):
    number_of_variables = 1 + len(mrcv.columns)
    srcv = srcv.copy()  # don't modify original dataframe
    mrcv = mrcv.copy()  # don't modify original dataframe
    srcv.name = "srcv"
    srcv_level_arguments = srcv.unique()
    mrcv_level_arguments = [[0, 1] for i in range(0, number_of_variables - 1)]
    level_arguments = [list(srcv_level_arguments), ] + mrcv_level_arguments
    variables = ['srcv', ] + list(mrcv.columns)
    level_combinations = list(itertools.product(*level_arguments))
    full_combinations = pd.DataFrame(level_combinations, columns=variables)
    full_combinations["_dummy"] = 0
    data = pd.concat([srcv, mrcv], axis=1)
    data.srcv.value_counts()
    data['_dummy'] = 1
    data = pd.concat([data, full_combinations]).reset_index(drop=True)
    grouped = data.groupby(list(variables))
    result = grouped.sum().reset_index()
    return result

def count_level_combinations(data, number_of_variables):
    data = data.copy()  # don't modify original dataframe
    level_arguments = [[0, 1] for i in range(0, number_of_variables)]
    variables = data.columns
    level_combinations = list(itertools.product(*level_arguments))
    full_combinations = pd.DataFrame(level_combinations, columns=variables)
    full_combinations["_dummy"] = 0
    data['_dummy'] = 1
    data = pd.concat([data, full_combinations]).reset_index(drop=True)
    grouped = data.groupby(list(variables))
    return grouped.sum().reset_index()

Y_count_ordered = count_level_combinations(Y, J)
n_count_ordered = conjoint_combinations(W, Y)
# n_count_ordered.sort_values("_dummy", inplace=True)

# need make n_iplus be in same order as SRCV options in the n_counts_ordered_table
srcv_table_order = n_count_ordered.groupby('srcv').first().index.values
n_iplus = W.value_counts().reindex(srcv_table_order)
tau = n_count_ordered.iloc[:, -1].astype(int) / np.repeat(n_iplus, repeats=(2 ** c)).reset_index(drop=True)
# the R version subtracts 1 from G_tilde because data.matrix converts 0->1 and 1->2
# (probably because it thinks they're factors and it's internally coding them)
G_tilde = Y_count_ordered.iloc[:, :-1].T
I_r = np.eye(r)
G = np.kron(I_r, G_tilde)
pi = G.dot(tau)
m = pi * np.repeat(n_iplus, c)
a_i = n_iplus / n
pi_not_j = (1 / n) * np.kron(np.ones(r), np.eye(c)).dot(m)
j_r = np.ones(r)
pi_not = np.kron(j_r, pi_not_j)
I_rc = np.eye(r * c)
I_c = np.eye(c)
J_rr = np.ones((r, r))
A = np.diag(a_i)
H = I_rc - np.kron(J_rr.dot(A), I_c)
D = np.kron(np.diag(n / n_iplus), np.diag(pi_not_j) * (1 - pi_not_j))
v_dim = r * (2 ** c)
V = np.zeros((v_dim, v_dim))

for i in range(1, r+1):
    a = ((i - 1) * (2 ** c) + 1) - 1
    b = ((i - 1) * (2 ** c) + (2 ** c)) - 1
    print(a)
    print(b)
    tau_range = tau[a:b]
    a_v = (1 / a_i[i - 1])
    tau_diag = np.diag(tau_range)
    tau_tcrossproduct = np.outer(tau_range, tau_range.T)
    v = a_v * (tau_diag - tau_tcrossproduct)
    V[a:b, a:b] = v

D_diag = np.diag(1 / np.diag(D))
tcrossprod_VG = V.dot(G.T)
tcrossprod_VGH = tcrossprod_VG.dot(H.T)

0
31
32
63
64
95
96
127
128
159
160
191


In [284]:
tcrossprod_VGH[176:]


array([[-0.05992742,  0.00807312,  0.00838363,  0.0065206 ,  0.00745211,
        -0.05992742,  0.00807312,  0.00838363,  0.0065206 ,  0.00745211,
        -0.05992742,  0.00807312,  0.00838363,  0.0065206 ,  0.00745211,
        -0.05992742,  0.00807312,  0.00838363,  0.0065206 ,  0.00745211,
        -0.05992742,  0.00807312,  0.00838363,  0.0065206 ,  0.00745211,
         0.20407002, -0.0274913 , -0.02854866, -0.02220451, -0.02537658],
       [-0.02247278,  0.00302742,  0.00314386,  0.00244523, -0.02363718,
        -0.02247278,  0.00302742,  0.00314386,  0.00244523, -0.02363718,
        -0.02247278,  0.00302742,  0.00314386,  0.00244523, -0.02363718,
        -0.02247278,  0.00302742,  0.00314386,  0.00244523, -0.02363718,
        -0.02247278,  0.00302742,  0.00314386,  0.00244523, -0.02363718,
         0.07652626, -0.01030924, -0.01070575, -0.00832669,  0.08049135],
       [-0.02996371,  0.00403656,  0.00419181, -0.03198199,  0.00372606,
        -0.02996371,  0.00403656,  0.00419181, -0

In [281]:
tcrossprod_VGH[175:, 175:]

array([], shape=(17, 0), dtype=float64)

In [275]:
G[175:, 175:]

array([], shape=(0, 17), dtype=float64)

In [None]:
Di_HG = D_diag.dot(H).dot(G)
Di_HGVGH = np.matmul(Di_HG, tcrossprod_VGH)

In [290]:
Di_HGVGH


array([[  1.08613205e+00,   4.30916853e-03,  -1.03167226e-01,
          2.83399467e-01,  -1.79802350e-02,  -2.97907691e-01,
          1.67903206e-02,   3.30986263e-02,  -5.66251887e-02,
          8.38301214e-03,  -1.96058928e-01,  -7.99913105e-03,
          1.60501906e-02,  -5.42555394e-02,   1.18233523e-02,
         -2.62313323e-01,  -7.96308985e-03,   2.47582749e-02,
         -6.92576425e-02,   6.44651058e-03,  -2.01890637e-01,
         -2.74326221e-02,   2.19962632e-02,  -2.04483615e-02,
         -3.09790421e-02,  -2.47424250e-01,   8.77219733e-03,
          2.34345815e-02,  -7.63586483e-02,  -2.24453811e-03],
       [  5.05003056e-03,   4.49648993e-01,   6.17919312e-02,
         -5.27174781e-03,  -4.25648721e-03,   1.96770286e-02,
         -1.02126747e-01,  -1.17682726e-02,   6.93153550e-03,
          1.52469322e-02,  -9.37439694e-03,  -1.66453199e-01,
         -8.25323478e-03,   1.81610685e-02,   2.33289295e-02,
         -9.33215928e-03,  -5.24345210e-02,  -2.24438050e-02,
       

In [296]:
eigenvalues, eigenvectors = np.linalg.eig(Di_HGVGH)
eigenvalues

array([  2.18652025e+00,   1.82297722e+00,   1.51739565e+00,
         1.50833538e+00,  -1.12449488e-15,   3.62206156e-17,
         1.35730999e+00,   4.33653721e-01,   1.28204725e+00,
         1.22394222e+00,   4.97534199e-01,   5.48117079e-01,
         5.68317028e-01,   5.93728437e-01,   6.73796880e-01,
         6.99785705e-01,   7.25101857e-01,   1.14648694e+00,
         1.11760946e+00,   8.34071100e-01,   8.50931406e-01,
         8.83476358e-01,   9.16463481e-01,   1.03159656e+00,
         1.00776387e+00,   9.73760416e-01,   9.87045624e-01,
        -2.98197989e-16,  -1.24293653e-16,   6.20393358e-17])

In [297]:
Di_HGVGH_eigen = np.real(eigenvalues)
sum_Di_HGVGH_eigen_sq = (Di_HGVGH_eigen ** 2).sum()
sum_Di_HGVGH_eigen_sq

30.152578631184642

In [298]:
observed = self._calculate_pairwise_chi2s_for_MMI_item_response_table(single_response_factor,
                                                                      multiple_response_factor)
observed_X_sq = observed.sum()

In [299]:
observed_X_sq

132.61374286431663

In [300]:
rows_by_columns = ((r - 1) * c)

X_sq_S_rs2 = rows_by_columns * observed_X_sq / sum_Di_HGVGH_eigen_sq
df_rs2 = ((r - 1) ** 2) * (c ** 2) / sum_Di_HGVGH_eigen_sq
df_rs2

20.727912118057041

In [301]:
X_sq_S_p_value_rs2 = chi2.sf(X_sq_S_rs2, df=df_rs2)
print(X_sq_S_p_value_rs2)

3.79886988451e-14


In [277]:
_test_for_marginal_mutual_independence_using_rao_scott_2(multiple_response_table, rows_factor, columns_factor)

0         None_Of_The_Above
1         None_Of_The_Above
2           Hillary_Clinton
3              Donald_Trump
4           Hillary_Clinton
5                Jill_Stein
6         None_Of_The_Above
7           Hillary_Clinton
8         None_Of_The_Above
9              Donald_Trump
10        None_Of_The_Above
11     I_Probably_Wont_Vote
12          Hillary_Clinton
13             Donald_Trump
14             Donald_Trump
15               Jill_Stein
16          Hillary_Clinton
17             Donald_Trump
18        None_Of_The_Above
19             Gary_Johnson
20        None_Of_The_Above
21        None_Of_The_Above
22     I_Probably_Wont_Vote
23     I_Probably_Wont_Vote
24             Gary_Johnson
25     I_Probably_Wont_Vote
26             Gary_Johnson
27             Gary_Johnson
28        None_Of_The_Above
29               Jill_Stein
               ...         
970            Gary_Johnson
971    I_Probably_Wont_Vote
972       None_Of_The_Above
973            Gary_Johnson
974         Hillary_

0.0

In [302]:
rao_scott_test = multiple_response_table._test_for_marginal_mutual_independence_using_rao_scott_2
table_p_value_rao_scott_corrected = rao_scott_test(rows_factor, columns_factor)
print("Overall table p value: {}\n\n".format(table_p_value_rao_scott_corrected))

Overall table p value: 3.798869884507482e-14




In [192]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_rao_scott.csv")
%R -i fpath

In [203]:
%%R 

res <- MI.test(srcv_presidential, I=1, J=5, type="rs2")
res$rs2
write.csv(res$rs2, file=fpath)
res$rs2

$X.sq.S.rs2
[1] 109.9522

$df.rs2
[1] 20.72791

$p.value.rs2
[1] 3.796963e-14



In [178]:
r_result = pd.DataFrame.from_csv(fpath)
table_p_value_r = r_result["p.value.rs2"] 
assert_allclose(table_p_value_r, table_p_value_rao_scott_corrected)

AssertionError: 
Not equal to tolerance rtol=1e-07, atol=0

(mismatch 100.0%)
 x: array([  3.796963e-14])
 y: array(2.9278708403821406e-25)

In this case the Rao Scott adjusted test is much less conservative, even compared to the naive chi-squared test. In this case the p value is so low that it rounds to zero.

## Multiple response versus multiple response

When we compare two (or more) multiple response questions against each other (e.g. question 2 versus question 3 in our sample presidential survey), we test for **simultaneous pairwise mutual independence (SPMI)**, i.e. whether each possible choice on one multiple response question is simultaneously indepedent of each possible choice on a different multiple response question.

Once again, we treat each multiple response question as a compound of individual "sub-questions", one for each answer option (i.e. "does A apply (yes/no)? Does B apply (yes/no)?" Then we say that the multiple response questions have simultaneous pairwise mutual independence if each answer choice in the first question is simultaneously independent of each answer choice in the second question.

As before we start by building a full "item response table" comparing answer choice on the first question each answer choice on the second question:


In [25]:
rows_factor = Factor(data.data.iloc[:, 6:11], data.data.columns[6:11], "believe_true", orientation="wide")
columns_factor = Factor(data.data.iloc[:, 11:], data.data.columns[11:], "why_uncertain", orientation="wide")


Unnamed: 0_level_0,column_levels,Dont_like__any_candidate,Dont_like__any_candidate,I_wish_another_candidate_had_won_the_primary,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Need_to_do_more_research,Not_sure_which_candidate_shares_my_values,Not_sure_which_candidate_shares_my_values,Waiting_for_debates,Waiting_for_debates
Unnamed: 0_level_1,selected?,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
row_levels,selected?,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Hillary_Clinton_is_involved_in_many_coverups,0.0,720,110,683,147,661,169,719,111,719,111
Hillary_Clinton_is_involved_in_many_coverups,1.0,126,44,121,49,155,15,148,22,158,12
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,0.0,748,122,685,185,713,157,752,118,777,93
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,1.0,98,32,119,11,103,27,115,15,100,30
Trump_changes_his_positions_all_of_the_time,0.0,721,139,703,157,702,158,750,110,747,113
Trump_changes_his_positions_all_of_the_time,1.0,125,15,101,39,114,26,117,23,130,10
Trump_is_a_successful_businessman,0.0,753,123,715,161,709,167,759,117,761,115
Trump_is_a_successful_businessman,1.0,93,31,89,35,107,17,108,16,116,8
Trumps_temper_could_get_the_country_into_trouble,0.0,763,118,692,189,715,166,774,107,771,110
Trumps_temper_could_get_the_country_into_trouble,1.0,83,36,112,7,101,18,93,26,106,13


In [26]:
multiple_response_table._build_item_response_table_for_SPMI(rows_factor, columns_factor)

Unnamed: 0_level_0,column_levels,Dont_like__any_candidate,Dont_like__any_candidate,I_wish_another_candidate_had_won_the_primary,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Need_to_do_more_research,Not_sure_which_candidate_shares_my_values,Not_sure_which_candidate_shares_my_values,Waiting_for_debates,Waiting_for_debates
Unnamed: 0_level_1,selected?,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
row_levels,selected?,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Hillary_Clinton_is_involved_in_many_coverups,0.0,720,110,683,147,661,169,719,111,719,111
Hillary_Clinton_is_involved_in_many_coverups,1.0,126,44,121,49,155,15,148,22,158,12
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,0.0,748,122,685,185,713,157,752,118,777,93
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,1.0,98,32,119,11,103,27,115,15,100,30
Trump_changes_his_positions_all_of_the_time,0.0,721,139,703,157,702,158,750,110,747,113
Trump_changes_his_positions_all_of_the_time,1.0,125,15,101,39,114,26,117,23,130,10
Trump_is_a_successful_businessman,0.0,753,123,715,161,709,167,759,117,761,115
Trump_is_a_successful_businessman,1.0,93,31,89,35,107,17,108,16,116,8
Trumps_temper_could_get_the_country_into_trouble,0.0,763,118,692,189,715,166,774,107,771,110
Trumps_temper_could_get_the_country_into_trouble,1.0,83,36,112,7,101,18,93,26,106,13


Now as before we can calculate a chi-squared statistic for each individual pairing of column level and row level:


In [61]:
multiple_response_table._calculate_pairwise_chi2s_for_SPMI_item_response_table(rows_factor, columns_factor)

Unnamed: 0,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Waiting_for_debates
Hillary_Clinton_is_involved_in_many_coverups,17.2742,11.0574,12.5105,0.0228698,5.21583
Trump_changes_his_positions_all_of_the_time,9.74002,11.7642,0.558638,0.402103,16.0883
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,2.74341,7.04331,0.00318631,1.38182,4.01368
Trump_is_a_successful_businessman,10.0131,6.68351,2.07403,0.0193256,4.48833
Trumps_temper_could_get_the_country_into_trouble,22.8695,16.1294,0.964289,8.56061,0.236957


Now we have to determine how likely it is to get chi-squared statistics this large in each of those cells of the same time. Once again the simplest way to do that is to apply a bonferroni correction, i.e. calculate a p value for each cell using a chi-square distribution and multiply the p value by the total number of cells. 

In [62]:
bonferroni_test = multiple_response_table._test_for_single_pairwise_mutual_independence_using_bonferroni
results = bonferroni_test(rows_factor, columns_factor)
table_p_value_bonferroni_corrected, pairwise_bonferroni_corrected_p_values = results
print("Overall table p value: {}\n\n".format(table_p_value_bonferroni_corrected))
print("Pairwise p values (likelihood of independence between single select variable and specific multi-select option):")
pairwise_bonferroni_corrected_p_values

Overall table p value: 4.3346430242129665e-05


Pairwise p values (likelihood of independence between single select variable and specific multi-select option):


Unnamed: 0,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Waiting_for_debates
Hillary_Clinton_is_involved_in_many_coverups,0.000809,0.022083,0.010117,1.0,0.559553
Trump_changes_his_positions_all_of_the_time,0.045075,0.015095,1.0,1.0,0.001511
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,1.0,0.198904,1.0,1.0,1.0
Trump_is_a_successful_businessman,0.038857,0.243271,1.0,1.0,0.853173
Trumps_temper_could_get_the_country_into_trouble,4.3e-05,0.001479,1.0,0.085879,1.0


Let's compare this result versus the p value we would get by applying a traditional chi-squared test:

In [65]:
_,  traditional_chi2_p_value, _, _ = chi2_contingency(multiple_response_table.table)
print("Naive chi-squared p value: {}".format(traditional_chi2_p_value))
ratio = table_p_value_bonferroni_corrected / traditional_chi2_p_value  
print("The naive chi-squared method overstates our confidence by a factor of {}".format(ratio))

Naive chi-squared p value: 1.7079214808649325e-09
The naive chi-squared method overstates our confidence by a factor of 25379.63877600391


In this case the traditional chi-squared test overstates our confidence by four orders of magnitude.

However we know that the bonferroni-adjustment can be conservative so we can try a 2nd order Rao Scott correction:

In [66]:
rao_scott_test = multiple_response_table._test_for_single_pairwise_mutual_independence_using_rao_scott_2
table_p_value_rao_scott_corrected = rao_scott_test(rows_factor, columns_factor)
print("Overall table p value: {}\n\n".format(table_p_value_rao_scott_corrected))

Overall table p value: 6.256504667259361e-18




Both the bonferroni and the row scott methods require us to make some assumptions about the distribution of the deviations we observe in our response table. Ever not comfortable making assumptions we can instead use a nonparametric method by running a bootstrap simulation to approximate the distribution of deviations. The bootstrap method is more computationally intensive but allows us to estimate the p value using fewer assumptions.

In [None]:
bootstrap_test = multiple_response_table._test_for_single_pairwise_mutual_independence_using_bootstrap
results = bootstrap_test(rows_factor, columns_factor, verbose=True)
table_p_value_bootstrap, pairwise_bootstrap_p_values = results
print("Overall table p value: {}\n\n".format(table_p_value_bootstrap))
print("Pairwise p values (likelihood of independence between single select variable and specific multi-select option):")
pairwise_bootstrap_p_values

sample 0


  return self._int64index.union(other)


sample 50
sample 100
sample 150
sample 200
sample 250
sample 300
sample 350
sample 400
sample 450
sample 500
