In [34]:
import numpy as np
import numpy.random as rand
from numpy.testing import assert_allclose, assert_equal

import pandas as pd
from scipy.stats import chi2_contingency

import sys
import os
# let us import local app packages
PACKAGE_PARENT = '../..'
sys.path.append(os.path.normpath(PACKAGE_PARENT))

result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"

%load_ext autoreload
%autoreload 2
%load_ext rpy2.ipython

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format = 'retina'

## A basic table:

In [3]:
age = np.random.choice(['18-36','37-54','55+'], size = 2000, p = [0.3,0.4,0.3]);
favorite_show = np.random.choice(['NCIS','House of Cards','Westworld'], size = 2000, p = [0.2,0.4,0.4])
survey_results = pd.DataFrame({"age": age, "favorite_show": favorite_show})
survey_results.index.name = "respondent_id"
survey_results.head(10)

Unnamed: 0_level_0,age,favorite_show
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,55+,House of Cards
1,55+,Westworld
2,37-54,House of Cards
3,37-54,Westworld
4,18-36,Westworld
5,18-36,House of Cards
6,55+,House of Cards
7,37-54,House of Cards
8,18-36,Westworld
9,37-54,House of Cards


In [4]:
%%R -i survey_results

table(survey_results)

       favorite_show
age     House of Cards NCIS Westworld
  18-36            245  113       264
  37-54            323  152       323
  55+              239  116       225


In [5]:
from statsmodels.stats.contingency_tables import Table

table =  Table.from_data(survey_results)
print(table)

Contingency Table: 
favorite_show  House of Cards  NCIS  Westworld
age                                           
18-36                     245   113        264
37-54                     323   152        323
55+                       239   116        225


In [6]:
independence_result = table.test_nominal_association()
print(independence_result)

Contingency Table Independence Result:
chi-squared statistic: 1.7704843257882488
degrees of freedom: 4
p value: 0.7778777647485371



In [7]:
%%R -i survey_results

chisq.test(table(survey_results))


	Pearson's Chi-squared test

data:  table(survey_results)
X-squared = 1.7705, df = 4, p-value = 0.7779



## A table with a relationship:

In [8]:
age = np.random.choice(['less than 18','19-36','37+'], size = 2000, p = [0.3,0.4,0.3]);
survey_results = pd.DataFrame({"age": age})
survey_results.index.name = "respondent_id"

def weighted_choice(age):
    # weight tuples: (snapchat, instagram, facebook)
    weights = {'less than 18': (.5, .4, .1),
               '19-36': (.3, .3, .4),
               '37+': (.1, .2, .7)}
    choices = ("snapchat", "instagram", "facebook")
    favorite_network = np.random.choice(choices, p=weights[age])
    return favorite_network

favorite_social_network = survey_results.age.apply(weighted_choice)
survey_results['favorite_social_network'] = favorite_social_network
survey_results.head(10)

Unnamed: 0_level_0,age,favorite_social_network
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,less than 18,facebook
1,19-36,facebook
2,19-36,instagram
3,19-36,instagram
4,less than 18,snapchat
5,19-36,snapchat
6,37+,facebook
7,37+,instagram
8,37+,facebook
9,37+,facebook


In [9]:
from statsmodels.stats.contingency_tables import Table

table =  Table.from_data(survey_results)
print(table)

Contingency Table: 
favorite_social_network  facebook  instagram  snapchat
age                                                   
19-36                         341        242       225
37+                           440        110        49
less than 18                   65        247       281


In [10]:
independence_result = table.test_nominal_association()
print(independence_result)

Contingency Table Independence Result:
chi-squared statistic: 494.16888956343263
degrees of freedom: 4
p value: 0.0



In [11]:
%%R -i survey_results

chisq.test(table(survey_results))


	Pearson's Chi-squared test

data:  table(survey_results)
X-squared = 494.17, df = 4, p-value < 2.2e-16



## A table with multiple response factors

In [4]:
import statsmodels.api as sm
from statsmodels.datasets import presidential2016

data = sm.datasets.presidential2016.load_pandas()
presidential_data = data.data
presidential_data.head()

Unnamed: 0,Hillary_Clinton,Donald_Trump,Jill_Stein,Gary_Johnson,None_Of_The_Above,I_Probably_Wont_Vote,Hillary_Clinton_is_involved_in_many_coverups,Trump_changes_his_positions_all_of_the_time,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Trump_is_a_successful_businessman,Trumps_temper_could_get_the_country_into_trouble,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Waiting_for_debates
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [89]:
narrow_row_factor = rows_factor.cast_wide_to_narrow()
srcv_presidential = pd.concat([narrow_row_factor.as_dataframe().loc[:, 'variable'], columns_factor.as_dataframe()], axis=1)
srcv_presidential.head()

Unnamed: 0,variable,Hillary_Clinton_is_involved_in_many_coverups,Trump_changes_his_positions_all_of_the_time,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Trump_is_a_successful_businessman,Trumps_temper_could_get_the_country_into_trouble
0,None_Of_The_Above,0.0,0.0,0.0,1.0,0.0
1,None_Of_The_Above,0.0,0.0,0.0,0.0,0.0
2,Hillary_Clinton,0.0,0.0,0.0,0.0,0.0
3,Donald_Trump,0.0,0.0,0.0,0.0,0.0
4,Hillary_Clinton,1.0,0.0,0.0,0.0,1.0


### Single response versus multiple response

In [12]:
from statsmodels.stats.contingency_tables import Factor, MRCVTable

In [14]:
rows_factor = Factor(data.data.iloc[:, :6], data.data.columns[:6], "expected_choice", orientation="wide")
columns_factor = Factor(data.data.iloc[:, 6:11], data.data.columns[6:11], "believe_true", orientation="wide")
multiple_response_table = MRCVTable([rows_factor,], [columns_factor])
multiple_response_table

variable_col          Hillary_Clinton_is_involved_in_many_coverups  \
variable_row                                                         
Donald_Trump                                                    51   
Gary_Johnson                                                    23   
Hillary_Clinton                                                 22   
I_Probably_Wont_Vote                                            36   
Jill_Stein                                                       4   
None_Of_The_Above                                               34   

variable_col          Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi  \
variable_row                                                                                 
Donald_Trump                                                         45                      
Gary_Johnson                                                         18                      
Hillary_Clinton                                                

### MMI Item Response Table

In [19]:
srcv_item_response_table_python = multiple_response_table._build_item_response_table_for_MMI(rows_factor, columns_factor)
srcv_item_response_table_python

multiple_response_level,Hillary_Clinton_is_involved_in_many_coverups,Hillary_Clinton_is_involved_in_many_coverups,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Trump_changes_his_positions_all_of_the_time,Trump_changes_his_positions_all_of_the_time,Trump_is_a_successful_businessman,Trump_is_a_successful_businessman,Trumps_temper_could_get_the_country_into_trouble,Trumps_temper_could_get_the_country_into_trouble
selected?,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
single_response_level,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Donald_Trump,131,51,137,45,172,10,143,39,168,14
Gary_Johnson,86,23,91,18,92,17,93,16,96,13
Hillary_Clinton,205,22,209,18,176,51,206,21,185,42
I_Probably_Wont_Vote,180,36,197,19,192,24,194,22,197,19
Jill_Stein,35,4,36,3,27,12,34,5,32,7
None_Of_The_Above,193,34,200,27,201,26,206,21,203,24


In [20]:
srcv_item_response_table_python.iloc[:, (0,1)].sum().sum()

1000

In [77]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_item_response_table_result.csv")
%R -i fpath

In [99]:
%%R -i srcv_presidential

library('MRCV')

srcv_item_response_table_r <- item.response.table(srcv_presidential, I=1, J=5)
srcv_item_response_table_r_dataframe = as.data.frame.matrix(srcv_item_response_table_r)
write.table(srcv_item_response_table_r, file = fpath, sep=",")
## R is apparently bad at writing out tables with nested headers so we just get "term", "term", "term"
colnames(srcv_item_response_table_r)

 [1] "term" "term" "term" "term" "term" "term" "term" "term" "term" "term"


In [105]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_item_response_table_result.csv")
srcv_item_response_table_r = pd.DataFrame.from_csv(fpath)
srcv_item_response_table_r

Unnamed: 0,term,term.1,term.2,term.3,term.4,term.5,term.6,term.7,term.8,term.9
1,131,51,172,10,137,45,143,39,168,14
2,86,23,92,17,91,18,93,16,96,13
3,205,22,176,51,209,18,206,21,185,42
4,180,36,192,24,197,19,194,22,197,19
5,35,4,27,12,36,3,34,5,32,7
6,193,34,201,26,200,27,206,21,203,24


In [128]:
for i in range(0, len(columns_factor.labels)*2, 2):
    c = columns_factor.labels[i // 2]
    r_left_offset = i
    r_right_offset = i + 2
    py_group = srcv_item_response_table_python.loc[:, c]
    r_group = srcv_item_response_table_r.iloc[:, r_left_offset:r_right_offset]
    assert_allclose(py_group.values, r_group)

## MMI Full Chi-square tables

In [122]:
python_result = multiple_response_table._calculate_pairwise_chi2s_for_MMI_item_response_table(rows_factor, columns_factor)
python_result

Hillary_Clinton_is_involved_in_many_coverups                            27.493592
Trump_changes_his_positions_all_of_the_time                             36.511000
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi    33.065276
Trump_is_a_successful_businessman                                       19.305086
Trumps_temper_could_get_the_country_into_trouble                        16.238789
dtype: float64

In [87]:
%%R -i srcv_presidential

library('MRCV')
r_srcv_chis <- MI.stat(srcv_presidential, I=1, J=5)

$X.sq.S
[1] 132.6137

$X.sq.S.ij
     Hillary_Clinton_is_involved_in_many_coverups
[1,]                                     27.49359
     Trump_changes_his_positions_all_of_the_time
[1,]                                      36.511
     Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi
[1,]                                                             33.06528
     Trump_is_a_successful_businessman
[1,]                          19.30509
     Trumps_temper_could_get_the_country_into_trouble
[1,]                                         16.23879

$valid.margins
[1] 5



In [82]:
%R -o r_srcv_chis

In [134]:

r_results = pd.Series({k:v for k, v in zip(r_srcv_chis[1].names[1], r_srcv_chis[1])})
r_result_reordered = r_results.reindex(python_result.index, )
assert_allclose(python_result, r_result_reordered)

In [137]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_all_chis_result.csv")
r_result_reordered.to_csv(fpath)

### MMI bonferroni test

In [134]:
bonferroni_test = multiple_response_table._test_for_marginal_mutual_independence_using_bonferroni_correction
results = bonferroni_test(rows_factor, columns_factor)
table_p_value_bonferroni_corrected, pairwise_bonferroni_corrected_p_values = results
print("Overall table p value: {}\n\n".format(table_p_value_bonferroni_corrected))
print("Pairwise p values (likelihood of independence between single select variable and specific multi-select option):")
pairwise_bonferroni_corrected_p_values

Overall table p value: 3.752624460014494e-06


Pairwise p values (likelihood of independence between single select variable and specific multi-select option):


Hillary_Clinton_is_involved_in_many_coverups                            0.000229
Trump_changes_his_positions_all_of_the_time                             0.000004
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi    0.000018
Trump_is_a_successful_businessman                                       0.008431
Trumps_temper_could_get_the_country_into_trouble                        0.030972
dtype: float64

In [146]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_bonferroni.csv")
%R -i fpath

In [169]:
%%R 

res <- MI.test(srcv_presidential, I=1, J=5, type="bon")
res$bon$X.sq.S.ij.p.bon
write.csv(res$bon, file=fpath)

In [166]:
r_result = pd.DataFrame.from_csv(fpath)
table_p_value_r = r_result["p.value.bon"] 
cell_p_values_r = r_result.iloc[:, 1:]
reshaped_python_values = pairwise_bonferroni_corrected_p_values.values.reshape(5,1)
assert_allclose(reshaped_python_values, cell_p_values_r.T)
assert_allclose(table_p_value_r, table_p_value_bonferroni_corrected)

### MMI Rao Scott 2 Test

In [311]:
rao_scott_test = multiple_response_table._test_for_marginal_mutual_independence_using_rao_scott_2
table_p_value_rao_scott_corrected = rao_scott_test(rows_factor, columns_factor)
print("Overall table p value: {}\n\n".format(table_p_value_rao_scott_corrected))

109.95224031
20.7279121181
Overall table p value: 3.7969627442180354e-14




In [192]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "srcv_r_rao_scott.csv")
%R -i fpath

In [304]:
%%R 

res <- MI.test(srcv_presidential, I=1, J=5, type="rs2")
res$rs2
write.csv(res$rs2, file=fpath)
res$rs2

$X.sq.S.rs2
[1] 109.9522

$df.rs2
[1] 20.72791

$p.value.rs2
[1] 3.796963e-14



In [312]:
r_result = pd.DataFrame.from_csv(fpath)
table_p_value_r = r_result["p.value.rs2"] 
assert_allclose(table_p_value_r, table_p_value_rao_scott_corrected)

## Multiple response versus multiple response

### Item response table

In [31]:
columns_factor = Factor(data.data.iloc[:, 11:], presidential_data.columns[11:],
                              "reasons_undecided", orientation="wide")
rows_factor = Factor(data.data.iloc[:, 6:11], presidential_data.columns[6:11],
                             "believe_true", orientation="wide")
multiple_response_table = MRCVTable([rows_factor, ], [columns_factor])

In [24]:
rows_factor.data.shape

(1000, 5)

In [30]:
mrcv_presidential = presidential_data.iloc[:, 6:]
mrcv_presidential.head()

Unnamed: 0,Hillary_Clinton_is_involved_in_many_coverups,Trump_changes_his_positions_all_of_the_time,Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,Trump_is_a_successful_businessman,Trumps_temper_could_get_the_country_into_trouble,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Waiting_for_debates
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [25]:
result_dir = "/Users/gjlondon/programming/open_source/statsmodels/statsmodels/stats/tests/results/"
fpath = os.path.join(result_dir, "spmi_r_item_response_table_result.csv")
%R -i fpath

In [26]:
%%R -i mrcv_presidential

library('MRCV')

spmi_item_response_table_r <- item.response.table(mrcv_presidential, I=5, J=5)
spmi_item_response_table_r_dataframe = as.data.frame.matrix(spmi_item_response_table_r)
write.table(spmi_item_response_table_r, file = fpath, sep=",")
## R is apparently bad at writing out tables with nested headers so we just get "term", "term", "term"
colnames(spmi_item_response_table_r)
spmi_item_response_table_r_dataframe

   term term term term term term term term term term
1   683  147  661  169  720  110  719  111  719  111
2   121   49  155   15  126   44  148   22  158   12
3   703  157  702  158  721  139  750  110  747  113
4   101   39  114   26  125   15  117   23  130   10
5   685  185  713  157  748  122  752  118  777   93
6   119   11  103   27   98   32  115   15  100   30
7   715  161  709  167  753  123  759  117  761  115
8    89   35  107   17   93   31  108   16  116    8
9   692  189  715  166  763  118  774  107  771  110
10  112    7  101   18   83   36   93   26  106   13


In [27]:
spmi_item_response_table_r = pd.DataFrame.from_csv(fpath)
spmi_item_response_table_r

Unnamed: 0,term,term.1,term.2,term.3,term.4,term.5,term.6,term.7,term.8,term.9
1,683,147,661,169,720,110,719,111,719,111
2,121,49,155,15,126,44,148,22,158,12
3,703,157,702,158,721,139,750,110,747,113
4,101,39,114,26,125,15,117,23,130,10
5,685,185,713,157,748,122,752,118,777,93
6,119,11,103,27,98,32,115,15,100,30
7,715,161,709,167,753,123,759,117,761,115
8,89,35,107,17,93,31,108,16,116,8
9,692,189,715,166,763,118,774,107,771,110
10,112,7,101,18,83,36,93,26,106,13


In [32]:
spmi_item_response_table_python = multiple_response_table._build_item_response_table_for_SPMI(rows_factor, columns_factor)
spmi_item_response_table_python

Unnamed: 0_level_0,column_levels,I_wish_another_candidate_had_won_the_primary,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Need_to_do_more_research,Dont_like__any_candidate,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Not_sure_which_candidate_shares_my_values,Waiting_for_debates,Waiting_for_debates
Unnamed: 0_level_1,selected?,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
row_levels,selected?,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Hillary_Clinton_is_involved_in_many_coverups,0.0,683,147,661,169,720,110,719,111,719,111
Hillary_Clinton_is_involved_in_many_coverups,1.0,121,49,155,15,126,44,148,22,158,12
Trump_changes_his_positions_all_of_the_time,0.0,703,157,702,158,721,139,750,110,747,113
Trump_changes_his_positions_all_of_the_time,1.0,101,39,114,26,125,15,117,23,130,10
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,0.0,685,185,713,157,748,122,752,118,777,93
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,1.0,119,11,103,27,98,32,115,15,100,30
Trump_is_a_successful_businessman,0.0,715,161,709,167,753,123,759,117,761,115
Trump_is_a_successful_businessman,1.0,89,35,107,17,93,31,108,16,116,8
Trumps_temper_could_get_the_country_into_trouble,0.0,692,189,715,166,763,118,774,107,771,110
Trumps_temper_could_get_the_country_into_trouble,1.0,112,7,101,18,83,36,93,26,106,13


In [33]:
assert_allclose(spmi_item_response_table_r.values, spmi_item_response_table_python.values)

### Pairwise Chi-squared

In [45]:
spmi_pairwise_chis_python = multiple_response_table._calculate_pairwise_chi2s_for_SPMI_item_response_table(rows_factor, columns_factor)
spmi_pairwise_chis_python

Unnamed: 0,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Waiting_for_debates
Hillary_Clinton_is_involved_in_many_coverups,11.0574,12.5105,17.2742,0.0228698,5.21583
Trump_changes_his_positions_all_of_the_time,7.04331,0.00318631,2.74341,1.38182,4.01368
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,11.7642,0.558638,9.74002,0.402103,16.0883
Trump_is_a_successful_businessman,6.68351,2.07403,10.0131,0.0193256,4.48833
Trumps_temper_could_get_the_country_into_trouble,16.1294,0.964289,22.8695,8.56061,0.236957


Let's compare this result versus the p value we would get by applying a traditional chi-squared test:

In [42]:
fpath = os.path.join(result_dir, "spmi_r_pairwise_chis_result.csv")
%R -i fpath

In [43]:
%%R -i mrcv_presidential

library('MRCV')

spmi_pairwise_chis_r <- MI.stat(mrcv_presidential, I=5, J=5)

spmi_pairwise_chis_r$X.sq.S.ij

spmi_pairwise_chis_r_dataframe <- as.data.frame.matrix(spmi_pairwise_chis_r$X.sq.S.ij)
write.table(spmi_pairwise_chis_r_dataframe, file = fpath, sep=",")
## R is apparently bad at writing out tables with nested headers so we just get "term", "term", "term"
colnames(spmi_pairwise_chis_r_dataframe)
spmi_pairwise_chis_r_dataframe

                                                                     I_wish_another_candidate_had_won_the_primary
Hillary_Clinton_is_involved_in_many_coverups                                                            11.057399
Trump_changes_his_positions_all_of_the_time                                                              7.043314
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi                                    11.764200
Trump_is_a_successful_businessman                                                                        6.683514
Trumps_temper_could_get_the_country_into_trouble                                                        16.129399
                                                                     Need_to_do_more_research
Hillary_Clinton_is_involved_in_many_coverups                                      12.51047518
Trump_changes_his_positions_all_of_the_time                                        0.00318631
Hillary_Clinton_lied_to_the_famili

In [48]:
spmi_pairwise_chis_r = pd.DataFrame.from_csv(fpath)
spmi_pairwise_chis_r

assert_allclose(spmi_pairwise_chis_r.values.astype(float), spmi_pairwise_chis_python.values.astype(float))

Unnamed: 0,I_wish_another_candidate_had_won_the_primary,Need_to_do_more_research,Dont_like__any_candidate,Not_sure_which_candidate_shares_my_values,Waiting_for_debates
Hillary_Clinton_is_involved_in_many_coverups,11.057399,12.510475,17.274174,0.02287,5.215833
Trump_changes_his_positions_all_of_the_time,7.043314,0.003186,2.743406,1.381818,4.013684
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,11.7642,0.558638,9.740022,0.402103,16.088256
Trump_is_a_successful_businessman,6.683514,2.07403,10.013109,0.019326,4.488333
Trumps_temper_could_get_the_country_into_trouble,16.129399,0.964289,22.869451,8.560605,0.236957


### SPMI bonferroni

In [64]:
bonferroni_test = multiple_response_table._test_for_single_pairwise_mutual_independence_using_bonferroni
table_p_value_bonferroni_corrected, cellwise_p_bonferroni_python = bonferroni_test(rows_factor, columns_factor)
print("Overall table p value: {}\n\n".format(table_p_value_bonferroni_corrected))
print("Cellwise p value: {}\n\n".format(cellwise_p_bonferroni_python))

Overall table p value: 4.3346430242129665e-05


Cellwise p value:                                                     I_wish_another_candidate_had_won_the_primary  \
Hillary_Clinton_is_involved_in_many_coverups                                            0.022083   
Trump_changes_his_positions_all_of_the_time                                             0.198904   
Hillary_Clinton_lied_to_the_families_of_America...                                      0.015095   
Trump_is_a_successful_businessman                                                       0.243271   
Trumps_temper_could_get_the_country_into_trouble                                        0.001479   

                                                    Need_to_do_more_research  \
Hillary_Clinton_is_involved_in_many_coverups                        0.010117   
Trump_changes_his_positions_all_of_the_time                         1.000000   
Hillary_Clinton_lied_to_the_families_of_America...                  1.000000   
Trump_is_a_su

In [59]:
fpath = os.path.join(result_dir, "spmi_r_bonferroni.csv")
%R -i fpath

In [60]:
%%R -i mrcv_presidential

library('MRCV')

res <- MI.test(mrcv_presidential, I=5, J=5, type="bon")
res$bon$X.sq.S.ij.p.bon
write.csv(res$bon, file=fpath)

In [61]:
spmi_bonferroni_r = pd.DataFrame.from_csv(fpath)
spmi_bonferroni_r

table_p_value_r = spmi_bonferroni_r["p.value.bon"]
cell_p_values_r = spmi_bonferroni_r.iloc[:, 1:]

assert_allclose(cellwise_p_bonferroni_python, cell_p_values_r)
assert_allclose(table_p_value_r, table_p_value_bonferroni_corrected)

Unnamed: 0,p.value.bon,X.sq.S.ij.p.bon.I_wish_another_candidate_had_won_the_primary,X.sq.S.ij.p.bon.Need_to_do_more_research,X.sq.S.ij.p.bon.Dont_like__any_candidate,X.sq.S.ij.p.bon.Not_sure_which_candidate_shares_my_values,X.sq.S.ij.p.bon.Waiting_for_debates
Hillary_Clinton_is_involved_in_many_coverups,4.3e-05,0.022083,0.010117,0.000809,1.0,0.559553
Trump_changes_his_positions_all_of_the_time,4.3e-05,0.198904,1.0,1.0,1.0,1.0
Hillary_Clinton_lied_to_the_families_of_Americans_killed_in_Benghazi,4.3e-05,0.015095,1.0,0.045075,1.0,0.001511
Trump_is_a_successful_businessman,4.3e-05,0.243271,1.0,0.038857,1.0,0.853173
Trumps_temper_could_get_the_country_into_trouble,4.3e-05,0.001479,1.0,4.3e-05,0.085879,1.0


AssertionError: 
Not equal to tolerance rtol=1e-07, atol=0

(shapes (5, 6), (5, 5) mismatch)
 x: array([[  4.334643e-05,   2.208349e-02,   1.011692e-02,   8.088865e-04,
          1.000000e+00,   5.595530e-01],
       [  4.334643e-05,   1.989042e-01,   1.000000e+00,   1.000000e+00,...
 y: array([[  1.105740e+01,   1.251048e+01,   1.727417e+01,   2.286978e-02,
          5.215833e+00],
       [  7.043314e+00,   3.186310e-03,   2.743406e+00,   1.381818e+00,...

### SPMI Rao Scott 2

In [77]:
rao_scott_2_test = multiple_response_table._test_for_single_pairwise_mutual_independence_using_rao_scott_2
table_p_value = rao_scott_2_test(rows_factor, columns_factor)
fpath = os.path.join(result_dir, "spmi_r_rao_scott.csv")


%R -i fpath

124.68450287
18.1376800255


In [73]:
%%R -i mrcv_presidential

res <- MI.test(mrcv_presidential, I=5, J=5, type="rs2")
res$rs2
write.csv(res$rs2, file=fpath)
res$rs2

$X.sq.S.rs2
[1] 124.6845

$df.rs2
[1] 18.13768

$p.value.rs2
[1] 0



In [78]:
table_p_value

0.0

In [79]:
r_result = pd.DataFrame.from_csv(fpath)
table_p_value_r = r_result["p.value.rs2"]
assert_allclose(table_p_value_r, table_p_value)

Both the bonferroni and the row scott methods require us to make some assumptions about the distribution of the deviations we observe in our response table. Ever not comfortable making assumptions we can instead use a nonparametric method by running a bootstrap simulation to approximate the distribution of deviations. The bootstrap method is more computationally intensive but allows us to estimate the p value using fewer assumptions.

In [None]:
bootstrap_test = multiple_response_table._test_for_single_pairwise_mutual_independence_using_bootstrap
results = bootstrap_test(rows_factor, columns_factor, verbose=True)
table_p_value_bootstrap, pairwise_bootstrap_p_values = results
print("Overall table p value: {}\n\n".format(table_p_value_bootstrap))
print("Pairwise p values (likelihood of independence between single select variable and specific multi-select option):")
pairwise_bootstrap_p_values

sample 0


  return self._int64index.union(other)


sample 50
sample 100
sample 150
sample 200
sample 250
sample 300
sample 350
sample 400
sample 450
sample 500
