<h1><span style="color:red">Generate Contingency Tables</span></h1>

### This notebook reads numeric and categorical variables from the survey dataset, lets users compute a contingency table from variables of interest and test for independence.

## 1. Retrieve survey parameters from the URL

In [None]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

In [None]:
# common imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import Markdown, display
from ipyfilechooser import FileChooser

import pandas as pd    
import numpy as np
import panel as pn
import statsmodels.api as sm
import statsmodels.formula.api as smf

pn.extension()
def printmd(string):
    display(Markdown(string))

absolutePath = "../../temp_csvs/"

# local imports
import sys
sys.path.insert(1, '../../helpers')
import panel_libs as panellibs
import suave_integration as suaveint

## 2. Read the survey file

In [None]:
# read the csv file
df = panellibs.extract_data(absolutePath + csv_file)

# create a list of variable names
variables_df = pd.DataFrame({'varname':df.columns})
printmd("<b><span style='color:red'>All variables in the survey file:</span></b>")
print(variables_df.varname.values)

## 3. Select variables from survey

In [None]:
# select number of variables for contingency table
n_row = pn.widgets.IntSlider(name='Select Number of Row Variables',start=1, end=2, value=1)
n_col = pn.widgets.IntSlider(name='Select Number of Column Variables',start=1, end=2, value=1)

pn.Column(n_row, n_col)

In [None]:
# select variables from survey
row_vars = pn.widgets.MultiChoice(name='Select Row Variables for Contingency Table',
                                  value=[], options=list(df.columns), max_items=n_row.value)
col_vars = pn.widgets.MultiChoice(name='Select Column Variables for Contingency Table',
                                  value=[], options=list(df.columns), max_items=n_col.value)
pn.Row(row_vars, col_vars, height=250)

## 4. Generate contingency table

In [None]:
def generate_table(df, row_variables, column_variables):
    """
    Helper function to generate a contingency table from the input dataframe
    with a given list of row_variables and column_variables.
    """
    tab = pd.crosstab([df[row_variables[i]] for i in range(len(row_variables))],
                      [df[column_variables[i]] for i in range(len(column_variables))])
    table = sm.stats.Table(tab)
    return table.table_orig, table

In [None]:
# bin all numerical variables in original df
df_binned = df.apply(pd.to_numeric, errors='ignore')
numeric_cols = df_binned.select_dtypes(include=np.number).columns.tolist()
df_binned[numeric_cols] = df_binned[numeric_cols].apply(pd.cut, bins=5)
df_binned['freq'] = 1

In [None]:
# generate contingency table
selected_row_vars = row_vars.value
selected_col_vars = col_vars.value
tab = generate_table(df_binned, selected_row_vars, selected_col_vars)
tab[0]

In [None]:
# denote shape of contingency table as two-way, three-way, or multi-way
table_shape = None
total_vars = len(row_vars.value + col_vars.value)
if total_vars == 2:
    table_shape = 'two-way'
elif total_vars == 3:
    table_shape = 'three-way'
else:
    table_shape = 'multi-way'
table_shape

## 5. Test for mutual independence

For one-way tables conduct a chi-square test of independence, under the null hypothesis that the two variables of interest are independent. For three-way and multi-way tables, fit a log-linear model to test for mutual independence between all variables of interest.  

In [None]:
def chi_square(table, alpha=.05):
    """
    Performs a chi-square test of independence in a
    two-way contingency table at the given significance level
    """
    chi2 = table.test_nominal_association()
    
    # display a warning if expected counts are < 5
    exp_freq = table.fittedvalues.to_numpy()
    if (exp_freq < 5).sum() != 0:
        print("Warning: table contains expected frequencies less than 5.")

    # display results of chi-square test
    p_val = chi2.pvalue
    if p_val < alpha:
        print("P-value: {}. Table variables are associated at significance level: {}".format(p_val.round(4), alpha))
    else:
        print("P-value: {}. Table variables are independent at significance level: {}".format(p_val.round(4), alpha))
        
        
def llm_mutual(flat_table):
    """
    Performs log-linear analysis of the flattened multi-way
    contingency table using Poisson regression for mutual independence.
    """
    # remove suave tags from column names (interferes with statsmodels api)
    df_2 = flat_table.copy()
    df_2.columns = [i.split('#')[0].replace(' ', '_') for i in flat_table.columns]
    
    # define predictor and response variables
    predictors = (' + ').join([i.split('#')[0].replace(' ', '_') for i in row_vars.value + col_vars.value])
    response = 'freq'
    
    # fit model
    glm = smf.glm(response + ' ~ ' + predictors, data=df_2, family=sm.families.Poisson())
    res_o = glm.fit()
    print(res_o.summary())
    return res_o

In [None]:
# perform chi-square test of independence for two-way tables
if table_shape == 'two-way':
    result = chi_square(tab[1])
# perform log-linear analysis for three-way and multi-way tables
else:
    flat = df_binned.groupby(col_vars.value + row_vars.value).sum()['freq'].to_frame().reset_index()
    result = llm_mutual(flat)
result

<h2><span style="color:red">5a. Optional: Test for joint independence</span></h2>

<span style="color:red">The following log-linear models are for testing THREE-WAY tables only!</span> 

For three-way tables, test variables for joint-independence. The log-linear model tests for joint independence under the assumption that a single variable of interest is independent of the other two. The model implies that other two variables can have an arbitrary association, and that this association doesn't depend on the level of the variable of interest.

In [None]:
# select a variable to test for joint indpendence
selector = pn.widgets.Select(name='Test joint indpedence of: ', options=col_vars.value + row_vars.value)
selector

In [None]:
def llm_joint(flat_table):
    """
    Performs log-linear analysis of the flattened multi-way
    contingency table using Poisson regression for joint independence.
    """
    # remove suave tags from column names (interferes with statsmodels api)
    df_2 = flat_table.copy()
    df_2.columns = [i.split('#')[0].replace(' ', '_') for i in flat_table.columns]
    
    # define predictor and response variables
    var = row_vars.value + col_vars.value
    var.remove(selector.value)
    selected_var = selector.value.split('#')[0].replace(' ', '_')
    predictors = [i.split('#')[0].replace(' ', '_') for i in var]
    formula = 'freq ~ ' + selected_var + ' + ' + predictors[0] + ' + ' + predictors[1] + ' + ' + \
                predictors[0] + ' * ' + predictors[1]
    print('Model: ' + formula)
    
    # fit model
    glm = smf.glm(formula, data=df_2, family=sm.families.Poisson())
    res_o = glm.fit()
    return res_o

In [None]:
if table_shape == 'three-way':
    flat = df_binned.groupby(col_vars.value + row_vars.value).sum()['freq'].to_frame().reset_index()
    result = llm_joint(flat)
result.summary()

<h2><span style="color:red">5b. Optional: Test for conditional independence</span></h2>

For three-way tables, test variables for conditional independence. The log-linear model tests for conditional indpendence under the asumption that two variables are independent, given the third.

In [None]:
# select a variable to condition the independence of the remaining variables
selector = pn.widgets.Select(name='Test if variables are indepndent given: ', options=col_vars.value + row_vars.value)
selector

In [None]:
def llm_conditional(flat_table):
    """
    Performs log-linear analysis of the flattened multi-way
    contingency table using Poisson regression for conditional independence.
    """
    # remove suave tags from column names (interferes with statsmodels api)
    df_2 = flat_table.copy()
    df_2.columns = [i.split('#')[0].replace(' ', '_') for i in flat_table.columns]
    
    # define predictor and response variables
    var = row_vars.value + col_vars.value
    var.remove(selector.value)
    selected_var = selector.value.split('#')[0].replace(' ', '_')
    predictors = [i.split('#')[0].replace(' ', '_') for i in var]
    formula = 'freq ~ ' + selected_var + ' + ' + predictors[0] + ' + ' + predictors[1] + ' + ' + \
                predictors[0] + ' * ' + selected_var + '+ ' + predictors[1] +  ' * ' + selected_var
    print('Model: ' + formula)
    
    # fit model
    glm = smf.glm(formula, data=df_2, family=sm.families.Poisson())
    res_o = glm.fit()
    return res_o

In [None]:
if table_shape == 'three-way':
    flat = df_binned.groupby(col_vars.value + row_vars.value).sum()['freq'].to_frame().reset_index()
    result = llm_conditional(flat)
result.summary()

<h2><span style="color:red">5c. Test for homogeneous association</span></h2>

For three-way tables, test for homogeneous association between the variables. The log-linear model, or no three-factor interaction model, tests for homogeneous association and implies that the conditional reelationship between any pair of variables given the third is the same at each level of the third variable.

In [None]:
def llm_association(flat_table):
    """
    Performs log-linear analysis of the flattened multi-way
    contingency table using Poisson regression for homogeneous association.
    """
    # remove suave tags from column names (interferes with statsmodels api)
    df_2 = flat_table.copy()
    df_2.columns = [i.split('#')[0].replace(' ', '_') for i in flat_table.columns]
    
    # define predictor and response variables
    predictors = (' + ').join([i.split('#')[0].replace(' ', '_') for i in row_vars.value + col_vars.value])
    formula = 'freq ~ (' + predictors + ')**2'
    print('Model: ' + formula)
    
    # fit model
    glm = smf.glm(formula, data=df_2, family=sm.families.Poisson())
    res_o = glm.fit()
    return res_o

In [None]:
if table_shape == 'three-way':
    flat = df_binned.groupby(col_vars.value + row_vars.value).sum()['freq'].to_frame().reset_index()
    result = llm_association(flat)
result.summary()