<h1><span style="color:red">Generate Factor Contributions</span></h1>

### This notebook reads numeric and categorical variables from the survey dataset and computes factor contributions for all levels of the variables in the survey dataset.

## 1. Retrieve survey parameters from the URL

In [None]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

In [None]:
# common imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import Markdown, display
from ipyfilechooser import FileChooser

import pandas as pd    
import panel as pn

pn.extension()
def printmd(string):
    display(Markdown(string))

absolutePath = "../../temp_csvs/"

# local imports
import sys
sys.path.insert(1, '../../helpers')
import panel_libs as panellibs
import suave_integration as suaveint

## 2. Read the survey file

In [None]:
# read the csv file
df = panellibs.extract_data(absolutePath + csv_file)

# create a list of variable names
variables_df = pd.DataFrame({'varname':df.columns})
printmd("<b><span style='color:red'>All variables in the survey file:</span></b>")
print(variables_df.varname.values)

In [3]:
df.head()

Unnamed: 0,record_id,bio_sex_birth_2,gender_identity_term,race_ethn_hispanic,race_ethn_hispanic_detail_2___1,race_ethn_hispanic_detail_2___2,race_ethn_hispanic_detail_2___3,race_ethn_hispanic_detail_2___4,race_ethn_hispanic_detail_2___5,race_ethn_hispanic_detail_2___6,...,vaccine_reasons___6,vaccine_reasons___7,vaccine_reasons___8,vaccine_reasons___9,vaper_cur_stat,work_closecont,work_ppe,work_ppe_date_mdy,work_wash,zip_code
0,Aim1_english_1,1.0,1.0,1.0,1.0,,,,,,...,,,,,,,,,,
1,Aim1_english_2,1.0,1.0,1.0,,,,1.0,,,...,,,,,,,,,,
2,Aim1_english_3,1.0,1.0,1.0,1.0,,,,,,...,,,,,,,,,,
3,Aim1_english_4,1.0,1.0,1.0,1.0,,,,,,...,,,,,,,,,,
4,Aim1_english_5,1.0,1.0,1.0,,,,1.0,,,...,,,,,,,,,,


## 3. Generate Factor Contributions

In [10]:
# bin all numeric variables in df

In [73]:
def find_factor_contributions(var, filter_vars):
    """
    Helper function to find all the factor contributions from
    a list of filter variables to the levels of the variable of interest.
    """
    out = {var: {}}
    
    for f in filter_vars:
        if var == f:
            continue
        x_levels = df[var].value_counts().index.to_list()
        a_levels = df[f].value_counts().index.to_list()

        for i in x_levels:
            if (var + '_' + str(i)) not in out[var].keys():
                out[var][var + '_' + str(i)] = {}
            for j in a_levels:
                x_prop = df[df[var]==i].shape[0]/df.shape[0]
                a_count = df[df[f]==j].shape[0]
                ax_count = df[(df[f]==j) & (df[var]==i)].shape[0]
                ax_prop = ax_count/a_count
                contribution = (ax_prop - x_prop)*100
                out[var][var + '_' + str(i)][f + '_' + str(j)] = contribution
                
    return out

In [None]:
# second check on calculations

In [74]:
find_factor_contributions('cc_cancer', ['bio_sex_birth_2', 'cov_tst_mthd'])

{'cc_cancer': {'cc_cancer_0.0': {'bio_sex_birth_2_1.0': 5.269568853137524,
   'bio_sex_birth_2_0.0': -2.520863550947794,
   'bio_sex_birth_2_99.0': 4.706165703275528,
   'bio_sex_birth_2_96.0': 51.37283236994219,
   'bio_sex_birth_2_3.0': -48.627167630057805,
   'cov_tst_mthd_1.0': 25.813424869154293,
   'cov_tst_mthd_4.0': 27.197008194118016,
   'cov_tst_mthd_2.0': 20.603601600711425,
   'cov_tst_mthd_3.0': 30.784597075824543},
  'cc_cancer_1.0': {'bio_sex_birth_2_1.0': 0.1863619854210248,
   'bio_sex_birth_2_0.0': -0.19979946221577588,
   'bio_sex_birth_2_99.0': -0.9826589595375723,
   'bio_sex_birth_2_96.0': -0.9826589595375723,
   'bio_sex_birth_2_3.0': 99.01734104046243,
   'cov_tst_mthd_1.0': 0.5931368173297458,
   'cov_tst_mthd_4.0': 1.2151432382646257,
   'cov_tst_mthd_2.0': 4.145546168667556,
   'cov_tst_mthd_3.0': 1.9585175110506627}}}

In [66]:
# find contributions for all variables and all levels in survey

contributions = {}
for col in list(df.columns):
    if contributions == {}:
        contributions = find_factor_contributions(col, list(df.columns))
    else:
        contributions.update(find_factor_contributions(col, list(df.columns)))
contributions

## 4. Analyze Factor Contributions

In [None]:
# sort dictionaries



In [76]:
# have user select variable of interest to investigate
selector = pn.widgets.Select(name='Select a variable to investigate: ', options=list(df.columns))
selector

In [None]:
# display largest contributions to each level of the selected variable of interest

contributions[selector.value]