<h1><span style="color:red">Generate Factor Contributions</span></h1>

### This notebook reads numeric and categorical variables from the survey dataset and computes factor contributions for all levels of the variables in the survey dataset.

## 1. Retrieve survey parameters from the URL

In [None]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

In [None]:
# common imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import Markdown, display
from ipyfilechooser import FileChooser

import pandas as pd    
import panel as pn
import numpy as np

pn.extension()
def printmd(string):
    display(Markdown(string))

absolutePath = "../../temp_csvs/"

# local imports
import sys
sys.path.insert(1, '../../helpers')
import panel_libs as panellibs
import suave_integration as suaveint

## 2. Read the survey file

In [None]:
# read the csv file
df = panellibs.extract_data(absolutePath + csv_file)

# create a list of variable names
variables_df = pd.DataFrame({'varname':df.columns})
printmd("<b><span style='color:red'>All variables in the survey file:</span></b>")
print(variables_df.varname.values)

In [None]:
df.head()

## 3. Generate Factor Contributions

In [None]:
# bin numerical columns in survey dataframe
df_binned = df.apply(pd.to_numeric, errors='ignore')
numeric_cols = df_binned.select_dtypes(include=np.number).columns.tolist()
df_binned[numeric_cols] = df_binned[numeric_cols].apply(pd.cut, bins=5)

In [None]:
# OPTIONAL: if survey has #img and #name columns, remove them
df_binned.drop(columns=['#img', '#name'], inplace=True)

# OPTIONAL: if survey has columns with #date or #hidden, remove them
date_cols = [i for i in list(df_binned.columns) if '#date' in i or '#hidden' in i]
df_binned.drop(columns=date_cols, inplace=True)

In [None]:
# helper functions
def find_unique(var):
    """
    Helper function to return all unique entries for #multi survey variables
    """
    arr = df[var].unique()
    all_entries = set()
    for i in range(len(arr)):
        if arr[i] != arr[i]:
            continue
        if (i != 0) and ('|' in arr[i]):
            arr[i] = arr[i].split('|')
            for j in arr[i]:
                all_entries.add(j)
        else:
            all_entries.add(arr[i])
    all_entries = list(all_entries)
    return all_entries

def find_factor_contributions(var, filter_vars):
    """
    Helper function to find all the factor contributions from
    a list of filter variables to the levels of the variable of interest.
    """
    out = {var: {}}
    
    for f in filter_vars:
        if var == f:
            continue
                    
        x_levels = df_binned[var].value_counts().index.to_list()
        if '#multi' in f:
            a_levels = find_unique(f)
        else:
            a_levels = df_binned[f].value_counts().index.to_list()

        for i in x_levels:
            if (var + '_' + str(i)) not in out[var].keys():
                out[var][var + '_' + str(i)] = {}
            for j in a_levels:
                x_prop = df_binned[df_binned[var]==i].shape[0]/df.shape[0]
                a_count = df_binned[df_binned[f]==j].shape[0]
                ax_count = df_binned[(df_binned[f]==j) & (df[var]==i)].shape[0]
                try:
                    ax_prop = ax_count/a_count
                except:
                    ax_prop = 0
                contribution = (ax_prop - x_prop)*100
                out[var][var + '_' + str(i)][f + '_' + str(j)] = contribution
                
    return out

In [None]:
# select a variable of interest to generate factor contributions
selector = pn.widgets.Select(name='Select a variable to investigate: ', options=list(df_binned.columns))
selector

In [None]:
# generate factor contributions: 
#     **will take approxiamtely 30 seconds to run**
#     (based on survey size, and number of unique levels for variables)
contributions = find_factor_contributions(selector.value, list(df_binned.columns))

## 4. Analyze Factor Contributions

In [None]:
# sort contributions
def sort_contributions(dictionary, ascending=True):
    """
    Helper function to sort dictionary by values in ascending/descending order
    """
    out_dict = dictionary[selector.value]
    for key in out_dict.keys():
        out_dict[key] = {k: v for k, v in sorted(out_dict[key].items(), key=lambda x: x[1], reverse=ascending)}
    return out_dict

In [None]:
# find largest contributions to each level of the selected variable of interest
out_dict = sort_contributions(contributions)

In [None]:
# investigate levels of variable of interest
level = pn.widgets.Select(name = 'Select a level to analyze:',options=list(out_dict.keys()))
level

In [None]:
# output factor contributions to specified variable level
out_dict[level.value]