<h1><span style="color:red">Generate Factor Contributions</span></h1>

### This notebook reads numeric and categorical variables from the survey dataset and computes factor contributions for all levels of the variables in the survey dataset.

## 1. Retrieve survey parameters from the URL

In [None]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

In [None]:
# common imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import Markdown, display

import re
import datetime
import pandas as pd    
import panel as pn
import numpy as np
import matplotlib.pyplot as plt

pn.extension('tabulator')
def printmd(string):
    display(Markdown(string))

absolutePath = "../../temp_csvs/"

# local imports
import sys
sys.path.insert(1, '../../helpers')
import panel_libs as panellibs
import suave_integration as suaveint

## 2. Read the survey file

In [None]:
# read the csv file
df = panellibs.extract_data(absolutePath + csv_file)

# create a list of variable names
variables_df = pd.DataFrame({'varname':df.columns})
printmd("<b><span style='color:red'>All variables in the survey file:</span></b>")
print(variables_df.varname.values)

## 3. Bin Numerical and Date Variables

Create custom bins for numerical and date variables, or skip to the next step.

<h3><span style="color:red">(OPTIONAL) Set custom bins for numerical variables:</span></h3>

In [None]:
# select numerical columns to bin -- **unselected columns will be automatically binned into 5 equal bins**
df = df.apply(pd.to_numeric, errors='ignore')
df = df.dropna(axis=1, how='all')
op = [i for i in list(df.columns) if '#number' in i]
binnable = pn.widgets.MultiChoice(name='Select Columns to Bin: ', options=op, height=350)
binnable

In [None]:
# set custom bins for a numeric (#number) variable
next_var = pn.widgets.Button(name='Continue to Next Variable', button_type='success')
first_col = binnable.value[next_var.clicks]
selected_bins = [[] for i in binnable.value]

text = pn.widgets.TextInput(name='Current Variable: ', value=first_col, disabled=True)
num = pn.widgets.Select(name='Select number of bins', options=[i+1 for i in range(5)])
next_bin = pn.widgets.Button(name='Next Bin', button_type='primary')
bin_range = pn.widgets.RangeSlider(name='Select Range for Bin #1',
                                   start=df[first_col].min(), end=df[first_col].max(),
                                   value=(df[first_col].min(), df[first_col].max()), step=2)

def plot_histogram(df, column, plotting_pane, x_range=None):
    """
    Helper function to plot histogram of a numeric variable
    in the provided x_range onto the panel plotting pane.
    """
    fig, ax = plt.subplots(1,1)
    df[column].plot.hist(bins=50, ax=ax, title = 'Histogram of: ' + column, xlim=x_range)
    ax.set_xlabel(column)
    plotting_pane.object = fig
    plt.close()

def b(event):
    """
    Updates bin sliders when "next" is clicked
    """
    if next_bin.clicks == 0:
        return
    
    selected_col = binnable.value[next_var.clicks]

    if next_bin.clicks < num.value:
        bin_range.name = 'Select Range for Bin #' + str(next_bin.clicks + 1)
        selected_bins[next_var.clicks].append(bin_range.value)
        bin_range.start = bin_range.value[1] + 1
        bin_range.value = (bin_range.value[1] + 1, df[selected_col].max())
        plot_histogram(df, selected_col, plot, x_range = bin_range.value)
    else:
        selected_bins[next_var.clicks].append(bin_range.value)
        next_bin.disabled = True
        
        if selected_col != binnable.value[-1]:
            layout[0][4] = next_var
        else:
            layout[0][4] = "Variable Binning Complete!"

def c(event):
    """
    Resets layout when "continue" is clicked
    """
    next_bin.clicks = 0
    next_col = binnable.value[next_var.clicks] 
    text.value = next_col
    num.value = 1
    bin_range.start = df[next_col].min()
    bin_range.end = df[next_col].max()
    bin_range.value = (df[next_col].min(), df[next_col].max())
    next_bin.disabled = False
    layout[0][4] = ""
    plot_histogram(df, next_col, plot)

next_bin.on_click(b)
next_var.on_click(c)
plot =  pn.pane.Matplotlib(dpi=80)
plot_histogram(df, binnable.value[next_var.clicks], plot)
layout = pn.Row(pn.Column(text, num, bin_range, next_bin, ""), plot)
layout

In [None]:
# bin each column with chosen bins
cols = binnable.value
for col, bins in list(zip(cols, selected_bins)):
    bins = pd.IntervalIndex.from_tuples(bins, closed='left')
    df[col] = pd.cut(df[col], bins)
    
# bin remaining numeric columns into 5 equal bins
remainder = list(set(binnable.options) - set(binnable.value))
df[remainder] = df[remainder].apply(pd.cut, bins=5)

<h3><span style="color:red">(OPTIONAL) Set custom bins for date variables:</span></h3>

In [None]:
# convert date variables in survey
dates = [i for i in list(df.columns) if '#date' in i]
df[dates] = df[dates].apply(pd.to_datetime, errors='coerce')
df[dates] = df[dates].apply(lambda x: x.dt.date)
d_max = pd.Series(df[dates].to_numpy().flatten()).dropna().max()
d_min = pd.Series(df[dates].to_numpy().flatten()).dropna().min()

# define bins for all date variables
date_bins = []
date_slider = pn.widgets.DateRangeSlider(name='Date Range for Bin #1 ', start=d_min, end=d_max, value=(d_min,d_max))
next_bin2 = pn.widgets.Button(name='Next Bin', button_type='primary')
num = pn.widgets.Select(name='Select number of bins', options=[i+1 for i in range(5)])

def d(event):
    """
    Updates bin sliders when "next" is clicked
    """
    if next_bin2.clicks < num.value:
        date_slider.name = 'Date Range for Bin #' + str(next_bin2.clicks + 1)
        date_bins.append(date_slider.value)
        date_slider.start = date_slider.value[1]
        date_slider.value = (date_slider.value[1], datetime.datetime(d_max.year, d_max.month, d_max.day))
        plot_dates(df, plot)
    else:
        date_bins.append(date_slider.value)
        next_bin2.disabled = True
        layout2[0][3] = 'Date Binning Complete!'
        
def plot_dates(df, plotting_pane):
    """
    Plots dates based on slider selection to the plotting pane.
    """
    all_dates = pd.DataFrame(df[dates].to_numpy().flatten(), columns=['date'])
    filtered = all_dates[(all_dates['date'] >= pd.Timestamp(date_slider.value[0]))]
    if len(filtered) == 0:
        fig, ax = fig, ax = plt.subplots(1,1)
        all_dates.groupby('date').size().plot(kind='bar', ax=ax)
        num = 20
    else:
        fig, ax = fig, ax = plt.subplots(1,1)
        filtered.groupby('date').size().plot(kind='bar', ax=ax)
        if len(filtered.groupby('date').size()) > 20:
            num = int(len(filtered.groupby('date').size())/15)
        else:
            num = 1
    ax.set_xticks(ax.get_xticks()[::num])
    ax.set_ylabel('Frequency');
    for tick in ax.get_xticklabels():
        tick.set_rotation(70)
    plotting_pane.object = fig
    plt.close()

next_bin2.on_click(d)
plot =  pn.pane.Matplotlib(dpi=80, tight=True)
plot_dates(df, plot)
layout2 = pn.Row(pn.Column(num, date_slider, next_bin2, ""), plot)
layout2

In [None]:
# convert selected bins to binning intervals
bins = pd.interval_range(start=pd.Timestamp(datetime.date(date_bins[0][0].year, date_bins[0][0].month, date_bins[0][0].day)),
                         end=pd.Timestamp(datetime.date(date_bins[0][1].year, date_bins[0][1].month, date_bins[0][1].day)),
                         periods=1)

for d_bin in date_bins[1:]:
    out = (datetime.date(d_bin[0].year, d_bin[0].month, d_bin[0].day),
           datetime.date(d_bin[1].year, d_bin[1].month, d_bin[1].day))
    interval = pd.interval_range(start=pd.Timestamp(out[0]), end=pd.Timestamp(out[1]), periods=1)
    bins = bins.append(interval)
    
# convert dates to date ranges
for d in dates:
    df[d] = df[d].apply(pd.Timestamp)
    df[d] = pd.cut(df[d], bins=bins)

## 4. Generate Factor Contributions

In [None]:
# OPTIONAL: if survey has #img and #name columns, remove them
df.drop(columns=['#img', '#name'], inplace=True)

# OPTIONAL: if survey has columns with #long or #hidden, remove them
remove_cols = [i for i in list(df.columns) if '#long' in i or '#hidden' in i]
df.drop(columns=remove_cols, inplace=True)

In [None]:
# helper functions
def find_unique(var):
    """
    Helper function to return all unique entries for #multi survey variables
    """
    arr = df[var].unique()
    all_entries = set()
    for i in range(len(arr)):
        if arr[i] != arr[i]:
            continue
        if (i != 0) and ('|' in arr[i]):
            arr[i] = arr[i].split('|')
            for j in arr[i]:
                all_entries.add(j)
        else:
            all_entries.add(arr[i])
    all_entries = list(all_entries)
    return all_entries

def find_tags(value):
    """
    Helper function to extract SuAVE qualifiers from variable names
    """
    tags = re.findall('#\S+', value)
    if tags == []:
        return ['untagged']
    return tags

def find_factor_contributions(var, filter_vars):
    """
    Helper function to find all the factor contributions from
    a list of filter variables to the levels of the variable of interest.
    """
    out = {var: {}}
    
    for f in filter_vars:
        if var == f:
            continue
                    
        x_levels = df[var].value_counts().index.to_list()
        if '#multi' in f:
            a_levels = find_unique(f)
        else:
            a_levels = df[f].value_counts().index.to_list()

        for i in x_levels:
            if (var + '_' + str(i)) not in out[var].keys():
                out[var][var + '_' + str(i)] = {}
            for j in a_levels:
                x_count = df[df[var]==i].shape[0]
                x_prop = df[df[var]==i].shape[0]/df.shape[0]
                a_count = df[df[f]==j].shape[0]
                ax_count = df[(df[f]==j) & (df[var]==i)].shape[0]
                try:
                    ax_prop = ax_count/a_count
                    completeness = round((ax_count/x_count)*100, 3)
                except:
                    ax_prop = 0
                    completeness = 0
                contribution = round((ax_prop - x_prop)*100, 3)
                accuracy = round(ax_prop*100, 3)
                value_name = f.split('#')[0]
                out[var][var + '_' + str(i)][value_name + ': ' + str(j)] = [contribution,
                                                                            completeness,
                                                                            accuracy,
                                                                            ax_count,
                                                                            a_count,
                                                                            find_tags(f)]
                
    return out

# select a variable of interest to generate factor contributions
selector = pn.widgets.Select(name='Select a variable to investigate: ', options=list(df.columns))
selector

In [None]:
# generate factor contributions: 
#     **will take approxiamtely 30 seconds to run**
#     (based on survey size, and number of unique levels for variables)
contributions = find_factor_contributions(selector.value, list(df.columns))

## 5. Analyze Factor Contributions

In [None]:
# sort contributions
def sort_contributions(dictionary, ascending=True):
    """
    Helper function to sort dictionary by values in ascending/descending order
    """
    out_dict = dictionary[selector.value]
    for key in out_dict.keys():
        out_dict[key] = {k: v for k, v in sorted(out_dict[key].items(), key=lambda x: x[1], reverse=ascending)}
    return out_dict

# find largest contributions to each level of the selected variable of interest
out_dict = sort_contributions(contributions)

# investigate levels of variable of interest
level = pn.widgets.Select(name = 'Select a level to analyze (A):',options=list(out_dict.keys()))
level

In [None]:
# display contribution outputs for specified filter level (A)
def color(val):
    """
    Syling function to change color of scalar values
    """
    color = 'red' if val < 0 else 'green'
    return 'color: %s' % color

def filter_counts(df):
    """
    Helper function to remove 0 counts from output dataframe
    """
    df = df[df['Completeness'] > 0]
    return df

def search_filter(df, pattern, column):
    """
    Helper function to filter dataframe values from text input
    """
    if not pattern:
        return df
    return df[df[column].str.contains(pattern, case=False)]

# build output dataframe
output = pd.DataFrame.from_dict(out_dict[level.value]).T.reset_index()
output.columns = ['Potential Explanatory Values (X)',
                  'Contribution of A',
                  'Completeness',
                  'Accuracy',
                  'Count (AX)',
                  'Count (A)',
                  'SuAVE Qualifiers']
output['SuAVE Qualifiers'] = output['SuAVE Qualifiers'].apply(lambda x: x[0])

# create output table for display
tab = pn.widgets.Tabulator(filter_counts(output), pagination='remote',
                           show_index=False, hidden_columns=['SuAVE Qualifiers'])
tab.style.applymap(color, subset=pd.IndexSlice[:, ['Contribution of A']])

# define filtering widgets
checkbox = pn.widgets.CheckBoxGroup(options=['#sortquan', '#number', '#multi', '#date', 'untagged'], width=100)
contribution_slider = pn.widgets.RangeSlider(start=-100, end=100, name='Contribution Filter', width=200)
completeness_slider = pn.widgets.RangeSlider(start=0, end=100, name='Completeness Filter', width=200)
accuracy_slider = pn.widgets.RangeSlider(start=0, end=100, name='Accuracy Filter', width=200)
search = pn.widgets.TextInput(name='Search Explanatory Values', placeholder='Enter text to filter values', width=200)

# apply filtering widgets to respective columns
tab.add_filter(checkbox, 'SuAVE Qualifiers')
tab.add_filter(contribution_slider, 'Contribution of A')
tab.add_filter(completeness_slider, 'Completeness')
tab.add_filter(accuracy_slider, 'Accuracy')
tab.add_filter(pn.bind(search_filter, pattern=search, column='Potential Explanatory Values (X)'))  

# display output table and widgets
pn.Column(pn.Row(search, contribution_slider, completeness_slider, accuracy_slider, checkbox), tab)