<h1><span style="color:red">Generate Factor Contributions</span></h1>

### This notebook reads numeric and categorical variables from the survey dataset and computes factor contributions for all levels of the variables in the survey dataset.

## 1. Retrieve survey parameters from the URL

In [1]:
%%javascript
function getQueryStringValue (key)
{  
    return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$", "i"), "$1"));
}
IPython.notebook.kernel.execute("survey_url='".concat(getQueryStringValue("surveyurl")).concat("'"));
IPython.notebook.kernel.execute("views='".concat(getQueryStringValue("views")).concat("'"));
IPython.notebook.kernel.execute("view='".concat(getQueryStringValue("view")).concat("'"));
IPython.notebook.kernel.execute("user='".concat(getQueryStringValue("user")).concat("'"));
IPython.notebook.kernel.execute("csv_file='".concat(getQueryStringValue("csv")).concat("'")); 
IPython.notebook.kernel.execute("dzc_file='".concat(getQueryStringValue("dzc")).concat("'")); 
IPython.notebook.kernel.execute("params='".concat(getQueryStringValue("params")).concat("'")); 
IPython.notebook.kernel.execute("active_object='".concat(getQueryStringValue("activeobject")).concat("'")); 
IPython.notebook.kernel.execute("full_notebook_url='" + window.location + "'"); 

<IPython.core.display.Javascript object>

In [3]:
# common imports
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import Markdown, display

from io import StringIO
import re
import datetime
import pandas as pd    
import panel as pn
import numpy as np
import matplotlib.pyplot as plt
from helper import *

pn.extension('tabulator')
def printmd(string):
    display(Markdown(string))

absolutePath = "/home/jovyan/jupyter-suave/temp_csvs/"

# local imports
import sys
sys.path.insert(1, '../../helpers')
import panel_libs as panellibs
import suave_integration as suaveint

In [4]:
# helper functions for widget interaction and events
def b(event):
    """
    Updates bin sliders when "next" is clicked
    """
    if next_bin.clicks == 0:
        return
    selected_col = binnable.value[next_var.clicks]
    if next_bin.clicks < num.value:
        bin_range.name = 'Select Range for Bin #' + str(next_bin.clicks + 1)
        selected_bins[next_var.clicks].append(bin_range.value)
        bin_range.start = bin_range.value[1] + 1
        bin_range.value = (bin_range.value[1] + 1, df[selected_col].max())
        plot_histogram(df, selected_col, plot, x_range = bin_range.value)
    else:
        selected_bins[next_var.clicks].append(bin_range.value)
        next_bin.disabled = True
        if selected_col != binnable.value[-1]:
            layout[0][4] = next_var
        else:
            layout[0][4] = "Variable Binning Complete!"
            # bin each column with chosen bins
            cols = binnable.value
            for col, bins in list(zip(cols, selected_bins)):
                bins = pd.IntervalIndex.from_tuples(bins, closed='left')
                df[col] = pd.cut(df[col], bins)
            # bin remaining numeric columns into 5 equal bins
            remainder = list(set(binnable.options) - set(binnable.value))
            df[remainder] = df[remainder].apply(pd.cut, bins=5, right=False)

def c(event):
    """
    Resets layout when "continue" is clicked
    """
    next_bin.clicks = 0
    next_col = binnable.value[next_var.clicks] 
    text.value = next_col
    num.value = 1
    bin_range.start = df[next_col].min()
    bin_range.end = df[next_col].max()
    bin_range.value = (df[next_col].min(), df[next_col].max())
    next_bin.disabled = False
    layout[0][4] = ""
    plot_histogram(df, next_col, plot)
    
def d(event):
    """
    Updates bin sliders when "next" is clicked
    """
    if next_bin2.clicks < num.value:
        date_slider.name = 'Date Range for Bin #' + str(next_bin2.clicks + 1)
        date_bins.append(date_slider.value)
        date_slider.start = date_slider.value[1]
        date_slider.value = (date_slider.value[1], datetime.datetime(d_max.year, d_max.month, d_max.day))
        plot_dates(df[dates], plot, date_slider.value[0])
    else:
        date_bins.append(date_slider.value)
        next_bin2.disabled = True
        layout2[0][3] = 'Date Binning Complete!'
        # apply bins to date variables
        bins = pd.interval_range(start=pd.Timestamp(datetime.date(date_bins[0][0].year, date_bins[0][0].month, date_bins[0][0].day)),
                         end=pd.Timestamp(datetime.date(date_bins[0][1].year, date_bins[0][1].month, date_bins[0][1].day)),
                         periods=1, closed='left')
        for d_bin in date_bins[1:]:
            out = (datetime.date(d_bin[0].year, d_bin[0].month, d_bin[0].day),
                   datetime.date(d_bin[1].year, d_bin[1].month, d_bin[1].day))
            interval = pd.interval_range(start=pd.Timestamp(out[0]), end=pd.Timestamp(out[1]), periods=1, closed='left')
            bins = bins.append(interval)
        # convert dates to date ranges
        for d in dates:
            df[d] = df[d].apply(pd.Timestamp)
            df[d] = pd.cut(df[d], bins=bins)

## 2. Read the survey file

In [5]:
# read the csv file
df = panellibs.extract_data(absolutePath + csv_file)

# create a list of variable names
variables_df = pd.DataFrame({'varname':df.columns})
printmd("<b><span style='color:red'>All variables in the survey file:</span></b>")
print(variables_df.varname.values)

<b><span style='color:red'>All variables in the survey file:</span></b>

['Name' 'OAID#link#multi' 'Affiliation#sortquan' 'City#sortquan'
 'Region#sortquan' 'Country#sortquan' 'Latitude#hidden' 'Longitude#hidden'
 'Collaborators#multi#link#sortquan' 'Scope#multi#sortquan'
 'Keywords#multi#sortquan' 'OA concepts#multi#sortquan'
 'Publications#hidden' 'Publication Dates#multi#sortquan' '#img' '#netvis']


<h2><span style="color:red">3. OPTIONAL: Bin Numerical and Date Variables</span></h3>

<span style="color:red">Create custom bins for numerical and date variables, or skip to the next step.
</span> 

<h3><span style="color:red">Set custom bins for numerical variables:</span></h3>

<span style="color:red">Please select at least one numeric variable to bin. The remaining unselected variables will be automatically grouped into 5 equal bins</span> 

In [None]:
# select numerical columns to bin -- **unselected columns will be automatically binned into 5 equal bins**
df = df.apply(pd.to_numeric, errors='ignore')
df = df.dropna(axis=1, how='all')
op = [i for i in list(df.columns) if '#number' in i]
binnable = pn.widgets.MultiChoice(name='Select Columns to Bin: ', options=op, height=350)
binnable

<span style="color:red">Set custom bins for the selected numerical variables.</span> 

In [None]:
# set custom bins for a numeric (#number) variable
next_var = pn.widgets.Button(name='Continue to Next Variable', button_type='success')
first_col = binnable.value[next_var.clicks]
selected_bins = [[] for i in binnable.value]

text = pn.widgets.TextInput(name='Current Variable: ', value=first_col, disabled=True)
num = pn.widgets.Select(name='Select number of bins', options=[i+1 for i in range(5)])
next_bin = pn.widgets.Button(name='Next Bin', button_type='primary')
bin_range = pn.widgets.RangeSlider(name='Select Range for Bin #1',
                                   start=df[first_col].min(), end=df[first_col].max(),
                                   value=(df[first_col].min(), df[first_col].max()), step=1)

next_bin.on_click(b)
next_var.on_click(c)
plot = pn.pane.Matplotlib(dpi=80)
plot_histogram(df, binnable.value[next_var.clicks], plot)
layout = pn.Row(pn.Column(text, num, bin_range, next_bin, ""), plot)
layout

<h3><span style="color:red">Set custom bins for date variables:</span></h3>

<span style="color:red">Convert dates into custom date ranges.</span> 

In [None]:
# convert date variables in survey
dates = [i for i in list(df.columns) if '#date' in i]
df[dates] = df[dates].apply(pd.to_datetime, errors='coerce')
df[dates] = df[dates].apply(lambda x: x.dt.date)
d_max = pd.Series(df[dates].to_numpy().flatten()).dropna().max()
d_min = pd.Series(df[dates].to_numpy().flatten()).dropna().min()

# define bins for all date variables
date_bins = []
date_slider = pn.widgets.DateRangeSlider(name='Date Range for Bin #1 ', start=d_min, end=d_max, value=(d_min,d_max))
next_bin2 = pn.widgets.Button(name='Next Bin', button_type='primary')
num = pn.widgets.Select(name='Select number of bins', options=[i+1 for i in range(5)])
        
next_bin2.on_click(d)
plot =  pn.pane.Matplotlib(dpi=80, tight=True)
plot_dates(df[dates], plot, date_slider.value[0])
layout2 = pn.Row(pn.Column(num, date_slider, next_bin2, ""), plot)
layout2

## 4. Generate Factor Contributions

Drop survey variables that are unnecessary for analysis, such as SuAVE #img, #name, #long, and #hidden variables.

In [6]:
# OPTIONAL: if survey has #img, #name, #long, or #hidden columns, remove them
df.drop(columns=['#img', '#name'], inplace=True)
remove_cols = [i for i in list(df.columns) if '#long' in i or '#hidden' in i]
df.drop(columns=remove_cols, inplace=True)

KeyError: "['#name'] not found in axis"

Please select a variable of interest to generate factor contributions.

In [11]:
# select a variable of interest to generate factor contributions
selector = pn.widgets.Select(name='Select a variable to investigate: ', options=list(df.columns))
selector

Please select a level of the selected variable to further investigate

In [14]:
# select a variable level to investigate
op = [selector.value + '_' + str(i) for i in df[selector.value].value_counts().index.to_list()]
level = pn.widgets.Select(name = 'Select a level to analyze (A):', options=op)
level

Run the following cell to generate factor contributions for the specified variable and level

In [15]:
# generate factor contributions: 
selected_var = selector.value
selected_level = level.value.split('_')[-1].strip()
factors = get_factors(df)
contributions = find_factor_contributions(df, selected_var, selected_level, factors)

IndexError: list index out of range

## 5. Analyze Factor Contributions

Run the following cell to analyze all factor contributions for the selected variable and level of interest.

In [None]:
# build output dataframe
output = build_df(contributions)

# build output table and widgets for display
search, checkbox, accuracy_slider, completeness_slider, contribution_slider, count_slider, tab = build_table(output)

# display output table and widgets
pn.Column(
    checkbox,
    pn.Row(search, accuracy_slider, completeness_slider, contribution_slider, count_slider),
    tab
)

Run the following cell to download the filtered table

In [None]:
# download the output table
filename = pn.widgets.TextInput(name='Enter Filename: ')

@pn.depends(name=filename, watch=True)
def file_download(name):
    download.filename = name

sio = StringIO()
final_output = tab.selected_dataframe[tab.selected_dataframe.columns[:-1]]
final_output.to_csv(sio)
sio.seek(0)
download = pn.widgets.FileDownload(sio, embed=True, filename='table.csv', button_type='primary')
pn.Column(filename, download)