<h1><span style="color:red">Converting a series of binary variables to a single #multi variable</span></h1>

Using this notbook, you can define groups of binary variables and orgaize each group into a multiple-response variables, possibly dropping the initial binary variables. You will have an option to pricess a survey file received from the current SuAVE application, or import a local CSV file. Then the notebook will let you create a new SuAVE survey with the updated survey file. 



In [None]:
import pandas as pd
import panel as pn

pn.extension()

In [None]:
# reading a data file. Replace with survey file retrieved from SuAVE 

df = pd.read_csv('test.csv')
df.head()

In [None]:
# this function organizes all values in a row into a #multi variable
# the options are: 
# a) the binary variables have a non-empty value that needs to be included; the other value (such as "not selected") is empty
# b) the binary variables have specific non-empty conditions specified in condition_values

def summarize_row(row, condition_values=[True], any_not_nan=False, separator='|'):
    if any_not_nan:
        matching_columns = [col_name for col_name, val in row.iteritems() if not pd.isnull(val)]
    else:
        matching_columns = [col_name for col_name, val in row.iteritems() if val in condition_values]
    return separator.join(matching_columns)


# this function calls summarize_row for a selected set of binary variables

def unbinarize(df, columns, condition_values=[True], any_not_nan=False, separator='|'):
    """
    for every row: concatenate values that match condition_value with [separator]
    parameters:
        * df: dataframe
        * columns : the ones you want to unbinarize. 
        * condition_value: the value that these binary columns take when they are selected. e.g. True or 1
        * separator: the separator you want to use in the resulting list column
        * drop: remove columns before returning dataframe
    returns new column (pandas series) and original dataframe, perhaps with columns removed
    """
    return df[columns].apply(lambda row: summarize_row(row, condition_values, any_not_nan, separator), axis=1)

# this function calls unbinarize for all defined mappings

def unbinarize_mapping(df, map_columns, condition_values=[True], any_not_nan=False, separator='|'):
#     print(map_columns)
    for new_col, dummy_cols in map_columns.items():
#         print(new_col + ", " + str(dummy_cols) + ", " + ' OR '.join(condition_values) + ", " + separator)
        df[new_col] = unbinarize(
            df, dummy_cols, condition_values=condition_values, any_not_nan=any_not_nan, separator=separator
        )
    return df

# this function deletes variables that have been integrated into #multi

def delete_dummies(df, map_columns):
    for new_col, dummy_cols in map_columns.items():
        df = df.drop(dummy_cols, axis=1)
    return df

In [None]:
# NOTE: Not needed as widgets in the next cell speed up this process.

# here we specify the mappings
# In the example cases, the three sets of binary variables will be combined into 3 #multi variables
# The values of the #multi variables will be formed from column names of the binary vars

# Eventually, this will be done via some widgets

remappings = {
    #New column name': ['Original', 'Column', 'List'],
    'Role#multi': ['Faculty', 'Researcher', 'Developer', 'Administrator', 'Post-Doc', 'Other Role'],
    'Participated in apps#multi': ['No participation in apps', 'Would participate in apps', 'As PI or Project Manager',
                                     'As Developer', 'As grad or post-doc', 'As UI Designer','As Outreach Specialist',
                                     'As Content Specialist', 'As User Support','On Advisory Board','Other Participation'],
    'Resources_created#multi': ['Computational tools', 'Data collections', 'Data analysis tools', 'Collaboration tools',
                                'Workflows', 'Interfaces to scientific instruments','Interfaces to sensor data',
                                'Educational tools','Frameworks or platforms','Citizen science resources','Other applications']

}

# This specifies which values of binary variables to include in #multi, per each mapping
# for case b) described above

condition_values = ['Selected', 'Selected','Have created' ]

In [None]:
# Left panel
left_text = pn.Row("####Select Binary Variables", margin=(0,0,-15,270))
binary_selector = pn.widgets.CrossSelector(options=list(df.columns), width=630)
left_panel = pn.Column(left_text, binary_selector, css_classes=['widget-box'], margin=(0,30,0,0))

# Right panel
condition_text = pn.Row("####Select Condition", margin=(0,0,-20, 40))
condition = pn.widgets.Select(width=200, margin=(11,10,10,10))
condition_select = pn.Column(condition_text, condition, margin=(5,0,0,0))

multi_text = pn.Row("####Specify #multi Name", margin=(0,0,-20, 25))
multi_name = pn.widgets.TextInput(placeholder='Enter Name', width=200, margin=(11,10,10,10))
name_input = pn.Column(multi_text, multi_name, margin=(-5,0,0,0))
right_panel = pn.Column(condition_select, name_input, css_classes=['widget-box'])

apply_button = pn.widgets.Toggle(name='Create Mapping', margin=(30,0,10,10), width=200)

# Remappings display
remap_text = pn.pane.Markdown('#### Remappings ', width=950)

remappings = {}
condition_values = []
@pn.depends(binary_selector.param.value, apply_button.param.value)
def remap(b_var, apply):
    apply_button.value = False
    
    if (b_var != []):
        
        # Determines possible conditions based on variables selected
        possible_conditions = pd.unique(df[b_var].values.ravel('K'))
        condition.options = ['Not Empty'] + [value for value in possible_conditions if not pd.isnull(value)]
        if (multi_name.value != '') and apply:
            
            # Creates mapping
            new_multi = multi_name.value + '#multi'
            remappings[new_multi] = b_var
            condition_values.append(condition.value)
            
            # Refreshes selected binary values and name
            binary_selector.value = []
            multi_name.value = ''
            
            # Updates remapping display
            new_mapping = '- **'+ str(new_multi) + '**' +' &rarr; '+ str(remappings[new_multi])
            remap_text.object = remap_text.object + '\n' + new_mapping
            
            return remap_text
        
    return remap_text

# Display widgets
widgets = pn.Row(left_panel, pn.Column(right_panel, apply_button))
full_display = pn.Column(widgets, remap)
full_display

In [None]:
# an example call for case b) described above.
# remappings and condition_values are the two key inputs

df_new = df.pipe(
    unbinarize_mapping, remappings,condition_values=condition_values, separator='|'\
).pipe(delete_dummies, remappings)

df_new.head(30)

In [None]:
# just checking the new #multi variables
df_new[['Role#multi', 'Participated in apps#multi','Resources created#multi' ]]

In [None]:
# checking the input binary variables for one of the remappings
df[remappings['Resources created#multi']]

In [None]:
# an example call for case a) described above.
# remappings and any_not_nan=True are the two key inputs

df_new2 = df.pipe(
    unbinarize_mapping, remappings, any_not_nan=True, separator='|'\
).pipe(delete_dummies, remappings)

df_new2.head(30)

In [None]:
def slider(df):
    """
    slider creates an interactive display of a
    data frame.
    
    :param df: data frame
    :returns: interactive dataframe
    """
    
    # Row Selector widget
    row_selection = pn.widgets.IntSlider(name='Navigate Rows', width=350, 
                                         margin=(0,50,-15,0), end=len(df)-1)

    # Column Selector widget
    col_selection = pn.widgets.IntSlider(name='Navigate Columns', width=350, 
                                         margin=(0,0,5,0), end=len(df.columns))
    
    @pn.depends(row_selection.param.value, col_selection.param.value)
    def navigate_data(row=0, col=0):
        return df.iloc[row:row+5, col:col+10]
    
    sliders = pn.Row(row_selection, col_selection, margin=(0,0,0,10))
    full_widget = pn.Column(sliders, navigate_data)
    return full_widget

slider(df_new2)

In [None]:
# now write this back, or upload to SuAVE.

# df_new.to_csv('test_multi.csv', index=None)
#  or
df_new2.to_csv('test_2multi.csv', index=None)