In [1]:
def helloworld():
    print('hello world')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
######################################
## Weighted Onehot Encoding options ##
######################################

##############
# Throughput #
##############
# TP1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TP2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TP3 + k: weighted by 1 inverted k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TP4 + k: weighted by 1 upright k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# (For TP3 & TP4, k=1 results in V-shaped variance, and as k>1 increases, sides will curve into U-shaped variance)

############
# Worktime #
############
# WT1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# WT2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)

################
# PC agreement #
################
# PC1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# PC2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# PC3: weighted by 1 PC agreement weight per annotation in each OHE, i.e. (a, b, c, d) -> (w1*a, w2*b, w3*c, w4*d)

#####################
# Input text length #
#####################
# TL1: weighted by 1 normalised number of characters per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TL2: weighted by 1 normalised number of words per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)

###################
# Special Options #
###################
# SP1: weighted by average of TP1 and TP2 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# SP2: weighted by average of WT1 and WT2 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# SP3: weighted by average of PC1 and PC2 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# RAND_UNI: weighted by 1 uniformly distributed random number between 0 to 1 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# RAND_NORM: weighted by 1 normally distributed random number between 0 to 1 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)

# Select 1 option from each of the few variants above, e.g. TP2, WT1, PC3, TL1, SP3, and input into function
# set_OHE_pipeline_options. If not selecting TP3 or TP4, input k (option_k) will be ignored. After
# editing the options, run the entire notebook for results accordingly.


In [4]:
######################################
## Weighted Onehot Encoding options ##
######################################
# During runtime: no need to edit anything in this cell

def set_OHE_pipeline_options(dataframe, option_TP, option_WT, option_PC, option_TL, option_SP, option_k):
    ##############
    # Throughput #
    ##############
    df_throughput = dataframe[['Throughput.1_x', 'Throughput.2_x', 'Throughput.3_x', 'Throughput.4_x', 'Throughput.1_y', 'Throughput.2_y', 'Throughput.3_y', 'Throughput.4_y']].copy()
    if option_TP == 'TP1':
        print("TP1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['avg_throughput'] = df_throughput.mean(axis=1)
        df_throughput['avg_throughput'] = df_throughput['avg_throughput'] / df_throughput['avg_throughput'].max()
    elif option_TP == 'TP2':
        print("TP2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['var_throughput'] = df_throughput.var(axis=1)
        print("Plot below: old throughput (x-axis) vs new throughput (y-axis)")
        plt.plot(df_throughput['var_throughput'], df_throughput['var_throughput'])
        df_throughput['var_throughput'] = df_throughput['var_throughput'] / df_throughput['var_throughput'].max()
    elif option_TP == 'TP3':
        print("TP3 + k: weighted by 1 inverted k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['var_throughput'] = df_throughput.var(axis=1)
        max_val = df_throughput['var_throughput'].max()
        min_val = df_throughput['var_throughput'].min()
        tp_mid = ((max_val - min_val) // 2) + min_val
        tp_to_list = df_throughput['var_throughput'].tolist()

        amount_of_curve = option_k

        u_shaped_variance = []
        for each in tp_to_list:
            if each > tp_mid:
                u_shaped_variance.append((2*tp_mid - each)**(1/amount_of_curve))
            else:
                u_shaped_variance.append(each**(1/amount_of_curve))

        df_throughput['var_throughput_u_shaped'] = u_shaped_variance
        print("Plot below: old throughput (x-axis) vs new throughput (y-axis)")
        df_throughput.plot(x='var_throughput', y='var_throughput_u_shaped', style='o')
        df_throughput['var_throughput_u_shaped'] = df_throughput['var_throughput_u_shaped'] / df_throughput['var_throughput_u_shaped'].max()
        df_throughput = df_throughput.assign(var_throughput=df_throughput['var_throughput_u_shaped'])
    elif option_TP == 'TP4':
        print("TP4 + k: weighted by 1 upright k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['var_throughput'] = df_throughput.var(axis=1)
        max_val = df_throughput['var_throughput'].max()
        min_val = df_throughput['var_throughput'].min()
        tp_mid = ((max_val - min_val) // 2) + min_val
        tp_to_list = df_throughput['var_throughput'].tolist()

        amount_of_curve = option_k

        u_shaped_variance = []
        for each in tp_to_list:
            if each > tp_mid:
                u_shaped_variance.append((2*tp_mid - each)**(1/amount_of_curve))
            else:
                u_shaped_variance.append(each**(1/amount_of_curve))
        list_min = min(u_shaped_variance)
        list_max = max(u_shaped_variance)
        u_shaped_variance = (np.asarray(u_shaped_variance) * -1) +  list_min + list_max

        df_throughput['var_throughput_u_shaped'] = u_shaped_variance
        print("Plot below: old throughput (x-axis) vs new throughput (y-axis)")
        df_throughput.plot(x='var_throughput', y='var_throughput_u_shaped', style='o')
        df_throughput['var_throughput_u_shaped'] = df_throughput['var_throughput_u_shaped'] / df_throughput['var_throughput_u_shaped'].max()
        df_throughput = df_throughput.assign(var_throughput=df_throughput['var_throughput_u_shaped'])
    ############
    # Worktime #
    ############
    df_worktime = dataframe[['WorkTime.1_x', 'WorkTime.2_x', 'WorkTime.3_x', 'WorkTime.4_x', 'WorkTime.1_y', 'WorkTime.2_y', 'WorkTime.3_y', 'WorkTime.4_y']].copy()
    if option_WT == 'WT1':
        print("WT1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_worktime['avg_worktime'] = df_worktime.mean(axis=1)
        df_worktime['avg_worktime'] = df_worktime['avg_worktime'] / df_worktime['avg_worktime'].max()
    elif option_WT == 'WT2':
        print("WT2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_worktime['var_worktime'] = df_worktime.var(axis=1)
        df_worktime['var_worktime'] = df_worktime['var_worktime'] / df_worktime['var_worktime'].max()
    ################
    # PC agreement #
    ################
    df_agreement = dataframe[['emo_disc_pc_agree', 'info_disc_pc_agree', 'emo_supp_pc_agree', 'info_supp_pc_agree']].copy()
    if option_PC == 'PC1':
        print("PC1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_agreement['avg_agreement'] = df_agreement.mean(axis=1)
        df_agreement['avg_agreement'] = df_agreement['avg_agreement'] / df_agreement['avg_agreement'].max()
    elif option_PC == 'PC2':
        print("PC2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_agreement['var_agreement'] = df_agreement.var(axis=1)
        df_agreement['var_agreement'] = df_agreement['var_agreement'] / df_agreement['var_agreement'].max()
    elif option_PC == 'PC3':
        print("PC3: weighted by 1 PC agreement weight per annotation in each OHE, i.e. (a, b, c, d) -> (w1*a, w2*b, w3*c, w4*d)")
        # Do nothing, df_agreement is ready as it is
    #####################
    # Input text length #
    #####################
    if option_TL == 'TL1':
        print("TL1: weighted by 1 normalised number of characters per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_textlength = dataframe[['num_chars']].copy()
        df_textlength['num_chars'] = df_textlength['num_chars'] / df_textlength['num_chars'].max()
    elif option_TL == 'TL2':
        print("TL2: weighted by 1 normalised number of words per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_textlength = dataframe[['num_words']].copy()
        df_textlength['num_words'] = df_textlength['num_words'] / df_textlength['num_words'].max()
    ###################
    # Special Options #
    ###################
    if option_SP == 'SP1':
        print("SP1: weighted by average of TP1 and TP2 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_special = dataframe[['Throughput.1_x', 'Throughput.2_x', 'Throughput.3_x', 'Throughput.4_x', 'Throughput.1_y', 'Throughput.2_y', 'Throughput.3_y', 'Throughput.4_y']].copy()
        df_special['avg_throughput'] = df_special.mean(axis=1)
        df_special['avg_throughput'] = df_special['avg_throughput'] / df_special['avg_throughput'].max()
        df_special['var_throughput'] = df_special.var(axis=1)
        df_special['var_throughput'] = df_special['var_throughput'] / df_special['var_throughput'].max()
        df_special['average_avg_var'] = df_special[['avg_throughput', 'var_throughput']].mean(axis=1)
    elif option_SP == 'SP2':
        print("SP2: weighted by average of WT1 and WT2 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_special = dataframe[['WorkTime.1_x', 'WorkTime.2_x', 'WorkTime.3_x', 'WorkTime.4_x', 'WorkTime.1_y', 'WorkTime.2_y', 'WorkTime.3_y', 'WorkTime.4_y']].copy()
        df_special['avg_worktime'] = df_special.mean(axis=1)
        df_special['avg_worktime'] = df_special['avg_worktime'] / df_special['avg_worktime'].max()
        df_special['var_worktime'] = df_special.var(axis=1)
        df_special['var_worktime'] = df_special['var_worktime'] / df_special['var_worktime'].max()
        df_special['average_avg_var'] = df_special[['avg_worktime', 'var_worktime']].mean(axis=1)
    elif option_SP == 'SP3':
        print("SP3: weighted by average of PC1 and PC2 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_special = dataframe[['emo_disc_pc_agree', 'info_disc_pc_agree', 'emo_supp_pc_agree', 'info_supp_pc_agree']].copy()
        df_special['avg_agreement'] = df_special.mean(axis=1)
        df_special['avg_agreement'] = df_special['avg_agreement'] / df_special['avg_agreement'].max()
        df_special['var_agreement'] = df_special.var(axis=1)
        df_special['var_agreement'] = df_special['var_agreement'] / df_special['var_agreement'].max()
        df_special['average_avg_var'] = df_special[['avg_agreement', 'var_agreement']].mean(axis=1)
    elif option_SP == 'RAND_UNI':
        print("RAND_UNI: weighted by 1 uniformly distributed random number between 0 to 1 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_special = pd.DataFrame(np.random.uniform(low=0.0, high=1.0,size=(dataframe.shape[0], 1)), columns=['special_uni'])
    elif option_SP == 'RAND_NORM':
        print("# RAND_NORM: weighted by 1 normally distributed random number between 0 to 1 per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_special = pd.DataFrame(np.random.normal(0.0, 1.0,size=(dataframe.shape[0], 1)), columns=['special_norm'])        
    return df_throughput, df_worktime, df_agreement, df_textlength, df_special

In [5]:
def construct_weighted_dataframe(indices, df_throughput, df_worktime, 
                                 df_agreement, df_textlength, df_special,
                                 pred_df):
    
    pred_df_throughput = pred_df.copy()
    pred_df_worktime = pred_df.copy()

    df_throughput_keys = df_throughput.keys().to_list()
    throughput_values = df_throughput[df_throughput_keys[-1]].take(indices).values
    pred_df_throughput = pred_df_throughput.mul(throughput_values, axis=0)

    df_worktime_keys = df_worktime.keys().to_list()
    worktime_values = df_worktime[df_worktime_keys[-1]].take(indices).values
    pred_df_worktime = pred_df_worktime.mul(worktime_values, axis=0)
    
    df_agreement_keys = df_agreement.keys().to_list()
    agreement_values = df_agreement[df_agreement_keys[-1]].take(indices).values
#     agreement_values = df_agreement.take(indices)
#     pred_df_agreement = np.multiply(pred_df_throughput, agreement_values)
    pred_df_agreement = pred_df_throughput.mul(agreement_values, axis=0)

    textlength_values = df_textlength.take(indices).values
#     pred_df_full_textlength = pred_df_full_throughput.mul(textlength_values, axis=0)
    pred_df_textlength = pred_df_throughput.mul(textlength_values, axis=0)

    df_special_keys = df_special.keys().to_list()
    special_values = df_special[df_special_keys[-1]].take(indices).values
#     pred_df_full_special = pred_df_full_throughput.mul(special_values, axis=0)
    pred_df_special = pred_df_throughput.mul(special_values, axis=0)
    
#     return pred_df_throughput, pred_df_worktime, pred_df_full_agreement, pred_df_agreement, pred_df_full_textlength, pred_df_textlength, pred_df_full_special, pred_df_special
    return pred_df_throughput, pred_df_worktime, pred_df_agreement, pred_df_textlength, pred_df_special
