In [4]:
def helloworld():
    print('hello world')

In [2]:
######################################
## Weighted Onehot Encoding options ##
######################################
# During runtime: no need to edit anything in this cell

##############
# Throughput #
##############
# TP1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TP2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TP3 + k: weighted by 1 inverted k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TP4 + k: weighted by 1 upright k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# (For TP3 & TP4, k=1 results in V-shaped variance, and as k>1 increases, sides will curve into U-shaped variance)

############
# Worktime #
############
# WT1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# WT2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)

################
# PC agreement #
################
# PC1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# PC2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# PC3: weighted by 1 PC agreement weight per annotation in each OHE, i.e. (a, b, c, d) -> (w1*a, w2*b, w3*c, w4*d)

#####################
# Input text lenght #
#####################
# TL1: weighted by 1 normalised number of characters per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)
# TL2: weighted by 1 normalised number of words per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)

def set_OHE_pipeline_options(dataframe, option_TP, option_WT, option_PC, option_TL, option_k):
    ##############
    # Throughput #
    ##############
    df_throughput = dataframe[['Throughput.1', 'Throughput.2', 'Throughput.3', 'Throughput.4', 'Throughput.5']].copy()
    if option_TP == 'TP1':
        print("TP1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['avg_throughput'] = df_throughput.mean(axis=1)
        df_throughput['avg_throughput'] = df_throughput['avg_throughput'] / df_throughput['avg_throughput'].max()
    elif option_TP == 'TP2':
        print("TP2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['var_throughput'] = df_throughput.var(axis=1)
        print("Plot below: old throughput (x-axis) vs new throughput (y-axis)")
        plt.plot(df_throughput['var_throughput'], df_throughput['var_throughput'])
        df_throughput['var_throughput'] = df_throughput['var_throughput'] / df_throughput['var_throughput'].max()
    elif option_TP == 'TP3':
        print("TP3 + k: weighted by 1 inverted k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['var_throughput'] = df_throughput.var(axis=1)
        max_val = df_throughput['var_throughput'].max()
        min_val = df_throughput['var_throughput'].min()
        tp_mid = ((max_val - min_val) // 2) + min_val
        tp_to_list = df_throughput['var_throughput'].tolist()

        amount_of_curve = option_k

        u_shaped_variance = []
        for each in tp_to_list:
            if each > tp_mid:
                u_shaped_variance.append((2*tp_mid - each)**(1/amount_of_curve))
            else:
                u_shaped_variance.append(each**(1/amount_of_curve))

        df_throughput['var_throughput_u_shaped'] = u_shaped_variance
        print("Plot below: old throughput (x-axis) vs new throughput (y-axis)")
        df_throughput.plot(x='var_throughput', y='var_throughput_u_shaped', style='o')
        df_throughput['var_throughput_u_shaped'] = df_throughput['var_throughput_u_shaped'] / df_throughput['var_throughput_u_shaped'].max()
        df_throughput = df_throughput.assign(var_throughput=df_throughput['var_throughput_u_shaped'])
    elif option_TP == 'TP4':
        print("TP4 + k: weighted by 1 upright k-power U-shaped variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_throughput['var_throughput'] = df_throughput.var(axis=1)
        max_val = df_throughput['var_throughput'].max()
        min_val = df_throughput['var_throughput'].min()
        tp_mid = ((max_val - min_val) // 2) + min_val
        tp_to_list = df_throughput['var_throughput'].tolist()

        amount_of_curve = option_k

        u_shaped_variance = []
        for each in tp_to_list:
            if each > tp_mid:
                u_shaped_variance.append((2*tp_mid - each)**(1/amount_of_curve))
            else:
                u_shaped_variance.append(each**(1/amount_of_curve))
        list_min = min(u_shaped_variance)
        list_max = max(u_shaped_variance)
        u_shaped_variance = (np.asarray(u_shaped_variance) * -1) +  list_min + list_max

        df_throughput['var_throughput_u_shaped'] = u_shaped_variance
        print("Plot below: old throughput (x-axis) vs new throughput (y-axis)")
        df_throughput.plot(x='var_throughput', y='var_throughput_u_shaped', style='o')
        df_throughput['var_throughput_u_shaped'] = df_throughput['var_throughput_u_shaped'] / df_throughput['var_throughput_u_shaped'].max()
        df_throughput = df_throughput.assign(var_throughput=df_throughput['var_throughput_u_shaped'])
    ############
    # Worktime #
    ############
    df_worktime = dataframe[['WorkTime.1', 'WorkTime.2', 'WorkTime.3', 'WorkTime.4', 'WorkTime.5']].copy()
    if option_WT == 'WT1':
        print("WT1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_worktime['avg_worktime'] = df_worktime.mean(axis=1)
        df_worktime['avg_worktime'] = df_worktime['avg_worktime'] / df_worktime['avg_worktime'].max()
    elif option_WT == 'WT2':
        print("WT2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_worktime['var_worktime'] = df_worktime.var(axis=1)
        df_worktime['var_worktime'] = df_worktime['var_worktime'] / df_worktime['var_worktime'].max()
    ################
    # PC agreement #
    ################
    df_agreement = dataframe[['Answer.1gamemove.yes_pc_agree', 'Answer.2reasoning.yes_pc_agree', 'Answer.4shareinformation.yes_pc_agree', 'Answer.3rapport.yes_pc_agree']].copy()
    if option_PC == 'PC1':
        print("PC1: weighted by 1 average per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_agreement['avg_agreement'] = df_agreement.mean(axis=1)
        df_agreement['avg_agreement'] = df_agreement['avg_agreement'] / df_agreement['avg_agreement'].max()
    elif option_PC == 'PC2':
        print("PC2: weighted by 1 linear variance per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_agreement['var_agreement'] = df_agreement.var(axis=1)
        df_agreement['var_agreement'] = df_agreement['var_agreement'] / df_agreement['var_agreement'].max()
    elif option_PC == 'PC3':
        print("PC3: weighted by 1 PC agreement weight per annotation in each OHE, i.e. (a, b, c, d) -> (w1*a, w2*b, w3*c, w4*d)")
        # Do nothing, df_agreement is ready as it is
    #####################
    # Input text lenght #
    #####################
    if option_TL == 'TL1':
        print("TL1: weighted by 1 normalised number of characters per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_textlenght = dataframe[['Input.num_characters']].copy()
        df_textlenght['Input.num_characters'] = df_textlenght['Input.num_characters'] / df_textlenght['Input.num_characters'].max()
    elif option_TL == 'TL2':
        print("TL2: weighted by 1 normalised number of words per set of OHE, i.e. (a, b, c, d) -> (w*a, w*b, w*c, w*d)")
        df_textlenght = dataframe[['Input.num_words']].copy()
        df_textlenght['Input.num_words'] = df_textlenght['Input.num_words'] / df_textlenght['Input.num_words'].max()
    
    return df_throughput, df_worktime, df_agreement, df_textlenght