In [1]:
import os
import pandas as pd
from typing import List, Dict
from rich import print
from collections import Counter

In [2]:
# train_gpt4o = pd.read_csv('../../data/result/train_with_suggestion_df_llm_gpt4o.csv')
test_gpt4o = pd.read_csv('../../data/result/test_with_suggestion_df_llm_gpt4o.csv')

# train_command = pd.read_csv('../../data/result/train_with_suggestions_df_llm_command_r_plus.csv')
test_command = pd.read_csv('../../data/result/test_with_suggestions_df_llm_command_r_plus.csv')

# train_llama3 = pd.read_csv('../../data/result/train_with_suggestion_df_llm_llama3_70b.csv')
test_llama3 = pd.read_csv('../../data/result/test_with_suggestion_df_llm_llama3_70b.csv')

In [3]:
def majority_vote(labels_dict: Dict, labels_list: List[str], break_tie: List):
    """
    Determines the majority vote from multiple annotators' labels for a given example.

    Parameters:
    -----------
    labels_dict : dict
        A dictionary with annotator names as keys and their corresponding labels as values.
    labels_list : list
        A list containing all possible labels.
    break_tie : str
        The annotator whose label should be favored in case of a tie. If `break_tie` is not a key in `labels_dict`, its value will be preferred if it is in `labels_list`.

    Returns:
    --------
    str
        The label that represents the majority vote.
    bool
        If there is a tie or not.
    """
    labels = list(labels_dict.values())
    
    tie = False
    # break tie
    if len(set(labels)) == len(labels):
        label = labels_dict.get(break_tie, break_tie)
        tie = True
        if label not in labels_list:
            raise ValueError(f"The label '{label}' is not in the list of possible labels.")
    
    else:
        label = Counter(labels).most_common()[0][0]

    return label, tie

In [4]:
test_command.head()

Unnamed: 0,column,text,topic_id,topic_name,suggestions_label
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,,none
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,,none
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر,needs to be added


In [5]:
labels_list = list(test_gpt4o['suggestion_label'].unique())

In [6]:
print(labels_list)

In [7]:
def get_label(label_cols: List[List[str]], break_tie: str, labels_list: List[str]):
    labels = []
    ties = []
    for i, (gpt_label, llama_label, command_label) in enumerate(zip(*label_cols)):
        labels_dict = {}
        labels_dict['gpt'] = gpt_label
        labels_dict['llama3'] = llama_label
        labels_dict['command'] = command_label

        label, tie = majority_vote(labels_dict, labels_list, break_tie=break_tie)

        labels.append(label)
        ties.append(tie)
    
    return labels, ties

In [8]:
test_gpt_labels = test_gpt4o['suggestion_label']
test_llama_labels = test_llama3['suggestion_label']
test_command_labels = test_command['suggestions_label']

In [9]:
test_labels, test_ties = get_label([test_gpt_labels, test_llama_labels, test_command_labels], break_tie='gpt', labels_list=labels_list)

In [10]:
# test_labels, test_ties = get_label([test_gpt_labels, test_llama_labels, test_command_labels], break_tie='gpt', labels_list=labels_list)

In [11]:
test_gpt4o.head()

Unnamed: 0,column,text,topic_id,topic_name,suggestion_label
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,,none
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,,none
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري,needs to be added
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر,needs to be added


In [12]:
test_gpt4o['majority_vote'] = test_labels
test_gpt4o['tie'] = test_ties

In [13]:
test_gpt4o.head()

Unnamed: 0,column,text,topic_id,topic_name,suggestion_label,majority_vote,tie
0,add_suggestions,اقتراحاتي للإضافة: everything is spectacular a...,2,,none,none,False
1,add_suggestions,اقتراحاتي للإضافة: thank you for everything,2,,none,none,False
2,add_suggestions,اقتراحاتي للإضافة: أتمنى ان يتم تطبيقها بشكل فعال,3,موازنة الجزء العملي مع الجزء النظري,needs to be added,needs to be added,False
3,add_suggestions,اقتراحاتي للإضافة: ان يتم طليق العملي بشكل مكثف,3,موازنة الجزء العملي مع الجزء النظري,needs to be added,needs to be added,False
4,add_suggestions,اقتراحاتي للإضافة: change the contents all,0,محتوى ومعلومات المقرر,needs to be added,needs to be added,False


In [14]:
# ties len
sum(test_ties)

0

In [15]:
test_save_path = '../../data/result/test_with_suggestions_df_majority_vote_gpt4o_preferred.csv'
if os.path.exists(test_save_path):
    print('The path exists!')
else:
    print(f'Saving to {test_save_path}...')
    test_gpt4o.to_csv(test_save_path, index=False)
