In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../../dataset/llm_annotations/train_maj.csv') #please add your path
test = pd.read_csv('../../dataset/llm_annotations/test_maj.csv')
val = pd.read_csv('../../dataset/llm_annotations/val_maj.csv')


train_soft = pd.read_csv('llm_annotations/train_llm_ann.csv') #please add your path
test_soft = pd.read_csv('llm_annotations/test_llm_ann.csv')
val_soft = pd.read_csv('llm_annotations/val_llm_ann.csv')

# we had to combine both train_maj (which has no majority labels (human dataset) but was conducted LLM annotations on it) and LLM annotations (does not include no majority)

In [3]:
columns = train_soft.columns.tolist()
train = train[columns]

In [None]:
train = pd.concat([train, train_soft], ignore_index=True)
test = pd.concat([test, test_soft], ignore_index=True)
val = pd.concat([val, val_soft], ignore_index=True)

print(train.shape)
print(test.shape)
print(val.shape)

In [5]:
def multi_label(df, col1, col2, col3, col_name):
    df[col_name] = df[[col1, col2,col3]].values.tolist()
    return df

In [6]:
train = multi_label(train, 'olmo', 'llama', 'mistral', 'llm_labels_noninstruct')
test = multi_label(test, 'olmo', 'llama', 'mistral', 'llm_labels_noninstruct')
val = multi_label(val, 'olmo', 'llama', 'mistral', 'llm_labels_noninstruct')



In [8]:
from collections import Counter

# Function to determine the majority label
def get_majority_label(annotations):
    count = Counter(annotations)
    most_common = count.most_common(1)[0]
    if most_common[1] > 1:  # Check if the most common label appears more than once
        return most_common[0]
    else:
        return 'No Majority'  # If no majority, return 'No Majority'


In [9]:
train['majority_llm_noninst'] = train['llm_labels_noninstruct'].apply(get_majority_label)
test['majority_llm_noninst'] = test['llm_labels_noninstruct'].apply(get_majority_label)
val['majority_llm_noninst'] = val['llm_labels_noninstruct'].apply(get_majority_label)

In [11]:
labels = ['Pro', 'Against', 'Neutral', 'Not-about']
num_labels = len(labels)
id2label = {id:label for id,label in enumerate(labels)}
label2id = {label:id for id,label in enumerate(labels)}

In [None]:
import ast


def ensure_list(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except (ValueError, SyntaxError):
            return [val]
    elif isinstance(val, list):
        return val
    else:
        return []




In [None]:
train['llm_labels_noninstruct'] = train['llm_labels_noninstruct'].apply(ensure_list)
test['llm_labels_noninstruct'] = test['llm_labels_noninstruct'].apply(ensure_list)
val['llm_labels_noninstruct'] = val['llm_labels_noninstruct'].apply(ensure_list)

train['label_indices_noninstruct'] = train['llm_labels_noninstruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])
test['label_indices_noninstruct'] = test['llm_labels_noninstruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])
val['label_indices_noninstruct'] = val['llm_labels_noninstruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])

# Print the results
print(train[['llm_labels_noninstruct', 'label_indices_noninstruct']].head())

In [None]:
unique_labels = set(label for sublist in train['llm_labels_noninstruct'] for label in sublist)
print("Etichette uniche nel DataFrame:", unique_labels)
print(train['llm_labels_noninstruct'].head())

In [15]:
from scipy.special import softmax
import ast
import pandas as pd

def create_soft_labels_index(data, label_column, col_name, all_labels=None):
    """
    Creates soft labels based on a "labels" column that contains all labels for a document,
    approximating them to one decimal point, ensuring that all classes have an assigned probability.

    :param data: pandas DataFrame containing the data
    :param label_column: Name of the column that contains the labels (default: 'labels')
    :param col_name: The name of the column to store the resulting soft labels
    :param all_labels: List of all possible labels. If None, they will be inferred from the labels in the DataFrame.
    :return: DataFrame with a new column containing the soft labels.
    """
    _data = data.copy()  # Create a copy of the input data to avoid modifying the original DataFrame

    # If all_labels is not provided, use the default label set
    if all_labels is None:
        all_labels = ['Pro', 'Against', 'Neutral', 'Not-about']

    all_labels_set = set(all_labels)  # Convert all_labels to a set for quick lookup (optional)

    soft_labels_list = []  # Initialize a list to store soft labels for each document

    # Iterate through each label in the specified label_column
    for labels in _data[label_column]:
        
        if isinstance(labels, str):
            labels = ast.literal_eval(labels)  # If labels are a string, convert them into a list using literal_eval

        # Count the occurrences of each label in the current list of labels
        label_counts = pd.Series(labels).value_counts().to_dict()

        # Create a list of counts, ensuring each label from all_labels is represented (default count = 0)
        counts = [label_counts.get(label, 0) for label in all_labels]

        # Compute soft probabilities using the softmax function
        soft_probs = softmax(counts)

        # Map each label to its corresponding softmax probability
        soft_label_map = dict(zip(all_labels, soft_probs))

        # Retrieve the soft probabilities for all labels in the same order as all_labels
        soft_labels = [soft_label_map[label] for label in all_labels]

        # Round each soft probability to one decimal point
        soft_labels = [round(prob, 1) for prob in soft_labels]

        # Append the soft labels to the list
        soft_labels_list.append(soft_labels)

    # Add a new column to the DataFrame with the generated soft labels
    _data[col_name] = soft_labels_list

    return _data  # Return the modified DataFrame


In [16]:
train=create_soft_labels_index(train, 'llm_labels_noninstruct', 'soft_labels_noninst')
test=create_soft_labels_index(test, 'llm_labels_noninstruct', 'soft_labels_noninst')
val=create_soft_labels_index(val, 'llm_labels_noninstruct', 'soft_labels_noninst')

In [18]:
train.to_csv('/home/benedetta.muscato/multiperspective/dataset/df_train_llm_soft.csv', index = False)
test.to_csv('/home/benedetta.muscato/multiperspective/dataset/df_test_llm_soft.csv', index = False)
val.to_csv('/home/benedetta.muscato/multiperspective/dataset/df_val_llm_soft.csv', index = False)

In [None]:
# def clean_data(df, col):
#     df = df.loc[~(df[col] == 'No majority')]
#     return df


# train = clean_data(train, 'majority_llm_noninst')
# test = clean_data(test, 'majority_llm_noninst')
# val = clean_data(val, 'majority_llm_noninst')
