In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../../dataset/llm_annotations/train_maj.csv')
test = pd.read_csv('../../dataset/llm_annotations/test_maj.csv')
val = pd.read_csv('../../dataset/llm_annotations/val_maj.csv')


train_soft = pd.read_csv('/home/benedetta.muscato/multiperspective/dataset/llm_annotations/train_llm_ann.csv')
test_soft = pd.read_csv('/home/benedetta.muscato/multiperspective/dataset/llm_annotations/test_llm_ann.csv')
val_soft = pd.read_csv('/home/benedetta.muscato/multiperspective/dataset/llm_annotations/val_llm_ann.csv')


In [3]:
columns = train_soft.columns.tolist()
train = train[columns]

In [4]:
train = pd.concat([train, train_soft], ignore_index=True)
test = pd.concat([test, test_soft], ignore_index=True)
val = pd.concat([val, val_soft], ignore_index=True)

print(train.shape)
print(test.shape)
print(val.shape)

(734, 14)
(157, 27)
(157, 27)


In [5]:
def multi_label(df, col1, col2, col3, col_name):
    df[col_name] = df[[col1, col2,col3]].values.tolist()
    return df

In [6]:
train = multi_label(train, 'olmo', 'llama', 'mistral', 'llm_labels_noninstruct')
test = multi_label(test, 'olmo', 'llama', 'mistral', 'llm_labels_noninstruct')
val = multi_label(val, 'olmo', 'llama', 'mistral', 'llm_labels_noninstruct')



In [7]:
train = multi_label(train, 'olmo_instruct', 'llama_instruct', 'mistral_instruct', 'llm_labels_instruct')
test = multi_label(test, 'olmo_instruct', 'llama_instruct', 'mistral_instruct', 'llm_labels_instruct')
val = multi_label(val, 'olmo_instruct', 'llama_instruct', 'mistral_instruct', 'llm_labels_instruct')

In [8]:
from collections import Counter

# Function to determine the majority label
def get_majority_label(annotations):
    count = Counter(annotations)
    most_common = count.most_common(1)[0]
    if most_common[1] > 1:  # Check if the most common label appears more than once
        return most_common[0]
    else:
        return 'No Majority'  # If no majority, return 'No Majority'


In [9]:
train['majority_llm_noninst'] = train['llm_labels_noninstruct'].apply(get_majority_label)
test['majority_llm_noninst'] = test['llm_labels_noninstruct'].apply(get_majority_label)
val['majority_llm_noninst'] = val['llm_labels_noninstruct'].apply(get_majority_label)

In [10]:
train['majority_llm_inst'] = train['llm_labels_instruct'].apply(get_majority_label)
test['majority_llm_inst'] = test['llm_labels_instruct'].apply(get_majority_label)
val['majority_llm_inst'] = val['llm_labels_instruct'].apply(get_majority_label)

In [11]:
labels = ['Pro', 'Against', 'Neutral', 'Not-about']
num_labels = len(labels)
id2label = {id:label for id,label in enumerate(labels)}
label2id = {label:id for id,label in enumerate(labels)}

In [12]:
import ast


def ensure_list(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except (ValueError, SyntaxError):
            return [val]
    elif isinstance(val, list):
        return val
    else:
        return []


train['llm_labels_instruct'] = train['llm_labels_instruct'].apply(ensure_list)
test['llm_labels_instruct'] = test['llm_labels_instruct'].apply(ensure_list)
val['llm_labels_instruct'] = val['llm_labels_instruct'].apply(ensure_list)

train['label_indices_instruct'] = train['llm_labels_instruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])
test['label_indices_instruct'] = test['llm_labels_instruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])
val['label_indices_instruct'] = val['llm_labels_instruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])

# Stampa i risultati
print(train[['llm_labels_instruct', 'label_indices_instruct']].head())

               llm_labels_instruct label_indices_instruct
0        [Not-about, Against, Pro]              [3, 1, 0]
1    [Against, Against, Not-about]              [1, 1, 3]
2          [Against, Neutral, Pro]              [1, 2, 0]
3    [Against, Neutral, Not-about]              [1, 2, 3]
4  [Not-about, Neutral, Not-about]              [3, 2, 3]


In [13]:
train['llm_labels_noninstruct'] = train['llm_labels_noninstruct'].apply(ensure_list)
test['llm_labels_noninstruct'] = test['llm_labels_noninstruct'].apply(ensure_list)
val['llm_labels_noninstruct'] = val['llm_labels_noninstruct'].apply(ensure_list)

train['label_indices_noninstruct'] = train['llm_labels_noninstruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])
test['label_indices_noninstruct'] = test['llm_labels_noninstruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])
val['label_indices_noninstruct'] = val['llm_labels_noninstruct'].apply(lambda x: [label2id[label] for label in x if label in label2id])

# Stampa i risultati
print(train[['llm_labels_noninstruct', 'label_indices_noninstruct']].head())

      llm_labels_noninstruct label_indices_noninstruct
0  [Pro, Against, Not-about]                 [0, 1, 3]
1  [Not-about, Against, Pro]                 [3, 1, 0]
2    [Against, Against, Pro]                 [1, 1, 0]
3  [Against, Not-about, Pro]                 [1, 3, 0]
4        [Pro, Neutral, Pro]                 [0, 2, 0]


In [14]:
unique_labels = set(label for sublist in train['llm_labels_noninstruct'] for label in sublist)
print("Etichette uniche nel DataFrame:", unique_labels)
print(train['llm_labels_noninstruct'].head())

Etichette uniche nel DataFrame: {'Against', 'Pro', 'Neutral', 'Not-about'}
0    [Pro, Against, Not-about]
1    [Not-about, Against, Pro]
2      [Against, Against, Pro]
3    [Against, Not-about, Pro]
4          [Pro, Neutral, Pro]
Name: llm_labels_noninstruct, dtype: object


In [15]:

from scipy.special import softmax
import ast
import pandas as pd

def create_soft_labels_index(data, label_column,col_name, all_labels=None, ):
    """
    Crea soft labels basate su una colonna "labels" che contiene tutte le etichette per un documento,
    approssimandole a una sola cifra decimale, assicurandosi che tutte le classi abbiano una probabilità assegnata.

    :param data: DataFrame di pandas con i dati
    :param label_column: Nome della colonna che contiene le etichette (default: 'labels')
    :param all_labels: Lista di tutte le etichette possibili. Se None, verranno inferite dalle etichette nel DataFrame.
    :return: DataFrame con una nuova colonna contenente le soft labels.
    """
    _data = data.copy()
    
    
    if all_labels is None:
        all_labels = ['Pro', 'Against', 'Neutral', 'Not-about']


    all_labels_set = set(all_labels)
    
    soft_labels_list = []

    for labels in _data[label_column]:
        
        if isinstance(labels, str):
            labels = ast.literal_eval(labels)
        
      
        label_counts = pd.Series(labels).value_counts().to_dict()

       
        counts = [label_counts.get(label, 0) for label in all_labels]
        
       
        soft_probs = softmax(counts)

       
        soft_label_map = dict(zip(all_labels, soft_probs))
        
       
        soft_labels = [soft_label_map[label] for label in all_labels]
        
       
        soft_labels = [round(prob, 1) for prob in soft_labels]

       
        soft_labels_list.append(soft_labels)

    
    _data[col_name] = soft_labels_list

    return _data

In [16]:
train=create_soft_labels_index(train, 'llm_labels_noninstruct', 'soft_labels_noninst')
test=create_soft_labels_index(test, 'llm_labels_noninstruct', 'soft_labels_noninst')
val=create_soft_labels_index(val, 'llm_labels_noninstruct', 'soft_labels_noninst')

In [17]:
train=create_soft_labels_index(train, 'llm_labels_instruct', 'soft_labels_inst')
test=create_soft_labels_index(test, 'llm_labels_instruct', 'soft_labels_inst')
val=create_soft_labels_index(val, 'llm_labels_instruct', 'soft_labels_inst')

In [18]:
train.to_csv('/home/benedetta.muscato/multiperspective/dataset/df_train_llm_soft.csv', index = False)
test.to_csv('/home/benedetta.muscato/multiperspective/dataset/df_test_llm_soft.csv', index = False)
val.to_csv('/home/benedetta.muscato/multiperspective/dataset/df_val_llm_soft.csv', index = False)

In [None]:
# def clean_data(df, col):
#     df = df.loc[~(df[col] == 'No majority')]
#     return df


# train = clean_data(train, 'majority_llm_noninst')
# test = clean_data(test, 'majority_llm_noninst')
# val = clean_data(val, 'majority_llm_noninst')
