In [2]:
import pandas as pd
import numpy as np
from utils import *

In [7]:
train_llm=pd.read_csv('../dataset/llm_annotations/train_llm_ann.csv')
test_llm=pd.read_csv('../dataset/llm_annotations/test_llm_ann.csv')
val_llm=pd.read_csv('../dataset/llm_annotations/val_llm_ann.csv')


In [None]:
from collections import Counter

def create_majority_label(df):
    def get_majority(row):
        labels = [row['olmo'], row['mistral'], row['llama']]
        label_counts = Counter(labels)
        
        # Trova la label più comune e il suo conteggio
        most_common = label_counts.most_common(1)[0]
        
        if most_common[1] > 1:
            # Se la label più comune appare più di una volta, è la maggioranza
            return most_common[0]
        else:
            # Se non c'è una maggioranza chiara
            return "No majority"
    
    df['majority_label'] = df.apply(get_majority, axis=1)
    return df

In [13]:
train_llm = create_majority_label(train_llm)
test_llm = create_majority_label(test_llm)
val_llm = create_majority_label(val_llm)

In [15]:
count_no_majority_train_llm = train_llm['majority_label'].value_counts().get("No majority", 0)
count_no_majority_test_llm = test_llm['majority_label'].value_counts().get("No majority", 0)
count_no_majority_val_llm = val_llm['majority_label'].value_counts().get("No majority", 0)

In [None]:
count_no_majority_train_llm,count_no_majority_test_llm, count_no_majority_val_llm

In [8]:
train = pd.read_csv('../dataset/train_gpt.csv')
test = pd.read_csv('../dataset/test_gpt.csv')
val = pd.read_csv('../dataset/val_gpt.csv')

In [9]:
def clean_data_new(df):
    
    df = df[df['majority_label'] != 'link broken']
    return df

def verify_filtering_new(df):
    print(f"Numero di righe: {df.shape[0]}")
    print(f"Numero di colonne: {df.shape[1]}")

In [None]:
train_llm= clean_data_new(train_llm)
verify_filtering_new(train_llm)

test_llm = clean_data_new(test_llm)
verify_filtering_new(test_llm)

val_llm = clean_data_new(val_llm)
verify_filtering_new(val_llm)

In [None]:
train= clean_data_new(train)
verify_filtering_new(train)

test = clean_data_new(test)
verify_filtering_new(test)

val = clean_data_new(val)
verify_filtering_new(val)

In [11]:
count_no_majority_train = train['majority_label'].value_counts().get("No majority", 0)
count_no_majority_test = test['majority_label'].value_counts().get("No majority", 0)
count_no_majority_val = val['majority_label'].value_counts().get("No majority", 0)

In [None]:
count_no_majority_train,count_no_majority_test,count_no_majority_val

In [13]:
train['gpt_summaries'] = train['gpt_summaries'].fillna('')
test['gpt_summaries'] = test['gpt_summaries'].fillna('')
val['gpt_summaries'] = val['gpt_summaries'].fillna('')

In [None]:
print(train.shape)
print(test.shape)
print(val.shape)

In [15]:
def doc_cocat(row):
    if row['gpt_summaries'] != '':
        return row['gpt_summaries']
    else:
        return row['docCont']

train['doc'] = train.apply(doc_cocat, axis=1)
test['doc'] = test.apply(doc_cocat, axis=1)
val['doc'] = val.apply(doc_cocat, axis=1) 

In [16]:
def combine_columns(df): #combine to create Input
    df['Input'] = df['Query'] + ' ' + df['docTitle'] + '. ' + df['doc']
    return df

train = combine_columns(train)
test = combine_columns(test)
val = combine_columns(val)

In [17]:
train['doc'] = train['doc'].str.replace('\n', ' ')
test['doc'] = test['doc'].str.replace('\n', ' ')
val['doc'] = val['doc'].str.replace('\n', ' ')

train['Input'] = train['Input'].str.replace('\n', ' ')
test['Input'] = test['Input'].str.replace('\n', ' ')
val['Input'] = val['Input'].str.replace('\n', ' ')

In [None]:
train['majority_label'].value_counts() 

In [None]:
import os


def create_directory(directory_path):
    """
    Crea una nuova directory se non esiste già.
    """
    if not os.path.exists(directory_path):
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' creata.")
    else:
        print(f"Directory '{directory_path}' già esistente.")

def save_dataset(train, test, val, output_directory, prefix):
    """
    Salva i DataFrame come file CSV nella directory di output con i nomi prefissati.
    """
    # Assicurati che la directory esista
    create_directory(output_directory)
    
    # Costruisci i percorsi completi per i file di output
    train_path = os.path.join(output_directory, f'{prefix}_train.csv')
    test_path = os.path.join(output_directory, f'{prefix}_test.csv')
    val_path = os.path.join(output_directory, f'{prefix}_val.csv')
    
    # Salva i DataFrame come file CSV
    train.to_csv(train_path, index=False)
    test.to_csv(test_path, index=False)
    val.to_csv(val_path, index=False)

    print(f"Dataset salvati nella directory '{output_directory}'.")


# Specifica il percorso della nuova directory e i nomi dei file
output_directory = '../dataset/multip_new'
prefix = 'multip'

# Usa la funzione per salvare i dataset nella nuova directory
save_dataset(train, test, val, output_directory, prefix)


In [None]:
output_directory = '../dataset/multip' #with no majority instances 


save_dataset(train, test, val, output_directory, 'multip') 

In [None]:
train['majority_label'].value_counts() 

In [25]:
col_name=["answer1","answer2","answer3","majority_label"]
condition = train[col_name].apply(lambda col: col.str.contains('Link-broken', na=False)).any(axis=1)

# Filtra le righe che soddisfano la condizione
link_broken_rows = train[condition]

In [19]:
import os
directory = directory_path
train_file = os.path.join(directory, 'train_multip_human.csv')
test_file = os.path.join(directory, 'test_multip_human.csv')
val_file = os.path.join(directory, 'val_multip_human.csv')


train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
val.to_csv(val_file, index=False) 

In [3]:
train_df=pd.read_csv(multip_train_path)
val_df=pd.read_csv(multip_val_path)
test_df=pd.read_csv(multip_test_path)

In [None]:
cols_of_interest = ['answer1', 'answer2', 'answer3']
rows_with_nan = train_df[train_df[cols_of_interest].isna().any(axis=1)]
print(rows_with_nan)

In [None]:
test_df['majority_label'].value_counts()

In [27]:

import os
directory = directory_path
train_file = os.path.join(directory, 'train_llm.csv')
test_file = os.path.join(directory, 'test_llm.csv')
val_file = os.path.join(directory, 'val_llm.csv')


train_llm.to_csv(train_file, index=False)
test_llm.to_csv(test_file, index=False)
val_llm.to_csv(val_file, index=False) 

In [None]:
import os
directory = directory_path
train_file = os.path.join(directory, 'train_llm_instr.csv')
test_file = os.path.join(directory, 'test_llm_instr.csv')
val_file = os.path.join(directory, 'val_llm_instr.csv')
