In [1]:
from minineedle import needle, smith, core
import os
import numpy as np
import pandas as pd
import itertools

In [2]:
#strict consistency
path_no_context_strict_consistency_chatgpt = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/no-context/ChatGPT/strict_consistency/'
path_no_context_strict_consistency_googlebard = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/no-context/GoogleBard/strict_consistency/'
path_no_context_strict_consistency_googlegemini = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/no-context/GoogleGemini/strict_consistency/'
path_context_strict_consistency_chatgpt = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/context/ChatGPT/strict_consistency/'
path_context_strict_consistency_googlegemini = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/context/GoogleGemini/strict_consistency/'

#soft consistency
path_no_context_soft_consistency_chatgpt = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/no-context/ChatGPT/soft_consistency/'
path_no_context_soft_consistency_googlebard = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/no-context/GoogleBard/soft_consistency/'
path_no_context_soft_consistency_googlegemini = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/no-context/GoogleGemini/soft_consistency/'
path_context_soft_consistency_chatgpt = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/context/ChatGPT/soft_consistency/'
path_context_soft_consistency_googlegemini = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/context/GoogleGemini/soft_consistency/'

dataset_paths_list = [path_no_context_strict_consistency_chatgpt,
                      path_no_context_strict_consistency_googlebard,
                      path_no_context_strict_consistency_googlegemini,
                      path_context_strict_consistency_chatgpt,
                      path_context_strict_consistency_googlegemini,
                      path_no_context_soft_consistency_chatgpt,
                      path_no_context_soft_consistency_googlebard,
                      path_no_context_soft_consistency_googlegemini,
                      path_context_soft_consistency_chatgpt,
                      path_context_soft_consistency_googlegemini]

#datasets names list
dataset_names_list = ['Education_expenditure_and_indicators.csv',
                      'Health_expectancy.csv',
                      'Listed_monuments.csv', 
                      'Livestock.csv',
                      'Milk_supply_and_dairy_production.csv',
                      'Mobility.csv',
                      'Plant_protection_products.csv',
                      'Population_dynamics.csv',
                      'Social_security.csv',
                      'Trade_and_industry.csv']

row_pairs = [('run1', 'run2'), ('run1', 'run3'), ('run1', 'run4'), ('run1', 'run5'), ('run1', 'run6'),
             ('run1', 'run7'), ('run1', 'run8'), ('run1', 'run9'), ('run1', 'run10'), ('run2', 'run3'),
             ('run2', 'run4'), ('run2', 'run5'), ('run2', 'run6'), ('run2', 'run7'), ('run2', 'run8'),
             ('run2', 'run9'), ('run2', 'run10'), ('run3', 'run4'), ('run3', 'run5'), ('run3', 'run6'),
             ('run3', 'run7'), ('run3', 'run8'), ('run3', 'run9'), ('run3', 'run10'), ('run4', 'run5'),
             ('run4', 'run6'), ('run4', 'run7'), ('run4', 'run8'), ('run4', 'run9'), ('run4', 'run10'),
             ('run5', 'run6'), ('run5', 'run7'), ('run5', 'run8'), ('run5', 'run9'), ('run5', 'run10'),
             ('run6', 'run7'), ('run6', 'run8'), ('run6', 'run9'), ('run6', 'run10'), ('run7', 'run8'),
             ('run7', 'run9'), ('run7', 'run10'), ('run8', 'run9'), ('run8', 'run10'), ('run9', 'run10')]

In [3]:
def remove_id_column(df):
    
    # Drop the column 'ID' if it exists
    if 'ID' in df.columns:
        df = df.drop(columns=['ID'])
    
    return df

In [4]:
def set_index_column(df):
    
    # Set run_index as index column
    df.set_index('run_index', inplace=True)
    
    return df

In [5]:
def transform_row_to_sequence(df, row_pair):
    
    # Extract row values to list
    seq1 = df.loc[row_pair[0]].tolist()
    seq2 = df.loc[row_pair[1]].tolist()
    
    return seq1, seq2

In [6]:
def remove_not_found_from_sequence(seq):
    
    seq = [x for x in seq if x != 'NOT_FOUND']
    
    return seq

In [7]:
def get_alignment_score(seq1, seq2, n_cols):
    
    alignment: needle.NeedlemanWunsch[str] = needle.NeedlemanWunsch(seq1, seq2)
    
    alignment.change_matrix(core.ScoreMatrix(match=1, miss=-0.5, gap=-2))
    
    if len(seq1) == 0 and len(seq2) == 0:
        pair_alignment_score = -2
        return pair_alignment_score
    
    else:
        alignment.align()
        
        if len(seq1) > len(seq2):
            longer_seq = seq1
        elif len(seq1) < len(seq2):
            longer_seq = seq2
        else:
            longer_seq = seq1  # or seq2, since they are the same length

        if n_cols > len(longer_seq):
            additional_penalty = -2*(n_cols-len(longer_seq)) # additional penalty for when both pairs have missing values
            pair_alignment_score = (alignment.get_score()+(additional_penalty))/n_cols #dividing by the number of columns
            return pair_alignment_score
        else:
            pair_alignment_score = alignment.get_score()/n_cols #dividing by the number of columns
            return pair_alignment_score

In [8]:
results_df = pd.DataFrame(columns=['setting', 'dataset', 'alignment_score'])

for dataset_path in dataset_paths_list:
    for dataset_name in dataset_names_list:
        df = pd.read_csv(os.path.join(dataset_path, dataset_name))
        df = remove_id_column(df)
        df = set_index_column(df)
        n_cols = df.shape[1]
        
        dataset_alignment_score = 0
        
        for row_pair in row_pairs:
            seq1, seq2 = transform_row_to_sequence(df, row_pair)
            seq1 = remove_not_found_from_sequence(seq1)
            seq2 = remove_not_found_from_sequence(seq2)
            pair_alignment_score = get_alignment_score(seq1, seq2, n_cols)
            dataset_alignment_score += pair_alignment_score
            
        results_df.loc[len(results_df)] = [dataset_path, dataset_name, dataset_alignment_score/len(row_pairs)]

In [9]:
results_df.to_csv('/Users/brain/Documents/GitHub/LLMs-topic-classification/results/statistics/NeedlemanWunsch_score.csv')