In [17]:
import os
import json
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score as accuracy

import numpy as np
import pandas as pd

def read_json(path):
    with open(path, 'r', encoding="utf-8") as f:
        data = json.load(f)
    return data

def write_json(data, path):
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    with open(path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

In [26]:
def get_all_tests(folder):
    tests=[]
    for id in range(1,11):
        test_path = folder + f"task{id}/test_1.json"
        data = read_json(test_path)
        
        tests.extend(data)
    return tests

In [None]:
def calculate_accuracy(y_pred_folder):
    acc_list = []
    for run_id in range(1,6):
        gt_folder = f"/Users/sefika/phd_projects/CRE_PTM/data/tacred/data/llama_format_data/test/run_{run_id}/"
        y_true = get_all_tests(gt_folder)
        y_true = [item['relation'] for item in y_true]
        y_pred_path = f"{y_pred_folder}/model{run_id}/task_10_seen_task.json"
        y_pred = read_json(y_pred_path)
        y_pred = [item['clean'] for item in y_pred]
        print(f"Run {run_id} - y_true: {len(y_true)}, y_pred: {len(y_pred)}")
        filtered = [(yt, yp) for yt, yp in zip(y_true, y_pred) if yt is not None and yp is not None]
        y_true_filtered, y_pred_filtered = zip(*filtered)
    
        acc = accuracy(y_true_filtered, y_pred_filtered) * 100
        # acc = accuracy(y_true, y_pred)*100
        acc_list.append(acc)
    return acc_list

In [100]:
import numpy as np
np.random.seed(42)
def permutation_test(group1, group2, num_permutations=10000, alternative='two-sided'):
    observed_diff = np.mean(group1) - np.mean(group2)
    combined = np.concatenate([group1, group2])
    count = 0

    for _ in range(num_permutations):
        np.random.shuffle(combined)
        new_group1 = combined[:len(group1)]
        new_group2 = combined[len(group1):]
        new_diff = np.mean(new_group1) - np.mean(new_group2)

        if alternative == 'two-sided':
            count += abs(new_diff) >= abs(observed_diff)
        elif alternative == 'greater':
            count += new_diff >= observed_diff
        elif alternative == 'less':
            count += new_diff <= observed_diff

    p_value = count / num_permutations
    return observed_diff, p_value


In [52]:
no_replay_mistral = read_json("/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/mistal_results/m_0/seen_metric.json")
memory_5_mistral = read_json("/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/mistal_results/m_5/seen_metric.json")
memory_10_mistral = read_json("/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/mistal_results/m_10/seen_metric.json")
memory_15_mistral = read_json("/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/mistal_results/m_15/seen_metric.json")

In [53]:
no_replay_mistral_df = pd.DataFrame(no_replay_mistral)
memory_5_mistral_df = pd.DataFrame(memory_5_mistral)
memory_10_mistral_df = pd.DataFrame(memory_10_mistral)

memory_15_mistral_df = pd.DataFrame(memory_15_mistral)

In [49]:
no_replay_mistral_samples = [
    (no_replay_mistral_df[(no_replay_mistral_df['run'] == i) & (no_replay_mistral_df['task'] == 10)]['acc'].item() * 100)
    for i in range(1, 6)
]


In [None]:
no_replay_mistral_samples


In [47]:
memory_5_mistral_samples = [
    (memory_5_mistral_df[(memory_5_mistral_df['run'] == i) & (memory_5_mistral_df['task'] == 10)]['acc'].item() * 100)
    for i in range(1, 6)
]


In [46]:
memory_10_mistral_samples = [
    (memory_10_mistral_df[(memory_10_mistral_df['run'] == i) & (memory_10_mistral_df['task'] == 10)]['acc'].item() * 100)
    for i in range(1, 6)
]


In [45]:
memory_15_mistral_samples = [
    (memory_15_mistral_df[(memory_15_mistral_df['run'] == i) & (memory_15_mistral_df['task'] == 10)]['acc'].item() * 100)
    for i in range(1, 6)
]


In [44]:
diff, p_val = permutation_test(
    no_replay_mistral_samples,
    memory_15_mistral_samples,
    num_permutations=10,
    alternative='two-sided'
)
print(f"Observed difference in means: {diff:.3f}")
print(f"P-value: {p_val:.4f}")


Observed difference in means: 0.016
P-value: 1.0000


## Flan T5

In [43]:
memory_10_t5_samples =[]
for model_id in range(1, 6):
    file = f"/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/memory_experiments/m10/model{model_id}/task_10_seen_task_result.json"
    acc = read_json(file)[0]["acc"]*100
    memory_10_t5_samples.append(acc)

In [42]:
memory_5_t5_samples =[]
for model_id in range(1, 6):
    file = f"/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/memory_experiments/m5/model{model_id}/task_10_seen_task_result.json"
    acc = read_json(file)[0]["acc"]*100
    memory_5_t5_samples.append(acc)

In [41]:
memory_15_t5_samples =[]
for model_id in range(1, 6):
    file = f"/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/memory_experiments/m15/model{model_id}/task_10_seen_task_result.json"
    acc = read_json(file)[0]["acc"]*100
    memory_15_t5_samples.append(acc)

In [40]:
no_replay_t5_samples =[]
for model_id in range(1, 6):
    file = f"/Users/sefika/phd_projects/CRE_PTM copy/src/test/results_memory_cl_tacred/fine-tuning/relatedwork/model{model_id}/task_10_seen_task_result.json"
    acc = read_json(file)[0]["acc"]*100
    no_replay_t5_samples.append(acc)

## Llama2 - 7B

In [None]:

pred_folder = "/Users/sefika/phd_projects/CRE_PTM copy/src/clean/llama_results_clean/m_15"
memory_15_llama_samples = calculate_accuracy(pred_folder)
    

In [None]:

pred_folder = f"/Users/sefika/phd_projects/CRE_PTM copy/src/clean/llama_results_clean/m_10"
 
memory_10_llama_samples = calculate_accuracy(pred_folder)

In [None]:
memory_10_llama_samples

In [None]:

pred_folder = f"/Users/sefika/phd_projects/CRE_PTM copy/src/clean/llama_results_clean/m_5"
memory_5_llama_samples = calculate_accuracy(pred_folder)

In [None]:

pred_folder = f"/Users/sefika/phd_projects/CRE_PTM copy/src/clean/llama_results_clean/m_0/"
no_replay_llama_samples = calculate_accuracy(pred_folder)

In [74]:
samples= {"no_replay_mistral_samples": no_replay_mistral_samples,
            "memory_5_mistral_samples": memory_5_mistral_samples,
            "memory_10_mistral_samples": memory_10_mistral_samples,
            "memory_15_mistral_samples": memory_15_mistral_samples,
            "no_replay_t5_samples": no_replay_t5_samples,
            "memory_5_t5_samples": memory_5_t5_samples,
            "memory_10_t5_samples": memory_10_t5_samples,
            "memory_15_t5_samples": memory_15_t5_samples,
            "no_replay_llama_samples": no_replay_llama_samples,
            "memory_5_llama_samples": memory_5_llama_samples,
            "memory_10_llama_samples": memory_10_llama_samples,
            "memory_15_llama_samples": memory_15_llama_samples
            }

In [101]:
def return_group_samples(group1, group2, iterations):
    diff, p_val = permutation_test(
        group1,
        group2,
        num_permutations=iterations,
        alternative='two-sided'
    )
    return {'diff': diff, 'p_val': p_val}

In [None]:
result_t5 = {

    'no_replay_vs_memory_15_iter_100': return_group_samples(no_replay_t5_samples, memory_15_t5_samples,100),
    'no_replay_vs_memory_10_iter_100': return_group_samples(no_replay_t5_samples, memory_10_t5_samples,100),
    'no_replay_vs_memory_5_iter_100': return_group_samples(no_replay_t5_samples, memory_5_t5_samples,100),
    'no_replay_vs_memory_15_iter_1000': return_group_samples(no_replay_t5_samples, memory_15_t5_samples,1000),
    'no_replay_vs_memory_10_iter_1000': return_group_samples(no_replay_t5_samples, memory_10_t5_samples,1000),
    'no_replay_vs_memory_5_iter_1000': return_group_samples(no_replay_t5_samples, memory_5_t5_samples,1000),
    'no_replay_vs_memory_15_iter_10000': return_group_samples(no_replay_t5_samples, memory_15_t5_samples,10000),
    'no_replay_vs_memory_10_iter_10000': return_group_samples(no_replay_t5_samples, memory_10_t5_samples,10000),
    'no_replay_vs_memory_5_iter_10000': return_group_samples(no_replay_t5_samples, memory_5_t5_samples,10000),

}

In [111]:
write_json(result_t5, "./t5_permutation_test_results.json")

In [112]:
result_mistral = {

    'no_replay_vs_memory_15_iter_100': return_group_samples(no_replay_mistral_samples, memory_15_mistral_samples,100),
    'no_replay_vs_memory_10_iter_100': return_group_samples(no_replay_mistral_samples, memory_10_mistral_samples,100),
    'no_replay_vs_memory_5_iter_100': return_group_samples(no_replay_mistral_samples, memory_5_mistral_samples,100),
    'no_replay_vs_memory_15_iter_1000': return_group_samples(no_replay_mistral_samples, memory_15_mistral_samples,1000),
    'no_replay_vs_memory_10_iter_1000': return_group_samples(no_replay_mistral_samples, memory_10_mistral_samples,1000),
    'no_replay_vs_memory_5_iter_1000': return_group_samples(no_replay_mistral_samples, memory_5_mistral_samples,1000),
    'no_replay_vs_memory_15_iter_10000': return_group_samples(no_replay_mistral_samples, memory_15_mistral_samples,10000),
    'no_replay_vs_memory_10_iter_10000': return_group_samples(no_replay_mistral_samples, memory_10_mistral_samples,10000),
    'no_replay_vs_memory_5_iter_10000': return_group_samples(no_replay_mistral_samples, memory_5_mistral_samples,10000),

}

In [113]:

write_json(result_mistral, "./mistral_permutation_test_results.json")

In [None]:
result_llama = {

    'no_replay_vs_memory_15_iter_100': return_group_samples(no_replay_llama_samples, memory_15_llama_samples,100),
    'no_replay_vs_memory_10_iter_100': return_group_samples(no_replay_llama_samples, memory_10_llama_samples,100),
    'no_replay_vs_memory_5_iter_100': return_group_samples(no_replay_llama_samples, memory_5_llama_samples,100),
    'no_replay_vs_memory_15_iter_1000': return_group_samples(no_replay_llama_samples, memory_15_llama_samples,1000),
    'no_replay_vs_memory_10_iter_1000': return_group_samples(no_replay_llama_samples, memory_10_llama_samples,1000),
    'no_replay_vs_memory_5_iter_1000': return_group_samples(no_replay_llama_samples, memory_5_llama_samples,1000),
    'no_replay_vs_memory_15_iter_10000': return_group_samples(no_replay_llama_samples, memory_15_llama_samples,10000),
    'no_replay_vs_memory_10_iter_10000': return_group_samples(no_replay_llama_samples, memory_10_llama_samples,10000),
    'no_replay_vs_memory_5_iter_10000': return_group_samples(no_replay_llama_samples, memory_5_llama_samples,10000),

}

In [115]:
write_json(result_llama, "./llama_permutation_test_results.json")

In [93]:
memory_5_t5_samples

[94.11283728536387,
 94.84873262469338,
 96.97465249386754,
 96.32052330335242,
 93.54047424366311]

In [None]:
result_llama