### Libraries

In [2]:
import os
import json
import random
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, PreTrainedModel, BertTokenizer, BertConfig
from tqdm import tqdm
from sklearn.metrics import accuracy_score
# Set random seeds for reproducibility
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


In [4]:
import pandas as pd


### source code

In [3]:

def read_json(path):
    """ Read a json file from the given path."""
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def write_json(data, path):
    """ Write a json file to the given path."""
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    with open(path, 'w', encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)


In [20]:
output_dir = f"drive/MyDrive/EMNLP-neurogenesis/non_neurogenesis_results"
data = []
for run_id in range(1,6):
  output_dir = f"drive/MyDrive/EMNLP-neurogenesis/non_neurogenesis_results/results_{run_id}.json"
  data.extend(read_json(output_dir))
data_df = pd.DataFrame(data)
non_mean_df = data_df.groupby(by=['task_id']).mean()

In [21]:
non_sample = list(non_mean_df['test_acc'])

In [22]:
output_dir = f"drive/MyDrive/EMNLP-neurogenesis/neurogenesis_results_performer"
data = []
for run_id in range(1,6):
  output_dir = f"drive/MyDrive/EMNLP-neurogenesis/neurogenesis_results_performer/results_{run_id}.json"
  data.extend(read_json(output_dir))
data_df = pd.DataFrame(data)
neuro_samples = list(data_df.groupby(by=['task_id']).mean()['test_acc'])

In [23]:
np.mean(non_sample)

np.float64(0.8728827120830054)

In [24]:
np.mean(neuro_samples)

np.float64(0.9419464096805346)

In [25]:
from scipy.stats import ttest_ind


# Perform t-test
t_stat, p_value = ttest_ind(neuro_samples, non_sample, equal_var=False)

t_stat, p_value


(np.float64(2.477506128998837), np.float64(0.030368935662589636))

The t-test results can be interpreted as follows:

**Test Statistic (t-value):**

2.477506128998837
This value indicates how many standard errors the sample mean is away from the null hypothesis value. A larger absolute t-value suggests a more significant difference from the null hypothesis.

**p-value**
0.030368935662589636
This value represents the probability of obtaining a result as extreme as the observed one, assuming the null hypothesis is true. If the p-value is below a commonly used threshold (e.g., 0.05), it suggests the null hypothesis can be rejected in favor of the alternative hypothesis.

**Conclusion:**
Since the p-value (0.0304) is less than 0.05, the results indicate statistical significance at the 5% level.

You can reject the null hypothesis, suggesting there is evidence of a significant difference or effect.