In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pathlib
import matplotlib.pyplot as plt
import time
import json
import itertools
from statsmodels.stats.contingency_tables import cochrans_q

In [3]:
def load_head_data(experiments_path):
    head_data = {}
    for task_dir in experiments_path.iterdir():
        head_data[task_dir.stem] = {}
        for seed_dir in task_dir.iterdir():
            head_mask = np.load(seed_dir / "head_mask.npy")
            head_data[task_dir.stem][seed_dir.stem] = {
                "head_mask": head_mask,
            }
    return head_data
def load_mlp_data(experiments_path):
    mlp_data = {}
    for task_dir in experiments_path.iterdir():
        mlp_data[task_dir.stem] = {}
        for seed_dir in task_dir.iterdir():
            mlp_mask = np.load(seed_dir / "mlp_mask.npy")
            mlp_importance = np.load(seed_dir / "mlp_importance.npy")
            mlp_data[task_dir.stem][seed_dir.stem] = {
                "mlp_mask": mlp_mask,
                "mlp_importance": mlp_importance
            }
    return mlp_data

In [4]:
experiments_path = pathlib.Path("../masks/heads_mlps")
heads = load_head_data(experiments_path)

experiments_path = pathlib.Path("../masks/heads_mlps_hans")
hans_heads = load_head_data(experiments_path)

for k, v in hans_heads.items():
    heads[k] = v

experiments_path = pathlib.Path("../masks/heads_mlps")
mlps = load_mlp_data(experiments_path)




experiments_path = pathlib.Path("../masks/heads_mlps_hans")
hans_mlps = load_mlp_data(experiments_path)

for k, v in hans_mlps.items():
    mlps[k] = v


In [9]:

def cochrans_q_masks(masks):
    inp = np.array(masks).transpose()
    return cochrans_q(inp)


def print_p_value_tasks_separate(data, mask="head_mask"):
    for task in sorted(data.keys()):
        print_p_value_tasks(data, [task], mask)

        
def print_p_value_tasks(data, tasks, mask="head_mask"):
    masks = []
    seeds = sorted(data[tasks[0]].keys())
    for task in tasks:
        for seed in seeds:
            masks.append(data[task][seed][mask].reshape(-1))
    test_result = cochrans_q_masks(masks)
    print(','.join(tasks))
    print("---------------------------------------------------------------------")
    print(f'p-value: {test_result.pvalue}')
    print(f"{'Null hypothesis (all seeds are similar) is rejected.' if test_result.pvalue < 0.05 else 'Null hypothesis (all seeds are similar) is not rejected.'}")
    
    if len(tasks) == 1:
        masks_combos = list(itertools.combinations(range(len(masks)), 2))
    else:
        masks_combos = []
        for i in range(len(seeds)):
            for j in range(len(seeds)):
                if i < j:
                    mask_1_idx = i
                    mask_2_idx = len(seeds)  +  j
                    masks_combos.append((mask_1_idx, mask_2_idx))
    similar_masks_combos = []
    for mask_1, mask_2 in masks_combos:
        r = cochrans_q_masks([masks[mask_1], masks[mask_2]])
        if r.pvalue >= 0.05:
            task1_name, seed1_name = tasks[mask_1 // len(seeds)], seeds[mask_1 % len(seeds)]
            task2_name, seed2_name = tasks[mask_2 // len(seeds)], seeds[mask_2 % len(seeds)]
            similar_masks_combos.append((f"{task1_name}-{seed1_name}", f"{task2_name}-{seed2_name}"))
    
    print(f"Total mask pairs where Null hypothesis is not rejected - {len(similar_masks_combos)}")
    print(f"Total mask pairs - {len(masks_combos)}")
    print(f"Percentage - {len(similar_masks_combos)/ len(masks_combos)}")
    print("\nSimilar Mask Pairs:\n")
    print("\t".join([",".join(p) for p in similar_masks_combos]))
    print("\n\n")

# Seeds in a Task

## Heads

In [10]:
print_p_value_tasks_separate(heads)

CoLA
---------------------------------------------------------------------
p-value: 2.113894110905964e-11
Null hypothesis (all seeds are similar) is rejected.
Total mask pairs where Null hypothesis is not rejected - 3
Total mask pairs - 10
Percentage - 0.3

Similar Mask Pairs:

CoLA-seed_1337,CoLA-seed_42	CoLA-seed_1337,CoLA-seed_71	CoLA-seed_166,CoLA-seed_42



HANS
---------------------------------------------------------------------
p-value: 0.004749695087342902
Null hypothesis (all seeds are similar) is rejected.
Total mask pairs where Null hypothesis is not rejected - 7
Total mask pairs - 10
Percentage - 0.7

Similar Mask Pairs:

HANS-seed_1337,HANS-seed_42	HANS-seed_1337,HANS-seed_71	HANS-seed_1337,HANS-seed_86	HANS-seed_166,HANS-seed_86	HANS-seed_42,HANS-seed_71	HANS-seed_42,HANS-seed_86	HANS-seed_71,HANS-seed_86



HANS_MNLI
---------------------------------------------------------------------
p-value: 1.0701419296602784e-05
Null hypothesis (all seeds are similar) is rejected.


## MLPs

In [11]:
print_p_value_tasks_separate(mlps, mask="mlp_mask")

CoLA
---------------------------------------------------------------------
p-value: 0.3232398928813502
Null hypothesis (all seeds are similar) is not rejected.
Total mask pairs where Null hypothesis is not rejected - 9
Total mask pairs - 10
Percentage - 0.9

Similar Mask Pairs:

CoLA-seed_1337,CoLA-seed_166	CoLA-seed_1337,CoLA-seed_42	CoLA-seed_1337,CoLA-seed_86	CoLA-seed_166,CoLA-seed_42	CoLA-seed_166,CoLA-seed_71	CoLA-seed_166,CoLA-seed_86	CoLA-seed_42,CoLA-seed_71	CoLA-seed_42,CoLA-seed_86	CoLA-seed_71,CoLA-seed_86



HANS
---------------------------------------------------------------------
p-value: 0.9594907270068882
Null hypothesis (all seeds are similar) is not rejected.
Total mask pairs where Null hypothesis is not rejected - 10
Total mask pairs - 10
Percentage - 1.0

Similar Mask Pairs:

HANS-seed_1337,HANS-seed_166	HANS-seed_1337,HANS-seed_42	HANS-seed_1337,HANS-seed_71	HANS-seed_1337,HANS-seed_86	HANS-seed_166,HANS-seed_42	HANS-seed_166,HANS-seed_71	HANS-seed_166,HANS-seed_8

## Pairwise Task to task comparison

# Heads

In [13]:
tasks = sorted(heads.keys())
for t1, t2 in itertools.combinations(tasks, 2):
    print_p_value_tasks(heads, [t1, t2])

CoLA,HANS
---------------------------------------------------------------------
p-value: 1.770648917866998e-23
Null hypothesis (all seeds are similar) is rejected.
Total mask pairs where Null hypothesis is not rejected - 4
Total mask pairs - 10
Percentage - 0.4

Similar Mask Pairs:

CoLA-seed_166,HANS-seed_42	CoLA-seed_166,HANS-seed_71	CoLA-seed_166,HANS-seed_86	CoLA-seed_42,HANS-seed_71



CoLA,HANS_MNLI
---------------------------------------------------------------------
p-value: 6.504899929147611e-15
Null hypothesis (all seeds are similar) is rejected.
Total mask pairs where Null hypothesis is not rejected - 5
Total mask pairs - 10
Percentage - 0.5

Similar Mask Pairs:

CoLA-seed_1337,HANS_MNLI-seed_166	CoLA-seed_1337,HANS_MNLI-seed_42	CoLA-seed_166,HANS_MNLI-seed_71	CoLA-seed_166,HANS_MNLI-seed_86	CoLA-seed_42,HANS_MNLI-seed_86



CoLA,MNLI
---------------------------------------------------------------------
p-value: 1.027240403810406e-08
Null hypothesis (all seeds are similar) i

## MLPs

In [14]:
for t1, t2 in itertools.combinations(tasks, 2):
    print_p_value_tasks(mlps, [t1, t2], mask="mlp_mask")

CoLA,HANS
---------------------------------------------------------------------
p-value: 0.02432935515164265
Null hypothesis (all seeds are similar) is rejected.
Total mask pairs where Null hypothesis is not rejected - 5
Total mask pairs - 10
Percentage - 0.5

Similar Mask Pairs:

CoLA-seed_1337,HANS-seed_166	CoLA-seed_1337,HANS-seed_71	CoLA-seed_166,HANS-seed_42	CoLA-seed_166,HANS-seed_71	CoLA-seed_166,HANS-seed_86



CoLA,HANS_MNLI
---------------------------------------------------------------------
p-value: 0.15879295358012627
Null hypothesis (all seeds are similar) is not rejected.
Total mask pairs where Null hypothesis is not rejected - 8
Total mask pairs - 10
Percentage - 0.8

Similar Mask Pairs:

CoLA-seed_1337,HANS_MNLI-seed_166	CoLA-seed_1337,HANS_MNLI-seed_42	CoLA-seed_1337,HANS_MNLI-seed_71	CoLA-seed_1337,HANS_MNLI-seed_86	CoLA-seed_166,HANS_MNLI-seed_42	CoLA-seed_166,HANS_MNLI-seed_71	CoLA-seed_166,HANS_MNLI-seed_86	CoLA-seed_71,HANS_MNLI-seed_86



CoLA,MNLI
-------------