### BDD overlap analysis

In [10]:
import json
import os
kgs = ["kg/" + i for i in os.listdir("../../results/nli/")]
nlis = ["nli/" + i for i in os.listdir("../../results/nli/")]
dirs = ["../../results/bdd/" + i for i in kgs + nlis]
all_rights = {}
for i in ["e", "c"]:
    all_rights[i] = {}
    for dir in dirs:
        with open(dir + "/" + i + "/preds.json")as f:
            preds = json.load(f)
            rights = set(preds["rights"])
            all_rights[i][dir.split("/")[-1]] = rights
for i in ["e", "c"]:
    with open("../../results/analysis/bddoverlap" + i + ".txt", "w")as f:
        f.write("\t")
        for j in all_rights[i].keys():
            f.write(j)
            f.write("\t")
        f.write("\n")
        for m in all_rights[i].keys():
            f.write(m)
            f.write("\t")
            for n in all_rights[i].keys():
                overlap = str(len(all_rights[i][m].intersection(all_rights[i][n])))
                f.write(overlap)
                f.write("\t")
            f.write("\n")

### BDD data average similarity

In [21]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device="cuda")
pre_embeds = []
hy_embeds = []
with open("../../data/bdd/kgnli/1.jsonl")as f:
    for line in tqdm(f.readlines()):
        data = json.loads(line)
        premise = data["premise"]
        hypothesis = data["hypothesis"][2]
        pembed = model.encode(premise)
        hembed = model.encode(hypothesis)
        pre_embeds.append(pembed)
        hy_embeds.append(hembed)
num_samples = len(hy_embeds)
pre_em_sum = np.sum(pre_embeds, axis=0)
hy_em_sum = np.sum(hy_embeds, axis=0)
avg_simi_pre = (np.linalg.norm(pre_em_sum)**2 - num_samples)/(num_samples**2)
avg_simi_hy = (np.linalg.norm(hy_em_sum)**2 - num_samples)/(num_samples**2)
print(avg_simi_pre)
print(avg_simi_hy)

100%|██████████| 3139/3139 [02:12<00:00, 23.64it/s]

0.40338223035867654
0.5534353785203416





### Kmeans for clustering, save the sentences and sets

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import json
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
hy_embeds = []
hypothesises = []
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device="cuda")
with open("../../data/bdd/kgnli/1.jsonl")as f:
    for line in tqdm(f.readlines()):
        data = json.loads(line)
        hypothesis = data["hypothesis"][2]
        hembed = model.encode(hypothesis)
        hy_embeds.append(hembed)
        hypothesises.append(hypothesis)
x = np.array(hy_embeds)
kmeans = KMeans(n_clusters=5, random_state=0).fit(x)
classes = [i for i in kmeans.labels_]
sentence_clusters = {}
for i, theclass in enumerate(classes):
    if theclass not in sentence_clusters:
        sentence_clusters[theclass] = [hypothesises[i]]
    else:
        sentence_clusters[theclass].append(hypothesises[i])
answer_sets = {}
for i, theclass in enumerate(classes):
    if str(int(theclass)) not in answer_sets:
        answer_sets[str(int(theclass))] = [i]
    else:
        answer_sets[str(int(theclass))].append(i)
with open("../../results/bdd/analysis/ef_classes.txt", "w")as f:
    for i, sentences in sentence_clusters.items():
        for sen in sentences[:20]:
            f.write(sen)
            f.write("\t")
        f.write("\n")
with open("../../results/bdd/analysis/set_classes.jsonl", "w")as f:
    f.write(json.dumps(answer_sets))

### get the acc of each class

In [3]:
import os
import json
files = ["kg", "nli"]
ms = [os.listdir("../../results/bdd/" + f) for f in files]
models = [f + "/" + m + "/" + i for i in ["1", "2"] for f, model in zip(files, ms) for m in model ]
base_dir = "../../results/bdd/"
with open("../../results/bdd/analysis/set_classes.jsonl")as f:
    sets = json.loads(f.readlines()[0])
all_acc = {}
for model_dir in models:
    all_acc[model_dir] = {}
    pre_dir = base_dir + model_dir + "/preds.json"
    with open(pre_dir)as f:
        rights = json.load(f)["rights"]
    for cls, the_set in sets.items():
        right_num = len(set(the_set).intersection(set(rights)))
        all_num = len(the_set)
        acc = right_num/all_num
        all_acc[model_dir][cls] = acc

with open("../../results/bdd/analysis/class_rights.txt", "w")as f:
    f.write("\t")
    for cls in ["accelerate", "Slow", "Stop", "Merge", "turn"]:
        f.write(cls)
        f.write("\t")
    f.write("\n")
    for model, model_right in all_acc.items():
        f.write(model)
        f.write("\t")
        for cls in ["0", "1", "2", "3", "4"]:
            acc = str(model_right[cls])
            f.write(acc)
            f.write("\t")
        f.write("\n")

with open("../../data/bdd/kgnli/1.jsonl")as f:
    lines = f.readlines()
    datas = [json.loads(line) for line in lines]
for cls, members in sets.items():
    with open("../../results/bdd/analysis/bdddata/" + cls + ".jsonl", "w")as f:
        for member in members:
            f.write(json.dumps(datas[member]))
            f.write("\n")