In [7]:
import pickle
import json
from pathlib import Path
from typing import Dict, Any

import numpy as np
from tqdm import tqdm

def filter_by_knowledge(int_grad: Dict[str, Dict[str, Any]], samples_to_eval: Dict[str, Dict[str, Any]], with_knowledge: bool):
    subset = {}
    for sample_id, sample in int_grad.items():
        if samples_to_eval[sample_id]["has_knowledge"] == with_knowledge:
            subset[sample_id] = sample
    return subset

## WoW

### Full

In [3]:
from utils import compute_average_attribution

# top_k_percentage tokens with highest attribution
top_k_percentage = 0.25

scores = {}
top_k_scores = {}
for model in Path("../output/WizardOfWikipedia/").iterdir():
    if model.is_dir():
        if model.name == "mistral":
            # name of the segments
            segments = ["topic", "dialogue_history", "knowledge"]
            # tokens to separate the segments
            tokens_to_find = [[7082, 441, 28747], [11308, 3829, 28747]]
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 28705, 733, 16289, 28793, 28748]
            for item in model.iterdir():
                if item.is_dir():
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
        else:
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 518, 25580, 29962, 3532, 14816, 29903, 6778, 29966, 829]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "knowledge"]
                    # tokens to separate the segments
                    tokens_to_find = [[7647, 434, 29901], [19320, 5485, 29901]]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # name of the segments
                    segments = ["prompt", "topic", "dialogue_history", "knowledge"]
                    # tokens to separate the segments
                    tokens_to_find = [[13, 7031, 293, 29901], [7647, 434, 29901], [19320, 5485, 29901]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd

df = pd.DataFrame(top_k_scores).T * 100
display(df[["prompt", "topic", "dialogue_history", "knowledge"]].round(2))

Unnamed: 0,prompt,topic,dialogue_history,knowledge
llama_ft_gold,,49.22,16.35,34.43
llama_ft_retrieved_top-3,,51.58,22.9,25.52
llama_prompt_retrieved_top-1,21.42,28.9,17.49,32.19
llama_ft_none,,60.99,19.81,19.2
llama_prompt_none,24.47,30.68,17.18,27.67
llama_ft_retrieved_top-1,,50.37,16.55,33.08
llama_prompt_gold,21.15,27.96,15.61,35.28
llama_prompt_retrieved_top-3,25.72,28.92,15.81,29.55
mistral_ft_gold,,69.6,13.62,16.78
mistral_prompt_retrieved_top-1,,66.09,15.08,18.82


### Knowledge only

In [12]:
from utils import compute_average_attribution

with open("../data/WizardOfWikipedia/samples_to_eval.json", "r") as f:
    samples_to_eval = json.load(f)

# top_k_percentage tokens with highest attribution
top_k_percentage = 0.25

with_knowledge = True

scores = {}
top_k_scores = {}
for model in Path("../output/WizardOfWikipedia/").iterdir():
    if model.is_dir():
        if model.name == "mistral":
            # name of the segments
            segments = ["topic", "dialogue_history", "knowledge"]
            # tokens to separate the segments
            tokens_to_find = [[7082, 441, 28747], [11308, 3829, 28747]]
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 28705, 733, 16289, 28793, 28748]
            for item in model.iterdir():
                if item.is_dir():
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
        else:
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 518, 25580, 29962, 3532, 14816, 29903, 6778, 29966, 829]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "knowledge"]
                    # tokens to separate the segments
                    tokens_to_find = [[7647, 434, 29901], [19320, 5485, 29901]]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # name of the segments
                    segments = ["prompt", "topic", "dialogue_history", "knowledge"]
                    # tokens to separate the segments
                    tokens_to_find = [[13, 7031, 293, 29901], [7647, 434, 29901], [19320, 5485, 29901]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}



In [13]:
import pandas as pd

df = pd.DataFrame(top_k_scores).T * 100
display(df[["prompt", "topic", "dialogue_history", "knowledge"]].round(2))

Unnamed: 0,prompt,topic,dialogue_history,knowledge
llama_ft_gold,,39.43,13.8,46.78
llama_ft_retrieved_top-3,,51.68,22.71,25.61
llama_prompt_retrieved_top-1,24.24,30.54,18.53,26.69
llama_ft_none,,60.83,18.64,20.53
llama_prompt_none,27.22,33.44,17.99,21.34
llama_ft_retrieved_top-1,,60.01,19.96,20.04
llama_prompt_gold,21.85,28.6,15.96,33.58
llama_prompt_retrieved_top-3,29.14,30.34,15.54,24.98
mistral_ft_gold,,65.55,11.0,23.45
mistral_prompt_retrieved_top-1,,70.68,17.5,11.83


### No knowledge

In [14]:
from utils import compute_average_attribution

with open("../data/WizardOfWikipedia/samples_to_eval.json", "r") as f:
    samples_to_eval = json.load(f)

# top_k_percentage tokens with highest attribution
top_k_percentage = 0.25

with_knowledge = False

scores = {}
top_k_scores = {}
for model in Path("../output/WizardOfWikipedia/").iterdir():
    if model.is_dir():
        if model.name == "mistral":
            # name of the segments
            segments = ["topic", "dialogue_history", "knowledge"]
            # tokens to separate the segments
            tokens_to_find = [[7082, 441, 28747], [11308, 3829, 28747]]
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 28705, 733, 16289, 28793, 28748]
            for item in model.iterdir():
                if item.is_dir():
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
        else:
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 518, 25580, 29962, 3532, 14816, 29903, 6778, 29966, 829]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "knowledge"]
                    # tokens to separate the segments
                    tokens_to_find = [[7647, 434, 29901], [19320, 5485, 29901]]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # name of the segments
                    segments = ["prompt", "topic", "dialogue_history", "knowledge"]
                    # tokens to separate the segments
                    tokens_to_find = [[13, 7031, 293, 29901], [7647, 434, 29901], [19320, 5485, 29901]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}



In [15]:
import pandas as pd

df = pd.DataFrame(top_k_scores).T * 100
display(df[["prompt", "topic", "dialogue_history", "knowledge"]].round(2))

Unnamed: 0,prompt,topic,dialogue_history,knowledge
llama_ft_gold,,59.01,18.9,22.09
llama_ft_retrieved_top-3,,51.48,23.1,25.42
llama_prompt_retrieved_top-1,18.6,27.26,16.45,37.69
llama_ft_none,,61.15,20.98,17.87
llama_prompt_none,21.71,27.92,16.37,34.0
llama_ft_retrieved_top-1,,40.73,13.14,46.13
llama_prompt_gold,20.44,27.32,15.26,36.99
llama_prompt_retrieved_top-3,22.31,27.5,16.08,34.11
mistral_ft_gold,,73.66,16.24,10.1
mistral_prompt_retrieved_top-1,,61.51,12.67,25.81


## DSTC9

### Full

In [16]:
from utils import compute_average_attribution

# top_k_percentage tokens with highest attribution
top_k_percentage = 0.25

scores = {}
top_k_scores = {}
for model in Path("../output/DSTC9/").iterdir():
    if model.is_dir():
        if model.name == "mistral":
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 28705, 733, 16289, 28793, 28748]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "dialogue_state"]
                    # tokens to separate the segments
                    tokens_to_find = [[7082, 441, 28747], [11308, 3829, 28747]]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            if file.name.endswith("none.pkl"):
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state"]
                                # tokens to separate the segments
                                tokens_to_find = [[27304, 441, 1665, 28747]]
                            else:
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state", "knowledge"]
                                # tokens to separate the segments
                                tokens_to_find = [[27304, 441, 1665, 28747], [11308, 3829, 28747]]
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # prompt to separate from the rest of the dialogue
                    prompt_to_find = [[560, 272, 2296, 7114, 264, 2188, 5659, 298, 6619, 741, 5541, 304, 3208, 1316, 477, 396, 13892, 28723, 13718, 441, 272, 7114, 395, 272, 2899, 302, 272, 13892, 28723]]
                    if item.name.endswith("none.pkl"):
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state"]
                        # tokens to separate the segments
                        tokens_to_find = [[27304, 441, 1665, 28747]]
                    else:
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state", "knowledge"]
                        # tokens to separate the segments
                        tokens_to_find = [[27304, 441, 1665, 28747], [11308, 3829, 28747]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find, prompt_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
        else:
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 518, 25580, 29962, 3532, 14816, 29903, 6778, 29966, 829]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "dialogue_state"]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            if file.name.endswith("none.pkl"):
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state"]
                                # tokens to separate the segments
                                tokens_to_find = [[18878, 434, 2106, 29901]]
                            else:
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state", "knowledge"]
                                # tokens to separate the segments
                                tokens_to_find = [[18878, 434, 2106, 29901], [19320, 5485, 29901]]
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # prompt to separate from the rest of the dialogue
                    prompt_to_find = [[797, 278, 1494, 14983, 263, 1404, 10753, 304, 6176, 777, 7306, 322, 4225, 1371, 515, 385, 20255, 29889, 2866, 14150, 278, 14983, 411, 278, 2933, 310, 278, 20255, 29889]]
                    if item.name.endswith("none.pkl"):
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state"]
                        # tokens to separate the segments
                        tokens_to_find = [[18878, 434, 2106, 29901]]
                    else:
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state", "knowledge"]
                        # tokens to separate the segments
                        tokens_to_find = [[18878, 434, 2106, 29901], [19320, 5485, 29901]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find, prompt_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}


In [17]:
import pandas as pd

df = pd.DataFrame(top_k_scores).T * 100
display(df[["prompt", "dialogue_history", "dialogue_state", "knowledge"]].round(2))

Unnamed: 0,prompt,dialogue_history,dialogue_state,knowledge
llama_ft_gold,,27.09,13.61,59.31
llama_prompt_retrieved_top-1,30.28,23.62,12.6,33.5
llama_ft_retrieved_top-3,,35.59,16.61,47.8
llama_ft_none,,55.0,45.0,
llama_prompt_none,40.7,31.86,27.44,
llama_prompt_gold,28.33,22.37,15.75,33.55
llama_prompt_retrieved_top-3,31.52,19.47,16.96,32.05
llama_ft_retrieved_top-1,,27.92,17.22,54.86
mistral_ft_gold,,16.46,31.65,51.89
mistral_prompt_retrieved_top-1,67.28,10.11,15.02,7.58


### Knowledge only

In [19]:
from utils import compute_average_attribution

with open("../data/DSTC9/samples_to_eval.json", "r") as f:
    samples_to_eval = json.load(f)

# top_k_percentage tokens with highest attribution
top_k_percentage = 0.25

with_knowledge = True

scores = {}
top_k_scores = {}
for model in Path("../output/DSTC9/").iterdir():
    if model.is_dir():
        if model.name == "mistral":
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 28705, 733, 16289, 28793, 28748]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "dialogue_state"]
                    # tokens to separate the segments
                    tokens_to_find = [[7082, 441, 28747], [11308, 3829, 28747]]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            if file.name.endswith("none.pkl"):
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state"]
                                # tokens to separate the segments
                                tokens_to_find = [[27304, 441, 1665, 28747]]
                            else:
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state", "knowledge"]
                                # tokens to separate the segments
                                tokens_to_find = [[27304, 441, 1665, 28747], [11308, 3829, 28747]]
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # prompt to separate from the rest of the dialogue
                    prompt_to_find = [[560, 272, 2296, 7114, 264, 2188, 5659, 298, 6619, 741, 5541, 304, 3208, 1316, 477, 396, 13892, 28723, 13718, 441, 272, 7114, 395, 272, 2899, 302, 272, 13892, 28723]]
                    if item.name.endswith("none.pkl"):
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state"]
                        # tokens to separate the segments
                        tokens_to_find = [[27304, 441, 1665, 28747]]
                    else:
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state", "knowledge"]
                        # tokens to separate the segments
                        tokens_to_find = [[27304, 441, 1665, 28747], [11308, 3829, 28747]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find, prompt_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
        else:
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 518, 25580, 29962, 3532, 14816, 29903, 6778, 29966, 829]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "dialogue_state"]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            if file.name.endswith("none.pkl"):
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state"]
                                # tokens to separate the segments
                                tokens_to_find = [[18878, 434, 2106, 29901]]
                            else:
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state", "knowledge"]
                                # tokens to separate the segments
                                tokens_to_find = [[18878, 434, 2106, 29901], [19320, 5485, 29901]]
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # prompt to separate from the rest of the dialogue
                    prompt_to_find = [[797, 278, 1494, 14983, 263, 1404, 10753, 304, 6176, 777, 7306, 322, 4225, 1371, 515, 385, 20255, 29889, 2866, 14150, 278, 14983, 411, 278, 2933, 310, 278, 20255, 29889]]
                    if item.name.endswith("none.pkl"):
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state"]
                        # tokens to separate the segments
                        tokens_to_find = [[18878, 434, 2106, 29901]]
                    else:
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state", "knowledge"]
                        # tokens to separate the segments
                        tokens_to_find = [[18878, 434, 2106, 29901], [19320, 5485, 29901]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find, prompt_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}


In [20]:
import pandas as pd

df = pd.DataFrame(top_k_scores).T * 100
display(df[["prompt", "dialogue_history", "dialogue_state", "knowledge"]].round(2))

Unnamed: 0,prompt,dialogue_history,dialogue_state,knowledge
llama_ft_gold,,27.19,8.03,64.77
llama_prompt_retrieved_top-1,28.31,21.14,14.16,36.39
llama_ft_retrieved_top-3,,46.64,12.94,40.41
llama_ft_none,,65.28,34.72,
llama_prompt_none,41.14,31.63,27.24,
llama_prompt_gold,25.98,19.54,16.45,38.02
llama_prompt_retrieved_top-3,38.24,14.81,17.49,29.46
llama_ft_retrieved_top-1,,36.04,18.72,45.25
mistral_ft_gold,,14.54,29.06,56.39
mistral_prompt_retrieved_top-1,67.34,9.89,13.21,9.56


### Knowledge only

In [21]:
from utils import compute_average_attribution

with open("../data/DSTC9/samples_to_eval.json", "r") as f:
    samples_to_eval = json.load(f)

# top_k_percentage tokens with highest attribution
top_k_percentage = 0.25

with_knowledge = False

scores = {}
top_k_scores = {}
for model in Path("../output/DSTC9/").iterdir():
    if model.is_dir():
        if model.name == "mistral":
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 28705, 733, 16289, 28793, 28748]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "dialogue_state"]
                    # tokens to separate the segments
                    tokens_to_find = [[7082, 441, 28747], [11308, 3829, 28747]]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            if file.name.endswith("none.pkl"):
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state"]
                                # tokens to separate the segments
                                tokens_to_find = [[27304, 441, 1665, 28747]]
                            else:
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state", "knowledge"]
                                # tokens to separate the segments
                                tokens_to_find = [[27304, 441, 1665, 28747], [11308, 3829, 28747]]
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # prompt to separate from the rest of the dialogue
                    prompt_to_find = [[560, 272, 2296, 7114, 264, 2188, 5659, 298, 6619, 741, 5541, 304, 3208, 1316, 477, 396, 13892, 28723, 13718, 441, 272, 7114, 395, 272, 2899, 302, 272, 13892, 28723]]
                    if item.name.endswith("none.pkl"):
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state"]
                        # tokens to separate the segments
                        tokens_to_find = [[27304, 441, 1665, 28747]]
                    else:
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state", "knowledge"]
                        # tokens to separate the segments
                        tokens_to_find = [[27304, 441, 1665, 28747], [11308, 3829, 28747]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find, prompt_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
        else:
            # tokens to remove from the attribution
            tokens_to_remove = [1, 2, 518, 25580, 29962, 3532, 14816, 29903, 6778, 29966, 829]
            for item in model.iterdir():
                if item.is_dir():
                    # name of the segments
                    segments = ["topic", "dialogue_history", "dialogue_state"]
                    for file in item.iterdir():
                        if file.name.startswith("integrated_gradients"):
                            if file.name.endswith("none.pkl"):
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state"]
                                # tokens to separate the segments
                                tokens_to_find = [[18878, 434, 2106, 29901]]
                            else:
                                # name of the segments
                                segments = ["dialogue_history", "dialogue_state", "knowledge"]
                                # tokens to separate the segments
                                tokens_to_find = [[18878, 434, 2106, 29901], [19320, 5485, 29901]]
                            with open(file, "rb") as f:
                                int_grad = pickle.load(f)
                                int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                            sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find)    
                            scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                            top_k_scores[f'{model.name}_ft_{file.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}
                elif item.name.startswith("integrated_gradients"):
                    # prompt to separate from the rest of the dialogue
                    prompt_to_find = [[797, 278, 1494, 14983, 263, 1404, 10753, 304, 6176, 777, 7306, 322, 4225, 1371, 515, 385, 20255, 29889, 2866, 14150, 278, 14983, 411, 278, 2933, 310, 278, 20255, 29889]]
                    if item.name.endswith("none.pkl"):
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state"]
                        # tokens to separate the segments
                        tokens_to_find = [[18878, 434, 2106, 29901]]
                    else:
                        # name of the segments
                        segments = ["prompt", "dialogue_history", "dialogue_state", "knowledge"]
                        # tokens to separate the segments
                        tokens_to_find = [[18878, 434, 2106, 29901], [19320, 5485, 29901]]
                    with open(item, "rb") as f:
                        int_grad = pickle.load(f)
                        int_grad = filter_by_knowledge(int_grad, samples_to_eval, with_knowledge)
                    sc, top_k_sc = compute_average_attribution(int_grad, top_k_percentage, tokens_to_remove, tokens_to_find, prompt_to_find)
                    scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, sc)}
                    top_k_scores[f'{model.name}_prompt_{item.name.split("integrated_gradients_").pop()[:-4]}'] = {k:v for k, v in zip(segments, top_k_sc)}


In [22]:
import pandas as pd

df = pd.DataFrame(top_k_scores).T * 100
display(df[["prompt", "dialogue_history", "dialogue_state", "knowledge"]].round(2))

Unnamed: 0,prompt,dialogue_history,dialogue_state,knowledge
llama_ft_gold,,26.98,19.18,53.84
llama_prompt_retrieved_top-1,32.26,26.11,11.03,30.61
llama_ft_retrieved_top-3,,24.54,20.28,55.18
llama_ft_none,,44.73,55.27,
llama_prompt_none,40.27,32.1,27.64,
llama_prompt_gold,30.68,25.19,15.05,29.08
llama_prompt_retrieved_top-3,24.81,24.13,16.42,34.63
llama_ft_retrieved_top-1,,19.81,15.72,64.47
mistral_ft_gold,,18.37,34.24,47.38
mistral_prompt_retrieved_top-1,67.23,10.33,16.84,5.61
