In [1]:
import os
import copy
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from transformers import AutoTokenizer, AutoModelWithLMHead
import shap
from shap.utils import cal_conditional_logits
import scipy as sp
import nlp
import torch
pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option("max_rows", None)

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es")
model = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-en-es").cuda()

In [19]:
# This function creates additional keyword arguments which is passed onto the model function
def gen_kwargs(x):
    inputs = tokenizer([x], max_length=512, return_tensors='pt',min_length=0,truncation=True)
    input_ids=inputs['input_ids'].cuda()
    with torch.no_grad():
        # generate input ids for output translation which we aim to explain
        out=model.generate(input_ids)
    decoder_inputs=out[:,:-1]
    del out
    return {'decoder_inputs':decoder_inputs,'input_ids':input_ids}

In [20]:
# This function defines the additional arguments passed onto the model function required inorder to get conditional logits corresponding to the original input sentence translation
def f_kwargs(x):
    kwargs = gen_kwargs(x)
    return kwargs

In [21]:
def f(x_batch,**kwargs):
    output_batch=[]
    # Extract decoder inputs for which we want to generate conditional logits
    decoder_inputs=kwargs['decoder_inputs']
    for i,x in enumerate(x_batch):
        inputs = tokenizer([x], max_length=1024, return_tensors='pt',truncation=True)
        input_ids=inputs['input_ids']
        conditional_logits = cal_conditional_logits(input_ids,model,tokenizer,decoder_inputs)
        output_batch.append(conditional_logits)
    return np.array(output_batch)

In [22]:
# Example function which returns a summary ids 
def example_summarize(x,model,tokenizer):
    print(f"Input: {x}")
    inputs = tokenizer([x], max_length=512, return_tensors='pt',truncation=True)
    input_ids=inputs['input_ids'].cuda()
    summary_ids = model.generate(input_ids).detach().cpu().numpy()
    del input_ids
    summary=[tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    print(f"summary: {summary[0]}")
    return summary_ids[0,1:-1]

In [23]:
s="In this picture, there are four persons: my father, my mother, my brother and my sister."
ids=example_summarize(s,model,tokenizer)

Input: In this picture, there are four persons: my father, my mother, my brother and my sister.
summary: En este cuadro, hay cuatro personas: mi padre, mi madre, mi hermano y mi hermana.


In [8]:
explainer = shap.Explainer(f,tokenizer,model_kwargs=f_kwargs)

In [9]:
shap_values = explainer([s])

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to &#39;longest_first&#39; truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Keyword arguments {&#39;min_length&#39;: 0} not recognized.
Partition explainer: 2it [00:11,  5.87s/it]               


In [24]:
for i,id in enumerate(ids):
    print(f'Output token: {tokenizer.convert_ids_to_tokens(int(id))}')
    shap.plots.text(shap_values[0][:,i])

Output token: ▁En


Output token: ▁este


Output token: ▁cuadro


Output token: ,


Output token: ▁hay


Output token: ▁cuatro


Output token: ▁personas


Output token: :


Output token: ▁mi


Output token: ▁padre


Output token: ,


Output token: ▁mi


Output token: ▁madre


Output token: ,


Output token: ▁mi


Output token: ▁hermano


Output token: ▁y


Output token: ▁mi


Output token: ▁hermana


Output token: .
