In [2]:
import ipywidgets as widgets
import ipyvuetify as v
from fastai.text.all import *
from fastai.callback.fp16 import *
import pathlib
from pathlib import Path
#for download
import gdown
# for attribution
from captum.attr import LayerIntegratedGradients
import gc

In [2]:
# for local
model_file = 'fastai_133_langs_v3'

# for downloading model from gdrive
MODEL_URL = "https://drive.google.com/uc?id=1Qf8ZMbzoEFSGxQ04DOPo01BEcL-43qeu"
gdown.download(MODEL_URL, model_file, quiet=True)


'fastai_133_langs_v3'

In [3]:
# character tokeniser used in fastai to support fastai model loading
from collections.abc import Iterable

def flatten(l):
    for el in l:
        if isinstance(el, Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el
class CharTokenizer():
        
    def __call__(self, items):
        
        # List where I temporarly store the tokens ['xxbos', 'h', 'e', 'l', 'l', 'o', 'xxeos'] as 
        # they are being parsed.
        final_list = []
        
        # We don't want to mess with the special fastai tokens
        special_chars = ['xxunk','xxpad','xxbos','xxeos','xxfld','xxrep','xxwrep','xxup','xxmaj']
        
        # Break up string into words, if word in special_chars dont touch it. Otherwise break up each
        # word into each character.
        for words in items:
            tmp = list(flatten([list(word) if word not in special_chars else word 
                                                for word in words.split()]))
            # tmp has each token 'xxbos', 'xxmaj', 'h', 'e', 'l', 'l', 'o', ',', 'w', 'h', ....]
            # We need to put the tmp list into another list to generate a generator below
            final_list.append(tmp)
        
        # Returns a generator
        return (t for t in final_list)


In [4]:
# next 2 lines are for windows
# temp = pathlib.PosixPath
# pathlib.PosixPath = pathlib.WindowsPath

# next line for linux
learner = load_learner(model_file)

In [5]:
# calculate attribution for each character
def get_attributions_for_sentence(sentence, 
                                  awd=learner, 
                                  target = None, 
                                  lig_n_steps = 200,
                                  baseline_token='xxunk'):
    # getting to the actual layer that holds embeddings
    embedding_layer = awd.model[0]._modules['module']._modules['encoder_dp']

    # working around the model prediction - first output only, apply softmax
    forward_func = lambda x: torch.softmax(awd.model(x)[0], dim=-1)
    
    # make integrated gradients instance
    lig = LayerIntegratedGradients(
        forward_func, 
        embedding_layer
    )
    vocab = awd.dls.vocab[0]
    num_sentence_tokens = awd.dls.numericalize(sentence).view(1, -1) 
    sentence_tokens = [vocab[i] for i in num_sentence_tokens[0]]
    baseline = torch.ones_like(torch.tensor(num_sentence_tokens)) * vocab.index(baseline_token)
    baseline[0,0] = vocab.index('xxbos') # beginning of sentence is always #1
    y = awd.predict(sentence)
    if target is None:
        target = y[1].item()
    attrs = lig.attribute(num_sentence_tokens, baseline, target, n_steps=lig_n_steps)
    a = attrs.sum(-1)
    a = a / torch.norm(a)
    return (
        pd.Series(a.numpy()[0], index=sentence_tokens),
        y
    )

In [6]:
# display
class Chip(v.Chip):
    positive = '0, 255, 0'
    negative = '255, 0, 0'
    def __init__(self, word, attribution):
        direction = self.positive if attribution >= 0 else self.negative
        color = f'rgba({direction}, {abs(attribution):.2f})'
        super().__init__(class_='mx-0 px-1', 
                         children=[word], color=color, 
                         value=attribution,
                         label=True, small=True)
        
def saliency_chips(attributions:pd.Series) -> v.ChipGroup:
    children = [Chip(w, a)
           for w, a in attributions.iteritems()]
    return v.ChipGroup(column=True, children=children)

In [12]:
# ipywidgets
inp_text = widgets.Text(
    placeholder='Type your text',
    description='Text:',
    disabled=False
)
# lbl_example = widgets.Label(value='Example text: 彼の発言で私の希望は失われた。')
lbl_example = widgets.HTML(
    value="彼の発言で私の希望は失われた。<br/>他的話讓我失去了希望。<br/>His remarks lost my hope.<br/>Ses remarques m'ont fait perdre espoir.<br/>Sus comentarios perdieron mi esperanza.<br/>उनकी टिप्पणियों ने मेरी आशा खो दी।",
    placeholder='Some HTML',
    description='Example texts:',
)
lbl_pred = widgets.Label()
# lbl_conf = widgets.Label()
out_pl = widgets.Output()
btn_run = widgets.Button(description='Detect & Explain')

In [8]:
# trigger function
def on_click_classify(change):
    out_pl.clear_output()
    lbl_pred.value = ''
    text = inp_text.value.strip()
    _ = gc.collect()
#     pred,pred_idx,probs = learner.predict(text)
    
#     lbl_conf.value = f'Confidence: {probs[pred_idx]:.04f}'
    attributions, prediction = get_attributions_for_sentence(text)
    lbl_pred.value = f'Detected language: {prediction[0]}'
    with out_pl: display(saliency_chips(attributions))
    
btn_run.on_click(on_click_classify)


In [9]:
#final layout
widgets.VBox([widgets.Label('Detect Language!'), 
      inp_text, lbl_example, btn_run,
#       widgets.VBox([lbl_pred,lbl_conf],
#                  layout={'border': '1px solid black'}),
      widgets.VBox([lbl_pred,widgets.Label('Attribution...'), out_pl],
      layout={'border': '1px solid black'})])

VBox(children=(Label(value='Detect Language!'), Text(value='彼の発言で私の希望は失われた。', description='Text:', placeholder…