In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.all import *
from local.text.core import *

In [None]:
from local.notebook.showdoc import *

In [None]:
#default_exp text.data
#default_cls_lvl 3

# Text data

> Functions and transforms to help gather text data in a `DataSource`

## Numericalizing

In [None]:
#export
def make_vocab(count, min_freq=3, max_vocab=60000):
    "Create a vocab of `max_vocab` size from `Counter` `count` with items present more than `min_freq`"
    vocab = [o for o,c in count.most_common(max_vocab) if c >= min_freq]
    for o in reversed(defaults.text_spec_tok): #Make sure all special tokens are in the vocab
        if o in vocab: vocab.remove(o)
        vocab.insert(0, o)
    vocab = vocab[:max_vocab]
    return vocab + [f'xxfake' for i in range(0, 8-len(vocab)%8)]

In [None]:
count = Counter(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'd'])
test_eq(set([x for x in make_vocab(count) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a'.split()))
test_eq(len(make_vocab(count))%8, 0)
test_eq(set([x for x in make_vocab(count, min_freq=1) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a b c d'.split()))
test_eq(set([x for x in make_vocab(count,max_vocab=12, min_freq=1) if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'a b c'.split()))

In [None]:
#export
class TensorText(TensorBase):   pass
class LMTensorText(TensorText): pass

In [None]:
# export
class Numericalize(Transform):
    "Reversible transform of tokenized texts to numericalized ids"
    def __init__(self, vocab=None, min_freq=3, max_vocab=60000, sep=' '):
        self.vocab,self.min_freq,self.max_vocab,self.sep = vocab,min_freq,max_vocab,sep
        self.o2i = None if vocab is None else defaultdict(int, {v:k for k,v in enumerate(vocab)})

    def setup(self, dsrc):
        if dsrc is None: return
        if self.vocab is None:
            count = Counter(p for o in dsrc for p in o)
            self.vocab = make_vocab(count, min_freq=self.min_freq, max_vocab=self.max_vocab)
            self.o2i = defaultdict(int, {v:k for k,v in enumerate(self.vocab) if v != 'xxfake'})

    def encodes(self, o): return TensorText(tensor([self.o2i  [o_] for o_ in o]))
    def decodes(self, o): return Str(self.sep.join([self.vocab[o_] for o_ in o if self.vocab[o_] != PAD]))

In [None]:
num = Numericalize(min_freq=1, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'This is an example of text this another'.split()))
test_eq(len(num.vocab)%8, 0)
start = 'This is an example of text'
t = num(start.split())

In [None]:
test_eq(t, tensor([11, 9, 12, 13, 14, 10]))
test_eq(num.decode(t), start)

In [None]:
num = Numericalize(min_freq=2, sep=' ')
num.setup(L('This is an example of text'.split(), 'this is another text'.split()))
test_eq(set([x for x in num.vocab if not x.startswith('xxfake')]), 
        set(defaults.text_spec_tok + 'is text'.split()))
test_eq(len(num.vocab)%8, 0)
t = num(start.split())
test_eq(t, tensor([0, 9, 0, 0, 0, 10]))
test_eq(num.decode(t), f'{UNK} is {UNK} {UNK} {UNK} text')

## LM_DataLoader -

In [None]:
#export
#TODO: add backward
@delegates()
class LMDataLoader(TfmdDL):
    def __init__(self, dataset, lens=None, cache=2, bs=64, seq_len=72, num_workers=0, **kwargs):
        super().__init__(dataset=dataset, bs=bs, num_workers=num_workers, **kwargs)
        self.items = ReindexCollection([(o[0] if isinstance(o, tuple) else o)
                                          for o in dataset], cache=cache)
        self.seq_len = seq_len
        if lens is None: lens = [len(o) for o in self.items]
        self.lens = ReindexCollection(lens, idxs=self.items.idxs)
        # The "-1" is to allow for final label
        self.m = round_multiple(sum(lens)-1, bs*seq_len, round_down=True)
        self.n = self.m//(seq_len)
        self.spb = self.n//bs
        self.make_chunks()

    def make_chunks(self): self.chunks = Chunks(self.items, self.lens)
    def shuffle_fn(self,idxs):
        self.items.shuffle()
        self.make_chunks()
        return idxs

    def create_item(self, seq):
        if seq>=self.n: raise IndexError
        st = ((seq%self.bs)*self.spb + (seq//self.bs)) * self.seq_len
        txt = self.chunks[st : st+self.seq_len+1]
        return LMTensorText(txt[:-1]),txt[1:]

In [None]:
bs,sl = 4,3
ints = L([0,1,2,3,4],[5,6,7,8,9,10],[11,12,13,14,15,16,17,18],[19,20],[21,22,23],[24]).map(tensor)

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

In [None]:
#hide
#Check lens work
dl = LMDataLoader(ints, lens=ints.map(len), bs=bs, seq_len=sl)
test_eq(list(dl),
    [[tensor([[0, 1, 2], [6, 7, 8], [12, 13, 14], [18, 19, 20]]),
      tensor([[1, 2, 3], [7, 8, 9], [13, 14, 15], [19, 20, 21]])],
     [tensor([[3, 4, 5], [ 9, 10, 11], [15, 16, 17], [21, 22, 23]]),
      tensor([[4, 5, 6], [10, 11, 12], [16, 17, 18], [22, 23, 24]])]])

In [None]:
dl = LMDataLoader(ints, bs=bs, seq_len=sl, shuffle=True)
for x,y in dl: test_eq(x[:,1:], y[:,:-1])
((x0,y0), (x1,y1)) = tuple(dl)
#Second batch begins where first batch ended
test_eq(y0[:,-1], x1[:,0]) 
test_eq(type(x0), LMTensorText)

### Showing

In [None]:
#export
@typedispatch
def show_batch(x: TensorText, y, samples, ctxs=None, max_n=10, **kwargs):
    if ctxs is None: ctxs = get_empty_df(min(len(samples), max_n))
    ctxs = show_batch[object](x, y, samples, max_n=max_n, ctxs=ctxs, **kwargs)
    display_df(pd.DataFrame(ctxs))
    return ctxs

In [None]:
#export
@typedispatch
def show_batch(x: LMTensorText, y, samples, ctxs=None, max_n=10, **kwargs):
    return show_batch[TensorText](x, None, samples, ctxs=ctxs, max_n=max_n, **kwargs)

## Integration example

In [None]:
path = untar_data(URLs.IMDB_SAMPLE)
df = pd.read_csv(path/'texts.csv')
df.head(2)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False


In [None]:
df_tok,count = tokenize_df(df, 'text')
df_tok.head(2)

Unnamed: 0,label,is_valid,text
0,negative,False,"[xxbos, xxmaj, un, -, bleeping, -, believable, !, xxmaj, meg, xxmaj, ryan, does, n't, even, look, her, usual, pert, lovable, self, in, this, ,, which, normally, makes, me, forgive, her, shallow, ticky, acting, schtick, ., xxmaj, hard, to, believe, she, was, the, producer, on, this, dog, ., xxmaj, plus, xxmaj, kevin, xxmaj, kline, :, what, kind, of, suicide, trip, has, his, career, been, on, ?, xxmaj, whoosh, …, xxmaj, banzai, xxrep, 3, !, xxmaj, finally, this, was, directed, by, the, guy, who, did, xxmaj, big, xxmaj, chill, ?, xxmaj, must, be, a, replay, of, xxmaj, jonestown, -, hollywood,..."
1,positive,False,"[xxbos, xxmaj, this, is, a, extremely, well, -, made, film, ., xxmaj, the, acting, ,, script, and, camera, -, work, are, all, first, -, rate, ., xxmaj, the, music, is, good, ,, too, ,, though, it, is, mostly, early, in, the, film, ,, when, things, are, still, relatively, cheery, ., xxmaj, there, are, no, really, superstars, in, the, cast, ,, though, several, faces, will, be, familiar, ., xxmaj, the, entire, cast, does, an, excellent, job, with, the, script, ., \n\n, xxmaj, but, it, is, hard, to, watch, ,, because, there, is, no, good, end, to, a, situation, like, the, one, ...]"


In [None]:
splits = RandomSplitter()(range_of((df_tok)))
tfm = Numericalize(make_vocab(count))
dsrc = DataSource(df_tok, [[attrgetter('text'), tfm]], splits=splits, dl_type=LMDataLoader)

In [None]:
show_at(dsrc.train, 0)

xxbos xxmaj weaker entry in the xxmaj xxunk xxmaj drummond series , with xxmaj john xxmaj howard in the role . xxmaj usual funny xxunk and antics , but not much plot . xxmaj barrymore gets something to do as the inspector , xxunk xxunk to follow xxmaj drummond , xxmaj algy , and xxmaj xxunk on a wild xxunk chase ( mostly in circles ; perhaps the budget was tighter than usual ) to rescue poor xxmaj xxunk , who is being held captive by people who want to lure xxmaj drummond to his doom . xxmaj for those keeping score , in this one , xxmaj drummond is planning to ask xxmaj xxunk to marry him and xxmaj algy is worried about missing the baby 's xxunk . xxmaj it 's fun to see xxmaj algy and xxmaj xxunk dressed up as xxunk to blend in at xxmaj the xxmaj xxunk 's xxmaj rest , but little of it rises above silly .


In [None]:
dbunch = dsrc.databunch(bs=16, seq_len=72)

In [None]:
dbunch.show_batch(max_n=6)

Unnamed: 0,text,text_
0,"xxbos xxmaj adrian has just gone out of the asylum , being rich and with no parents , his life seems empty . xxmaj one day , he meets xxmaj xxunk , a poor boy whom mother is prostitute . xxmaj desperate for xxunk some money , xxmaj xxunk helps xxmaj adrian to search about his life and who where his parents . xxmaj this is a movie from a new director","xxmaj adrian has just gone out of the asylum , being rich and with no parents , his life seems empty . xxmaj one day , he meets xxmaj xxunk , a poor boy whom mother is prostitute . xxmaj desperate for xxunk some money , xxmaj xxunk helps xxmaj adrian to search about his life and who where his parents . xxmaj this is a movie from a new director ,"
1,"sense of xxunk xxunk , do n't be surprised . xxmaj all they did was change the setting of the story and tell it differently but the differences are not significant . xxmaj and it does n't get any better because the plot is flawed to begin with . xxmaj it never works . xxmaj and like its predecessors , the acting is mediocre . \n\n xxmaj the plot has a unique","of xxunk xxunk , do n't be surprised . xxmaj all they did was change the setting of the story and tell it differently but the differences are not significant . xxmaj and it does n't get any better because the plot is flawed to begin with . xxmaj it never works . xxmaj and like its predecessors , the acting is mediocre . \n\n xxmaj the plot has a unique ending"
2,"xxmaj he was deeply moved by the audience xxunk of him and film . xxmaj both he and xxmaj clark were as likable in person as they were in the film . ) \n\n xxmaj riding xxmaj giants pays homage to these extraordinary xxunk while at the same time rewarding us with an insight into the xxunk and terrifying power of the waves they seek to xxunk , the gut - wrenching","he was deeply moved by the audience xxunk of him and film . xxmaj both he and xxmaj clark were as likable in person as they were in the film . ) \n\n xxmaj riding xxmaj giants pays homage to these extraordinary xxunk while at the same time rewarding us with an insight into the xxunk and terrifying power of the waves they seek to xxunk , the gut - wrenching xxunk"
3,"wilson and xxmaj robert xxmaj blake are excellent as the killers as is the supporting cast , including xxmaj john xxmaj xxunk and xxmaj paul xxmaj stewart as the reporter ( the xxmaj capote "" character ? "" ) xxmaj the landmark photography is by the great xxmaj xxunk xxmaj hall . xxbos xxmaj the xxmaj blob starts with one of the most bizarre theme songs ever , xxunk by an uncredited","and xxmaj robert xxmaj blake are excellent as the killers as is the supporting cast , including xxmaj john xxmaj xxunk and xxmaj paul xxmaj stewart as the reporter ( the xxmaj capote "" character ? "" ) xxmaj the landmark photography is by the great xxmaj xxunk xxmaj hall . xxbos xxmaj the xxmaj blob starts with one of the most bizarre theme songs ever , xxunk by an uncredited xxmaj"
4,", its ' realistic depiction of a young woman just starting out in life , its ' fine depiction of the struggles she has to go through to make her mark in life , the decisions she makes based on real things , the people she meets - there is nothing wrong with this movie . xxmaj it is as close to movie magic as i have ever seen outside of the","its ' realistic depiction of a young woman just starting out in life , its ' fine depiction of the struggles she has to go through to make her mark in life , the decisions she makes based on real things , the people she meets - there is nothing wrong with this movie . xxmaj it is as close to movie magic as i have ever seen outside of the """
5,"such usage okay because a member of the group uses a xxunk term to refer to the group because he or she is a member of the group ? xxmaj that may be okay to make a point , but it did n't seem to be used that way here . xxmaj and in any case , i do n't care what the reason , it xxunk me , a xxmaj pole","usage okay because a member of the group uses a xxunk term to refer to the group because he or she is a member of the group ? xxmaj that may be okay to make a point , but it did n't seem to be used that way here . xxmaj and in any case , i do n't care what the reason , it xxunk me , a xxmaj pole ."


In [None]:
b = dbunch.one_batch()
test_eq(type(x), LMTensorText)

In [None]:
test_eq(len(dbunch.valid_ds[0][0]), dbunch.valid_dl.lens[0])

## Classification

In [None]:
#export
def pad_input(samples, pad_idx=1, pad_first=False, backwards=False):
    "Function that collect samples and adds padding. Flips token order if needed"
    max_len = max([len(s[0]) for s in samples])
    if backwards: pad_first = not pad_first
    def _f(x, *y):
        sl = slice(-len(x), sys.maxsize) if pad_first else slice(0, len(x))
        pad =  x.new_zeros(max_len-x.shape[0])+pad_idx
        x1 = torch.cat([pad, x] if pad_first else [x, pad])
        if backwards: x1 = x1.flip(0)
        return (retain_type(x1, x), *y)
    return [_f(x,*y) for x,*y in samples]

In [None]:
test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0), 
        [(tensor([1,2,3]),1), (tensor([4,5,0]),2), (tensor([6,0,0]), 3)])
test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, pad_first=True), 
        [(tensor([1,2,3]),1), (tensor([0,4,5]),2), (tensor([0,0,6]), 3)])
test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, backwards=True), 
        [(tensor([3,2,1]),1), (tensor([5,4,0]),2), (tensor([6,0,0]), 3)])
x = test_eq(pad_input([(tensor([1,2,3]),1), (tensor([4,5]), 2), (tensor([6]), 3)], pad_idx=0, backwards=True), 
        [(tensor([3,2,1]),1), (tensor([5,4,0]),2), (tensor([6,0,0]), 3)])

In [None]:
#hide
#Check retain type
x = [(TensorText([1,2,3]),1), (TensorText([4,5]), 2), (TensorText([6]), 3)]
y = pad_input(x, pad_idx=0)
for s in y: test_eq(type(s[0]), TensorText)

In [None]:
#export
def _default_sort(x): return len(x[0])

@delegates(TfmdDL)
class SortedDL(TfmdDL):
    def __init__(self, dataset, sort_func=None, res=None, **kwargs):
        super().__init__(dataset, **kwargs)
        self.sort_func = _default_sort if sort_func is None else sort_func
        self.res = [self.sort_func(self.do_item(i)) for i in range_of(self.dataset)] if res is None else res
        self.idx_max = np.argmax(self.res)

    def get_idxs(self):
        idxs = super().get_idxs()
        if self.shuffle: return idxs
        return sorted(idxs, key=lambda i: self.res[i], reverse=True)

    def shuffle_fn(self,idxs):
        idxs = np.random.permutation(len(self.dataset))
        idx_max = np.extract(idxs==self.idx_max, idxs)[0]
        idxs[0],idxs[idx_max] = idxs[idx_max],idxs[0]
        sz = self.bs*50
        chunks = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]
        chunks = [sorted(s, key=lambda i: self.res[i], reverse=True) for s in chunks]
        sort_idx = np.concatenate(chunks)

        sz = self.bs
        batches = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]
        sort_idx = np.concatenate(np.random.permutation(batches[1:-1])) if len(batches) > 2 else np.array([],dtype=np.int)
        sort_idx = np.concatenate((batches[0], sort_idx) if len(batches)==1 else (batches[0], sort_idx, batches[-1]))
        return iter(sort_idx)

In [None]:
ds = [(tensor([1,2]),1), (tensor([3,4,5,6]),2), (tensor([7]),3), (tensor([8,9,10]),4)]
dl = SortedDL(ds, bs=2, before_batch=partial(pad_input, pad_idx=0))
test_eq(list(dl), [(tensor([[ 3,  4,  5,  6], [ 8,  9, 10,  0]]), tensor([2, 4])), 
                   (tensor([[1, 2], [7, 0]]), tensor([1, 3]))])

In [None]:
ds = [(tensor(range(random.randint(1,10))),i) for i in range(101)]
dl = SortedDL(ds, bs=2, create_batch=partial(pad_input, pad_idx=-1), shuffle=True, num_workers=0)
batches = list(dl)
max_len = len(batches[0][0])
for b in batches: 
    assert(len(b[0])) <= max_len 
    test_ne(b[0][-1], -1)

In [None]:
splits = RandomSplitter()(range_of(df_tok))
dsrc = DataSource(df_tok, splits=splits, tfms=[
    [attrgetter("text"), Numericalize(make_vocab(count))],
    [attrgetter("label"), Categorize()]], dl_type=SortedDL)
dbch = dsrc.databunch(before_batch=pad_input)

In [None]:
dbch.show_batch(max_n=2)

Unnamed: 0,text,category
0,"xxbos xxmaj raising xxmaj victor xxmaj vargas : a xxmaj review \n\n xxmaj you know , xxmaj raising xxmaj victor xxmaj vargas is like sticking your hands into a big , xxunk bowl of xxunk . xxmaj it 's warm and gooey , but you 're not sure if it feels right . xxmaj try as i might , no matter how warm and gooey xxmaj raising xxmaj victor xxmaj vargas became i was always aware that something did n't quite feel right . xxmaj victor xxmaj vargas suffers from a certain xxunk on the director 's part . xxmaj apparently , the director thought that the ethnic backdrop of a xxmaj latino family on the ...",negative
1,"xxbos xxup the xxup shop xxup around xxup the xxup corner is one of the xxunk and most feel - good romantic comedies ever made . xxmaj there 's just no getting around that , and it 's hard to actually put one 's feeling for this film into words . xxmaj it 's not one of those films that tries too hard , nor does it come up with the xxunk possible scenarios to get the two protagonists together in the end . xxmaj in fact , all its charm is xxunk , contained within the characters and the setting and the plot … which is highly believable to xxunk . xxmaj it 's easy to think that such a love sto...",positive


## TransformBlock for text -

In [None]:
#export
def TextBlock(vocab=None, is_lm=False):
    return TransformBlock(type_tfms=Numericalize(vocab), dl_type=LMDataLoader if is_lm else SortedDL, 
                          dbunch_kwargs={} if is_lm else {'before_batch': pad_input})

## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_utils.ipynb.
Converted 01b_dispatch.ipynb.
Converted 01c_transform.ipynb.
Converted 02_script.ipynb.
Converted 03_torch_core.ipynb.
Converted 03a_layers.ipynb.
Converted 04_dataloader.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_transforms.ipynb.
Converted 07_data_block.ipynb.
Converted 08_vision_core.ipynb.
Converted 09_vision_augment.ipynb.
Converted 10_pets_tutorial.ipynb.
Converted 11_vision_models_xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 13a_metrics.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 14a_callback_data.ipynb.
Converted 15_callback_hook.ipynb.
Converted 15a_vision_models_unet.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 21_vision_learner.ipynb.
Converted 22_tutorial_imagenette.ipynb.
Converted 23_tutorial_transfer_learning.ipynb.
Conve