# Text tokenizers

In [3]:
#| default_exp text.tokenizers

In [4]:
#| hide
%load_ext autoreload
%autoreload 2
from nbdev.showdoc import *

In [5]:
#| export

# torch
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.optim import SGD

# torchtext
import torchtext
from torchtext.vocab import vocab
from torchtext.data.utils import get_tokenizer
from torchtext.datasets import AG_NEWS

# hf
import datasets
from transformers import AutoTokenizer, DataCollatorForLanguageModeling

# data 
import pandas as pd
import numpy as np

# ui
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

# python
from typing import Dict, List, Tuple, Optional, Set, Iterable
from collections import Counter, OrderedDict
from dataclasses import dataclass, asdict
from plum import dispatch

# nimrod
from nimrod.models.lm import Vocab

## Spacy

In [6]:
#| export 

class Tokenizer:
    def __init__(self, backend='spacy', language='en'):
        if language == 'en':
            language = 'en_core_web_sm'
        self.tokenizer = get_tokenizer(backend, language=language)

    @dispatch
    def __call__(self, text:str)->List[str]:
        return self.tokenizer(text)
    
    @dispatch
    def __call__(self, texts:List[str])->List[List[str]]:
        return [self.tokenizer(text) for text in texts]
    
    @dispatch # to replace Iterable
    # works with agnews type of dataset [(index, text)]
    def __call__(self, data_iter:Iterable)->Iterable:
        for _, text in data_iter:
            yield self.tokenizer(text)

    @dispatch    
    def inverse(self, tokens:List[str])->str:
        # TODO: take care of white spaces
        return ' '.join(tokens)

    @dispatch
    def inverse(self, list_of_tokens:List[List[str]])->List[str]:
        s = []
        for tokens in list_of_tokens:
            s.append(' '.join(tokens)) 
        return s

In [7]:
tok = Tokenizer()

In [8]:
# str -> List[str]
s = "Oh, yeah I don't know dude..."
tokenized = tok(s)
print(s)
print(tokenized)
print(tok.inverse(tokenized))

# List[str]->List[List[str]]
s = ["Oh, yeah I don't know dude...", "this is a test"]
tokenized = tok(s)
print(tokenized)
print(tok.inverse(tokenized))

# Iterable -> Iterable
ds = AG_NEWS(split='test') # data pipe
sample = next(iter(ds)) # (label, text)
# print(sample)
it = tok(ds)
tokens = [token for token in it]
print(tokens[:2])

Oh, yeah I don't know dude...
['Oh', ',', 'yeah', 'I', 'do', "n't", 'know', 'dude', '...']
Oh , yeah I do n't know dude ...
[['Oh', ',', 'yeah', 'I', 'do', "n't", 'know', 'dude', '...'], ['this', 'is', 'a', 'test']]
["Oh , yeah I do n't know dude ...", 'this is a test']




[['Fears', 'for', 'T', 'N', 'pension', 'after', 'talks', 'Unions', 'representing', 'workers', 'at', 'Turner', '  ', 'Newall', 'say', 'they', 'are', "'", 'disappointed', "'", 'after', 'talks', 'with', 'stricken', 'parent', 'firm', 'Federal', 'Mogul', '.'], ['The', 'Race', 'is', 'On', ':', 'Second', 'Private', 'Team', 'Sets', 'Launch', 'Date', 'for', 'Human', 'Spaceflight', '(', 'SPACE.com', ')', 'SPACE.com', '-', 'TORONTO', ',', 'Canada', '--', 'A', 'second\\team', 'of', 'rocketeers', 'competing', 'for', 'the', ' ', '#', '36;10', 'million', 'Ansari', 'X', 'Prize', ',', 'a', 'contest', 'for\\privately', 'funded', 'suborbital', 'space', 'flight', ',', 'has', 'officially', 'announced', 'the', 'first\\launch', 'date', 'for', 'its', 'manned', 'rocket', '.']]


In [None]:
#| export
# TODO: add more special characters
class Numericalizer():
    def __init__(self, tokens_iter:Iterable, specials=["<pad>", "<unk>", "<bos>", "<eos>"]):
        self._vocab = self.build_map_from_iter(tokens_iter, specials)
    
    def build_map_from_iter(self,data_iter:Iterable, specials=None):
        self._vocab = torchtext.vocab.build_vocab_from_iterator(data_iter, specials=specials)
        if "<unk>" in specials:
            self._vocab.set_default_index(self._vocab["<unk>"])
        return self._vocab

    @dispatch
    def __call__(self, texts:List[str])->List[List[int]]:
        # TODO: check self._vocab has been built
        return [self._vocab[text] for text in texts]
    
    @dispatch
    def __call__(self, texts:List[List[str]]):
        # TODO: use nested list comprehension
        res = []
        for row in texts:
            res.append([self._vocab[text] for text in row])
        return res
        
    @dispatch
    def __call__(self, text:str)->int:
        return self._vocab[text]
    
    @property
    def vocab(self):
        return(self._vocab)
    
    @dispatch
    def inverse(self, idx:int)->str:
        return self._vocab.get_itos()[idx]

    @dispatch
    def inverse(self, indices:List[int])->List[str]:
        return [self._vocab.get_itos()[i] for i in indices]

In [None]:
tok = Tokenizer()
# In the case of agnews, dataset is: [(index, text)]
def token_iterator(data_iter:Iterable)->Iterable:
    for _, text in data_iter:
        yield tok(text)
tok_it= token_iterator(ds)
# initialize numericalizer based on token iterator
num = Numericalizer(tok_it)

In [None]:
print(num('<pad>'), num('<unk>'))

0 1


In [None]:
print(num.vocab['the'])
print(num('the'))
print(num(['<bos>', '<pad>', '<unk>', 'a', 'this', 'the', 'lkjsdf']))
print(num.inverse(0))
print(num.inverse([6,55]))
print(num([['<bos>', '<pad>'], ['<unk>', 'a', 'this', 'the', 'lkjsdf']]))

4
4
[2, 0, 1, 9, 58, 4, 1]
<pad>
['.', 'Monday']
[[2, 0], [1, 9, 58, 4, 1]]


In [None]:
tokens = tok(["here we go. asdflkj", "it was time..."])
print(tokens)
print([num(tok) for tok in tokens])
print(num(tokens))

[['here', 'we', 'go', '.', 'asdflkj'], ['it', 'was', 'time', '...']]
[[534, 1040, 310, 6, 1], [34, 40, 101, 67]]
[[534, 1040, 310, 6, 1], [34, 40, 101, 67]]


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()