In [1]:
import re

In [2]:
with open("verdict_plain.txt", "r") as f:
    raw_text = f.read()

In [3]:
print("Total Number of Characters: {}".format(len(raw_text)))

Total Number of Characters: 13316


In [4]:
def preprocess_data(raw_text):
    raw_text = raw_text.replace("\n", "").replace("\t", "")
    split_words = re.split(r"[--]|\s|\(|\)|\'|[-;,:!.?_]|\\|\"", raw_text)
    preprocessed_words = []
    for word in split_words:
        if word and word.strip():
            preprocessed_words.append(word)
    return preprocessed_words

In [15]:
def vocabulary(preprocessed_text):
    vocab = sorted(list(set(preprocessed_text)))
    vocab.extend(["<|endoftext|>", "<|unk|>"])
    print(vocab)
    return vocab

In [16]:
preprocessed_text = preprocess_data(raw_text)
vocabulary = vocabulary(preprocessed_text)

['A', 'Ah', 'Among', 'And', 'Arrt', 'As', 'Be', 'Burlington', 'But', 'By', 'Carlo', 'Chicago', 'Claude', 'Come', 'Croft', 'Destroyed', 'Don', 'Dubarry', 'Emperors', 'Florence', 'For', 'Gallery', 'Gideon', 'Gisburn', 'Gisburns', 'Grafton', 'Grindle', 'Grindles', 'HAD', 'Had', 'Has', 'He', 'Her', 'Hermia', 'His', 'I', 'If', 'In', 'It', 'Jack', 'Jove', 'Made', 'Miss', 'Money', 'Monte', 'Moon', 'Mr', 'Mrs', 'My', 'Never', 'No', 'Nutley', 'Of', 'Oh', 'On', 'Only', 'Or', 'Perhaps', 'Poor', 'Professional', 'Renaissance', 'Rickham', 'Riviera', 'Rome', 'Russian', 'Sevres', 'She', 'Stroud', 'Strouds', 'That', 'The', 'This', 'Though', 'Thwing', 'Thwings', 'To', 'Usually', 'Venetian', 'Victor', 'Was', 'We', 'Well', 'What', 'When', 'Why', 'Yes', 'You', 'a', 'abdication', 'able', 'about', 'above', 'abruptly', 'absolute', 'absurdity', 'academic', 'accuse', 'accustomed', 'across', 'activity', 'add', 'added', 'admirers', 'adulation', 'after', 'again', 'ago', 'ah', 'air', 'all', 'almost', 'always', 'ama

In [17]:
vocabulary

['A',
 'Ah',
 'Among',
 'And',
 'Arrt',
 'As',
 'Be',
 'Burlington',
 'But',
 'By',
 'Carlo',
 'Chicago',
 'Claude',
 'Come',
 'Croft',
 'Destroyed',
 'Don',
 'Dubarry',
 'Emperors',
 'Florence',
 'For',
 'Gallery',
 'Gideon',
 'Gisburn',
 'Gisburns',
 'Grafton',
 'Grindle',
 'Grindles',
 'HAD',
 'Had',
 'Has',
 'He',
 'Her',
 'Hermia',
 'His',
 'I',
 'If',
 'In',
 'It',
 'Jack',
 'Jove',
 'Made',
 'Miss',
 'Money',
 'Monte',
 'Moon',
 'Mr',
 'Mrs',
 'My',
 'Never',
 'No',
 'Nutley',
 'Of',
 'Oh',
 'On',
 'Only',
 'Or',
 'Perhaps',
 'Poor',
 'Professional',
 'Renaissance',
 'Rickham',
 'Riviera',
 'Rome',
 'Russian',
 'Sevres',
 'She',
 'Stroud',
 'Strouds',
 'That',
 'The',
 'This',
 'Though',
 'Thwing',
 'Thwings',
 'To',
 'Usually',
 'Venetian',
 'Victor',
 'Was',
 'We',
 'Well',
 'What',
 'When',
 'Why',
 'Yes',
 'You',
 'a',
 'abdication',
 'able',
 'about',
 'above',
 'abruptly',
 'absolute',
 'absurdity',
 'academic',
 'accuse',
 'accustomed',
 'across',
 'activity',
 'add',
 'a

In [18]:
tokenized_vocab = {token:integer for integer,token in enumerate(vocabulary)}

In [19]:
class Tokenizer:
    def __init__(self, vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = {integer:token for token,integer in vocabulary.items()}
    
    def encode(self, text):
        split_words = re.split(r"[--]|\s|\(|\)|\'|[-;,:!.?_]|\\|\"", text)
        preprocessed_words = []
        for word in split_words:
            if word and word.strip():
                preprocessed_words.append(word)
        preprocessed_words = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed_words]
        ids = [self.str_to_int[s] for s in preprocessed_words]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        return text

In [22]:
tokenizer = Tokenizer(tokenized_vocab)
encoded_words = tokenizer.encode(""""Hello would you like to have some tea ? It's the last he painted, you know," Mrs. Gisburn said""")
decoded_words = tokenizer.decode(encoded_words)
decoded_words

'<|unk|> would you like to have some tea It s the last he painted you know Mrs Gisburn said'