In [1]:
with open ("Data/the-verdict.txt", "r") as file:
    raw_text = file.read()

In [2]:
print(len(raw_text))

20479


In [3]:
import re

In [4]:
text="Hello, world! This is a test-text for tokenization."

In [11]:
result=re.split(r'(\s)', text)

In [12]:
print(result)

['Hello,', ' ', 'world!', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test-text', ' ', 'for', ' ', 'tokenization.']


In [13]:
result=re.split(r'([,.]|\s)',text)

In [14]:
print(result)

['Hello', ',', '', ' ', 'world!', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test-text', ' ', 'for', ' ', 'tokenization', '.', '']


In [31]:
pattern = r'(--|[,\.\-?!_;:\'"(){}\[\]<>/\\@#$%^&*+=]|\s)'
result = re.split(pattern, raw_text)
# keep only non-whitespace tokens and drop tokens that are purely punctuation
preprocessed = [item.strip() for item in result if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [32]:
len(preprocessed)

4766

In [39]:
all_words=sorted(set(preprocessed))
vocab_size=len(all_words)
print(vocab_size)

1140


In [44]:
import itertools

In [43]:
vocab={token:integer for integer,token in enumerate(all_words)}
print(vocab)

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '-': 6, '--': 7, '.': 8, ':': 9, ';': 10, '?': 11, 'A': 12, 'Ah': 13, 'Among': 14, 'And': 15, 'Are': 16, 'Arrt': 17, 'As': 18, 'At': 19, 'Be': 20, 'Begin': 21, 'Burlington': 22, 'But': 23, 'By': 24, 'Carlo': 25, 'Chicago': 26, 'Claude': 27, 'Come': 28, 'Croft': 29, 'Destroyed': 30, 'Devonshire': 31, 'Don': 32, 'Dubarry': 33, 'Emperors': 34, 'Florence': 35, 'For': 36, 'Gallery': 37, 'Gideon': 38, 'Gisburn': 39, 'Gisburns': 40, 'Grafton': 41, 'Greek': 42, 'Grindle': 43, 'Grindles': 44, 'HAD': 45, 'Had': 46, 'Hang': 47, 'Has': 48, 'He': 49, 'Her': 50, 'Hermia': 51, 'His': 52, 'How': 53, 'I': 54, 'If': 55, 'In': 56, 'It': 57, 'Jack': 58, 'Jove': 59, 'Just': 60, 'Lord': 61, 'Made': 62, 'Miss': 63, 'Money': 64, 'Monte': 65, 'Moon': 66, 'Mr': 67, 'Mrs': 68, 'My': 69, 'Never': 70, 'No': 71, 'Now': 72, 'Nutley': 73, 'Of': 74, 'Oh': 75, 'On': 76, 'Once': 77, 'Only': 78, 'Or': 79, 'Perhaps': 80, 'Poor': 81, 'Professional': 82, 'Renaissance': 83, 'R

In [52]:
print(dict(itertools.islice(vocab.items(), 50)))

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '-': 6, '--': 7, '.': 8, ':': 9, ';': 10, '?': 11, 'A': 12, 'Ah': 13, 'Among': 14, 'And': 15, 'Are': 16, 'Arrt': 17, 'As': 18, 'At': 19, 'Be': 20, 'Begin': 21, 'Burlington': 22, 'But': 23, 'By': 24, 'Carlo': 25, 'Chicago': 26, 'Claude': 27, 'Come': 28, 'Croft': 29, 'Destroyed': 30, 'Devonshire': 31, 'Don': 32, 'Dubarry': 33, 'Emperors': 34, 'Florence': 35, 'For': 36, 'Gallery': 37, 'Gideon': 38, 'Gisburn': 39, 'Gisburns': 40, 'Grafton': 41, 'Greek': 42, 'Grindle': 43, 'Grindles': 44, 'HAD': 45, 'Had': 46, 'Hang': 47, 'Has': 48, 'He': 49}


In [70]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        """
        Initialize tokenizer with vocabulary.
        vocab: dictionary mapping tokens to integers
        """
        self.vocab = vocab
        self.inverse_vocab = {v: k for k, v in vocab.items()}
    
    def encode(self, text):
        """
        Encode text to token IDs.
        Returns: dict with 'token_ids' (list of IDs in order), 'tokens' (list of words), 
        and 'mapper_dict' (token→ID mapping). Tokens not in vocab use null_character.
        """
        pattern = r'(--|[,\.\-?!_;:\'"(){}\[\]<>/\\@#$%^&*+=]|\s)'
        result = re.split(pattern, text)
        
        # Clean tokens: remove whitespace-only and strip
        tokens = [item.strip() for item in result if item.strip()]
        
        # Convert to IDs in order, use -1 for unknown tokens
        token_ids = []
        mapper_dict = {}
        for token in tokens:
            if token in self.vocab:
                token_id = self.vocab[token]
                token_ids.append(token_id)
                mapper_dict[token] = token_id
            else:
                token_ids.append(-1)  # null_character for unknown tokens
        
        return {
            'token_ids': token_ids,
            'tokens': tokens,
            'mapper_dict': mapper_dict
        }
    
    def decode(self, token_ids):
        """
        Decode token IDs back to text.
        Takes list of token IDs (from encoder).
        Returns: dict with 'tokens' (list of words), 'text' (joined sentence), 
        and 'mapper_dict' (ID→token mapping)
        """
        tokens = []
        mapper_dict = {}
        
        for token_id in token_ids:
            if token_id == -1:
                tokens.append('|<unk>|')  # unknown token
            elif token_id in self.inverse_vocab:
                token = self.inverse_vocab[token_id]
                tokens.append(token)
                mapper_dict[token_id] = token
        
        return {
            'tokens': tokens,
            'text': ' '.join(tokens),
            'mapper_dict': mapper_dict
        }

In [71]:
# Usage example:
tokenizer = SimpleTokenizerV1(vocab)

In [72]:
# Encode
text = "I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)"
encoded = tokenizer.encode(text)
# print("Encoded tokens:", encoded['tokens'])
print("Encoded IDs:", encoded['token_ids'])
# print("Token→ID Mapper:", encoded['mapper_dict'])

Encoded IDs: [54, 45, 150, 1013, 58, 39, 826, 116, 259, 493, 7, 1012, 116, 507, 441, 398, 7, 917, 589, 1088, 714, 513, 971, 1026, 667, 1026, 538, 997, 5, 572, 998, 541, 727, 552, 503, 5, 536, 519, 376, 552, 753, 5, 665, 116, 849, 1112, 5, 158, 403, 550, 572, 116, 1077, 732, 998, 85, 8, 3, 100, 54, 826, 1013, 589, 1130, 533, 207, 86, 739, 35, 8, 4]


In [73]:
decoded = tokenizer.decode(encoded['token_ids'])
# print("\nDecoded tokens:", decoded['tokens'])
print("Decoded text:", decoded['text'])
# print("ID→Token Mapper:", decoded['mapper_dict'])

Decoded text: I HAD always thought Jack Gisburn rather a cheap genius -- though a good fellow enough -- so it was no great surprise to me to hear that , in the height of his glory , he had dropped his painting , married a rich widow , and established himself in a villa on the Riviera . ( Though I rather thought it would have been Rome or Florence . )


In [74]:
tokenizer.decode(tokenizer.encode("Hello world!, How are you?")['token_ids'])['text']

'|<unk>| |<unk>| ! , How are you ?'

In [77]:
# Modifying The Vocabulary to Indicate Unknown Tokens and End of Text

all_words.extend(['|<unk>|', '|<endoftext>|'])

vocab_updated={token:integer for integer,token in enumerate(all_words)}

In [84]:
dict(list(vocab_updated.items())[-10:])

{'year': 1132,
 'years': 1133,
 'yellow': 1134,
 'yet': 1135,
 'you': 1136,
 'younger': 1137,
 'your': 1138,
 'yourself': 1139,
 '|<unk>|': 1140,
 '|<endoftext>|': 1141}

In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        """
        Initialize tokenizer with vocabulary.
        vocab: dictionary mapping tokens to integers
        """
        self.vocab = vocab
        self.inverse_vocab = {v: k for k, v in vocab.items()}
        # Get special token IDs
        self.unk_token_id = vocab.get('|<unk>|', -1)
        self.endoftext_token_id = vocab.get('|<endoftext>|', -1)
    
    def encode(self, text):
        """
        Encode text to token IDs.
        Uses vocab_updated with unknown and endoftext tokens.
        Returns: dict with 'token_ids' (list of IDs in order), 'tokens' (list of words), 
        and 'mapper_dict' (token→ID mapping). Unknown tokens use |<unk>| token ID.
        """
        pattern = r'(--|[,\.\-?!_;:\'"(){}\[\]<>/\\@#$%^&*+=]|\s)'
        result = re.split(pattern, text)
        
        # Clean tokens: remove whitespace-only and strip
        tokens = [item.strip() for item in result if item.strip()]
        
        # Convert to IDs in order, use |<unk>| token ID for unknown tokens
        token_ids = []
        mapper_dict = {}
        for token in tokens:
            if token in self.vocab:
                token_id = self.vocab[token]
                token_ids.append(token_id)
                mapper_dict[token] = token_id
            else:
                # Use the unknown token ID instead of -1
                token_ids.append(self.unk_token_id)
                mapper_dict[token] = self.unk_token_id
        
        # Add end-of-text token at the end
        token_ids.append(self.endoftext_token_id)
        
        return {
            'token_ids': token_ids,
            'tokens': tokens,
            'mapper_dict': mapper_dict
        }
    
    def decode(self, token_ids):
        """
        Decode token IDs back to text.
        Takes list of token IDs (from encoder).
        Returns: dict with 'tokens' (list of words), 'text' (joined sentence), 
        and 'mapper_dict' (ID→token mapping). Includes |<endoftext>| token.
        """
        tokens = []
        mapper_dict = {}
        
        for token_id in token_ids:
            if token_id in self.inverse_vocab:
                token = self.inverse_vocab[token_id]
                tokens.append(token)
                mapper_dict[token_id] = token
        
        return {
            'tokens': tokens,
            'text': ' '.join(tokens),
            'mapper_dict': mapper_dict
        }

In [93]:
# Demo: Using SimpleTokenizerV2 with vocab_updated
tokenizer_v2 = SimpleTokenizerV2(vocab_updated)

# Test sentence
test_text = "Hello world!, How are you?"

In [94]:
# Encode
encoded_v2 = tokenizer_v2.encode(test_text)

# print("\nEncoded tokens (first 20):", encoded_v2['tokens'][:20])
print("\nEncoded IDs (first 20):", encoded_v2['token_ids'])
# print(f"\nTotal token IDs: {len(encoded_v2['token_ids'])}")
# print(f"Unknown token ID (|<unk>|): {tokenizer_v2.unk_token_id}")
# print(f"End-of-text token ID (|<endoftext>|): {tokenizer_v2.endoftext_token_id}")
# print(f"Last ID in token_ids: {encoded_v2['token_ids'][-1]} (should be endoftext token)")



Encoded IDs (first 20): [1140, 1140, 0, 5, 53, 170, 1136, 11, 1141]


In [None]:
# Decode
decoded_v2 = tokenizer_v2.decode(encoded_v2['token_ids'])
print("\nDecoded tokens:")
print(decoded_v2['tokens'])
print("\nDecoded text:")
print(decoded_v2['text'])
print("\nMapper dict:")
print(decoded_v2['mapper_dict'])


Decoded text:
|<unk>| |<unk>| ! , How are you ?
