In [1]:
filename = 'the-verdict.txt'

with open(filename, 'r', encoding='utf-8') as f:
    raw_text = f.read()

print(f"Total no of characters: {len(raw_text)}")
print(f"First 100 characters: {raw_text[:99]}")

Total no of characters: 20479
First 100 characters: I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


The python library 'regex' is used to evaluate and split expressions based on whitespaces or special characters.

In [2]:
import re

In [3]:
text = "Hello, world! This is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world!', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']


The above method is splitting whenever it finds a whitespace.

We have to split it when it encounters a special characters as well.

In [4]:
text = "Hello, world! This is a test."
result = re.split(r'([,.!]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '!', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


Remove the whitespace characters.

In [5]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '!', 'This', 'is', 'a', 'test', '.']


Should we remove whitespaces or not ? depends on the LLM application. if the application is sensitive to indentation and spacing, it is required

Let's remove all special characters from the text.

In [6]:
text = "Hello, world! This is a-- test."
result = re.split(r'([,."\'?_:;()!]|--|\s)', text)
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '!', 'This', 'is', 'a', '--', 'test', '.']


So the basic structure of a tokeniser is done. apply it to the dataset.

In [7]:
pre_processed = re.split(r'([,.:;?!"\'()]|--|\s)', raw_text)
pre_processed = [item.strip() for item in pre_processed if item.strip()]
print(pre_processed[:30])
print(f"Length of tokens is : {len(pre_processed)}")


['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Length of tokens is : 4654


Now all the tokens have to be sorted out in such a way that only the unique tokens remain and everything is sorted in an alphabetical order. 

In [8]:
all_words = sorted(set(pre_processed))
vocab_size = len(all_words)
print(f" Unique tokens in vocab is : {(vocab_size)}")

 Unique tokens in vocab is : 1139


A vocab is actually a dictionary with all the unique words and each of them have a associated unique ID.

In [9]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [10]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry_', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


Furthermore there must be a way to transform the ID's back to the unique words as well since the network will only be dealing with numbers.

To handle unknown words and end of text, we need to introduce two additional tokens as well. 

In [11]:
all_tokens = sorted(set(pre_processed))
all_tokens.extend(["<|endoftext|>","<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(f" Unique tokens in vocab is : {(len(vocab.items()))}")

 Unique tokens in vocab is : 1141
