# Neural Network
---

## Going from raw text to tokens

### Loading necessary libraries.

In [None]:
%pip install transformers
%pip install datasets
%pip install tokenizers
%pip install torch
%pip install tqdm
%pip install requests
%pip install nltk

### Create a dataset

Load book from Gutenberg (Pride and Prejudice) and tokenize the text into sentences

In [None]:
import requests
import nltk
import re

# Download the book
# No need for a local file
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
response = requests.get(url)
response.raise_for_status()
print("Downloaded the book successfully!")

# Extract the main content
text = response.text
print("Raw text length:", len(text))

# Locate the true starting point
start_index = text.find("It is a truth universally acknowledged")
end_index = text.rfind("had been the means of uniting them.")
clean_text = text[start_index:end_index].strip()

# emove unwanted formatting using regex
clean_text = re.sub(r"Heading to", "", clean_text)  # Remove 'Heading to'
clean_text = re.sub(r"\[.*?\]", "", clean_text)  # Remove content inside square brackets
clean_text = re.sub(r"\d+", "", clean_text)  # Remove numbers
clean_text = re.sub(r"\s+", " ", clean_text).strip()  # Normalize spaces

print("Cleaned text length:", len(clean_text))

# Tokenize into sentences
sentences = nltk.sent_tokenize(clean_text)
print(f"Number of sentences: {len(sentences)}")

# Display the first few sentences
for i, sentence in enumerate(sentences[:5]):
    print(f"{i+1}: {sentence}")



In [None]:
def count_distinct_words(text):
    # Create an empty set to store unique words
    unique_words = set()

    # Split text into words using regular expressions
    words = re.split(r'\W+', text.lower())  # This splits at any non-alphanumeric character

    # Add each word to the set
    for word in words:
        if word:  # This check avoids adding empty strings
            unique_words.add(word)

    # Return the number of distinct words
    return len(unique_words)

### Tokenization

In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from transformers import AutoTokenizer

1. Pre-trained tokenizer from hugging face

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the first 2 sentences as an example
tokenized = [tokenizer.tokenize(sentence) for sentence in sentences[:2]]
for i, tokens in enumerate(tokenized):
    print(f"Sentence {i+1}: {tokens}")

2. Function to train WordLevel Tokenizer

In [None]:
def train_and_tokenize_word_level(sentences, vocab_size):
    """
    Train and apply a word-level tokenizer.
    
    Parameters:
        sentences (list): List of sentences to train the tokenizer on.
        vocab_size (int): Size of the vocabulary.
        
    Returns:
        tokenizer: The trained word-level tokenizer.
        tokenized_sentences: Tokenized version of the input sentences.
    """
    # Initialize a word-level tokenizer
    tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))

    # Set up pre-tokenization and trainer
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.WordLevelTrainer(vocab_size=vocab_size, special_tokens=["[UNK]"])

    # Train the tokenizer
    tokenizer.train_from_iterator(sentences, trainer)

    # Tokenize sentences
    tokenized_sentences = [tokenizer.encode(sentence).tokens for sentence in sentences]

    return tokenizer, tokenized_sentences

3. Function to train subword Tokenizer

In [None]:
def train_and_tokenize_subword(sentences, vocab_size):
    """
    Train and apply a subword-level tokenizer.
    
    Parameters:
        sentences (list): List of sentences to train the tokenizer on.
        vocab_size (int): Size of the vocabulary.
        
    Returns:
        tokenizer: The trained subword-level tokenizer.
        tokenized_sentences: Tokenized version of the input sentences.
    """
    # Initialize a subword-level tokenizer (BPE model)
    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

    # Set up pre-tokenization and trainer
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["[UNK]"])

    # Train the tokenizer
    tokenizer.train_from_iterator(sentences, trainer)

    # Tokenize sentences
    tokenized_sentences = [tokenizer.encode(sentence).tokens for sentence in sentences]

    return tokenizer, tokenized_sentences


Calculate the amount of unique words

In [None]:
num_unique_words = count_distinct_words(clean_text)

print(f"Total Unique Words: {num_unique_words}")

Experimenting with different vocabulary sizes in each case

In [None]:
vocab_sizes = [num_unique_words // 10, num_unique_words // 2, num_unique_words]

print("\nWord-Level Tokenizer Experiment:")
for vocab_size in vocab_sizes:
    word_tokenizer, word_tokenized = train_and_tokenize_word_level(sentences, vocab_size)
    print(f"Vocab Size {vocab_size}: \n{word_tokenized[0]}\n{word_tokenized[1]}\n")

print("Subword-Level Tokenizer Experiment:")
for vocab_size in vocab_sizes:
    subword_tokenizer, subword_tokenized = train_and_tokenize_subword(sentences, vocab_size)
    print(f"Vocab Size {vocab_size}: \n{subword_tokenized[0]}\n{subword_tokenized[1]}")

Test generating text using the trained tokenizers

In [None]:
def generate_text(tokenizer, tokenized_sentences, num_sentences=3):
    """
    Generate text by reversing the tokenization process.

    Parameters:
        tokenizer: The tokenizer used for tokenization.
        tokenized_sentences: List of tokenized sentences.
        num_sentences: Number of sentences to generate text for.

    Returns:
        str: Reconstructed text from tokens.
    """
    generated_text = []
    for tokens in tokenized_sentences[:num_sentences]:
        # Decode tokens to reconstruct the text
        text = tokenizer.decode(tokenizer.encode(" ".join(tokens)).ids)
        generated_text.append(text)
    
    return "\n".join(generated_text)


In [None]:
# Example usage for word-level tokenizer
print("Generated Text (Word-Level, Vocab=100%):")
print(generate_text(word_tokenizer, word_tokenized, num_sentences=3))

# Example usage for subword-level tokenizer
print("\nGenerated Text (Subword-Level, Vocab=100%):")
print(generate_text(subword_tokenizer, subword_tokenized, num_sentences=3))