<a href="https://colab.research.google.com/github/simply-pouria/The-LMs-Book/blob/main/TheLMBook_Chapter2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 2
### Training a Bag-of-Words Neural Network
using the sofmax activation function and cross-entropy loss function


In [None]:
import re
import torch
import torch.nn as nn
torch.manual_seed(42)

docs = [
 "Movies are fun for everyone.",
 "Watching movies is great fun.",
 "Enjoy a great movie today.",
 "Research is interesting and important.",
 "Learning math is very important.",
 "Science discovery is interesting.",
 "Rock is great to listen to.",
 "Listen to music for fun.",
 "Music is fun for everyone.",
 "Listen to folk music!"
]
labels = [1, 1, 1, 3, 3, 3, 2, 2, 2, 2]
num_classes = len(set(labels))

def tokenize(text):
  return re.findall(r"\w+", text.lower())
def get_vocabulary(texts):
  tokens = {token for text in texts for token in tokenize(text)}
  return {word: idx for idx, word in enumerate(sorted(tokens))}

vocabulary = get_vocabulary(docs)

def doc_to_bow(doc, vocabulary):
  tokens = set(tokenize(doc))
  bow = [0] * len(vocabulary)
  for token in tokens:
    if token in vocabulary:
      bow[vocabulary[token]] = 1
      return bow

vectors = torch.tensor(
 [doc_to_bow(doc, vocabulary) for doc in docs],
 dtype=torch.float32
)
labels = torch.tensor(labels, dtype=torch.long) - 1

input_dim = len(vocabulary)
hidden_dim = 50
output_dim = num_classes

class SimpleClassifier(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super().__init__()
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(hidden_dim, output_dim)
  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x
model = SimpleClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

for step in range(3000):
  optimizer.zero_grad()
  loss = criterion(model(vectors), labels)
  loss.backward()
  optimizer.step()





### Inferencing the Trained Model

In [None]:
new_docs = [
 "Listening to rock music is fun.",
 "I like to listen to this song."
]
class_names = ["Cinema", "Music", "Science"]
new_doc_vectors = torch.tensor(
 [doc_to_bow(new_doc, vocabulary) for new_doc in new_docs]
,
 dtype=torch.float32
)
with torch.no_grad():
 outputs = model(new_doc_vectors)
 predicted_ids = torch.argmax(outputs, dim=1) + 1
for i, new_doc in enumerate(new_docs):
 print(f'{new_doc}: {class_names[predicted_ids[i].item() -1]}')

Listening to rock music is fun.: Music
I like to listen to this song.: Music


## Implementning Byte-Pair Encoding


In [4]:
from collections import defaultdict

def initialize_vocabulary(corpus):
  vocabulary = defaultdict(int)
  charset = set()
  for word in corpus:
    word_with_marker = '_' + word
    characters = list(word_with_marker)
    charset.update(characters)
    tokenized_word = ' '.join(characters)
    vocabulary[tokenized_word] += 1
  return vocabulary, charset


def get_pair_counts(vocabulary):
  pair_counts = defaultdict(int)
  for tokenized_word, count in vocabulary.items():
    tokens = tokenized_word.split()
    for i in range(len(tokens) - 1):
      pair = (tokens[i], tokens[i + 1])
      pair_counts[pair] += count
  return pair_counts


def merge_pair(vocabulary, pair):
  new_vocabulary = {}
  bigram = re.escape(' '.join(pair))
  pattern = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
  for tokenized_word, count in vocabulary.items():
    new_tokenized_word = pattern.sub("".join(pair), tokenized_word)
    new_vocabulary[new_tokenized_word] = count
  return new_vocabulary


def byte_pair_encoding(corpus, vocab_size):
  vocabulary, charset = initialize_vocabulary(corpus)
  merges = []
  tokens = set(charset)
  while len(tokens) < vocab_size:
    pair_counts = get_pair_counts(vocabulary)
    if not pair_counts:
     break
    most_frequent_pair = max(pair_counts, key=pair_counts.get)
    merges.append(most_frequent_pair)
    vocabulary = merge_pair(vocabulary, most_frequent_pair)
    new_token = ''.join(most_frequent_pair) ➎
    tokens.add(new_token) ➏

  return vocabulary, merges, charset, tokens

def tokenize_word(word, merges, vocabulary, charset, unk_token="<UNK>"):
  word = '_' + word
  if word in vocabulary:
    return [word]
  tokens = [char if char in charset else unk_token for char in word]

  for left, right in merges:
    i = 0
    while i < len(tokens) - 1:
      if tokens[i:i+2] == [left, right]:
        tokens[i:i+2] = [left + right]
      else:
        i += 1
  return tokens

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 33)