In [1]:
!wget https://sherlock-holm.es/stories/plain-text/cano.txt
!mkdir data
!mv cano.txt data/sherlock-holmes.txt

--2024-03-15 05:43:56--  https://sherlock-holm.es/stories/plain-text/cano.txt
Resolving sherlock-holm.es (sherlock-holm.es)... 157.90.249.21, 2a01:4f8:1c17:5725::1
Connecting to sherlock-holm.es (sherlock-holm.es)|157.90.249.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3868223 (3.7M) [text/plain]
Saving to: ‘cano.txt’


2024-03-15 05:43:57 (3.17 MB/s) - ‘cano.txt’ saved [3868223/3868223]

mkdir: cannot create directory ‘data’: File exists


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [3]:
class Embedding:
  def __init__(self, vocab):
    self.vocab = vocab
    self.vocab_size = len(vocab)

  def __call__(self, X, y):
    sequence_length = len(X)
    X_tensor = torch.zeros((sequence_length, self.vocab_size))
    for i, x in enumerate(X):
      idx = self.vocab.index(x)
      X_tensor[i, idx] = 1.0

    y_tensor = torch.tensor(self.vocab.index(y))

    return X_tensor, y_tensor

In [4]:
class TextDataset(Dataset):
  def __init__(self, X, Y, embedding):
    self.X = X
    self.Y = Y
    self.embedding = embedding
    self.no_of_samples = len(self.X)

  def __len__(self):
    return self.no_of_samples

  def __getitem__(self, idx):
    x, y = self.X[idx], self.Y[idx]
    x, y = self.embedding(x, y)
    return x, y

In [5]:
class CharacterPredictor(nn.Module):
  def __init__(self, vocab_size):
    super(CharacterPredictor, self).__init__()
    self.rnn = nn.RNN(input_size=vocab_size, hidden_size=128, num_layers=3, nonlinearity='relu', batch_first=True)
    self.fc = nn.Linear(in_features=128, out_features=vocab_size)

  def forward(self, x):
    output, _status = self.rnn(x)
    output = output[:, -1, :]
    output = self.fc(output)
    return output

In [6]:
def create_dataset(lines, sequence_length):
  X = []
  Y = []
  for line in lines:
    line_length = len(line)
    if line_length > sequence_length:
      for i in range(line_length - sequence_length):
        X.append(line[i:i+sequence_length])
        Y.append(line[i+sequence_length])
  return X, Y

In [7]:
def train(model, train_loader, criterion, optimizer, log_interval=None, device="cpu"):
    model.train()
    running_loss = 0.0
    correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * len(data)
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).sum()

        if log_interval is not None:
          if batch_idx % log_interval == 0:
            print('\t\t{}/{} ({:.0f}%)\tLoss: {:.6f}'.format(
                    batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
    running_loss /= len(train_loader.dataset)
    acc = 100. * correct / len(train_loader.dataset)
    return acc, running_loss

def test(model, test_loader, criterion, device="cpu"):
    model.eval()
    running_loss = 0.0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            running_loss += criterion(output, target).item() * len(data)
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).sum()
    running_loss /= len(test_loader.dataset)
    acc = 100. * correct / len(test_loader.dataset)
    return acc, running_loss

In [8]:
FILE_PATH = 'data/sherlock-holmes.txt'
SEQUENCE_LENGTH = 10
EPOCHS = 20
BATCH_SIZE_TRAIN = 32
BATCH_SIZE_TEST = 1000
LOG_INTERVAL = 10000
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
with open(FILE_PATH) as f:
  lines = f.readlines()

In [10]:
stripped_lines = []
for line in lines:
  stripped = line.strip()

  if stripped != '':
    stripped_lines.append(line.strip())

In [11]:
text = ''
for line in stripped_lines:
  text += line

vocab = sorted(list(set(text)))

In [12]:
X_dataset, Y_dataset = create_dataset(stripped_lines, SEQUENCE_LENGTH)

X_train, X_test, Y_train, Y_test = train_test_split(X_dataset, Y_dataset, test_size=0.25)

In [13]:
embedding = Embedding(vocab)
train_dataset = TextDataset(X_train, Y_train, embedding)
test_dataset = TextDataset(X_test, Y_test, embedding)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE_TEST, shuffle=True)

In [14]:
model = CharacterPredictor(len(vocab))
model = model.to(DEVICE)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LR)

In [None]:
train_losses = []
test_losses = []

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch: {epoch}")

    print("\tTraining: ")
    train_acc, train_loss = train(model, train_loader, criterion, optimizer, LOG_INTERVAL, DEVICE)

    print("\t\t***********************************")
    print("\t\t\tAccuracy: {:.4}%".format(train_acc))
    print("\t\t\tLoss: {:.4}".format(train_loss))
    print("\t\t***********************************")

    print("\tValidation: ")
    test_acc, test_loss = test(model, test_loader, criterion, DEVICE)
    print("\t\tAccuracy: {:.4}%".format(test_acc))
    print("\t\tLoss: {:.4}".format(test_loss))

    train_losses.append(train_loss)
    test_losses.append(test_loss)

Epoch: 1
	Training: 
		0/2164078 (0%)	Loss: 4.590468
		320000/2164078 (15%)	Loss: 3.099102
		640000/2164078 (30%)	Loss: 3.512105
		960000/2164078 (44%)	Loss: 2.968599
		1280000/2164078 (59%)	Loss: 3.121705
		1600000/2164078 (74%)	Loss: 3.261446
		1920000/2164078 (89%)	Loss: 2.736673
		***********************************
			Accuracy: 17.94%
			Loss: 3.06
		***********************************
	Validation: 
		Accuracy: 24.45%
		Loss: 2.892
Epoch: 2
	Training: 
		0/2164078 (0%)	Loss: 2.551030
		320000/2164078 (15%)	Loss: 3.082278
		640000/2164078 (30%)	Loss: 2.761587
		960000/2164078 (44%)	Loss: 3.002820
		1280000/2164078 (59%)	Loss: 2.844057
		1600000/2164078 (74%)	Loss: 2.490162
		1920000/2164078 (89%)	Loss: 2.159872
		***********************************
			Accuracy: 27.88%
			Loss: 2.602
		***********************************
	Validation: 
		Accuracy: 32.66%
		Loss: 2.386
Epoch: 3
	Training: 
		0/2164078 (0%)	Loss: 2.194847
		320000/2164078 (15%)	Loss: 2.370729
		640000/2164078 (30%)	Los