# プロジェクト2：文字レベルの言語モデルをPyTorchで実装する

In [1]:
import numpy as np
# テキストを読み込んで処理
with open('1268-0.txt', 'r', encoding='utf-8') as fp:
    text = fp.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('END of the Project Gutenberg')
text = text[start_indx:end_indx]
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112309
Unique Characters: 80


In [2]:
chars_sorted = sorted(char_set)
char2int = {ch: i for i, ch in enumerate(chars_sorted)}
cahr_array = np.array(chars_sorted)
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)
print('Text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15:21], '== Reverse ==>', ''.join(cahr_array[text_encoded[15:21]]))

Text encoded shape: (1112309,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ==> ISLAND


In [3]:
for ex in text_encoded[:5]:
    print('{} -> {}'.format(ex, cahr_array[ex]))

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [4]:
import torch
from torch.utils.data import Dataset
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i + chunk_size] for i in range(len(text_encoded) - chunk_size +1)]

class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_dataset = TextDataset(torch.tensor(text_chunks))

  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [5]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ', repr(''.join(cahr_array[seq])))
    print('Target (y): ', repr(''.join(cahr_array[target])))
    print()
    if i == 1:
        break

 Input (x):  'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'
Target (y):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'

 Input (x):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'
Target (y):  'E MYSTERIOUS ISLAND\n\nby Jules Verne\n\n187'



In [6]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [7]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_size, rnn_hidden_size, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [8]:
vocab_size = len(char_set)
embed_size = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_size, rnn_hidden_size)
print(model)

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)


In [9]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [10]:
num_epochs = 10000
torch.manual_seed(1)
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    
    loss.backward()
    optimizer.step()
    loss = loss.item() / seq_length
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, loss: {loss:.4f}')

Epoch 0, loss: 4.3720
Epoch 100, loss: 1.7148
Epoch 200, loss: 1.5688
Epoch 300, loss: 1.4150
Epoch 400, loss: 1.3449
Epoch 500, loss: 1.4155
Epoch 600, loss: 1.3027
Epoch 700, loss: 1.3343
Epoch 800, loss: 1.2885
Epoch 900, loss: 1.2950
Epoch 1000, loss: 1.3157
Epoch 1100, loss: 1.2778
Epoch 1200, loss: 1.2536
Epoch 1300, loss: 1.2551
Epoch 1400, loss: 1.2739
Epoch 1500, loss: 1.3049
Epoch 1600, loss: 1.2745
Epoch 1700, loss: 1.2224
Epoch 1800, loss: 1.2883
Epoch 1900, loss: 1.2362
Epoch 2000, loss: 1.2192
Epoch 2100, loss: 1.2425
Epoch 2200, loss: 1.2555
Epoch 2300, loss: 1.1883
Epoch 2400, loss: 1.1441
Epoch 2500, loss: 1.1865
Epoch 2600, loss: 1.2089
Epoch 2700, loss: 1.1773
Epoch 2800, loss: 1.1980
Epoch 2900, loss: 1.2071
Epoch 3000, loss: 1.1528
Epoch 3100, loss: 1.2140
Epoch 3200, loss: 1.1547
Epoch 3300, loss: 1.1545
Epoch 3400, loss: 1.1960
Epoch 3500, loss: 1.1764
Epoch 3600, loss: 1.2323
Epoch 3700, loss: 1.1993
Epoch 3800, loss: 1.1715
Epoch 3900, loss: 1.1584
Epoch 4000, 

In [16]:
path = 'RNN_Project2.pt'
torch.save(model, path)

In [17]:
model = torch.load(path, weights_only=False)

In [14]:
from torch.distributions.categorical import Categorical
def sample(model, starting_str, len_generated_text=500, scale_factor=1.0):
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))
    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str) - 1):
        _, hidden, cell = model(encoded_input[:, c], hidden, cell)

    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell)
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(cahr_array[last_char])
    
    return generated_str

In [15]:
torch.manual_seed(1)
print(sample(model, starting_str='The island'))

The island would distant
supply of steel, but there was--”

“We, VINEYART mothing some islet by the roads to be affect, when Western would half it than nothing but year, led by that without animined which the electricity, the engineer mounted by the war of question. You see.”

“Perhaps,” answered the reporter.

“You have they are of a large conimina, and had just to some perpendicularly low-troughly before than their presence of the Chimneys, they advanced by the castaway, our mountain will be best, movem


potentialsの温度調整もできる。生成されたテキストの新奇さと正確さの間にはトレードオフがある。