In [1]:
# Data Collection
!curl -O https://www.gutenberg.org/files/1268/1268-0.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1143k  100 1143k    0     0   358k      0  0:00:03  0:00:03 --:--:--  358k


In [2]:
import numpy as np

In [3]:
# Reading and preprocessing text
with open("../Chapter_15/1268-0.txt", "r", encoding="utf-8") as fp:
	text = fp.read()

start_idx = text.find("THE MYSTERIOUS ISLAND")
end_idx = text.find("End of the Project Gutenberg")
text = text[start_idx:end_idx]
char_set = set(text)
print("total length: ", len(text))
print("Unique Characters:", len(char_set))

total length:  1130711
Unique Characters: 85


In [4]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
	[char2int[ch] for ch in text], dtype=np.int32
)
print("Text encoded shape: ", text_encoded.shape)


Text encoded shape:  (1130711,)


In [5]:
print(text[:15], "== Encoding ==>", text_encoded[:15])

THE MYSTERIOUS  == Encoding ==> [48 36 33  1 41 53 47 48 33 46 37 43 49 47  1]


In [6]:
print(text_encoded[15:21], "== Reverse ==>",''.join(char_array[text_encoded[15:21]]))

[37 47 40 29 42 32] == Reverse ==> ISLAND


In [7]:
for ex in text_encoded[:5]:
	print("{}->{}".format(ex,char_array[ex]))

48->T
36->H
33->E
1-> 
41->M


In [8]:
# Data Preprocessing
import torch
from torch.utils.data import Dataset

seq_length = 40
chunk_size = seq_length+1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded)-chunk_size)]
text_chunks[:5]

[array([48, 36, 33,  1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47,
        40, 29, 42, 32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1,
        41, 53, 47, 48, 33, 46, 37], dtype=int32),
 array([36, 33,  1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47, 40,
        29, 42, 32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1, 41,
        53, 47, 48, 33, 46, 37, 43], dtype=int32),
 array([33,  1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47, 40, 29,
        42, 32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1, 41, 53,
        47, 48, 33, 46, 37, 43, 49], dtype=int32),
 array([ 1, 41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47, 40, 29, 42,
        32,  1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1, 41, 53, 47,
        48, 33, 46, 37, 43, 49, 47], dtype=int32),
 array([41, 53, 47, 48, 33, 46, 37, 43, 49, 47,  1, 37, 47, 40, 29, 42, 32,
         1, 10, 10, 10,  0,  0,  0,  0,  0, 48, 36, 33,  1, 41, 53, 47, 48,
        33, 46, 37, 43, 49, 47,  1],

In [9]:
class TextDataset(Dataset):
	def __init__(self, text_chunks):
		self.text_chunks = text_chunks

	def len(self):
		return len(self.text_chunks)
	
	def __getitem__(self, index):
		text_chunk = self.text_chunks[index]
		return text_chunk[:-1].long(), text_chunk[1:].long()
	
seq_dataset = TextDataset(torch.tensor(text_chunks))

for i, (seq, target) in enumerate(seq_dataset):
	print("Input (x):", repr(''.join(char_array[seq])))
	print("Target (y):", repr(''.join(char_array[target])))
	print()
	if i == 1:
		break


Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

Input (x): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y): 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [10]:
from torch.utils.data import DataLoader
batch_size = 32
torch.manual_seed(1)
seq_dl = DataLoader(list(seq_dataset), batch_size, shuffle=True, drop_last=True)

In [15]:
# Building a Character Level RNN Model
import torch.nn as nn
class RNN(nn.Module):
	def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
		super().__init__()
		self.embedding = nn.Embedding(vocab_size, embed_dim)
		self.rnn_hidden_size = rnn_hidden_size
		self.rnn = nn.LSTM(embed_dim,rnn_hidden_size,batch_first=True)
		self.fc = nn.Linear(rnn_hidden_size, vocab_size)

	def forward(self, x, hidden, cell):
		out = self.embedding(x).unsqueeze(1)
		out,(hidden, cell) = self.rnn(out, (hidden, cell))
		out = self.fc(out).reshape(out.size(0), -1)
		return out, hidden, cell
	
	def init_hidden(self, batch_size):
		hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
		cell = torch.zeros(1,batch_size, self.rnn_hidden_size)
		return hidden, cell

In [16]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim,rnn_hidden_size)
model

RNN(
  (embedding): Embedding(85, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=85, bias=True)
)

In [20]:
import torch.utils
import torch.utils.tensorboard
from torch.utils.tensorboard import SummaryWriter


loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
writer = SummaryWriter("runs/Character_runs/")
num_epochs = 500
for epoch in range(num_epochs):
	hidden, cell = model.init_hidden(batch_size)
	seq_batch, target_batch = next(iter(seq_dl))
	optimizer.zero_grad()
	loss = 0
	for c in range(seq_length):
		pred, hidden, cell = model(seq_batch[:,c], hidden, cell)
		loss += loss_fn(pred, target_batch[:,c])
	loss.backward()
	optimizer.step()
	loss = loss.item()/seq_length
	if epoch % 50 == 0:
		print(f"Epoch : {epoch} Loss :{loss:.4f}")
	writer.add_scalar("Training Loss", loss, epoch)



Epoch : 0 Loss :2.0109
Epoch : 50 Loss :1.9667
Epoch : 100 Loss :1.8885
Epoch : 150 Loss :1.7178
Epoch : 200 Loss :1.6165
Epoch : 250 Loss :1.6553
Epoch : 300 Loss :1.5549
Epoch : 350 Loss :1.5901
Epoch : 400 Loss :1.5912
Epoch : 450 Loss :1.4833


In [21]:
# Evaluating phase - Generating new text passages
from torch.distributions.categorical import Categorical
logits = torch.tensor([[1.0,1.0,1.0]])
print("Probabilities:", nn.functional.softmax(logits, dim=1).numpy()[0])
m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples.numpy())

Probabilities: [0.33333334 0.33333334 0.33333334]
[[0]
 [0]
 [0]
 [2]
 [0]
 [0]
 [0]
 [2]
 [0]
 [2]]


In [22]:
logits = torch.tensor([[1.0,1.0,3.0]])
print("Probabilities:", nn.functional.softmax(logits, dim=1).numpy()[0])
m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples.numpy())

Probabilities: [0.10650698 0.10650698 0.78698605]
[[2]
 [2]
 [0]
 [2]
 [2]
 [2]
 [2]
 [2]
 [1]
 [2]]


In [23]:
def sample(model, starting_str, len_generated_text=500,scale_factor=1.0):
	encoded_input = torch.tensor(
		[char2int[s] for s in starting_str]
	)
	encoded_input = torch.reshape(
		encoded_input, (1,-1)
	)
	generated_str = starting_str

	model.eval()
	hidden, cell = model.init_hidden(1)
	for c in range(len(starting_str)-1):
		_, hidden, cell = model(
			encoded_input[:,c].view(1), hidden, cell
		)
	last_char = encoded_input[:, -1]
	for c in range(len_generated_text):
		logits, hidden, cell = model(
			last_char.view(1), hidden, cell
		)
		logits = torch.squeeze(logits, 0)
		scaled_logits = logits*scale_factor
		m = Categorical(logits=scaled_logits)
		last_char = m.sample()
		generated_str += str(char_array[last_char])
	return generated_str

In [25]:
print(sample(model, starting_str="It was Found"))

It was Found,
orenough the gran in the good all begt his prevable
sailor lazed the rechimented low’t loward
Harding to taking and make as
hey nabreards
was in Captain a Chatter, no hearge bord, the lave aften by aimal. from the islet had on o’clorcy.

At I
chirest-gut be
did the tabor doing.

“A pieff nature
fuch innithan which. The mose, by than manter, which it, therewn yet did give man?” replied Pendroved bay, which habumbited. The forrally some leme a rrest secence malures in some vold over it. But that


In [27]:
logits = torch.tensor([[1.0,1.0,3.0]])
print("Probabilities before Scaling: ", nn.functional.softmax(logits, dim=1).numpy()[0])
print("Probabilities after Scaling with 0.5: ", nn.functional.softmax(0.5*logits, dim=1).numpy()[0])
print("Probabilities after Scaling with 0.1: ", nn.functional.softmax(0.1*logits, dim=1).numpy()[0])


Probabilities before Scaling:  [0.10650698 0.10650698 0.78698605]
Probabilities after Scaling with 0.5:  [0.21194156 0.21194156 0.57611686]
Probabilities after Scaling with 0.1:  [0.3104238  0.3104238  0.37915248]


In [28]:
print(sample(model, starting_str="It was Found", scale_factor=2.0))

It was Found be summined on the island. The shape of the shourced and a beand still the preported all gunting was a struck the sailor was sea. The from the firest for the shore of the propass, and as the sea. The stranger and herew did not his provision was neary of the fire been seep a strugge, and a wholes were the engineer and an more of the mate of the dar at the island. But it was not can the forest for a stall greation of the streath the engineer have been an arrow and not it was prodiced the more was


In [29]:
print(sample(model, starting_str="It was Found", scale_factor=0.5))

It was FounddublaRiuj?”:.-R
OVe
vg-y6us Herbs!-AGStrak!-I-MMmukI0g
CPurmo?
The opicipCy
Angmate any!” sixstout’s lave opuring ochic
Payh,; “yo. GiditpAV, M.xNe, Mhpaul
tyyruls 
Tunteej ts yauOjlsect
faunsheq"
wad trohddel, coj melu; YOp.
“Ap-baj!”’
AnyoFne.

HowTs pout sitate,, and
knorscice:jA
“rading,” xuppent feLItrm.
A3
togk cPanazon?” part
of
wicch pNery
habd0k,-umodFUqa9)e,
knok’g/.
E1SODhb8T(”YAUvCTyQ
et1”
brepo
as le-tok!”
” row,”
wahd Dll!”’-quatwea” ret Gucy!H-”“ITay
A”
Dlat
Mazn?ZiVhatiw, “bvenif
