In [1]:
#Libraries
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
from collections import Counter
print(os.listdir())

['.ipynb_checkpoints', 'TextGeneration.ipynb', 'HPBook1.txt', 'final_model']


In [2]:
file = open('./HPBook1.txt', encoding = 'utf-8')
file_contents = file.read()

In [3]:
print('Sample text from book')
file_contents[:1000]

Sample text from book


"Harry Potter and the Sorcerer's Stone \n\nCHAPTER ONE \n\nTHE BOY WHO LIVED \n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. \n\nMr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. \n\nThe Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about 

In [4]:
total_words = len(file_contents.split())
total_characters = len(file_contents)
unique_words = len(set(file_contents.split()))
unique_characters = len(set(file_contents))
paragraphs = file_contents.split('\n\n')

print ("Total words in book :", total_words)
print ("Total characters in book :", total_characters)
print ("Unique words in book :", unique_words)
print ("Unique characters in book :", unique_characters)
print ("Total Paragraphs :", len(paragraphs))

Total words in book : 78449
Total characters in book : 442744
Unique words in book : 11897
Unique characters in book : 80
Total Paragraphs : 3033


In [5]:
most_words = Counter()
for i in range(len(paragraphs)):
    for x in paragraphs[i]:
        most_words[x] +=1

most_words.most_common()

[(' ', 78449),
 ('e', 39628),
 ('t', 27993),
 ('a', 25887),
 ('o', 25809),
 ('n', 21337),
 ('r', 20990),
 ('h', 19535),
 ('i', 19422),
 ('s', 18870),
 ('d', 15932),
 ('l', 14385),
 ('u', 9562),
 ('y', 8293),
 ('g', 8127),
 ('w', 7744),
 ('m', 6729),
 ('f', 6431),
 ('c', 6403),
 ('.', 6136),
 (',', 5658),
 ('b', 4980),
 ('p', 4909),
 ('"', 4747),
 ('k', 3930),
 ("'", 3141),
 ('H', 2996),
 ('v', 2716),
 ('-', 1986),
 ('I', 1393),
 ('T', 1055),
 ('S', 844),
 ('?', 754),
 ('A', 703),
 ('D', 685),
 ('M', 665),
 ('R', 660),
 ('W', 653),
 ('P', 639),
 ('G', 492),
 ('N', 488),
 ('!', 474),
 ('F', 426),
 ('x', 381),
 ('B', 348),
 ('O', 332),
 ('Y', 326),
 ('j', 319),
 ('C', 293),
 ('E', 287),
 ('z', 259),
 ('q', 217),
 ('L', 209),
 ('Q', 203),
 ('U', 193),
 ('V', 192),
 (';', 135),
 ('K', 79),
 (':', 69),
 ('J', 51),
 (')', 33),
 ('(', 30),
 ('“', 11),
 ('1', 11),
 ('3', 8),
 ('4', 6),
 ('Z', 5),
 ('0', 5),
 ('7', 4),
 ('9', 4),
 ('2', 3),
 ('X', 2),
 ('5', 2),
 ('*', 2),
 ('–', 1),
 ('~', 1),


In [6]:
vocab = sorted(set(file_contents))
vocab_to_int = {c : i for i,c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))
embeddings = np.array([vocab_to_int[i] for i in file_contents], dtype=np.int32)

vocab_to_int

{'\n': 0,
 ' ': 1,
 '!': 2,
 '"': 3,
 "'": 4,
 '(': 5,
 ')': 6,
 '*': 7,
 ',': 8,
 '-': 9,
 '.': 10,
 '0': 11,
 '1': 12,
 '2': 13,
 '3': 14,
 '4': 15,
 '5': 16,
 '6': 17,
 '7': 18,
 '8': 19,
 '9': 20,
 ':': 21,
 ';': 22,
 '?': 23,
 'A': 24,
 'B': 25,
 'C': 26,
 'D': 27,
 'E': 28,
 'F': 29,
 'G': 30,
 'H': 31,
 'I': 32,
 'J': 33,
 'K': 34,
 'L': 35,
 'M': 36,
 'N': 37,
 'O': 38,
 'P': 39,
 'Q': 40,
 'R': 41,
 'S': 42,
 'T': 43,
 'U': 44,
 'V': 45,
 'W': 46,
 'X': 47,
 'Y': 48,
 'Z': 49,
 '\\': 50,
 'a': 51,
 'b': 52,
 'c': 53,
 'd': 54,
 'e': 55,
 'f': 56,
 'g': 57,
 'h': 58,
 'i': 59,
 'j': 60,
 'k': 61,
 'l': 62,
 'm': 63,
 'n': 64,
 'o': 65,
 'p': 66,
 'q': 67,
 'r': 68,
 's': 69,
 't': 70,
 'u': 71,
 'v': 72,
 'w': 73,
 'x': 74,
 'y': 75,
 'z': 76,
 '~': 77,
 '–': 78,
 '“': 79}

In [7]:
print ("Examples encodings")
print('-'*100)
print (file_contents[:120])
print (embeddings[:120])
print('-'*100)

Examples encodings
----------------------------------------------------------------------------------------------------
Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet D
[31 51 68 68 75  1 39 65 70 70 55 68  1 51 64 54  1 70 58 55  1 42 65 68
 53 55 68 55 68  4 69  1 42 70 65 64 55  1  0  0 26 31 24 39 43 28 41  1
 38 37 28  1  0  0 43 31 28  1 25 38 48  1 46 31 38  1 35 32 45 28 27  1
  0  0 36 68 10  1 51 64 54  1 36 68 69 10  1 27 71 68 69 62 55 75  8  1
 65 56  1 64 71 63 52 55 68  1 56 65 71 68  8  1 39 68 59 72 55 70  1 27]
----------------------------------------------------------------------------------------------------


In [8]:
def generate_batches(inputs, batch_size, num_steps):
    char_batch = batch_size * num_steps
    num_batches = len(inputs)//char_batch
    
    idx = char_batch * num_batches 
    inputs = inputs[:idx]
    inputs = inputs.reshape((batch_size, -1))
    
    for i in range(0, inputs.shape[1], num_steps):
        x = inputs[:, i : i+num_steps]
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        
        yield x, y

In [9]:
class charLSTM(nn.Module):

    def __init__(self, num_emb, emb_dim, hidden_dim, tagset_size):
        super(charLSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_emb = num_emb
        self.emb_dim = emb_dim
        
        self.word_embeddings = nn.Embedding(num_emb, emb_dim)

        self.lstm = nn.LSTM(emb_dim, hidden_dim, 2, batch_first = True)

        self.hidden2out = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, hidden = self.lstm(embeds)
        out_space = self.hidden2out(lstm_out)
        out_scores = F.log_softmax(out_space, dim = 2)
        
        return out_scores


In [10]:
num_emb = len(int_to_vocab)
emb_dim = 16
hidden_dim = 512
target_size = num_emb
model = charLSTM(num_emb, emb_dim, hidden_dim, target_size)

In [11]:
batch_size = 32

In [12]:
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
print_every = 256

In [13]:
batches = generate_batches(embeddings, batch_size, 100)
batches = list(batches)

In [14]:
batches = np.array(batches)

In [15]:
print(batches.shape)

(138, 2, 32, 100)


In [None]:
#------------------------------------------------------------
#MODEL TRAINING
#------------------------------------------------------------


# for epoch in range(100):
#     print('-'*50)
#     print('NEW EPOCH:',epoch+1)
#     print('-'*50)
    
#     count = 0
    
#     for i in range(len(batches)):
        
#         batch = batches[i]
#         batch_x = batch[0]
#         batch_y = batch[1]
#         batch_x = torch.tensor(batch_x, dtype = torch.long)
#         batch_y = torch.tensor(batch_y, dtype = torch.long)
        
#         softmax = model(batch_x)
#         loss = loss_function(softmax.view(-1,num_emb),batch_y.view(-1))
        
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), .25)
#         optimizer.step()
        
#         count += batch_size
        
#         if(count%print_every == 0):
#             print(loss,count,sep = ':')

In [None]:
#------------------------------------------------------------
#MODEL SAVING
#------------------------------------------------------------

# torch.save(model.state_dict(), './modelParams')

In [16]:
#------------------------------------------------------------
#MODEL LOADING
#------------------------------------------------------------

model = charLSTM(num_emb, emb_dim, hidden_dim, target_size)
model.load_state_dict(torch.load('./final_model'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [17]:
def generateSentence(starting_text, length):

    text = [w for w in starting_text]

    tensors = [vocab_to_int[t] for t in text]
    ten = torch.tensor(tensors, dtype = torch.long).view(1,-1)    

    last = model(ten)[0][-1]

    with torch.no_grad():
        print(starting_text, end = '')
        prediction = last.argmax().tolist()
        print(int_to_vocab[prediction],end = '')
        text.append(int_to_vocab[prediction])
        tensors = [vocab_to_int[t] for t in text]
        ten = torch.tensor(tensors, dtype = torch.long).view(1,-1)    
        emb = model.word_embeddings(ten)
        lstm_out, hidden = model.lstm(emb)
        hidden_out = model.hidden2out(lstm_out)
        softmax_out = F.log_softmax(hidden_out, dim = 2)
        softmax_out = softmax_out[0][-1]
        prediction = softmax_out.argmax().tolist()
        print(int_to_vocab[prediction],end = '')

        for i in range(length):
            text = [int_to_vocab[prediction]]
            tensors = [vocab_to_int[t] for t in text]
            ten = torch.tensor(tensors, dtype = torch.long).view(1,-1)  
            emb = model.word_embeddings(ten)
            lstm_out, hidden = model.lstm(emb, hidden)
            hidden_out = model.hidden2out(lstm_out)
            softmax_out = F.log_softmax(hidden_out, dim = 2)
            softmax_out = softmax_out[0][-1]
            prediction = softmax_out.argmax().tolist()
            print(int_to_vocab[prediction],end = '')



In [18]:
generateSentence('Ronald Weasley', 2000)

Ronald Weasley, who has always been overshadonsil eyes watching down the walls. 

"All right, but I warn yeh, he's a coward," said Hermione grimly. "Flitwi have no fining somethin', has is but the night my parents died." 

Hagrid leaned across the table, his eyes from the ground and the knight turteing him. 

"Shut up, both of you!" said Harry sharplykon in his glass, as fast-scase-Fin a cat wand. He caught Harry's bathrobe for a while, but Snape didn't look at him again, and one of hiuse to ask you. I think Hermione does, though, which was measuring bewarely through the whirl ofing and noise. 

Percy was allowed to fly it, snaking all over the floor. He didn't say anything. 

"Where is this school, anyway?" 

"I don't know, sit," said Hagrid, giving Harry another of his sideways looked just a chance or witches. 

It was a nice laugh at the spill show he had no fore about Snape.... 

Our he didn't want to talk to him anywhere fell from the corner of his eye he saw the fluttering banner