In [1]:
import re
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ralampay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
filename = "story.txt"
f = open(filename, 'r')

raw_text = f.read()

In [3]:
raw_text[0:1000]

'Once upon a time, there was a boy named Mike\nHe was raised in the hood, life was never quite\nGrowing up, he faced many struggles and strife\nBut through it all, he had one love in his life\n\nRap music was his escape from reality\nThe beats and lyrics helped him to see\nA world beyond his troubled neighborhood\nHe knew that with hard work, he could change his mood\n\nHe started writing rhymes and practicing his flow\nIn the mirror, he would rap and watch himself grow\nHe knew that if he could just make it to the top\nHe could change his life, and make a better stop\n\nOne day, he had the chance to perform on stage\nIn front of a crowd, he killed it, he killed it with rage\nHe had finally made it, he had achieved his dream\nAnd now he is a successful rapper, or so it seems\n\nHe never forgot where he came from, and he never will\nHe always remember the struggles that he had to fulfill\nHe is now a role model to kids in the hood\nShowing them that with hard work, they too could\n\nRis

In [4]:
processed_text = raw_text.lower()
processed_text = re.sub(r'[^\x00-\x7f]', r'', processed_text)

processed_text

'once upon a time, there was a boy named mike\nhe was raised in the hood, life was never quite\ngrowing up, he faced many struggles and strife\nbut through it all, he had one love in his life\n\nrap music was his escape from reality\nthe beats and lyrics helped him to see\na world beyond his troubled neighborhood\nhe knew that with hard work, he could change his mood\n\nhe started writing rhymes and practicing his flow\nin the mirror, he would rap and watch himself grow\nhe knew that if he could just make it to the top\nhe could change his life, and make a better stop\n\none day, he had the chance to perform on stage\nin front of a crowd, he killed it, he killed it with rage\nhe had finally made it, he had achieved his dream\nand now he is a successful rapper, or so it seems\n\nhe never forgot where he came from, and he never will\nhe always remember the struggles that he had to fulfill\nhe is now a role model to kids in the hood\nshowing them that with hard work, they too could\n\nris

In [5]:
print("Corpus Length: {}".format(len(processed_text)))

characters = sorted(list(set(processed_text)))
print("Total Characters: {}".format(len(characters)))

character_indices = dict((c, i) for i,c in enumerate(characters))
indices_characters = dict((i, c) for i, c in enumerate(characters))

Corpus Length: 1073
Total Characters: 27


In [6]:
character_indices

{'\n': 0,
 ' ': 1,
 ',': 2,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'y': 26}

In [7]:
indices_characters

{0: '\n',
 1: ' ',
 2: ',',
 3: 'a',
 4: 'b',
 5: 'c',
 6: 'd',
 7: 'e',
 8: 'f',
 9: 'g',
 10: 'h',
 11: 'i',
 12: 'j',
 13: 'k',
 14: 'l',
 15: 'm',
 16: 'n',
 17: 'o',
 18: 'p',
 19: 'q',
 20: 'r',
 21: 's',
 22: 't',
 23: 'u',
 24: 'v',
 25: 'w',
 26: 'y'}

In [8]:
maxlen = 40
step = 5

sentences = []
next_characters = []

for i in range(0, len(processed_text) - maxlen, step):
    sentences.append(processed_text[i: i+maxlen])
    next_characters.append(processed_text[i + maxlen])

In [9]:
sentences

['once upon a time, there was a boy named ',
 'upon a time, there was a boy named mike\n',
 'a time, there was a boy named mike\nhe wa',
 'e, there was a boy named mike\nhe was rai',
 'ere was a boy named mike\nhe was raised i',
 'as a boy named mike\nhe was raised in the',
 'boy named mike\nhe was raised in the hood',
 'amed mike\nhe was raised in the hood, lif',
 'mike\nhe was raised in the hood, life was',
 'he was raised in the hood, life was neve',
 's raised in the hood, life was never qui',
 'sed in the hood, life was never quite\ngr',
 'n the hood, life was never quite\ngrowing',
 ' hood, life was never quite\ngrowing up, ',
 ', life was never quite\ngrowing up, he fa',
 'e was never quite\ngrowing up, he faced m',
 ' never quite\ngrowing up, he faced many s',
 'r quite\ngrowing up, he faced many strugg',
 'te\ngrowing up, he faced many struggles a',
 'owing up, he faced many struggles and st',
 ' up, he faced many struggles and strife\n',
 'he faced many struggles and strife\n

In [10]:
next_characters

['m',
 'h',
 's',
 's',
 'n',
 ' ',
 ',',
 'e',
 ' ',
 'r',
 't',
 'o',
 ' ',
 'h',
 'c',
 'a',
 't',
 'l',
 'n',
 'r',
 'b',
 'h',
 'h',
 'a',
 'h',
 'd',
 ' ',
 ' ',
 'i',
 'f',
 'a',
 's',
 'a',
 's',
 'a',
 'r',
 'e',
 'y',
 ' ',
 's',
 ' ',
 'c',
 'l',
 'h',
 'o',
 '\n',
 'r',
 'e',
 ' ',
 't',
 'l',
 'e',
 'o',
 'd',
 'k',
 't',
 'w',
 'h',
 'w',
 ' ',
 'o',
 'c',
 'e',
 ' ',
 '\n',
 's',
 'e',
 'i',
 ' ',
 'e',
 'd',
 'c',
 'n',
 's',
 'w',
 't',
 'i',
 ',',
 'w',
 ' ',
 'a',
 'a',
 'h',
 'l',
 'o',
 ' ',
 ' ',
 ' ',
 'e',
 'l',
 's',
 'k',
 ' ',
 'h',
 'p',
 'c',
 ' ',
 'g',
 's',
 'e',
 'd',
 'e',
 'e',
 ' ',
 '\n',
 ' ',
 ' ',
 'a',
 'e',
 'n',
 'o',
 'f',
 'o',
 'a',
 'n',
 'n',
 ' ',
 'o',
 'h',
 'l',
 'i',
 'e',
 'l',
 't',
 'h',
 'e',
 'h',
 'i',
 'y',
 'e',
 ' ',
 'a',
 'h',
 'd',
 ' ',
 'm',
 ' ',
 'h',
 ' ',
 'c',
 'f',
 'a',
 ',',
 's',
 ' ',
 's',
 ' ',
 'r',
 'g',
 'h',
 'h',
 'm',
 'o',
 'n',
 ' ',
 'r',
 'l',
 'a',
 's',
 'e',
 ' ',
 's',
 'g',
 't',
 'h',
 'd',
 

In [11]:
print("Vectorization")
device = 'cuda'
x = np.zeros((len(sentences), maxlen, len(characters)), dtype=np.float64)
y = np.zeros((len(sentences), len(characters)), dtype=np.float64)

for i, sentence in enumerate(sentences):
    for t, character, in enumerate(sentence):
        x[i, t, character_indices[character]] = 1
    y[i, character_indices[next_characters[i]]] = 1
    
x = torch.tensor(x).float().to(device)
y = torch.tensor(y).float().to(device)

Vectorization


RuntimeError: No CUDA GPUs are available

In [None]:
x.shape

In [None]:
x = torch.flatten(x, start_dim=1)

x.shape

In [None]:
y.shape

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def callback(model):
    start = 0
    stop = len(processed_text) - maxlen - 1

    #print("Start: {}".format(start))
    #print("Stop: {}".format(stop))

    start_index = random.randint(start, stop)

    #print("Start Index: {}".format(start_index))

    sentence = processed_text[start_index: start_index + maxlen]

    #print("Sentence: {}".format(sentence))
    #print("Sentence Length: {}".format(len(sentence)))

    generated = ''

    for i in range(400):
        x_predictions = np.zeros((1, maxlen, len(characters)))

        for t, char in enumerate(sentence):
            x_predictions[0, t, character_indices[char]] = 1

            # print(x_predictions)
        x_predictions = torch.tensor(x_predictions).float().to(device)
        x = torch.flatten(x_predictions, start_dim=1)

        preds = model.forward(x)[0].detach().cpu().numpy()

        next_index = sample(preds)
        #print("next_index: {}".format(next_index))
        next_char = indices_characters[next_index]
        #print("next_char: {}".format(next_char))

        generated += next_char
        sentence = sentence[1:] + next_char

    return sentence

In [None]:
class MultiLayerPerceptron(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()

        self.hidden = nn.Linear(input_dim, 500)
        self.output = nn.Linear(500, output_dim)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # f(x) = a(f(x))
        x = self.relu(self.hidden(x))
        y = self.sigmoid(self.output(x))

        return y

In [None]:
model = MultiLayerPerceptron(x.shape[1], y.shape[1]).to(device)

model

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.00001)
criterion = nn.CrossEntropyLoss()

def train_fn(model, optimizer, loss_fn, device):
    ave_loss = 0
    count = 0
    
    for i, data in enumerate(x):
        data = x[i]
        targets = y[i]
        
        # Forward
        predictions = model.forward(data)
        
        predictions = F.softmax(predictions, dim=-1)
        
        loss = loss_fn(predictions, targets)
        
        # Backward
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()

        count += 1
        ave_loss += loss.item()
    
    ave_loss = ave_loss / count

    return ave_loss

epochs = 1000

average_losses = []

for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    ave_loss = train_fn(model, optimizer, criterion, device)
    
    average_losses.append(ave_loss)
        
    print("Ave Loss: {}".format(ave_loss))
    
    generated_sentence = callback(model)
    
    print("Generated sentence:")
    print(generated_sentence)
    print("Length: {}".format(len(generated_sentence)))