In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
text = ["Get busy living or get busy dying",
        'You only live once, but if you do it right, once is enough',
        'If you want to live a happy life, tie it to a goal, not to people or things',
       'Never let the fear of striking out keep you from playing the game',
       "Money and success don’t change people; they merely amplify what is already there"]

In [3]:
len(text)

5

In [4]:
letters = set("".join(text)) #unique words

In [5]:
len(letters)

31

In [6]:
ws = WordNetLemmatizer()

In [7]:
def split_word_review(data):
    lemma_word = []
    for i in range(len(data)):
        data[i] = data[i].lower()
        word_token = word_tokenize(data[i])
        clean_data = [i for i in word_token if not i in stopwords.words() and i.isalnum()]
        
        a = []
        for i in clean_data:
            a.append(ws.lemmatize(i))
        lemma_word.append(a)
    return lemma_word     

In [36]:
split_word_review(text)

[['get', 'busy', 'living', 'get', 'busy', 'dying'],
 ['live', 'right', 'enough'],
 ['live', 'happy', 'life', 'tie', 'goal', 'people', 'thing'],
 ['never', 'let', 'fear', 'striking', 'keep', 'playing', 'game'],
 ['money', 'success', 'change', 'people', 'merely', 'amplify', 'already']]

In [8]:
int_to_word = dict(enumerate(letters))
word_to_int = {w:i for i,w in int_to_word.items()}

In [9]:
int_to_word[10]

'Y'

In [10]:
word_to_int['I']

23

In [11]:
len_text = len(max(text, key=len))

In [12]:
len_text

80

In [13]:
#Padding
for i in range(len(text)):
    while len(text[i]) < len_text:
        text[i] += " "

In [14]:
text

['Get busy living or get busy dying                                               ',
 'You only live once, but if you do it right, once is enough                      ',
 'If you want to live a happy life, tie it to a goal, not to people or things     ',
 'Never let the fear of striking out keep you from playing the game               ',
 'Money and success don’t change people; they merely amplify what is already there']

In [15]:
#Creating input sequence and target sequence
input_seq = []
target_seq = []

for i in range(len(text)):
    input_seq.append(text[i][:-1])
    target_seq.append(text[i][1:])
    print("Input Sequence: {}, \nTarget Sequence: {}".format(input_seq[i], target_seq[i]))

Input Sequence: Get busy living or get busy dying                                              , 
Target Sequence: et busy living or get busy dying                                               
Input Sequence: You only live once, but if you do it right, once is enough                     , 
Target Sequence: ou only live once, but if you do it right, once is enough                      
Input Sequence: If you want to live a happy life, tie it to a goal, not to people or things    , 
Target Sequence: f you want to live a happy life, tie it to a goal, not to people or things     
Input Sequence: Never let the fear of striking out keep you from playing the game              , 
Target Sequence: ever let the fear of striking out keep you from playing the game               
Input Sequence: Money and success don’t change people; they merely amplify what is already ther, 
Target Sequence: oney and success don’t change people; they merely amplify what is already there


In [16]:
for i in range(len(text)):
    input_seq[i] = [word_to_int[char] for char in input_seq[i]]
    target_seq[i] = [word_to_int[char] for char in target_seq[i]]

In [17]:
dict_size = len(word_to_int)
seq_len = len_text-1
batch_size = len(text)

def one_hot_encoding(sequence, dict_size, seq_len, batch_size):
    features = np.zeros((batch_size, seq_len, dict_size), dtype = np.float32)
    
    for i in range(batch_size):
        for j in range(seq_len):
            features[i,j, sequence[i][j]] = 1
            
    return features

In [18]:
input_seq = one_hot_encoding(input_seq, dict_size, seq_len, batch_size)

In [19]:
input_seq

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

    

In [20]:
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

In [21]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first = True)
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(x, hidden)
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden

In [22]:
model = Model(input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [23]:
epochs = 1000

for epoch in range(1, epochs+1):
    optimizer.zero_grad()
    output, hidden = model(input_seq)
    loss = loss_fn(output, target_seq.view(-1).long())
    loss.backward()
    optimizer.step()
    
    if epoch%100 == 0:
        print("Epochs:{}/{}, Loss:{:.2f}".format(epoch, epochs, loss.item()))

Epochs:100/1000, Loss:2.60
Epochs:200/1000, Loss:2.48
Epochs:300/1000, Loss:2.37
Epochs:400/1000, Loss:2.18
Epochs:500/1000, Loss:1.94
Epochs:600/1000, Loss:1.73
Epochs:700/1000, Loss:1.53
Epochs:800/1000, Loss:1.34
Epochs:900/1000, Loss:1.16
Epochs:1000/1000, Loss:1.04


In [24]:
def predict(model, char):
    char = np.array([[word_to_int[i] for i in char]])
    char = one_hot_encoding(char, dict_size, char.shape[1],1)
    char = torch.from_numpy(char)
    
    out, hidden = model(char)
    prob = nn.functional.softmax(out[-1], dim=0).data
    result = torch.max(prob, dim=0)[1].item()
    
    return int_to_word[result], hidden

In [30]:
def sample(model, out_len, start = 'You'):
    model.eval()
    start = start.lower()
    chars = [i for i in start]
    size = out_len-len(chars)
    
    for i in range(size):
        char, h = predict(model, chars)
        chars.append(char)
        
    return ''.join(chars)

In [31]:
sample(model, out_len=10, start="Get")

'get the g '

In [34]:
sample(model, out_len=20, start="You")

'youge  tot the g the'

In [35]:
sample(model, out_len=20, start="Money")

'money               '