In [None]:
import helper
import os
import pickle
import glob
import torch
import numpy as np
data_dir = '../input/friends-tv-series-screenplay-script/'
text = ""
for file in glob.glob(data_dir+"*.txt"):
    f = open(file, 'r')
    text += f.read()
    f.close()

In [None]:
text[:500]

In [None]:
text = ""
folder_name = "../input/friends-tv-series-screenplay-script/"
for f in glob.glob(folder_name + '/*.txt'):
    temp = open(f,'r')    
    text += temp.read()
    temp.close()

In [None]:
view_line_range = (0, 25)
print('The lines {} to {}:'.format(*view_line_range))
print('\n'.join(text.split('\n')[view_line_range[0]:view_line_range[1]]))

In [None]:
print('Dataset Stats')
unique_words = len ({word for word in text.split()})
print("Total # of Unique Words: ",unique_words)
lines = text.split('\n')
print("Total # of Lines: ",len(lines))
avg_words_perline = [len(line.split()) for line in lines]
print("Average number of words in each line: ", np.average(avg_words_perline))

# Pre-Processing

In [None]:
from collections import Counter
freq_word = Counter(text)
vocab_sorted = sorted(freq_word, key = freq_word.get, reverse = True)
int_to_vocab = {i : word for i, word in enumerate(vocab_sorted)}
vocab_to_int = {word : i for i, word in int_to_vocab.items()}

In [None]:
pun_dic = {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotation_mark||',
        ';': '||semicolon||',
        '!': '||exclamation_mark||',
        '?': '||question_mark||',
        '(': '||left_parentheses||',
        ')': '||right_Parentheses||',
        '-': '||dash||',
        '\n': '||return||'
    }

In [None]:
text = text[57:] # dropped the first two line( notice )

In [None]:
for key, token in pun_dic.items(): 
    text = text.replace(key, ' {} '.format(token))
text = text.lower()
text = text.split()

In [None]:
from collections import Counter
SPECIAL_WORDS = {'PADDING': '<PAD>'}
L_text = text + list(SPECIAL_WORDS.values())

freq_word = Counter(L_text)
vocab_sorted = sorted(freq_word, key = freq_word.get, reverse = True)
int_to_vocab = {i : word for i, word in enumerate(vocab_sorted)}
vocab_to_int = {word : i for i, word in int_to_vocab.items()}

In [None]:
int_text = [vocab_to_int[word] for word in text]

In [None]:
train_on_gpu = torch.cuda.is_available()

# Model

In [None]:
from torch.utils.data import TensorDataset, DataLoader
def get_dataloader(text, seq_length, batch_size):
    batch_num = len(text)//batch_size
    batch_words = text[: (batch_num * batch_size)]
    
    feature, target = [],[]
    target_len = len(batch_words[:-seq_length])
    
    for i in range(0, target_len):
        feature.append(batch_words[i: i + seq_length])
        target.append(batch_words[i + seq_length])
    
    target_tensors = torch.from_numpy(np.array(target))
    feature_tensors = torch.from_numpy(np.array(feature))
    
    data = TensorDataset(feature_tensors, target_tensors)
    
    data_loader = torch.utils.data.DataLoader(data, batch_size = batch_size, shuffle = True)
    
    return data_loader

In [None]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.3):
        super(RNN,self).__init__()
        self.n_layers  = n_layers
        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.dropout = nn.Dropout(dropout)
        self.embedding_dim = embedding_dim
        
        # Model Layers
        self.embedding = nn.Embedding(vocab_size,embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = dropout, batch_first = True)
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, nn_input, hidden):
        batch_size = nn_input.size(0)
        nn_input = nn_input.long()
        
        embed_out = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embed_out, hidden)
        
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        lstm_out = self.dropout(lstm_out)
        lstm_out = self.fc(lstm_out)
        
        lstm_out = lstm_out.view(batch_size, -1, self.output_size)
        lstm_output = lstm_out[:, -1]
        
        return lstm_output, hidden
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        
        if train_on_gpu:
            hidden = (weight.new(self.n_layers, batch_size , self.hidden_dim).zero_().cuda(),
                      weight.new(self.n_layers, batch_size , self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size , self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size , self.hidden_dim).zero_())
        
        return hidden


In [None]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    if (train_on_gpu):
        inp, target = inp.cuda(), target.cuda()
    
    hidden = tuple([i.data for i in hidden])
    
    rnn.zero_grad()
    out, hidden = rnn(inp, hidden)
    
    loss = criterion(out, target)
    loss.backward()
    
    clip = 5
    
    nn.utils.clip_grad_norm_(rnn.parameters(), clip)
    
    optimizer.step()
    
    return loss.item(), hidden

In [None]:
def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    batch_losses = []
    rnn.train()
    print("Training for %d epoch(s)..." % n_epochs)
    for epoch_i in range(1, n_epochs + 1):
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            n_batches = len(train_loader.dataset)//batch_size
            if(batch_i > n_batches):
                break
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)          
            
            batch_losses.append(loss)
            if batch_i % show_every_n_batches == 0:
                print('Epoch: {:>4}/{:<4}  Loss: {}\n'.format(
                    epoch_i, n_epochs, np.average(batch_losses)))
                batch_losses = []

    return rnn

In [None]:
sequence_length = 8
batch_size = 256
train_loader = get_dataloader(int_text, sequence_length, batch_size)

In [None]:
epoch = 25
lr = 0.0003
vocab_size = len(vocab_to_int)
output_size = len(vocab_to_int)
embedding_dim = 256
hidden_dim = 512
n_layers = 3
show_every_n_batches = 500

In [None]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
if train_on_gpu:
    rnn.cuda()
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, epoch, show_every_n_batches)
save_filename = os.path.splitext(os.path.basename('./rnn_trained'))[0] + '.pt'
torch.save(trained_rnn, save_filename)
print('Model Trained and Saved')

In [None]:
print(rnn)

# RNN, LSTM The Directors

In [None]:
import torch.nn.functional as F

def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100):
    """
    Generate text using the neural network
    :param decoder: The PyTorch Module that holds the trained neural network
    :param prime_id: The word id to start the first prediction
    :param int_to_vocab: Dict of word id keys to word values
    :param token_dict: Dict of puncuation tokens keys to puncuation values
    :param pad_value: The value used to pad a sequence
    :param predict_len: The length of text to generate
    :return: The generated text
    """
    rnn.eval()
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda()
        else:
            current_seq = torch.LongTensor(current_seq)
        hidden = rnn.init_hidden(current_seq.size(0))
        output, _ = rnn(current_seq, hidden)
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu()
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum())
        word = int_to_vocab[word_i]
        predicted.append(word)
        current_seq = current_seq.cpu().numpy()
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    return gen_sentences

In [None]:
gen_length = 400 # modify the length to your preference
prime_word = 'joey' # name for starting the script
pad_word = SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word], int_to_vocab, pun_dic, vocab_to_int[pad_word], gen_length)
print(generated_script)