In [96]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
plt.switch_backend('agg')
import matplotlib.ticker as ticker
from nltk.corpus import stopwords
import string
import re

import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

from io import open
import unicodedata
import random
import pickle
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from rouge import Rouge

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patel.ayushj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [97]:
train_data = pd.read_csv('Training_data.csv')
validate_data = pd.read_csv('Validation_data.csv')
test_data = pd.read_csv('Testing_data.csv')

train_data = train_data[['summary_id','chapter','chapter_length','summary_name','summary_text','summary_analysis','summary_length','analysis_length']]
validate_data = validate_data[['summary_id','chapter','chapter_length','summary_name','summary_text','summary_analysis','summary_length','analysis_length']]
test_data = test_data[['summary_id','chapter','chapter_length','summary_name','summary_text','summary_analysis','summary_length','analysis_length']]

In [98]:
def pre_processing(sentence):
    #lower text
    sentence = sentence.lower()

    pattern = r"\s*\([a-zA-Z]\s_\)"
    sentence = re.sub(pattern, "", sentence)

    sentence = sentence.replace("\n"," ")

    # replacing everything with space
    sentence = re.sub(r"[=.!,¿?.!+,;¿/:|%()<>।॰{}#_'\"@$^&*']", " ", sentence)
    sentence = re.sub(r"…", " ", sentence)

    #remove double quotes
    sentence = re.sub(r'"', " ", sentence)

    #remove numbers
    sentence = re.sub(r'[0-9]', "", sentence)
    #sentence = re.sub(r'#([^s]+)', r'1', sentence)

    #remove website links
    sentence = re.sub('((www.[^s]+)|(https?://[^s]+))','',sentence)

    #remove @anythin here
    #sentence = re.sub('@[^s]+','',sentence)

    #remove multiple spaces
    sentence = re.sub(r'[" "]+', " ", sentence)

    # remove extra space
    sentence = sentence.strip()

    return sentence

In [99]:
train_data['summary_text'] = train_data['summary_text'].apply(lambda x: pre_processing(x))
train_data['chapter'] = train_data['chapter'].apply(lambda x: pre_processing(x))

validate_data['summary_text'] = validate_data['summary_text'].apply(lambda x: pre_processing(x))
validate_data['chapter'] = validate_data['chapter'].apply(lambda x: pre_processing(x))

test_data['summary_text'] = test_data['summary_text'].apply(lambda x: pre_processing(x))
test_data['chapter'] = test_data['chapter'].apply(lambda x: pre_processing(x))

In [100]:
train_data.describe()

Unnamed: 0,chapter_length,summary_length,analysis_length
count,9600.0,9600.0,9600.0
mean,3897.230625,376.896354,274.324063
std,4203.548176,331.915025,385.446081
min,42.0,2.0,1.0
25%,1674.0,171.0,1.0
50%,2779.0,283.0,133.0
75%,4571.0,467.0,466.0
max,114226.0,4852.0,5761.0


In [101]:
# Function to extract first 300 words
def extract_first_60_words(text):
    words = text.split()[:100]
    return ' '.join(words)

# Apply the function to each chapter
train_data['first_60_words'] = train_data['chapter'].apply(extract_first_60_words)
validate_data['first_60_words'] = validate_data['chapter'].apply(extract_first_60_words)
test_data['first_60_words'] = test_data['chapter'].apply(extract_first_60_words)

# Display the result
print(train_data['first_60_words'])

0       mine ear is open and my heart prepared the wor...
1       before these fields were shorn and tilled full...
2       well go thy way thou shalt not from this grove...
3       in such a night did thisbe fearfully o ertrip ...
4       those strains that once did sweet in zion glid...
                              ...                        
9595    there was a train for turin and paris that eve...
9596    it was not with surprise it was with a feeling...
9597    isabel s arrival at gardencourt on this second...
9598    he had told her the first evening she ever spe...
9599    the life and death of scyld the famous race of...
Name: first_60_words, Length: 9600, dtype: object


In [102]:
# Function to extract first 300 words
def extract_first_60_words(text):
    words = text.split()[:100]
    return ' '.join(words)

# Apply the function to each chapter
train_data['first_60_words_summary'] = train_data['summary_text'].apply(extract_first_60_words)
validate_data['first_60_words_summary'] = validate_data['summary_text'].apply(extract_first_60_words)
test_data['first_60_words_summary'] = test_data['summary_text'].apply(extract_first_60_words)

# Display the result
print(train_data['first_60_words_summary'])


0       before any characters appear the time and geog...
1       in another part of the forest by the river a f...
2       when the mounted party from fort howard approa...
3       the pursuit of magua is unsuccessful but hawke...
4       heyward and the girls are uneasy and gamut is ...
                              ...                        
9595    before isabel leaves rome she goes to see pans...
9596    isabel is greeted by henrietta stackpole at ch...
9597    isabel arrives at gardencourt the house is ver...
9598    isabel remembers that when she first came to g...
9599    beowulf begins with the legends of the warrior...
Name: first_60_words_summary, Length: 9600, dtype: object


In [103]:
train_data['first_60_words'][0]

'mine ear is open and my heart prepared the worst is worldly loss thou canst unfold say is my kingdom lost shakespeare it was a feature peculiar to the colonial wars of north america that the toils and dangers of the wilderness were to be encountered before the adverse hosts could meet a wide and apparently an impervious boundary of forests severed the possessions of the hostile provinces of france and england the hardy colonist and the trained european who fought at his side frequently expended months in struggling against the rapids of the streams or in effecting the rugged'

In [104]:
train_data = train_data[0:10]
validate_data = validate_data[0:10]
test_data = test_data[0:10]

In [105]:
train_data.describe()

Unnamed: 0,chapter_length,summary_length,analysis_length
count,10.0,10.0,10.0
mean,5305.4,356.9,257.3
std,2350.395626,134.096524,136.337856
min,3075.0,198.0,75.0
25%,3166.0,270.25,150.75
50%,4527.5,325.0,252.0
75%,7428.75,386.0,331.25
max,8710.0,612.0,473.0


In [106]:
SOS_token = 0
EOS_token = 1


class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        ''' Add every word in a sentence to the vocabulary '''
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        ''' Add a word to the vocabulary'''
        if word not in self.word2index:
            #Include the word in the mapping from word to index
            self.word2index[word] = self.n_words
            #Set the count of ocurrencies of the word to 1
            self.word2count[word] = 1
            # Include the word in the indexes
            self.index2word[self.n_words] = word
            # Increment by 1 the number of words
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
    def save_to_file(self, filename):
        ''' Save the Vocab object to a file'''
        with open(filename,'wb') as f:
            pickle.dump(self,f) 

def load_vocab(filename):
    ''' Load a Vocab instance from a file'''
    with open(filename,'rb') as f:
        v = pickle.load(f)
    return v

def read_vocabs(text, summary, reverse=False):
    print("Reading lines...")
    
    # Split every line into pairs and normalize
    pairs = [[text[i],summary[i]] for i in range(len(text))]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Vocab(summary)
        output_lang = Vocab(text)
    else:
        input_lang = Vocab(text)
        output_lang = Vocab(summary)

    return input_lang, output_lang, pairs

def prepare_data(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = read_vocabs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

In [107]:
x_train = train_data['first_60_words']
y_train = train_data['first_60_words_summary']

# x_validate = validate_data['chapter']
# y_validate = validate_data['summary_text']

# x_test = test_data['chapter']
# y_test = test_data['summary_text']

In [108]:
# Create the vocabularies of the inout and output data and return the data in pairs of (source text, summary)
input_lang, output_lang, pairs = prepare_data( x_train, y_train , False)
print(random.choice(pairs))

Reading lines...
Read 10 sentence pairs
Counting words...
Counted words:
0    mine ear is open and my heart prepared the wor...
1    before these fields were shorn and tilled full...
2    well go thy way thou shalt not from this grove...
3    in such a night did thisbe fearfully o ertrip ...
4    those strains that once did sweet in zion glid...
5    they do not sleep on yonder cliffs a grisly ba...
6    be gay securely dispel my fair with smiles the...
7    i fear we shall outsleep the coming morn as mu...
8    clo --i am gone sir and anon sir i ll be with ...
9    i ll seek a readier path parnell the route tak...
Name: first_60_words, dtype: object 545
0    before any characters appear the time and geog...
1    in another part of the forest by the river a f...
2    when the mounted party from fort howard approa...
3    the pursuit of magua is unsuccessful but hawke...
4    heyward and the girls are uneasy and gamut is ...
6    in the stillness that follows heyward finds it...
7    th

In [109]:
# For training data
x_train = train_data['first_60_words']
y_train = train_data['first_60_words_summary']
max_chapter_length_train = max(len(chapter.split(' ')) for chapter in x_train)
max_summary_length_train = max(len(summary.split(' ')) for summary in y_train)

# For validation data
x_validate = validate_data['first_60_words']
y_validate = validate_data['first_60_words_summary']
max_chapter_length_validate = max(len(chapter.split(' ')) for chapter in x_validate)
max_summary_length_validate = max(len(summary.split(' ')) for summary in y_validate)

# For test data
x_test = test_data['first_60_words']
y_test = test_data['first_60_words_summary']
max_chapter_length_test = max(len(chapter.split(' ')) for chapter in x_test)
max_summary_length_test = max(len(summary.split(' ')) for summary in y_test)

# Determine the overall maximum length for each type of data
max_length_train = max(max_chapter_length_train, max_summary_length_train) + 1
max_length_validate = max(max_chapter_length_validate, max_summary_length_validate) + 1
max_length_test = max(max_chapter_length_test, max_summary_length_test) + 1

print("Training data: Maximum chapter length =", max_chapter_length_train, ", Maximum summary length =", max_summary_length_train)
print("Validation data: Maximum chapter length =", max_chapter_length_validate, ", Maximum summary length =", max_summary_length_validate)
print("Test data: Maximum chapter length =", max_chapter_length_test, ", Maximum summary length =", max_summary_length_test)

print("Overall maximum length for training data:", max_length_train)
print("Overall maximum length for validation data:", max_length_validate)
print("Overall maximum length for test data:", max_length_test)

Training data: Maximum chapter length = 100 , Maximum summary length = 100
Validation data: Maximum chapter length = 100 , Maximum summary length = 100
Test data: Maximum chapter length = 100 , Maximum summary length = 100
Overall maximum length for training data: 101
Overall maximum length for validation data: 101
Overall maximum length for test data: 101


In [121]:
MAX_LENGTH = 101
print(MAX_LENGTH)

101


In [122]:
class EncoderRNN(nn.Module):
    ''' Define an encoder in a seq2seq architecture'''
    def __init__(self, input_size, hidden_size):
        ''' Initialize tyhe encoder instance defining its parameters:
            Input:
                - input_size: the size of the vocabulary
                - hidden:size: size of the hidden layer
        '''
        super(EncoderRNN, self).__init__()
        # Set the hidden size
        self.hidden_size = hidden_size
        # Create the embedding layer of size (vocabulary length, hidden_size) 
        self.embedding = nn.Embedding(input_size, hidden_size)
        # Create a GRU layer
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        ''' Run a Forward pass of the encoder to return outputs
            Input:
                Input: a tensor element (integer) representing the next word in the sentence
                hidden: a tensor, the previous hidden state of the encoder
        '''
        # Get the embedding of the input
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        
        # Apply a forward step of the GRU returning the output features and
        # the hidden state of the actual time step
        output, hidden = self.gru(output, hidden)
        
        return output, hidden

    def initHidden(self):
        ''' Initialize the hidden state of the encoder, tensor of zeros'''
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [123]:
class AttnDecoderRNN(nn.Module):
    ''' Define a decoder with atention in a seq2seq architecture'''
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        ''' Initialize the decoder instance defining its parameters:
            Input:
                - hidden_size:size: size of the hidden layer (Hyperparameter)
                - output_size: the size of the vocabulary of the output summary
                - dropout_p: dropout probability to apply
                - max_length: max length (number of words) of an output or summary
        '''

        super(AttnDecoderRNN, self).__init__()
        # Set parameters of the decoder
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        #Create an embedding layer for the input (output vocabulary, hidden size)
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        # Create some linear layers to build the attention mechanism
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        # A dropout layer
        self.dropout = nn.Dropout(self.dropout_p)
        # A GRU layer
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        # A Fully-connected layer
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        ''' Run a Forward pass of the decoder to return outputs
            Input:
                Input: a tensor element (integer) representing the previous output of the decoder
                hidden: a tensor, the previous hidden state of the decoder
                Encoder outputs: a tensor, outputs of the encoder
        '''
        
        #Get the embedding representation of the input
        embedded = self.embedding(input).view(1, 1, -1)
        # Apply dropout 
        embedded = self.dropout(embedded)
        #Calculate the attention weights of the attention mechanism using the encoder states
        #in previous time steps
        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        
        #Calculate the context vectors fo the attention mechanism using the attention weights
        # and the encoder outputs
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)
        output = F.relu(output)
        
        # Apply a forward pass to the GRU layer of the decider using the output from the attention
        # as input and the hidden state
        output, hidden = self.gru(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1)
        # return the output features, the hidden state and the attention weights
        return output, hidden, attn_weights

    def initHidden(self):
        ''' Initialize the hidden state of the encoder, tensor of zeros'''
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [124]:
def indexesFromSentence(lang, sentence):
    ''' Transform a sentence in string format to a list of indexes or integers.
            The model need to be feeded with numbers, not characters
            Input:
                - sentence: a string
            Output:
                - a list of integers, the representation of the sentence in the vector space.
    '''
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    ''' Transform a sentence in string format to tensor of indexes or integers.
            Out pytorch model work with tensor objects
            Input:
                - sentence: a string
            Output:
                - a tensor of integers, the representation of the sentence in the vector space.
    '''
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    ''' Convert a pair of text data (source text, summary) to tensors
        Input:
        - pair: tuple of strings, the source text and its summary
        Output:
        - tuple of tensors, the input tensor and the outout one
    '''
    # Convert the source text to the input tensor
    input_tensor = tensorFromSentence(input_lang, pair[0])
    # Convert the summary to the output tensor
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [125]:
teacher_forcing_ratio = 0.5

In [126]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    ''' Run all the steps in the training phase of a batch of examples
        Input:
        - input_tensor: a tensor, vector representation of the input text
        - target_tensor: a tensor, vector representation of the expect or labelled output or summary
        - encoder: a Class Encoder object, the encoder
        - decoder: a Class AttnDecoder object, the decoder
        - encoder_optimizer: a torch optimizer, the optimizer of the encoder
        - decoer_optimizer: a torch optimizer, the optimizer of the decoder
        - criterion: a pytoch loss function
        - max_length: an integer, maximun length of an output
    '''
    #Init the encoder hidden state
    encoder_hidden = encoder.initHidden()
    
    # Reset the optimizer
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # Set the length if the source text and the summary
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    # Create the initial encoder output, all zeros
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    # For every token in the source text or inout
    for ei in range(input_length):
        # Forward pass of the encoder to get the encoder output and hidden state
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
        
    # Set the initial decoder input as the SOS token
    decoder_input = torch.tensor([[SOS_token]], device=device)
    #Set the initial decoder hidden state equals to the last encoder hidden state
    decoder_hidden = encoder_hidden

    # Active teacher forcing with probability teacher_forcing_ratio 
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            # Forward pass of the decoder returning the decoder output, hidden state and context vector
            # of the attention mechanism
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            # Increment the loss function by the loss of the decoder output in the actual time step
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            # Forward pass of the decoder returning the decoder output, hidden state and context vector
            # of the attention mechanism
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
             # Select the decoder output with the highest probability
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            # Increment the loss function by the loss of the decoder output in the actual time step
            loss += criterion(decoder_output, target_tensor[di])
            # Stop training if the EOS token is returned
            if decoder_input.item() == EOS_token:
                break
   # Apply the backward pass to calculate and propagate the loss
    loss.backward()
    # Apply a step of the optimizers
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    # Return the final loss
    return loss.item() / target_length

In [127]:
import time
import math

def asMinutes(s):
    ''' Return the seconds, s, to a string in the format: Xm Ys'''
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    ''' Return '''
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def showPlot(points):
    ''' Plot the points in a line graph to show a training metric'''
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [128]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    ''' Train a encoder-decoder model on the input x for n_iters iterations
        Input:
        - encoder: a Class Encoder object, the encoder
        - decoder: a Class AttnDecoder object, the decoder
        - x: array of strings, source texts of the training dataset
        - y: array of strings, target texts or summaries of the training dataset
        - vocab_input: a Vocab Class object, vocabulary of the source texts
        - vocab_output: a Vocab Class object, vocabulary of the target texts
        - n_iters: integer, number of iterations
        - print_every: integer, print the progress every print_every iteration
        - plot_every: integer, plot the losses every plot_every iteration
        - learning_rate: float, learning rate
    '''

    print("Training....")
    # Get the current time
    start = time.time()
    # Initialize variables for progress tracking
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    # Create the optimizer for the encoder and the decoder
    encoder_optimizer = optim.AdamW(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.AdamW(decoder.parameters(), lr=learning_rate)
    # Extract the training set randomly for all the iterations
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    # Set the function loss to apply
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        if iter% 1000 == 0:
            print(iter,"/",n_iters + 1) # Plot progress
            
        # Get the next pair of source text and target to train on
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]
        # Train on the pair of data selected
        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        # Set the variable to plot the progress
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            # Print the ETA and current loss
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            # Plot the current loss
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [129]:
def predict(encoder, decoder, sentence, input_lang, output_lang, max_length=MAX_LENGTH):
    ''' Function to predict the summary of the source text sentence with a max length
        Input:
        - encoder: a Class Encoder object, the encoder
        - decoder: a Class AttnDecoder object, the decoder
        - input_lang: a Vocab Class object, vocabulary of the source texts
        - output_lang: a Vocab Class object, vocabulary of the target texts
        - sentence: string, source text to predict
    '''
    with torch.no_grad():
        # Get the tensor of the source text
        input_tensor = tensorFromSentence(input_lang, sentence)
        # Calculate the length of the source text
        input_length = input_tensor.size()[0]
        # Set the initial hidden state of the encoder
        encoder_hidden = encoder.initHidden()
        # Set the initial encoder outputs
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        # For every word in the input
        for ei in range(input_length):
            # Forward pass of the encoder
            encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            encoder_outputs[ei] = encoder_output[0, 0]  # Update encoder_outputs

        # Initialize decoder_attentions with the correct dimensions
        decoder_attentions = torch.zeros(max_length, input_length)

        # Set the initial input of the decoder 
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        # Set the initial hidden state of the decoder to the hidden state of the decoder in the last time step
        decoder_hidden = encoder_hidden

        decoded_words = []
        # For every word or step in the output sequence
        for di in range(max_length):
            # Forward pass of the decoder
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            # Save the decoder attention vector of the step
            decoder_attentions[di, :input_length] = decoder_attention.data.squeeze()
            # Get the element in the decoder output with the highest probability (the best output)
            topv, topi = decoder_output.data.topk(1)
            # If the token returned is EOS then finish
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                # Append the token in the summary returned by the decoder
                decoded_words.append(output_lang.index2word[topi.item()])
            # Set the decoder input to the output selected
            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]


In [130]:
def generate_predictions(x_test, encoder, decoder, input_vocab, output_vocab, max_length, print_every=20):
    ''' Generate the predicted summaries of the source texts on x_test
        Input:
        - x_test: list of strings, the source texts
        - encoder: a Class Encoder object, the encoder
        - decoder: a Class AttnDecoder object, the decoder
        - input_vocab: a Vocab Class object, vocabulary of the source texts
        - output_vocab: a Vocab Class object, vocabulary of the target texts
        - max_length: integer, max length of the output summary
        - print_every: integer, print progress every print_every iterations
    '''
    predicted_summaries = []
    # Set a progress bar
    #kbar = pkbar.Kbar(target=len(x_test), width=8)
    # Para cada text or document in the validation dataset
    for i,doc in enumerate(x_test):
        # Predict the summary for the document
        #pred_summ = predict(doc,vocab,params,batch_size=1)
        pred_summ,_ = predict(encoder, decoder, doc, input_vocab, output_vocab, max_length)
        predicted_summaries.append(' '.join(pred_summ[:-1]))
        #predicted_summaries.append(' '.join(pred_summ))
        
        #if i%print_every==0:
        #    kbar.update(i)
            
    # Set teh labeled summaries as the y_test variable, column summary of our dataset
    return predicted_summaries

In [131]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = predict(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [132]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [133]:
hidden_size = 100

encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.2).to(device)

trainIters(encoder1, attn_decoder1, 5000, print_every=100)

Training....
0m 16s (- 13m 9s) (100 2%) 5.1421
0m 32s (- 12m 48s) (200 4%) 4.2405
0m 47s (- 12m 29s) (300 6%) 4.0311
1m 3s (- 12m 12s) (400 8%) 3.9682
1m 19s (- 11m 55s) (500 10%) 3.8542
1m 34s (- 11m 36s) (600 12%) 3.6254
1m 50s (- 11m 20s) (700 14%) 3.9851
2m 6s (- 11m 1s) (800 16%) 3.1434
2m 21s (- 10m 44s) (900 18%) 3.7077
1000 / 5001
2m 37s (- 10m 28s) (1000 20%) 3.7631
2m 52s (- 10m 11s) (1100 22%) 3.4893
3m 7s (- 9m 54s) (1200 24%) 2.9858
3m 23s (- 9m 38s) (1300 26%) 3.1708
3m 38s (- 9m 22s) (1400 28%) 3.2616
3m 54s (- 9m 6s) (1500 30%) 3.5877
4m 9s (- 8m 50s) (1600 32%) 3.7465
4m 25s (- 8m 34s) (1700 34%) 3.4385
4m 40s (- 8m 18s) (1800 36%) 2.9138
4m 56s (- 8m 3s) (1900 38%) 3.8292
2000 / 5001
5m 11s (- 7m 47s) (2000 40%) 3.4716
5m 26s (- 7m 30s) (2100 42%) 3.6602
5m 42s (- 7m 15s) (2200 44%) 3.8371
5m 57s (- 6m 59s) (2300 46%) 3.6999
6m 12s (- 6m 43s) (2400 48%) 3.9038
6m 28s (- 6m 28s) (2500 50%) 3.3541
6m 43s (- 6m 12s) (2600 52%) 2.9057
6m 58s (- 5m 56s) (2700 54%) 3.3697
7

In [134]:
torch.save(encoder1.state_dict(), './enc.w')
torch.save(attn_decoder1.state_dict(), './att.w')
# Save the vocabularies
input_lang.save_to_file('input_vocab.pkl')
output_lang.save_to_file('output_vocab.pkl')

In [143]:
# Save the model parameters
torch.save(encoder1.state_dict(), 'encoder_checkpoint.pth')
torch.save(attn_decoder1.state_dict(), 'decoder_checkpoint.pth')

In [135]:
x_test = validate_data['first_60_words'].values
y_test = validate_data['first_60_words_summary'].values
# Generate the predctions on the validation dataset
predicted_summaries = generate_predictions(x_test, encoder1, attn_decoder1, input_lang, output_lang, MAX_LENGTH, 100)
# Set teh labeled summaries as the y_test variable, column summary of our dataset
labeled_summaries = y_test

In [137]:
print('\n Pred: ',predicted_summaries[1],'\n Target: ', labeled_summaries[1])
print('\n Pred: ',predicted_summaries[2],'\n Target: ', labeled_summaries[2])
print('\n Pred: ',predicted_summaries[3],'\n Target: ', labeled_summaries[3])
print('\n Pred: ',predicted_summaries[4],'\n Target: ', labeled_summaries[4])
print('\n Pred: ',predicted_summaries[5],'\n Target: ', labeled_summaries[5])


 Pred:  in another part of the forest to the river is now of his tribe of the forest habits of his tribe of the forest by a long semi-nude war-painted body and chingachgook with his hunting river and chingachgook with his hunting shirt to be his semi-nude war-painted body the forest hawkeye and chingachgook to be forest by the river and chingachgook to be 
 Target:  in another part of the forest by the river a few miles to the west hawkeye and chingachgook appear to be waiting for someone as they talk with low voices it is now afternoon the indian and the scout are attired according to their forest habits chingachgook with his semi-nude war-painted body and scalping tuft of hair his tomahawk scalping knife and short rifle hawkeye with his hunting shirt skin cap buckskin leggings knife pouch and horn and long rifle they discuss their respective forefathers and chingachgook relates the slow demise of his tribe of mohicans so that only he

 Pred:  when the mounted party from fort howard 

In [139]:
def save_textfile(filename, strings):
    ''' Save the contect of a list of strings to a file called filename
    
        Input:
           - filename: name of the file to save the strings
           - strings: a list of string to save to disk
    '''
    
    with open(filename, 'w') as f:
        for item in strings:
            #Remove any \n in the string
            item = remove_CTL(item)
            f.write("%s\n" % item)

def eval_metrics(preds, targets, avg=True):
    ''' Evaluate the ROUGE metrics ROUGE-2 and ROUGE-L for every pair predicted summary - target summary
    
        Input:
           - preds: list of strings, predicted summaries
           - targets: list of string, target summaries
        Output:
            - rouge2_f_metric: list of float, the Rouge-2 fscore for every predicted summary
            - rougel_f_metric: list of float, the Rouge-L fscore for every predicted summary
    '''
    #Lets calculate the rouge metrics for every document
    rouge = Rouge()
    scores = rouge.get_scores(preds, targets, avg)
    # Create the output variables
    if avg:
        rouge2_f_metric = scores['rouge-2']['f']
        rouge2_p_metric = scores['rouge-2']['p']
        rouge2_r_metric = scores['rouge-2']['r']
        rougel_f_metric = scores['rouge-l']['f']
        rougel_p_metric = scores['rouge-l']['p']
        rougel_r_metric = scores['rouge-l']['r']
    else:
        rouge2_f_metric = [score['rouge-2']['f'] for score in scores]
        rouge2_p_metric = [score['rouge-2']['p'] for score in scores]
        rouge2_r_metric = [score['rouge-2']['r'] for score in scores]
        rougel_f_metric = [score['rouge-l']['f'] for score in scores]
        rougel_p_metric = [score['rouge-l']['p'] for score in scores]
        rougel_r_metric = [score['rouge-l']['r'] for score in scores]

    return rouge2_f_metric, rouge2_p_metric, rouge2_r_metric, rougel_f_metric, rougel_p_metric, rougel_r_metric

In [141]:
# Calculate the Rouge-2 and Rouge-L metrics for the validation dataset
r2_f, r2_p, r2_r, rl_f, rl_p, rl_r = eval_metrics(predicted_summaries, list(labeled_summaries), False)
print('Mean Rouge-2 FScore: ',np.mean(r2_f), 'Mean Rouge-L FScore: ',np.mean(rl_f))
#Store the results on the dataframe
validate_data['pred_summary'] = predicted_summaries
validate_data['rouge2-f'] = r2_f
validate_data['rouge2-p'] = r2_p
validate_data['rouge2-r'] = r2_r
validate_data['rougel-f'] = rl_f
validate_data['rougel-p'] = rl_p
validate_data['rougel-r'] = rl_r

Mean Rouge-2 FScore:  0.2669670780243491 Mean Rouge-L FScore:  0.4223856568250451


In [142]:
validate_data.to_csv('results.csv', index=False)
validate_data.head(5)

Unnamed: 0,summary_id,chapter,chapter_length,summary_name,summary_text,summary_analysis,summary_length,analysis_length,first_60_words,first_60_words_summary,pred_summary,rouge2-f,rouge2-p,rouge2-r,rougel-f,rougel-p,rougel-r
0,chapters 1-2,mine ear is open and my heart prepared the wor...,6471.0,Chapters 1-2,before any characters appear the time and geog...,These two chapters introduce the reader to the...,388.0,473.0,mine ear is open and my heart prepared the wor...,before any characters appear the time and geog...,in the time and is the last war to the horrors...,0.151515,0.151515,0.151515,0.263736,0.705882,0.162162
1,chapter 3,before these fields were shorn and tilled full...,3132.0,Chapter 3,in another part of the forest by the river a f...,This chapter introduces the other three main a...,198.0,149.0,before these fields were shorn and tilled full...,in another part of the forest by the river a f...,in another part of the forest to the river is ...,0.335404,0.435484,0.272727,0.395833,0.730769,0.271429
2,chapter 4,well go thy way thou shalt not from this grove...,3075.0,Chapter 4,when the mounted party from fort howard approa...,Since this chapter is mostly one of surface ac...,319.0,75.0,well go thy way thou shalt not from this grove...,when the mounted party from fort howard approa...,when the mounted party from fort howard approa...,0.30303,0.30303,0.30303,0.490196,0.862069,0.342466
3,chapter 5,in such a night did thisbe fearfully o ertrip ...,3268.0,Chapter 5,the pursuit of magua is unsuccessful but hawke...,Here the reader encounters the first bloodshed...,329.0,156.0,in such a night did thisbe fearfully o ertrip ...,the pursuit of magua is unsuccessful but hawke...,the pursuit of is unsuccessful has wounded him...,0.10101,0.10101,0.10101,0.288889,0.764706,0.178082
4,chapter 6,those strains that once did sweet in zion glid...,3873.0,Chapter 6,heyward and the girls are uneasy and gamut is ...,This chapter shows Cooper in his most inventiv...,321.0,128.0,those strains that once did sweet in zion glid...,heyward and the girls are uneasy and gamut is ...,heyward and the girls have gamut have uneasy a...,0.292398,0.347222,0.252525,0.455446,0.821429,0.315068
