In [0]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re, string
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.preprocessing import sequence
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer

from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

In [0]:
#Create dataframe from csv data
df = pd.read_csv('export_dataframe.csv', error_bad_lines=False, na_values=['nan'])
text = df['Synopsis']
target = df['Summary']

In [26]:
#Create the dataframe we are working with and drop some nan rows
df = pd.DataFrame({'Synopsis':text, 'Summary':target}, dtype='str')

df=df[df.Summary != 'nan']
df

Unnamed: 0,Synopsis,Summary
0,The unforgettable novel of a childhood in a sl...,harper lee consid book simpl love story
1,"The year 1984 has come and gone, but George Or...",deni novel hold imagin generations power admon...
2,Alternate Cover Edition ISBN: 0743273567 (ISBN...,stori fabul wealthi jay gatsbi new love beauti...
3,Harry Potter's life is miserable. His parents ...,harri potter life miserable
4,George Orwell's timeless and timely allegorica...,anim farm published stalinist russia seen target
...,...,...
9996,Ballu is a gangster who is arrested by Inspec...,ram show compass ballu tri provid inform lead ...
9997,The central figure of the film is represented ...,scienc use brain art use brain heart
9998,"Set in 2003, Huck Cheever is a young and tale...",billi older sister suzann warn huck hustl 10 c...
9999,The Namesake depicts the struggles of Ashoke a...,short after gogol vacat maxin family ashok dies


In [0]:
#Method for cleaning the synopsis text data
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub('\s+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([w for w in text.split() if w not in ENGLISH_STOP_WORDS])
    return text

df['Synopsis'] = df['Synopsis'].apply(clean_text)

In [45]:
#method for cleaning summary text data
def clean_title(title):
    title = str(title)
    title = title.lower()
    title = re.sub('\s+', ' ', title)
    title = title.translate(str.maketrans('', '', string.punctuation))
    title = ' '.join([w for w in title.split() if w not in ENGLISH_STOP_WORDS])
    return title

df['CleanTitle'] = df['Summary'].apply(clean_title)
df['CleanTitle'] = df['CleanTitle'].apply(lambda x : '_START_ '+ x + ' _END_')
print(df['CleanTitle'])

0        _START_ harper lee consid book simpl love stor...
1        _START_ deni novel hold imagin generations pow...
2        _START_ stori fabul wealthi jay gatsbi new lov...
3                _START_ harri potter life miserable _END_
4        _START_ anim farm published stalinist russia s...
                               ...                        
9996     _START_ ram compass ballu tri provid inform le...
9997     _START_ scienc use brain art use brain heart _...
9998     _START_ billi older sister suzann warn huck hu...
9999     _START_ short gogol vacat maxin family ashok d...
10000    _START_ practic line bathroom younger brother ...
Name: CleanTitle, Length: 9606, dtype: object


In [0]:
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(df['Synopsis'])

#Some globals required for later
latentdim = 500
max_text_length = 125
max_target_length = 20

#convert each sentence into an array of integer representations for the words
x_tr    =   x_tokenizer.texts_to_sequences(df['Synopsis']) 

#we are padding 0 here to the sequence so they are all the same size sequence
x_tr    =   sequence.pad_sequences(x_tr,  maxlen=max_text_length, padding='post') 

#Repeat the same as above but in the Y set
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(df['CleanTitle'])

y_tr = y_tokenizer.texts_to_sequences(df['CleanTitle'])

y_tr = sequence.pad_sequences(y_tr, maxlen=max_target_length, padding='post')


#xtr = x_tr.reshape(x_tr.shape[0],x_tr.shape[1],1)
#ytr = y_tr.reshape(y_tr.shape[0],y_tr.shape[1],1)


In [54]:
# Load in the pretrained model output by Training_Model.ipynb
from keras.models import load_model
model = load_model('model.h5')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 125)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 125, 500)     48378000    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 500)    18539000    input_2[0][0]                    
____________________________________________________________________________________________

In [0]:
dec_inputs = Input(shape=(None,))

# Grab the encoder part of the model and put it into one model
enc_model = Model(inputs=model.layers[0].input, outputs=model.layers[4].output)

# Setting up the input dimensions of states
dec_state_in_h = Input(shape=(latentdim,))
dec_state_in_c = Input(shape=(latentdim,))
dec_hidden_in = Input(shape=(max_text_length,latentdim))

# Grab the decodin embedding layer and initiliaze its dimensions
dec_emb = model.layers[3]
dec_emb_pred = dec_emb(dec_inputs)

# Grab the LSTM from the decoding part of the model
dec_lstm = model.layers[5]

# Get the states of the decoder LSTM
out, state_h, state_c = dec_lstm(dec_emb_pred, initial_state=[dec_state_in_h, dec_state_in_c])

# Grab the Dense layer of the model
dec_dense = model.layers[6]
out = dec_dense(out)

# Grab the encoder part of the original model and combine it into one model so we can use it
dec_model = Model([dec_inputs] + [dec_hidden_in,dec_state_in_h, dec_state_in_c], [out, state_h, state_c])

In [0]:
# Maps for index to word
reverse_target_word_index=y_tokenizer.index_word 
reverse_source_word_index=x_tokenizer.index_word 
target_word_index=y_tokenizer.word_index

In [0]:
def decode_sequence(input_seq):
    # Get the starting sequences from encoding layer model
    e_out, e_h, e_c = enc_model.predict(input_seq)
    # Generate an emtpy value to store target seq
    target_seq = np.zeros((1,1))

    # Use the added start word to tell where the target starts
    target_seq[0, 0] = target_word_index['start']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        # Here we get the predicition and states from the model
        output_tokens, h, c = dec_model.predict([target_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # If we predict a zero value we consider the summary finished so return
        if sampled_token_index == 0:
          return decoded_sentence

        # Do a lookup for the word that the predicted integer represents  
        sampled_token = reverse_target_word_index[sampled_token_index]

        # Add the word onto the sentence
        if(sampled_token!='end'):
            decoded_sentence += ' '+sampled_token

            # Our main exit condition is if we predict the end of the summary or hit max length of a summary
            if (sampled_token == 'end' or len(decoded_sentence.split()) >= (max_target_length-1)):
                stop_condition = True

        #update the cursor for the next word
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        #New states for the next word
        e_h, e_c = h, c

    return the 
    return decoded_sentence

In [0]:
#Converts a sequence into the summary it was originally
def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['start']) and i!=target_word_index['end']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString
#Converts a sequence into the synopsis it was originally
def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if(i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString

In [60]:
#Looping through all words and adding them to a dictionary for export
new_df = {'Synopsis':[], 'Extracted Summary':[], 'Abstracted Summary': [] }
for i in range(len(x_tr)):
    print("Synopsis:",seq2text(x_tr[i]))
    new_df['Synopsis'].append(seq2text(x_tr[i]))
    print("Original summary:",seq2summary(y_tr[i]))
    new_df['Extracted Summary'].append(seq2summary(y_tr[i]))
    print("Predicted summary:",decode_sequence(x_tr[i].reshape(1,max_text_length)))
    new_df['Abstracted Summary'].append(decode_sequence(x_tr[i].reshape(1,max_text_length)))
    print("\n")

Synopsis: unforgettable novel childhood sleepy southern town crisis conscience rocked kill mockingbird instant bestseller critical success published 1960 went win pulitzer prize 1961 later academy awardwinning film classiccompassionate dramatic deeply moving kill mockingbird takes readers roots human behavior innocence experience kindness cruelty love hatred humor pathos 18 million copies print translated languages regional story young alabama woman claims universal appeal harper lee considered book simple love story today regarded masterpiece american literature 
Original summary: harper lee consid book simpl love story 
Predicted summary:  movi lesson live life live hari aarthi deaf mute


Synopsis: year 1984 come gone george orwells prophetic nightmarish vision 1949 world timelier 1984 great modern classic negative utopia—a startlingly original haunting novel creates imaginary world completely convincing sentence words deny novels hold imaginations generations power admonitions—a po

KeyboardInterrupt: ignored

In [0]:
#Create dataframe and export to csv for topic modelling
ndf = pd.DataFrame.from_dict(new_df)
ndf
ndf.to_csv('all_summaries.csv', index=False)