Import all the required packages

In [1]:
import numpy as np
import pandas as pd 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model

In [124]:
data = pd.read_csv('processed_reviews.csv',nrows=100000)

Now we will set a max length to both our summary and text first we will compute the length of all the sentences and find their mean

In [125]:
text_len_list, summ_len_list = [],[]

for i in range(len(data['text'])):
    text_len_list.append(len(data['text'][i].split()))
    summ_len_list.append(len(data['summary'][i].split()))    

In [126]:
print(np.mean(text_len_list), np.mean(summ_len_list))

38.792962162284525 4.0105488336295005


With that we will fix the length of text to be 38 but a summary of just 4 words is too short, so we'll take around 8 words for each summary length.

In [127]:
text_len = 30
summ_len = 8

Now we will add unique start and end tokens to each sentences in data['summary']

In [128]:
data['summary'] = data['summary'].apply(lambda x: 'summstart ' + x + ' summend')

With that each sentence in data['summary'] will look something like this,

In [129]:
data['summary'][0]

'summstart good quality dog food summend'

Splitting the data to train and test

In [142]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(np.array(data['text']), np.array(data['summary']), 
                                                  test_size=0.1, random_state=0, shuffle=True)

### Tokenizer

We will only consider words that are more frequent and remove any words which are very rarely occuring.

In [143]:
def token_words_len(tokenizer, min_occr):
    count, total_count = 0,0
    frequency, total_frequency = 0,0
    
    for key,val in tokenizer.word_counts.items():
        total_count += 1
        total_frequency += val
        if val < min_occr:
            count += 1
            frequency += val
    print("% of rare words: ", count/total_count)
    print("total coverage of rare words: ", frequency/total_frequency)
    return total_count - count

In [144]:
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(list(x_train))
x_tokenizer = Tokenizer(num_words=token_words_len(text_tokenizer,4))
x_tokenizer.fit_on_texts(list(x_train))

% of rare words:  0.6423736937973349
total coverage of rare words:  0.014954988379241722


In [145]:
summ_tokenizer = Tokenizer()
summ_tokenizer.fit_on_texts(list(y_train))
y_tokenizer = Tokenizer(num_words=token_words_len(summ_tokenizer,6))
y_tokenizer.fit_on_texts(list(y_train))

% of rare words:  0.7533709067560819
total coverage of rare words:  0.0373879678987757


In [146]:
# Text seq to int seq
x_train_seq = x_tokenizer.texts_to_sequences(x_train)
x_val_seq = x_tokenizer.texts_to_sequences(x_val)

# Padding
x_train = pad_sequences(x_train_seq, maxlen=text_len, padding='post')
x_val = pad_sequences(x_val_seq, maxlen=text_len, padding='post')

# vocabulary
x_voc = x_tokenizer.num_words + 1

In [147]:
# Text seq to int seq
y_train_seq = y_tokenizer.texts_to_sequences(y_train)
y_val_seq = y_tokenizer.texts_to_sequences(y_val)

# Padding
y_train = pad_sequences(y_train_seq, maxlen=summ_len, padding='post')
y_val = pad_sequences(y_val_seq, maxlen=summ_len, padding='post')

# vocabulary
y_voc = y_tokenizer.num_words + 1

Remove any rows that has only stat and end tokens

In [148]:
def delete_empty_sentence(data):
    index = []
    for i in range(len(data)):
        count = 0
        for j in data[i]:
            if j != 0:
                count += 1
        if count == 2:
            index.append(i)
    return index

In [150]:
train_index = delete_empty_sentence(y_train)
y_train = np.delete(y_train, train_index, axis=0)
# Deleting the corresponding text to the summary
x_train = np.delete(x_train, train_index, axis=0)
        

In [151]:
val_index = delete_empty_sentence(y_val)
y_train = np.delete(y_train, val_index, axis=0)
# Deleting the corresponding text to the summary
x_train = np.delete(x_train, val_index, axis=0)

With that done its time to build our model. Before that lets understand how the encoder decoder model works. 

In [None]:
def summarizer_model(input_shape):
    embedding_dim = 100
    latent_dim = 300
    # ENCODER 
    
    # Input Layer
    e_input_layer = Input(shape=input_shape)
    
    # Embedding layer
    e_embedding_layer = Embedding(x_voc, embedding_dim, trainable=True)(input_layer)
    
    # LSTM Layers
    e_lstm_1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
    e_output_1, state_h1, state_c1 = e_lstm_1(enc_emb)
    e_lstm_2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
    e_output_2, state_h2, state_c2 = e_lstm_2(e_output_1)
    e_lstm_3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
    encoder_outputs, state_h, state_c= e_lstm_3(e_output_2)
    
    # DECODER
    d_input_layer = Input(shape=(None,))

    # Embedding layer
    d_embedding_layer = Embedding(y_voc, embedding_dim,trainable=True)
    d_embedding = d_embedding_layer(d_input_layer)

    d_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
    d_output,d_fwd_state, d_back_state = d_lstm(d_embedding,initial_state=[state_h, state_c])

    # Attention layer
    attn_layer = AttentionLayer(name='attention_layer')
    attn_out, attn_states = attn_layer([encoder_outputs, d_output])

    # Concat attention input and decoder LSTM output
    decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([d_output, attn_out])

    #dense layer
    decoder_dense =  TimeDistributed(Dense(y_voc, activation='softmax'))
    d_output = decoder_dense(decoder_concat_input)

    # Define the model 
    model = Model([e_input_layer, d_input_layer], d_output)
    


    