In [0]:
from pandas import read_csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense,Embedding,LSTM,Input,TimeDistributed,Concatenate
from attention import AttentionLayer
from tensorflow.keras.models import Model
from tensorflow import __version__

In [2]:
__version__

'2.2.0-rc3'

In [0]:
oo=read_csv('smallFINALFINALREVIEW.csv')
texts=[str(e) for e in list(oo.Text)]
summaries=[str(e) for e in list(oo.Summary)]
texts_maxlen=500
summary_maxlen=50

In [0]:
text_tokenizer=Tokenizer()
text_tokenizer.fit_on_texts(texts)
text_sequences=text_tokenizer.texts_to_sequences(texts)
padded_text_sequences=pad_sequences(text_sequences,maxlen=texts_maxlen,padding='post',truncating='post')

summary_tokenizer=Tokenizer()
summary_tokenizer.fit_on_texts(summaries)
summary_sequences=summary_tokenizer.texts_to_sequences(summaries)
padded_summary_sequences=pad_sequences(summary_sequences,maxlen=summary_maxlen,padding='post',truncating='post')

In [5]:
padded_text_sequences.shape

(20000, 500)

In [6]:
padded_summary_sequences.shape

(20000, 50)

In [0]:
X=padded_text_sequences
y=padded_summary_sequences

In [8]:
X.shape

(20000, 500)

In [9]:
y.shape

(20000, 50)

In [0]:
#Glove Embedding
import numpy as np
latent_dim=500
embedding_dim=50
embeddings_index={}
f=open("glove.6B.50d.txt")
for line in f:
  values=line.split()
  word=values[0]
  coefs=np.asarray(values[1:],dtype='float32')
  embeddings_index[word]=coefs
f.close()

encoder_embedding_matrix=np.zeros((len(text_tokenizer.word_index)+1,embedding_dim))
for word, i in text_tokenizer.word_index.items():
  embedding_vector=embeddings_index.get(word)
  if embedding_vector is not None:
    encoder_embedding_matrix[i]=embedding_vector

decoder_embedding_matrix=np.zeros((len(summary_tokenizer.word_index)+1,embedding_dim))
for word, i in summary_tokenizer.word_index.items():
  embedding_vector=embeddings_index.get(word)
  if embedding_vector is not None:
    decoder_embedding_matrix[i]=embedding_vector

In [11]:
x_voc_size=len(text_tokenizer.word_index)
y_voc_size=len(summary_tokenizer.word_index)

#ENCODER
encoder_inputs=Input(shape=(texts_maxlen,))
enc_emb=Embedding(x_voc_size+1,
                  embedding_dim,
                  weights=[encoder_embedding_matrix],
                  trainable=False)(encoder_inputs)

encoder_lstm1=LSTM(latent_dim,return_sequences=True,return_state=True)
encoder_output1, state_h1, state_c1=encoder_lstm1(enc_emb)

encoder_lstm2=LSTM(latent_dim,return_sequences=True,return_state=True)
encoder_output2, state_h2, state_c2=encoder_lstm2(encoder_output1)

encoder_lstm3=LSTM(latent_dim,return_sequences=True,return_state=True)
encoder_outputs, state_h, state_c=encoder_lstm3(encoder_output2)

encoder_states=[state_h,state_c]
#experiment with using embeddings of higher dimensions

#DECODER
decoder_inputs=Input(shape=(None,))
dec_emb_layer=Embedding(y_voc_size+1,
                        embedding_dim,
                        weights=[decoder_embedding_matrix],
                        trainable=False)
dec_emb=dec_emb_layer(decoder_inputs)

decoder_lstm=LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_outputs, decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=encoder_states)

attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs,decoder_outputs])

decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])

decoder_dense=TimeDistributed(Dense(y_voc_size,activation='softmax'))
decoder_outputs=decoder_dense(decoder_concat_input)

model=Model([encoder_inputs, decoder_inputs],decoder_outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 500)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 500, 50)      1360100     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 500, 500), ( 1102000     embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
______________________________________________________________________________________________

In [0]:
model.compile('adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [0]:
encoder_input=X
decoder_input=y[:,:-1]
decoder_target=y.reshape(y.shape[0],y.shape[1],1)[:,1:]
epochs=10

In [14]:
decoder_input.shape

(20000, 49)

In [15]:
decoder_target.shape

(20000, 49, 1)

In [16]:
decoder_input[:1]

array([[  2, 129,  11,  80,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [17]:
decoder_target[:1]

array([[[129],
        [ 11],
        [ 80],
        [  1],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0],
        [  0]]], dtype=int32)

In [18]:
history=model.fit([encoder_input[:15000],decoder_input[:15000]],decoder_target[:15000],epochs=10,validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
max_len_text=500
max_len_summary=50
reverse_target_word_index=text_tokenizer.index_word 
reverse_source_word_index=summary_tokenizer.index_word 
target_word_index=summary_tokenizer.word_index
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_len_text,latent_dim))
dec_emb2= dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])
decoder_outputs2 = decoder_dense(decoder_inf_concat)
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])

In [0]:
reverse_target_word_index=summary_tokenizer.index_word
reverse_source_word_index=text_tokenizer.index_word
def decode_sequence(input_seq):
    e_out, e_h, e_c = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))

    target_seq[0, 0] = target_word_index['alpha']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])


        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]

        if(sampled_token!='omega'):
            decoded_sentence += ' '+sampled_token
            if len(decoded_sentence)>=max_len_summary:
              stop_condition=True
        else:
          stop_condition=True

        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        e_h, e_c = h, c

    return decoded_sentence

def seq2summary(input_seq):
    newString=''
    for i in input_seq:
      if((i!=0 and i!=target_word_index['alpha']) and i!=target_word_index['omega']):
        newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
      if(i!=0):
        newString=newString+reverse_source_word_index[i]+' '
    return newString

In [26]:
for i in range(len(X[15000:])):
  print("Review:",seq2text(X[i]))
  print("Original summary:",seq2summary(y[i]))
  print("Predicted summary:",decode_sequence(X[i].reshape(1,max_len_text)))
  print("\n")

Review: nespresso capsules are fantastic but do some homework nespresso offers the same introductory pack when you buy a machine for cents a capsule why would you pay over a dollar a capsule for the same thing amazon why are you allowing this type of price gouging it reflects badly on your site some oversight is needed here 
Original summary: do not buy 
Predicted summary:  great product


Review: this product when received became an instant household staple i share with many others br br the unsalted snack foods offered through amazon can be delivered in case quantity then set out at social events to introduce others to healthier eating br br and one can produce dips for selection at such social events by simply food processing ordinary or ideally organic vegetables also shipped through amazon sellers or found locally br br experiment that s healthy to do your own body is your own body feed your own body carefully br br clint williams br oakland michigan usa 
Original summary: highest

KeyError: ignored