In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
import tensorflow as tf
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K


class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

In [0]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Concatenate,TimeDistributed,Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings 
pd.set_option("display.max_colwidth",200)
warnings.filterwarnings("ignore")


Using TensorFlow backend.


In [0]:
data=pd.read_csv("../input/amazon-fine-food-reviews/Reviews.csv",nrows=200000)
#200000 over here is to only include 200000 raw data from total data


In [0]:
data[:5]

NameError: ignored

In [0]:
#data preprocessing
#1)drop duplicates and na
# expanding contraction
#2)stopword removal
# 3)text cleaning ,removal of special character
# 4)padding



In [0]:
data.drop_duplicates(subset=['Text'],inplace=True)
data.dropna(axis=0,inplace =True)

In [0]:
#creatin dictionary fro expanding contractions
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [0]:
stop_words=set(stopwords.words('english'))

In [0]:
TEXT_CLEANING_RE="@\S+|https?:\S+|http?:\S\[^A-Za-z0-9]+"
def preprocessing(text):
    newstring=text.lower()
    newstring=BeautifulSoup(newstring,'lxml').text#to get text from html and xml pages
    newstring=re.sub(TEXT_CLEANING_RE,'',newstring)
    newstring = re.sub(r'\([^)]*\)', '', newstring)
    newstring=re.sub('"','',newstring)
    newstring=re.sub(r"'s\b","",newstring)
    newstring=re.sub("[^a-zA-Z]", " ", newstring)
    word=[]
    for i in newstring.split():
        if i in contraction_mapping:
            word.append(contraction_mapping[i])
        else:
            word.append(i)
    
    newstring=' '.join(word)
    
    wordb=[]
    for i in newstring.split():
        if i not in stop_words:
            wordb.append(i)
    
    fword=[]
    
    for i in wordb:
        if len(i)>=3:
            fword.append(i)
    
    return " ".join(fword).strip()



            
    
    

In [0]:
#making a new array of cleaned text

cleaned_text=data.Text.apply(lambda x :preprocessing(x))

In [0]:
cleaned_text[:5]

In [0]:
#cleaning summary
cleaned_summary=data.Summary.apply(lambda x :preprocessing(x))

In [0]:
cleaned_summary.replace('',np.nan,inplace=True)

In [0]:
cleaned_summary[:5]
# np.dtype(cleaned_summary)

In [0]:
data.Summary[:5]

In [0]:
#defining function to add start and end word in summary

def addstartend(text):
    k='_START_'+str(text)+'_END_'
    return k

In [0]:
cleaned_summary=cleaned_summary.apply(lambda x:addstartend(x))

In [0]:
cleaned_summary[:5]

In [0]:
#counting number of words to get padding
import matplotlib.pyplot as plt
text_word_count=[]
for i in cleaned_text:
    text_word_count.append(len(i.split()))
    
summary_word_count=[]
for i in cleaned_summary:
    summary_word_count.append(len(i.split()))
    
length_df=pd.DataFrame({'text':text_word_count,'summary':summary_word_count})
length_df.hist(bins=20)
plt.show()

NameError: ignored

In [0]:
max_len_text=80
max_len_summary=10

In [0]:
#splitting data for train and validation
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val=train_test_split(cleaned_text,cleaned_summary,test_size=.1,random_state=0,shuffle=True)

In [0]:
#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(x_tr)

#convert text sequences into integer sequences
x_tr    =   x_tokenizer.texts_to_sequences(x_tr) 
x_val   =   x_tokenizer.texts_to_sequences(x_val)

#padding zero upto maximum length
x_tr    =   pad_sequences(x_tr,  maxlen=max_len_text, padding='post') 
x_val   =   pad_sequences(x_val, maxlen=max_len_text, padding='post')

x_voc_size   =  len(x_tokenizer.word_index) +1

In [0]:
#preparing a tokenizer for summary on training data 
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_tr))

#convert summary sequences into integer sequences
y_tr    =   y_tokenizer.texts_to_sequences(y_tr) 
y_val   =   y_tokenizer.texts_to_sequences(y_val) 

#padding zero upto maximum length
y_tr    =   pad_sequences(y_tr, maxlen=max_len_summary, padding='post')
y_val   =   pad_sequences(y_val, maxlen=max_len_summary, padding='post')

y_voc_size  =   len(y_tokenizer.word_index) +1

In [0]:
from tensorflow.keras import backend as K
K.clear_session()
latent_dim=500

In [0]:
#first we will define all the layers of the model than add them 

######encoder
encoder_inputs=Input(shape=(max_len_text,))

#embedding
enc_emb=Embedding(x_voc_size,latent_dim,trainable=True)(encoder_inputs)

#lstm 1
encoder_lstm1=LSTM(latent_dim,return_sequences=True,return_state=True)
encoder_output1,state_h1,state_c1=encoder_lstm1(enc_emb)

#lstm2
encoder_lstm2=LSTM(latent_dim,return_sequences=True,return_state=True)
encoder_outputs,state_h,state_c=encoder_lstm2(encoder_output1)

#####decoder
decoder_inputs=Input(shape=(None,))

#embedding
dec_emb_layer=Embedding(y_voc_size,latent_dim,trainable=True)
dec_emb=dec_emb_layer(decoder_inputs)

#lstm1
decoder_lstm=LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_outputs,decoder_fwd_state,decoder_back_state=decoder_lstm(dec_emb,initial_state=[state_h,state_c])

#Attention layer
attn_layer=AttentionLayer(name='attention_layer')
attn_out,attn_state=attn_layer([encoder_outputs,decoder_outputs])

#concatenate attention output with decoder output
decoder_concat_input=Concatenate(axis=-1,name='concat_layer')([decoder_outputs,attn_out])

#dense layer
#importnat to study timedistributed layer and its feature

decoder_dense=TimeDistributed(Dense(y_voc_size,activation='softmax'))
decoder_outputs=decoder_dense(decoder_concat_input)

#now summing up the model
model=Model([encoder_inputs,decoder_inputs],decoder_outputs)

model.summary()

In [0]:
#to create picture of model
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png')

In [0]:
model.compile(optimizer='rmsprop',loss='sparse_categorical_crossentropy')

In [0]:
es=EarlyStopping(monitor='val_loss',mode='min',verbose=1)

# es = EarlyStopping(monitor='val_acc', min_delta=.001, patience=5,mode="max")

In [0]:

#lets train 
# %%time
# BATCH_SIZE=512
# EPOCHS=10
from keras.callbacks import ModelCheckpoint

#  model_checkpoint=ModelCheckpoint('/kaggle/working/bestmodel1.hdf5', monitor='val_loss',verbose=1, savebest_only=True,save_weights_only=False,mode='auto',period=1)
modelcheckpoint = ModelCheckpoint("/kaggle/working/bestmodel1.h5", save_best_only=True, verbose=1)
# model_checkpoint = ModelCheckpoint(filepath='movie_review_epoch-{epoch:02d}_loss-{loss:.4f}_val_loss-{val_loss:.4f}.h5',
#                                   monitor='val_loss',
#                                   verbose=1,
#                                   save_best_only=True,
#                                   save_weights_only=False,
#                                   mode='auto',
#                                   period=1)

# model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_split=0.2)

# model.fit([x_tr,y_tr],y_tr,epochs=30,callbacks=[es],batch_size=512, validation_data=([x_val,y_val],y_val))
history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=20,callbacks=[es],batch_size=512, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))


In [0]:
model.save('/kaggle/working/modelf.h5')

In [0]:

from matplotlib import pyplot 
pyplot.plot(history.history['loss'], label='train') 
pyplot.plot(history.history['val_loss'], label='test') 
# pyplot.legend() pyplot.show()

In [0]:
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

In [0]:
#inference main part of language modelling

#encoder
encoder_model=Model(inputs=encoder_inputs,outputs=[encoder_outputs,state_h,state_c])

#decoder
decoder_state_input_h=Input(shape=(latent_dim,))
decoder_state_input_c=Input(shape=(latent_dim,))
decoder_hidden_state_input=Input(shape=(max_len_text,latent_dim))#this is the ouptut of encoder for each text

#get the embedding of decoder

dec_emb2=dec_emb_layer(decoder_inputs)

#to predict the next word 
decoder_outputs2,state_h2,state_c2=decoder_lstm(dec_emb2,initial_state=[decoder_state_input_h,decoder_state_input_c])

#attention layer inference
attn_out_inf,attn_states_inf=attn_layer([decoder_hidden_state_input,decoder_outputs2])
decoder_inf_concat=Concatenate(axis=-1,name='concat')([decoder_outputs2,attn_out_inf])

#dense softmax layer to generate prob distri.
decoder_outputs2=decoder_dense(decoder_inf_concat)

#final decoder model
decoder_model=Model([decoder_inputs]+[decoder_hidden_state_input,decoder_state_input_h,decoder_state_input_c],[decoder_outputs2]+[state_h2,state_c2])


In [0]:
reverse_target_word_index[2]

In [0]:
#defing function for inference
# updating the  input of decoder and h and c 

def decode_sequence(input_seq):
    #encode the input as state vector
    e_out,e_h,e_c=encoder_model.predict(input_seq)
    target_seq=np.zeros((1,1))
    target_seq[0,0]=target_word_index['start']
    
    stop_condition=False
    decoded_sentence=''
    
    while not stop_condition:
        output_tokens,h,c=decoder_model.predict([target_seq]+[e_out,e_h,e_c])
        
        #sample a token
        sampled_token_index=np.argmax(output_tokens[0,-1,:])
        sampled_token=reverse_target_word_index[sampled_token_index]
#         print(sampled_token_index)
        if (sampled_token!='end'):
            decoded_sentence+=' '+sampled_token
            
            #checking exit condition
        if(sampled_token=='end' or len(decoded_sentence.split())>=(max_len_summary-1) ):
            stop_condition=True
            
        #update input word of decoder to previous predict word    
        target_seq=np.zeros((1,1))
        target_seq[0,0]=sampled_token_index
        #update h and c
        e_h,e_c=h,c
    return decoded_sentence

In [0]:
#now lets make a function to convert all the input text ,summary from int to text

def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['start']) and i!=target_word_index['end']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

def seq2text(input_seq):
    newString=''
    for i in input_seq:
        if (i!=0):
            newString=newString+reverse_source_word_index[i]+' '
    return newString


In [0]:
model_json=decoder_model.to_json()
with open("decoder_model.json","w") as json_file:
    json_file.write(model_json)

decoder_model.save_weights("decoder_modelf.h5")

In [0]:
model_json=encoder_model.to_json()
with open("encoder_model.json","w") as json_file:
    json_file.write(model_json)

encoder_model.save_weights("encoder_modelf.h5")

In [0]:

for i in range(len(x_tr)):
  print("Review:",seq2text(x_tr[i]))
  print("Original summary:",seq2summary(y_tr[i]))
  print("Predicted summary:",decode_sequence(x_val[i].reshape(1,max_len_text)))
  print("\n")