# ECSE 484 Final project - Text summarizer

<i>Ruthvik thanda - rxt309</i>

This project utilises the amazon food reviews dataset

Source : https://www.kaggle.com/snap/amazon-fine-food-reviews

Importing libraries

In [479]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

In [480]:
data = pd.read_csv('Reviews.csv')

In [481]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [482]:
data = data[['Summary','Text']]
data['revLength'] = data['Text'].str.count(' ')
data['revLength'].describe()

count    568454.000000
mean         81.005522
std          80.807102
min           2.000000
25%          33.000000
50%          57.000000
75%          99.000000
max        3525.000000
Name: revLength, dtype: float64

In [483]:
data['sumLen'] = data['Summary'].str.count(' ')
data['sumLen'].describe()

count    568427.000000
mean          3.128462
std           2.619420
min           0.000000
25%           1.000000
50%           3.000000
75%           4.000000
max          41.000000
Name: sumLen, dtype: float64

from the data description,picking a set with smaller size so that the model is faster to build and easier to verify on smaller texts

In [484]:
data = data.loc[data['sumLen']<8]
data = data.loc[data['revLength']<30]

preprocessing and removing punctuations

In [485]:
data['textLowercase'] = data['Text'].str.lower()
data['textNopunc'] = data['textLowercase'].str.replace('[^\w\s]','')

In [486]:
data['summaryLowercase'] = data["Summary"].str.lower()
data['summaryNopunc'] =  '_begin_' + ' ' +data['summaryLowercase'].str.replace('[^\w\s]','')+ ' ' +'_end_'

Creating text and summary tokens

In [487]:
textFeature = 5000
textLen = 30

textTokens = tf.keras.preprocessing.text.Tokenizer(num_words=textFeature) 
textTokens.fit_on_texts(list(data['textNopunc'].astype(str))) 
textSeq =textTokens.texts_to_sequences(list(data['textNopunc'].astype(str)))
textSeq =tf.keras.preprocessing.sequence.pad_sequences(textSeq, maxlen=textLen)



In [488]:
summaryFeature = 5000
summaryLen = 8

summaryTokens = tf.keras.preprocessing.text.Tokenizer(num_words=summaryFeature, filters = '*') 
summaryTokens.fit_on_texts(list(data['summaryNopunc'].astype(str))) 
summarySeq = summaryTokens.texts_to_sequences(list(data['summaryNopunc'].astype(str)))
summarySeq = tf.keras.preprocessing.sequence.pad_sequences(summarySeq, maxlen=summaryLen, padding ='post') 

In [489]:
summaryVector = summarySeq
decoderData = summaryVector[:, :-1]
decoderTarget = summaryVector[:, 1:]
textVector = textSeq
encoderData = textVector
length = encoderData.shape[1]

In [490]:
encoderVocab = len(textTokens.word_index) + 1 
decoderVocab = len(summaryTokens.word_index) + 1

In [491]:
dim = 50

Create encoder and decoder with TF

In [492]:
#encoder
encoderInp = tf.keras.Input(shape=(length,), name='Encoder')
A = tf.keras.layers.Embedding(encoderVocab, dim, name='BWEmbed', mask_zero=False)(encoderInp)
A = tf.keras.layers.BatchNormalization(name='encoderBN1')(A)

_, hidden_state = tf.keras.layers.GRU(dim, return_state=True, name='EGRU')(A)

eModel = tf.keras.Model(inputs=encoderInp, outputs=hidden_state, name='EModel')
eOut = eModel(encoderInp)

#Decoder
decoderInp = tf.keras.Input(shape=(None,), name='Decoder')  
dEmbedd = tf.keras.layers.Embedding(decoderVocab, dim, name='DEmbedding', mask_zero=False)(decoderInp)

dBN = tf.keras.layers.BatchNormalization(name='decoderBN1')(dEmbedd)
dGru = tf.keras.layers.GRU(dim, return_state=True, return_sequences=True, name='Decoder-GRU')
dGruOp, _ = dGru(dBN, initial_state=eOut) 
A = tf.keras.layers.BatchNormalization(name='decoderBN2')(dGruOp)

#Dense layer
dense = tf.keras.layers.Dense(decoderVocab, activation='softmax', name='OP')
dOP = dense(A)

#Seq2Seq
seq2seq = tf.keras.Model([encoderInp, decoderInp], dOP)
seq2seq.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

In [493]:

seq2seq.summary()


Model: "functional_71"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
DEmbedding (Embedding)          (None, None, 50)     692150      Decoder[0][0]                    
__________________________________________________________________________________________________
Encoder (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
decoderBN1 (BatchNormalization) (None, None, 50)     200         DEmbedding[0][0]                 
______________________________________________________________________________________

In [494]:
batch = 64
epoch_no = 3 
mod = seq2seq.fit([encoderData, decoderData], np.expand_dims(decoderTarget, -1),
          batch_size=batch,  epochs=epoch_no ,  validation_split=0.15) 

Epoch 1/3
Epoch 2/3
Epoch 3/3


Testing,Place text you want to summarize in the test list

In [532]:
test = ['definitely not worth buying flavored water with a few teaspoons of beans and rice that doesnt taste like normal beans and rice i wont ever buy this again']

textTokens.fit_on_texts(test)
testTokens = textTokens.texts_to_sequences(test)
testTokens = tf.keras.preprocessing.sequence.pad_sequences(testTokens, maxlen=textLen)
encoding = eModel.predict(testTokens) 
dim = seq2seq.get_layer('DEmbedding').output_shape[-1]

decoderInp = seq2seq.get_layer('Decoder').input 
dEmbedd = seq2seq.get_layer('DEmbedding')(decoderInp)
dBN = seq2seq.get_layer('decoderBN1')(dEmbedd)
gruInp = tf.keras.Input(shape=(dim,), name='hidden_state_input')
gruOut, gruOutState = seq2seq.get_layer('Decoder-GRU')([dBN, gruInp])
dBN2 = seq2seq.get_layer('decoderBN2')(gruOut)
dOut = seq2seq.get_layer('OP')(dBN2)
dModel = tf.keras.Model([decoderInp, gruInp],[dOut, gruOutState])
oEncoding = encoding
startState = np.array(summaryTokens.word_index['_begin_']).reshape(1, 1)

prediction and append into sentence

In [533]:
sentence = []
flag = False
resVocab = dict((v, k) for k, v in summaryTokens.word_index.items())

while not flag:
    p, s = dModel.predict([startState, encoding])

    p_i = np.argmax(p[:, :, 2:]) + 2
    word = resVocab[p_i]
    if word == '_end_' or len(sentence) >= summaryLen:
        flag = True
        break
    sentence.append(word)

    encoding = s
    startState = np.array(p_i).reshape(1, 1)




Result

In [534]:
result=""
for i in sentence:
    result+=(i)
    result+=(" ")

print(result)

not as good as the flavor 
