# **Mount Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
ls

[0m[01;34mgdrive[0m/  [01;34msample_data[0m/


In [3]:
%cd /content/gdrive/MyDrive/testzzz/data

/content/gdrive/MyDrive/testzzz/data


In [4]:
ls

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt   Reviews.csv


# **Import necessary libraries for text summarisation**

In [5]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
#from attention import AttentionLayer
from keras.preprocessing.sequence import pad_sequences
#from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
data = pd.read_csv("/content/gdrive/MyDrive/testzzz/data/Reviews.csv")

# **Clean dataset by dropping and removing duplicated values, and rows**

In [8]:
data.drop_duplicates(subset=['Text'],inplace=True)
data.dropna(axis=0,inplace=True)

In [9]:
data.replace('', np.nan, inplace=True)
data.dropna(axis=0,inplace=True)

# **Text Pre-processing**

In [10]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

Preprocessing Steps

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

# Ensure that NLTK stopwords are downloaded
nltk.download('stopwords')

# Define contraction_mapping if not already defined in your code

stop_words = set(stopwords.words('english'))

def text_cleaner(text,num):
    # lower
    newString = text.lower()
    # remove HTML
    newString = BeautifulSoup(newString, "lxml").text
    # Remove any text inside the parenthesis
    newString = re.sub(r'\([^)]*\)', '', newString)
    # remove double quotes
    newString = re.sub('"','', newString)
    # contraction mapping
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
    # remove 's
    newString = re.sub(r"'s\b","",newString)
    # Eliminate punctuations and special characters
    newString = re.sub("[^a-zA-Z]", " ", newString)
    # Remove stopwords
    if(num==0):
        tokens = [w for w in newString.split() if not w in stop_words]
    else:
        tokens=newString.split()
    long_words=[]
    # Remove short words
    for i in tokens:
        if len(i)>1:
            long_words.append(i)
    return (" ".join(long_words)).strip()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Cleaning the "Text" Column

cleaned_text = []
for t in data['Text']:
    cleaned_text.append(text_cleaner(t,0))

In [13]:
cleaned_text[:2]

['bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better',
 'product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo']

In [14]:
# Cleaning the "Summary" Column

cleaned_summary = []
for t in data['Summary']:
    cleaned_summary.append(text_cleaner(t,1))

In [15]:
cleaned_summary[:2]

['good quality dog food', 'not as advertised']

In [16]:
data['cleaned_text']=cleaned_text
data['cleaned_summary']=cleaned_summary

In [17]:
#Drop empty rows
data.replace('', np.nan, inplace=True)
data.dropna(axis=0,inplace=True)

In [18]:
max_text_len=50
max_summary_len=10

In [19]:
cleaned_text =np.array(data['cleaned_text'])
cleaned_summary=np.array(data['cleaned_summary'])

short_text=[]
short_summary=[]

for i in range(len(cleaned_text)):
    if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])

df=pd.DataFrame({'text':short_text,'summary':short_summary}) # new dataframe to use

In [20]:
df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
df.head()

Unnamed: 0,text,summary
0,bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better,sostok good quality dog food eostok
1,product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo,sostok not as advertised eostok
2,confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat famil...,sostok delight says it all eostok
3,looking secret ingredient robitussin believe found got addition root beer extract ordered made cherry soda flavor medicinal,sostok cough medicine eostok
4,great taffy great price wide assortment yummy taffy delivery quick taffy lover deal,sostok great taffy eostok


In [21]:
from sklearn.model_selection import train_test_split

x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']), np.array(df['summary']),
                                       test_size=0.1, random_state=0, shuffle=True)

# A tokenizer builds the vocabulary and converts a word sequence to an integer sequence.
# Build tokenizers for text and summary.

x_tokenizer_fixed = Tokenizer()
x_tokenizer_fixed.fit_on_texts(list(x_tr))

In [22]:
# Tokenization with Max Vocabulary Size
from keras.preprocessing.sequence import pad_sequences

# Threshold for rare words
thresh = 5

# Count and frequency calculation for rare words
cnt = 0
tot_cnt = 0
freq = 0
tot_freq = 0

for key, value in x_tokenizer_fixed.word_counts.items():
    tot_cnt = tot_cnt + 1
    tot_freq = tot_freq + value
    if value < thresh:
        cnt = cnt + 1
        freq = freq + value
# Tokenizer initialization with fixed vocabulary size
max_vocab_size = tot_cnt  #adjust as needed
x_tokenizer_fixed = Tokenizer(num_words=max_vocab_size)
x_tokenizer_fixed.fit_on_texts(list(x_tr))

# Tokenize text sequences into integer sequences
x_tr_seq_fixed = x_tokenizer_fixed.texts_to_sequences(x_tr)
x_val_seq_fixed = x_tokenizer_fixed.texts_to_sequences(x_val)

# Padding sequences
x_tr_fixed = pad_sequences(x_tr_seq_fixed, maxlen=max_text_len, padding='post', truncating='post')
x_val_fixed = pad_sequences(x_val_seq_fixed, maxlen=max_text_len, padding='post', truncating='post')

# Size of vocabulary (+1 for padding token)
x_voc_fixed = max_vocab_size + 1


In [23]:
#prepare a tokenizer for reviews on training data

y_tokenizer_fixed = Tokenizer()
y_tokenizer_fixed.fit_on_texts(list(y_tr))

In [24]:
#  Tokenization with Max Vocab Size
# Tokenization with Max Vocabulary Size
from keras.preprocessing.sequence import pad_sequences

# Threshold for rare words
thresh = 5

# Count and frequency calculation for rare words
cnt = 0
tot_cnt = 0
freq = 0
tot_freq = 0

for key, value in y_tokenizer_fixed.word_counts.items():
    tot_cnt = tot_cnt + 1
    tot_freq = tot_freq + value
    if value < thresh:
        cnt = cnt + 1
        freq = freq + value

# 初始化目标序列的 Tokenizer，并设置合适的词汇表大小
max_vocab_size = tot_cnt  # Example value, adjust as needed
y_tokenizer_fixed = Tokenizer(num_words=max_vocab_size + 1)  # 注意这里的 num_words 参数要加1
y_tokenizer_fixed.fit_on_texts(list(y_tr))

# Tokenize text sequences into integer sequences
y_tr_seq_fixed = y_tokenizer_fixed.texts_to_sequences(y_tr)
y_val_seq_fixed = y_tokenizer_fixed.texts_to_sequences(y_val)

# Padding sequences
y_tr_fixed = pad_sequences(y_tr_seq_fixed, maxlen=max_text_len, padding='post', truncating='post')
y_val_fixed = pad_sequences(y_val_seq_fixed, maxlen=max_text_len, padding='post', truncating='post')

# Size of vocabulary (+1 for padding token)
y_voc_fixed = max_vocab_size + 1


In [25]:
# !pip install wget
# import wget
# import zipfile
# import os

# url = 'http://nlp.stanford.edu/data/glove.6B.zip'
# target_path = 'glove.6B.zip'

# if not os.path.exists(target_path):
#     wget.download(url, target_path)

# with zipfile.ZipFile(target_path, 'r') as zip_ref:
#     zip_ref.extractall('.')

# print("done")


In [26]:
from keras.layers import Attention, Concatenate, Dense, Embedding, LSTM, Input, TimeDistributed
from keras.models import Model
import numpy as np

# Define the input shapes and dimensions
max_text_len = 50
max_summary_len = 10
latent_dim = 300
embedding_dim = 100

# Load GloVe embeddings
def load_glove_embeddings(embedding_file):
    embedding_index = {}
    with open(embedding_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = coefs
    return embedding_index

glove_embedding_file = 'glove.6B.100d.txt'
glove_embeddings_index = load_glove_embeddings(glove_embedding_file)

# Function to generate GloVe embedding matrix
def generate_embedding_matrix(vocab_size, embedding_dim, word_index, embeddings_index):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if i < vocab_size:
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

# Generate GloVe embedding matrices for encoder and decoder
enc_embedding_matrix = generate_embedding_matrix(x_voc_fixed, embedding_dim, x_tokenizer_fixed.word_index, glove_embeddings_index)
dec_embedding_matrix = generate_embedding_matrix(y_voc_fixed, embedding_dim, y_tokenizer_fixed.word_index, glove_embeddings_index)

# Define Encoder
encoder_inputs = Input(shape=(max_text_len,))
enc_emb = Embedding(input_dim=x_voc_fixed, output_dim=embedding_dim, weights=[enc_embedding_matrix], input_length=max_text_len, trainable=True)(encoder_inputs)
encoder_lstm1 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)
encoder_lstm2 = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
encoder_lstm3 = LSTM(latent_dim, return_state=True, return_sequences=True, dropout=0.4, recurrent_dropout=0.4)
encoder_outputs, state_h, state_c = encoder_lstm3(encoder_output2)

# Define Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(input_dim=y_voc_fixed, output_dim=embedding_dim, weights=[dec_embedding_matrix], input_length=max_summary_len, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.4, recurrent_dropout=0.2)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

# Attention Layer
attn_layer = Attention()
attn_out = attn_layer([encoder_outputs, decoder_outputs])

# Concatenate attention input and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attn_out])

# Dense layer
decoder_dense = TimeDistributed(Dense(y_voc_fixed, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Print model summary
model.summary()





Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 50)]                 0         []                            
                                                                                                  
 embedding (Embedding)       (None, 50, 100)              7631900   ['input_1[0][0]']             
                                                                                                  
 lstm (LSTM)                 [(None, 50, 300),            481200    ['embedding[0][0]']           
                              (None, 300),                                                        
                              (None, 300)]                                                        
                                                                                              

In [27]:
#Convert integer sequence to one-hot vector
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [28]:
#Stop training as validation loss increases
early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)

In [29]:
print(y_tr_fixed.shape)
print(x_tr_fixed.shape)
print(y_val_fixed.shape)
print(x_val_fixed.shape)
print(x_voc_fixed)
print(y_voc_fixed)
print(max_vocab_size)

(273746, 50)
(273746, 50)
(30417, 50)
(30417, 50)
76319
25731
25730


In [None]:
history = model.fit(
    [x_tr_fixed, y_tr_fixed],
    y_tr_fixed,
    epochs=5,
    callbacks=[early_stopping],
    batch_size=8,
    validation_data=(
        [x_val_fixed, y_val_fixed],
        y_val_fixed
    )
)

Epoch 1/5
  402/34219 [..............................] - ETA: 6:50:03 - loss: 0.6084 - accuracy: 0.9251