# FUNCTIONS AND GLOBAL VARIABLES

## Install & Import Libraries

In [25]:
#Python >=3.9, only run for the first time
#%pip install -r requirements.txt

In [26]:
from tensorflow.keras.layers import Attention as AttentionLayer
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
import nltk
import os
import tensorflow as tf
from keras import backend as K
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
import re
from bs4 import BeautifulSoup as BS
import gdown
import os

## Global Variables

In [27]:
MAX_LEN_TXT = 100
MAX_LEN_SUM = 20
PATH_TO_DATA = './data/'
PATH_TO_MODELS = './model/'
PATH_TO_CHECKPOINT = './ckpt/'
SAMPLES = 400000
STOPWORDS_REMOVAL = True
EMBED_DIM = 300

## Download Models

In [28]:

if os.path.isfile("".join([PATH_TO_MODELS,'200000_100d_nostop_80_10.h5']))==False:
    output = "".join([PATH_TO_MODELS,'200000_100d_nostop_80_10.h5'])
    url = "https://drive.google.com/file/d/1V0cjoCCDB7JWwta8bnWTAlNSNNtyXy6Q/view?usp=drive_link"
    gdown.download(url=url, output=output, quiet=False, fuzzy=True)

if os.path.isfile("".join([PATH_TO_MODELS,'200000_100d_nostop_100_20.h5']))==False:
    output = "".join([PATH_TO_MODELS,'200000_100d_nostop_100_20.h5'])
    url = "https://drive.google.com/file/d/1cNhTrgYjbKa1J6UBn9tN0U8LRnpIxq0-/view?usp=drive_link"
    gdown.download(url=url, output=output, quiet=False, fuzzy=True)

if os.path.isfile("".join([PATH_TO_MODELS,'400000_300d_fullstop_100_20.h5']))==False:
    output = "".join([PATH_TO_MODELS,'400000_300d_fullstop_100_20.h5'])
    url = "https://drive.google.com/file/d/1M4fxm7isYtxcrflOsa_lIQDcs_k8JWvH/view?usp=drive_link"
    gdown.download(url=url, output=output, quiet=False, fuzzy=True)

if os.path.isfile("".join([PATH_TO_MODELS,'400000_300d_nostop_100_20.h5']))==False:
    output = "".join([PATH_TO_MODELS,'400000_300d_nostop_100_20.h5'])
    url = "https://drive.google.com/file/d/1U0vWsb1xuduEIOO6-cALJEfwby9vD3hR/view?usp=drive_link"
    gdown.download(url=url, output=output, quiet=False, fuzzy=True)
    
if os.path.isfile("".join([PATH_TO_MODELS,'glove.6B.100d.txt']))==False:
    output = "".join([PATH_TO_MODELS,'glove.6B.100d.txt'])
    url = "https://drive.google.com/file/d/1hxQIMX-TjR8eXbjmUEDanPzLkKH2a9ac/view?usp=drive_link"
    gdown.download(url=url, output=output, quiet=False, fuzzy=True)

if os.path.isfile("".join([PATH_TO_MODELS,'glove.6B.300d.txt']))==False:
    output = "".join([PATH_TO_MODELS,'glove.6B.300d.txt'])
    url = "https://drive.google.com/file/d/1Jc7BBiC18uSC7tZxNRn7FLijpBzLL4U9/view?usp=drive_link"
    gdown.download(url=url, output=output, quiet=False, fuzzy=True)

if os.path.isfile("".join([PATH_TO_DATA,'Reviews.csv']))==False:
    output = "".join([PATH_TO_DATA,'Reviews.csv'])
    url = "https://drive.google.com/file/d/1Mg6iyiwvrC4YzejDmmSDt4ajv3cLoVLT/view?usp=drive_link"
    gdown.download(url=url, output=output, quiet=False, fuzzy=True)



## Data Preprocessing Functions

### Data Cleaning

In [29]:

pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

nltk.download('stopwords')
# Remove stopwords
stop_words = set(stopwords.words('english'))


contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}



def clean_line(text,num):
    '''Use to clean up lines per rows in the dataset'''
    line = text.lower()
    line = BS(line, "lxml").text
    line = re.sub(r'\([^)]*\)', '', line)
    line = re.sub('"','', line)
    line = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in line.split(" ")])
    line = re.sub(r"'s\b","",line)
    line = re.sub("[^a-zA-Z]", " ", line)
    line = re.sub('[m]{2,}', 'mm', line)
    if(num==0):
        tokens = [w for w in line.split() if not w in stop_words]
    else:
        tokens=line.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:
            long_words.append(i)
    return (" ".join(long_words)).strip()

def cleaner(option):
    '''Cleaner main functions'''
    data=pd.read_csv("".join([PATH_TO_DATA,"Reviews.csv"]),nrows=SAMPLES,usecols=[8,9])
    data.drop_duplicates(subset=['Text'],inplace=True)
    data.dropna(axis=0,inplace=True)
    data = data.reset_index(drop=True)

    cleaned_text = []
    if STOPWORDS_REMOVAL==True:
        for t in data['Text']:
            cleaned_text.append(clean_line(t,0))
    else:
        for t in data['Text']:
            cleaned_text.append(clean_line(t,1))

    cleaned_sum = []
    for t in data['Summary']:
        cleaned_sum.append(clean_line(t,1))

    return cleaned_text, cleaned_sum

def make_csv(cleaned_text, cleaned_sum):
    '''Create a training csv for ease of use'''
    df = pd.DataFrame()

    df['Text']= cleaned_text
    df['Summary']= cleaned_sum

    df.Text = df.Text.apply(str)
    df.Summary = df.Summary.apply(str)

    df.to_csv("".join([PATH_TO_DATA,'training.csv']),index=False)

def preprocess(option):
    x,y = cleaner(option)
    make_csv(x,y)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Embedding

In [30]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

def create_windex(txt_cleaned,summary_cleaned):
    '''Create a word index'''
    word_counts = {}
    count_words(word_counts, summary_cleaned)
    count_words(word_counts, txt_cleaned)
    print("Create Word Index - Done")
    return word_counts

def indexing_ematrix():
    '''Create Embedding Index'''
    embeddings_index = {}
    if EMBED_DIM ==300:
        f = open("".join([PATH_TO_MODELS,"glove.6B.300d.txt"]),encoding='utf-8')
    else: 
        f = open("".join([PATH_TO_MODELS,"glove.6B.100d.txt"]),encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print("Create Embedding Index - Done")
    return embeddings_index
def count_missing_emb(embeddings_index,word_counts):
    '''Count missing words from embedding index'''
    missing_words = 0
    threshold = 20
    for word, count in word_counts.items():
        if count > threshold:
            if word not in embeddings_index:
                missing_words += 1
    missing_ratio = round(missing_words/len(word_counts),4)*100
    print("Count Missing Embedding - Done")
    return missing_ratio,threshold
def voc_n_int(word_counts,embeddings_index,threshold):
    '''Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe'''
    vocab_to_int = {}

    value = 0
    for word, count in word_counts.items():
        if count >= threshold or word in embeddings_index:
            vocab_to_int[word] = value
            value += 1

    # Special tokens that will be added to our vocab
    codes = ["<UNK>","<PAD>","<EOS>","<GO>"]

    # Add codes to vocab
    for code in codes:
        vocab_to_int[code] = len(vocab_to_int)

    # Dictionary to convert integers to words
    int_to_vocab = {}
    for word, value in vocab_to_int.items():
        int_to_vocab[value] = word
    '''Debug
    usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100
    print("Total number of unique words:", len(word_counts))
    print("Number of words we will use:", len(vocab_to_int))
    print("Percent of words we will use: {}%".format(usage_ratio))
    Debug'''
    print("Vocabulary to Ints - Done")
    return vocab_to_int,int_to_vocab


def create_emb_matrix(vocab_to_int,embeddings_index):
    '''Create embeding matrix'''
    # Need to use 100 for embedding dimensions to match CN's vectors.
    embedding_dim = EMBED_DIM
    nb_words = len(vocab_to_int)

    # Create matrix with default values of zero
    word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
    for word, i in vocab_to_int.items():
        if word in embeddings_index:
            word_embedding_matrix[i] = embeddings_index[word]
        else:
            # If word not in CN, create a random embedding for it
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
            embeddings_index[word] = new_embedding
            word_embedding_matrix[i] = new_embedding
    print("Create Embedding matrix - Done")
    return word_embedding_matrix

# Main running:
def create_embedding(txt_clean, sum_cleaned):
    '''Main function of creating embedding'''
    word_index_count = create_windex(txt_clean,sum_cleaned)
    emb_index = indexing_ematrix()
    _,threshold = count_missing_emb(emb_index,word_index_count)
    vocab2int,_ =voc_n_int(word_index_count,emb_index,threshold)
    word_embedding_matrix=create_emb_matrix(vocab2int,emb_index)
    return word_embedding_matrix, vocab2int



### Spliting

In [31]:
def convert_to_ints(text, word_count, unk_count,vocab_to_int, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count
def check_unk(vocab_to_int,txt_cleaned,summary_cleaned):
    '''Count the unknown tokens'''
    # Apply convert_to_ints to clean_summaries and clean_texts
    word_count = 0
    unk_count = 0

    int_summaries, word_count, unk_count = convert_to_ints(summary_cleaned, word_count, unk_count,vocab_to_int)
    int_texts, word_count, unk_count = convert_to_ints(txt_cleaned, word_count, unk_count,vocab_to_int,eos=True)
    #unk_percent = round(unk_count/word_count,4)*100
    return int_texts,int_summaries,unk_count

def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

def max_sum_txt(int_texts,int_summaries):
    '''Use to set custom flexible max length but we decided to create a fixed length.'''
    lengths_summaries = create_lengths(int_summaries)
    lengths_texts = create_lengths(int_texts)
    global MAX_LEN_TXT,MAX_LEN_SUM
    #MAX_LEN_TXT1 = lengths_texts.describe()['counts']['max']
    #MAX_LEN_SUM1 = lengths_summaries.describe()['counts']['max']
    #if (MAX_LEN_TXT > MAX_LEN_TXT1):
    #  MAX_LEN_TXT = int(MAX_LEN_TXT1)
    #if (MAX_LEN_SUM > MAX_LEN_SUM1):
    #  MAX_LEN_SUM= int(MAX_LEN_SUM)
    return MAX_LEN_TXT,MAX_LEN_SUM

def unk_counter(sentence, vocab_to_int):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

def pre_tokenizing(dataset,int_texts,int_summaries,vocab_to_int):
    '''Create 2 array for text and summary.'''
    max_len_txt,max_len_summary=max_sum_txt(int_texts,int_summaries)
    # select the reviews and summaries whose length falls below or equal to **max_len_txt** and **max_len_summary**
    txt_cleaned =np.array(dataset['txt_cleaned'])
    summary_cleaned=np.array(dataset['summary_cleaned'])

    txt_short=[]
    summary_short=[]
    for i in range(len(txt_cleaned)):
        if(len(summary_cleaned[i].split())<=max_len_summary and
            len(txt_cleaned[i].split())<=max_len_txt and
            unk_counter(summary_cleaned[i].split(),vocab_to_int) <= 1 and
            unk_counter(txt_cleaned[i].split(),vocab_to_int) <= 1
        ):
            txt_short.append(txt_cleaned[i])
            summary_short.append(summary_cleaned[i])

    dataframe=pd.DataFrame({'txt':txt_short,'summary':summary_short})

    # add the **START** and **END** special tokens at the beginning and end of the summary.
    dataframe['summary'] = dataframe['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
    return dataframe


def splitset(dataset,txt_cleaned,summary_cleaned,vocab_to_int):
    '''Main function of splitting data'''
    dataset['txt_cleaned']=txt_cleaned
    dataset['summary_cleaned']=summary_cleaned
    #Drop empty rows
    dataset.replace('', np.nan, inplace=True)
    dataset.dropna(axis=0,inplace=True)
    int_txt,int_sum,_=check_unk(vocab_to_int,txt_cleaned,summary_cleaned)
    df = pre_tokenizing(dataset,int_txt,int_sum,vocab_to_int)
    x_train,x_test,y_train,y_test=train_test_split(np.array(df['txt']),np.array(df['summary']),test_size=0.2,random_state=0,shuffle=True)
    x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.2)
    return x_train,x_test,x_val,y_train,y_test,y_val

### Tokenizing

In [32]:
def max_featuretxt_count(x_train):
    '''Count max features for text'''
    tokenizer_x = Tokenizer()
    tokenizer_x.fit_on_texts(list(x_train))
    threshold=4
    count=0
    count_total=0
    frequancy=0
    frequancy_total=0
    for k,v in tokenizer_x.word_counts.items():
        count_total=count_total+1
        frequancy_total=frequancy_total+v
        if(v<threshold):
            count=count+1
            frequancy=frequancy+v
    return count_total-count

def tokenizing_text(max_features,x_train,x_test,x_val):
    '''Tokenizing text with pad sequences'''
    tokenizer_x = Tokenizer(num_words=max_features)
    tokenizer_x.fit_on_texts(list(x_train))

    x_tr_seq    =   tokenizer_x.texts_to_sequences(x_train)
    x_test_seq   =   tokenizer_x.texts_to_sequences(x_test)
    x_val_seq   =   tokenizer_x.texts_to_sequences(x_val)

    x_voc = tokenizer_x.num_words + 1
    x_train    =   pad_sequences(x_tr_seq,  maxlen=MAX_LEN_TXT, padding='post')
    x_test   =   pad_sequences(x_test_seq, maxlen=MAX_LEN_TXT, padding='post')
    x_val   =   pad_sequences(x_val_seq, maxlen=MAX_LEN_TXT, padding='post')
    return x_train,x_test,x_val,x_tr_seq,x_test_seq,x_val_seq,x_voc,tokenizer_x

def max_featuresum_count(y_train):
    '''Count max features for Summary'''
    tokenizer_y = Tokenizer()
    tokenizer_y.fit_on_texts(list(y_train))
    threshold=6
    count=0
    count_total=0
    frequancy=0
    frequancy_total=0
    for k,v in tokenizer_y.word_counts.items():
        count_total=count_total+1
        frequancy_total=frequancy_total+v
        if(v<threshold):
            count=count+1
            frequancy=frequancy+v
    return count_total-count

def tokenizing_sum(max_features,y_train,y_test,y_val):
    '''Tokenizing summary with pad sequences'''
    tokenizer_y = Tokenizer(num_words=max_features)
    tokenizer_y.fit_on_texts(list(y_train))

    #convert text to integer
    y_train_seq    =   tokenizer_y.texts_to_sequences(y_train)
    y_test_seq   =   tokenizer_y.texts_to_sequences(y_test)
    y_val_seq   =   tokenizer_y.texts_to_sequences(y_val)
    # padding zero to maximum
    y_train    =   pad_sequences(y_train_seq, maxlen=MAX_LEN_SUM, padding='post')
    y_test   =   pad_sequences(y_test_seq, maxlen=MAX_LEN_SUM, padding='post')
    y_val   =   pad_sequences(y_val_seq, maxlen=MAX_LEN_SUM, padding='post')
    # size of vocabulary
    y_vocab  =   tokenizer_y.num_words +1
    return y_train,y_test,y_val,y_train_seq,y_test_seq,y_val_seq,y_vocab,tokenizer_y

def cleanup_token(x,y):
    '''Remove lines contains only start and end token'''
    index=[]
    for i in range(len(y)):
        count=0
        for j in y[i]:
            if j!=0:
                count=count+1
        if(count==2):
            index.append(i)
    y=np.delete(y,index, axis=0)
    x=np.delete(x,index, axis=0)
    return x,y

def cleanup_blanksum(x,y):
    '''Remove blank lines'''
    index=[]
    for i in range(len(y)):
      if np.count_nonzero(y[i]==0)== len(y[i]):
        index.append(i)
    y=np.delete(y,index, axis=0)
    x=np.delete(x,index, axis=0)
    return x,y


## Validating Functions

In [33]:
def make_valcsv(model_class,x_val,y_val,num_val=1000):
  '''Create a csv for validation process'''
  human_summ=[]
  mach_summ =[]
  for i in range(0,num_val):
    try:
      human_summ.append(model_class.sequence2summary(y_val[i]))
      mach_summ.append(model_class.greedy_decode(x_val[i].reshape(1,MAX_LEN_TXT)))
      print("Eval Step {0}",i)
    except(KeyError):
      pass
  df = pd.DataFrame()

  df['original']= human_summ
  df['generated']= mach_summ

  df.original = df.original.apply(str)
  df.generated = df.generated.apply(str)

  df.to_csv("".join([PATH_TO_DATA,model_class.name,'_val.csv']),index=False)
  return human_summ,mach_summ

def avg_rouge(candidates, references):
    '''Validate using ROUGE metrics'''
    avg_rouge1_scores = []
    avg_rouge2_scores = []
    avg_rougeL_scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    for i in range(0, min(len(candidates), len(references))):
        scores = scorer.score(candidates[i], references[i])
        avg_rouge1_scores.append(scores['rouge1'][2])
        avg_rouge2_scores.append(scores['rouge2'][2])
        avg_rougeL_scores.append(scores['rougeL'][2])
    return [sum(avg_rouge1_scores)/len(avg_rouge1_scores) , sum(avg_rouge2_scores)/len(avg_rouge2_scores), sum(avg_rougeL_scores)/len(avg_rougeL_scores)]


# MODEL BUILDING

In [34]:
class MyModel:
    def __init__(self,name,word_embedding_matrix,y_vocab,tokenizer_x,tokenizer_y):
        K.clear_session()
        self.name = name
        self.dim_latent = 300
        self.dim_emb=word_embedding_matrix.shape[1]
        self.encoder=None
        self.decoder=None
        self.tokenizer_x = tokenizer_x
        self.tokenizer_y = tokenizer_y
        self.reverce_targat_word_index=tokenizer_y.index_word
        self.reverce_sourse_word_index=tokenizer_x.index_word
        self.targat_word_index=tokenizer_y.word_index
        # Encoder
        self.en_input = Input(shape=(MAX_LEN_TXT,))
        #embedding layer
        self.en_embedding =  Embedding(word_embedding_matrix.shape[0], self.dim_emb,weights=[word_embedding_matrix],trainable=True)(self.en_input)

        #lstm 1
        self.en_lstm1 = LSTM(self.dim_latent,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
        self.en_output1, self.state_hidden1, self.state_c1 = self.en_lstm1(self.en_embedding)

        # lstm 2
        self.en_lstm2 = LSTM(self.dim_latent,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
        self.en_output2, self.state_hidden2, self.state_c2 =self. en_lstm2(self.en_output1)

        # lstm 3
        self.en_lstm3=LSTM(self.dim_latent, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
        self.en_outputs, self.state_hidden, self.state_c= self.en_lstm3(self.en_output2)

        # Set up the decoder
        self.dec_input = Input(shape=(None,))

        #embedding layer
        self.emb_layer_dec = Embedding(y_vocab,self.dim_emb,trainable=True)
        self.dec_embedding = self.emb_layer_dec(self.dec_input)

        self.dec_lstm = LSTM(self.dim_latent, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
        self.dec_output,self.dec_fwd_state, self.dec_back_state = self.dec_lstm(self.dec_embedding,initial_state=[self.state_hidden, self.state_c])

        # Attention layer
        self.attention_layer = AttentionLayer(name='attention_layer')
        self.attention_output = self.attention_layer([self.dec_output,self.en_outputs])

        # Concatinate attention input and decoder LSTM output
        self.dec_concate_input = Concatenate(axis=-1, name='concat_layer')([self.dec_output, self.attention_output])

        # Dense layer
        self.dec_dense =  TimeDistributed(Dense(y_vocab, activation='softmax'))
        self.dec_output = self.dec_dense(self.dec_concate_input)

        # Define the model
        self.model = Model([self.en_input, self.dec_input], self.dec_output)
        self.model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        self.model_path = "".join([PATH_TO_MODELS,self.name,".h5"])

    def save(self):
      '''Saving model weights'''
      self.model.save(self.model_path)

    def train(self,x_train,x_test,y_train,y_test,epoch=5):
      '''Training model with defined epochs and 128 batch'''
      self.checkpoint_path = "".join([PATH_TO_CHECKPOINT,self.name,"/cp.ckpt"])
      self.checkpoint_dir = os.path.dirname(self.checkpoint_path)
      earlystopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)
      checkpointing = tf.keras.callbacks.ModelCheckpoint(filepath=self.checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
      self.history=self.model.fit([x_train,
             y_train[:,:-1]],
             y_train.reshape(y_train.shape[0],y_train.shape[1], 1)[:,1:],
             epochs=epoch,
             callbacks=[earlystopping,checkpointing],
             batch_size=128,
             validation_data=([x_test,y_test[:,:-1]],
                              y_test.reshape(y_test.shape[0],
                                             y_test.shape[1], 1)[:,1:],
                            )

            )
      self.save()
      return self.history
    def load(self,cp=True):
      '''Load model from weight file or from checkpoint.'''
      if cp==True:
        latest = tf.train.latest_checkpoint("".join([PATH_TO_CHECKPOINT,self.name,"/cp.ckpt"]))
        self.model.load_weights(latest)
      else:
        self.model.load_weights("".join([PATH_TO_MODELS,self.name,".h5"]))
      return self.model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',metrics=['acc'])

    def create_en_de_model(self):
      ''' The inference step for the encoder and decoder '''
      # Encode the input sequence to get the feature vector
      self.encoder = Model(inputs=self.en_input,outputs=[self.en_outputs, self.state_hidden, self.state_c])

      # Decoder
      # Below tensors will hold the states of the previous time step
      decoder_state_input_h = Input(shape=(self.dim_latent,))
      decoder_state_input_c= Input(shape=(self.dim_latent,))
      dec_hid_stat_ip = Input(shape=(MAX_LEN_TXT,self.dim_latent))

      # Get the embeddings of the decoder sequence
      dec_embedding2= self.emb_layer_dec(self.dec_input)

      dec_output2, state_hidden2, state_c2 = self.dec_lstm(dec_embedding2, initial_state=[decoder_state_input_h, decoder_state_input_c])

      #attention inference
      attention_output_inf = self.attention_layer([dec_output2, dec_hid_stat_ip])
      dec_inf_con = Concatenate(axis=-1, name='concat')([dec_output2, attention_output_inf])

      # A dense softmax layer to generate prob dist. over the target vocabulary
      dec_output2 = self.dec_dense(dec_inf_con)

      # decoder model
      self.decoder = Model(
      [self.dec_input] + [dec_hid_stat_ip,decoder_state_input_h, decoder_state_input_c],
      [dec_output2] + [state_hidden2, state_c2])

    def greedy_decode(self,input_sequence):
      '''Predict next word by choosing the word with highest probability'''
      if self.encoder == None or self.decoder==None:
          self.create_en_de_model()
      # Encode the input.
      en_out, en_h, en_c = self.encoder.predict(input_sequence)

      # Generate empty target sequence of length 1.
      targat_seq = np.zeros((1,1))

      # the first word of target sequence with the start word.
      targat_seq[0, 0] = self.targat_word_index['sostok']

      stop_cond = False
      dec_sent = ''
      while not stop_cond:
        output_tokans, h, c = self.decoder.predict([targat_seq] + [en_out, en_h, en_c])
        sample_index_tokan = np.argmax(output_tokans[0, -1, :])
        sample_tokan = self.reverce_targat_word_index[sample_index_tokan]

        if(sample_tokan!='eostok'):
            dec_sent += ' '+sample_tokan

        # Exit condition
        if (sample_tokan == 'eostok'  or len(dec_sent.split()) >= (MAX_LEN_SUM-1)):
            stop_cond = True

        # Update the target sequence.
        targat_seq = np.zeros((1,1))
        targat_seq[0, 0] = sample_index_tokan

        # Update internal states
        en_h, en_c = h, c

      return dec_sent
    def sequence2summary(self,input_sequence):
      '''Convert Sequences to Summary'''
      string=''
      for i in input_sequence:
        if((i!=0 and i!=self.targat_word_index['sostok']) and i!=self.targat_word_index['eostok']):
            string=string+self.reverce_targat_word_index[i]+' '
      return string

    def sequence2text(self,input_sequence):
      '''Convert Sequences to Text'''
      string=''
      for i in input_sequence:
        if(i!=0):
          string=string+self.reverce_sourse_word_index[i]+' '
      return string

    def summarize(self,input_text):
      '''Custom summary function'''
      list_txt= []
      list_txt.append(input_text)
      list_tr=self.tokenizer_x.texts_to_sequences(list_txt)
      list2seq=pad_sequences(list_tr,  maxlen=MAX_LEN_TXT, padding='post')
      summary = self.greedy_decode(list2seq[0].reshape(1,MAX_LEN_TXT))
      return summary
    
    def evaluate(self,x,y):
      '''Re-evalute model after loading. Will only cost about 10% time of normal training time'''
      # Re-evaluate the model
      loss, acc = self.model.evaluate([x, y[:,:-1]],
             y.reshape(y.shape[0],y.shape[1], 1)[:,1:], verbose=1)
      print("Restored model, accuracy: {:5.2f}%".format(100 * acc))


# MAIN

## Loading Dataset

In [35]:
'''Remember to delete old training csv when using new hyperparameter or choosing new embedding dimension.'''
import os
if os.path.isfile("".join([PATH_TO_DATA,'training.csv']))==False:
    preprocess('review')
data = pd.read_csv("".join([PATH_TO_DATA,'training.csv']))
data['Text'] = data['Text'].astype(str)
data['Summary'] = data['Summary'].astype(str)

## Creating Embedding

In [36]:
#Embbeding
embb_matrix,vocab_to_int = create_embedding(data['Text'].tolist(), data['Summary'].tolist())

Create Word Index - Done
Create Embedding Index - Done
Count Missing Embedding - Done
Vocabulary to Ints - Done
Create Embedding matrix - Done


## Splitting Data

In [37]:
#Split-set
x_train,x_test,x_val,y_train,y_test,y_val = splitset(data,data['Text'].tolist(), data['Summary'].tolist(),vocab_to_int)

#Tokenizing text
maxfeat = max_featuretxt_count(x_train)
x_train,x_test,x_val,x_tr_seq,x_test_seq,x_val_seq,x_voc,tokenizer_x = tokenizing_text(maxfeat,x_train,x_test,x_val)

#Tokenizing sum
maxfeat = max_featuresum_count(y_train)
y_train,y_test,y_val,y_tr_seq,y_test_seq,y_val_seq,y_voc,tokenizer_y = tokenizing_sum(maxfeat,y_train,y_test,y_val)

x_train,y_train = cleanup_token(x_train,y_train)
x_test,y_test = cleanup_token(x_test,y_test)
x_val,y_val = cleanup_token(x_val,y_val)

## Training / Loading

In [38]:
'''When loading, try to match the model name to the weight file name.'''
TRAINING = False

current = MyModel("400000_300d_nostop_100_20",embb_matrix,y_voc,tokenizer_x,tokenizer_y)
current.model.summary()

if TRAINING:
    current.train(x_train,x_test,y_train,y_test,epoch=20)
else:
    # Load the previously saved weights
    current.load(cp=False)
model = current.model

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 100)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 100, 300)             1650390   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 lstm (LSTM)                 [(None, 100, 300),           721200    ['embedding[0][0]']           
                              (None, 300),                                                        
                              (None, 300)]                                                    

## Validating

In [None]:
'''Re-evaluate the model Accuarcy with train set'''
#Evaluate Model's Accuarcy
#current.evaluate(x_train,y_train)

In [None]:
'''Make a csv file for safe keeping'''
human_summ,mach_summ = make_valcsv(current,x_val,y_val,num_val=x_val.shape[0])

Eval Step {0} 0
Eval Step {0} 1
Eval Step {0} 2
Eval Step {0} 3
Eval Step {0} 4
Eval Step {0} 5
Eval Step {0} 6
Eval Step {0} 7
Eval Step {0} 8
Eval Step {0} 9
Eval Step {0} 10
Eval Step {0} 11
Eval Step {0} 12
Eval Step {0} 13
Eval Step {0} 14
Eval Step {0} 15
Eval Step {0} 16
Eval Step {0} 17
Eval Step {0} 18
Eval Step {0} 19
Eval Step {0} 20
Eval Step {0} 21
Eval Step {0} 22
Eval Step {0} 23
Eval Step {0} 24
Eval Step {0} 25
Eval Step {0} 26
Eval Step {0} 27
Eval Step {0} 28
Eval Step {0} 29
Eval Step {0} 30
Eval Step {0} 31
Eval Step {0} 32
Eval Step {0} 33
Eval Step {0} 34
Eval Step {0} 35
Eval Step {0} 36
Eval Step {0} 37
Eval Step {0} 38
Eval Step {0} 39
Eval Step {0} 40
Eval Step {0} 41
Eval Step {0} 42
Eval Step {0} 43
Eval Step {0} 44
Eval Step {0} 45
Eval Step {0} 46
Eval Step {0} 47
Eval Step {0} 48
Eval Step {0} 49
Eval Step {0} 50
Eval Step {0} 51
Eval Step {0} 52
Eval Step {0} 53
Eval Step {0} 54
Eval Step {0} 55
Eval Step {0} 56
Eval Step {0} 57
Eval Step {0} 58
Eval St

[0.14133075728671451, 0.032170655138658126, 0.14020987967631868]

In [None]:
'''Avergae ROUGE metric F1-score'''
avg_rouge(mach_summ,human_summ)

[0.14133075728671451, 0.032170655138658126, 0.14020987967631868]

In [39]:
'''Print out some cases from validation set'''
for i in range(0,50):
    print("Review:",current.sequence2text(x_val[i]))
    print("Original summary:",current.sequence2summary(y_val[i]))
    print("Predicted summary:",current.greedy_decode(x_val[i].reshape(1,MAX_LEN_TXT)))
    print("\n")

Review: bread moist flavorful however ordered pack arrived every single dented people buy product long term storage definitely problem probably buy bread future purchase another vendor 
Original summary: dented cans 
Predicted summary:  great product


Review: lucky enough able buy bread local farmer market go back every saturday morning buy absolutely delicious high cinnamon sugar ratio right 
Original summary: delicious cinnamon loaf 
Predicted summary:  best gluten free bread


Review: typical bread mix thought ordering like cinnamon bread comes several recipes box used make banana nut muffins coffee cake muffins absolutely delicious spongy gritty texture gluten free taste even gluten eating family loves nobody would know gluten free unless told make half box time yields muffins keep rest family eating freeze quick breakfast go still delicious defrosting microwave complaint mix bit pricey amazon usually best price pack continue look sales definitely one thing keep hand times 
Origin