## Imports

In [None]:
# !pip3 install os
from os import listdir
import string
from pickle import dump,load

## Loading the Data

In [None]:
class LoadData:
    def __init__(self, directory):
        self.directory= directory
        
    def load_stories(self):
        """
        Load the data and store it in a list of dictionaries
        
        """
        all_stories= list()
        
        def load_doc(filename):
            """
            Return the data from a given filename
            """
            file = open(filename, encoding='utf-8')
            text = file.read()
            file.close()
            return text
        
        def split_story(doc):
            """
            Split story from summaries based on the separater -> "@highlight"
            """
            index = doc.find('@highlight')
            story, highlights = doc[:index], doc[index:].split('@highlight')
            highlights = [h.strip() for h in highlights if len(h) > 0]
            return story, highlights
        
        list_of_files= listdir(self.directory)
        for name in list_of_files:
            filename = self.directory + '/' + name
            doc = load_doc(filename)
            story, highlights= split_story(doc)
            all_stories.append({'story': story, 'highlights': highlights})
        
        return all_stories

In [None]:
DIR_PATH= "/home/nikhil/Music/Text-Summarization/Original_data/cnn/stories"
obj= LoadData(DIR_PATH)
stories= obj.load_stories()

In [None]:
len(stories)

92579

In [None]:
print(stories[10]['highlights'])
print()
print(stories[10]['story'])

['Shirley Sotloff pleads directly to the leader of ISIS', '"Please release my child," she says', 'Steven Sotloff disappeared while reporting in Syria last year']

A mother's plea to the terrorists holding her son hostage: No individual should be punished for events he cannot control.

The mother is Shirley Sotloff, and she speaks directly to ISIS leader  Abu Bakr al-Baghdadi in a video broadcast Wednesday on Al Arabiya Network.

Her son, freelance journalist Steven Sotloff, appeared last week in an ISIS video showing the decapitation of American journalist James Foley.

The militant in the video warns that Steven Sotloff's fate depends on what President Barack Obama does next in Iraq.

A day after the video was posted, Obama vowed that the United States would be "relentless" in striking back against ISIS.

"Steven is a journalist who traveled to the Middle East to cover the suffering of Muslims at the hands of tyrants. Steven is a loyal and generous son, brother and grandson," Shirley 

In [None]:
stories[:2]

[{'story': '(CNN) -- The mayor of crime-ridden Camden, New Jersey, has announced layoffs of nearly half of the city\'s police force and close to a third of its fire department.\n\nOne hundred sixty-eight police officers and 67 firefighters were laid off Tuesday, as officials struggle to close a $26.5 million budget gap through a series of belt-tightening measures, Mayor Dana Redd told reporters. The layoffs take effect immediately.\n\nRedd said she was unable to secure the $8 million in budget concessions that she says she needed to save the jobs of up to 100 police officers and many of the city\'s firefighters.\n\nThe mayor -- who said she will continue negotiations with police and fire unions -- had been asking the workers to pay more for their health care, freeze or reduce their salaries and take furlough days.\n\nThe apparent impasse has left administrators of a city with the second-highest crime rate in the nation scrambling to figure out solutions to keep residents safe. Camden i

## Data Cleaning

In [None]:
class Clean_data:
    def __init__(self):
        pass
           
    def clean_lines(self, lines):
        cleaned = list()
        table = str.maketrans('', '', string.punctuation)
        
        for line in lines:
            index = line.find('(CNN)')
            if index >= 0:
                line = line[index + len('(CNN)'):]

            split_line = line.split()
            
            split_line = [word.lower() for word in split_line]
            split_line = [w.translate(table) for w in split_line]
            
            split_line = [word for word in split_line if word.isalpha()]
            cleaned.append(' '.join(split_line))
        cleaned = [c for c in cleaned if len(c) > 0]
        return cleaned

In [None]:
obj1= Clean_data()
cleaned_stories= list()
for example in stories:
    cleaned_stories.append({'story': obj1.clean_lines(example['story'].split('\n')), 'highlights': obj1.clean_lines(example['highlights'])})    

In [None]:
cleaned_stories[60]

{'story': ['the stepmother of zahra baker told police the girl was killed two weeks before she was reported missing according to search warrants released tuesday',
  'stepmother elisa baker also told police in hickory north carolina that the disabled girls body was disposed of the next day september in various locations according to the documents',
  'she told police on november that the girls father adam baker dismembered the girl and the couple disposed of the remains',
  'while elisa baker has been charged with obstruction of justice for writing a fake ransom note and leaving it at the familys hickory home no one has been charged directly in the girls death elisa baker also is accused of writing worthless checks',
  'police have said she had been cooperating with investigators',
  'one of the search warrants details an online conversation a web user said she had with adam andor elisa baker regarding their involvement with chainsaw massacre roleplaying',
  'according to the warrant t

In [None]:
dump(cleaned_stories, open('/home/nikhil/Music/Text-Summarization/Processed_data/CNN_data/processed_data/full_cnn_dataset.pkl', 'wb'))

In [None]:
cleaned_stories = load(open('/home/nikhil/Music/Text-Summarization/Processed_data/CNN_data/processed_data/full_cnn_dataset.pkl', 'rb'))
print('Loaded Stories %d' % len(cleaned_stories))

Loaded Stories 92579


In [None]:
cleaned_stories[0]

{'story': ['the mayor of crimeridden camden new jersey has announced layoffs of nearly half of the citys police force and close to a third of its fire department',
  'one hundred sixtyeight police officers and firefighters were laid off tuesday as officials struggle to close a million budget gap through a series of belttightening measures mayor dana redd told reporters the layoffs take effect immediately',
  'redd said she was unable to secure the million in budget concessions that she says she needed to save the jobs of up to police officers and many of the citys firefighters',
  'the mayor who said she will continue negotiations with police and fire unions had been asking the workers to pay more for their health care freeze or reduce their salaries and take furlough days',
  'the apparent impasse has left administrators of a city with the secondhighest crime rate in the nation scrambling to figure out solutions to keep residents safe camden is second only to st louis missouri in annu

---

## Amazon Food reviews Dataset

## Imports

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
AMAZON_DATA_PATH= '/home/nikhil/Music/Text-Summarization/Original_data/amazon-fine-food-reviews/Reviews.csv'

In [None]:
class Load_amazon_data:
    
    def __init__(self, dir_path, seed= 0):
        """
        Initialization
        """
        self.dir_path= dir_path
        np.random.seed(seed)
        
    def load(self):
        """
        Reads data from the given directory path
        """
        return pd.read_csv(self.dir_path)
    
    def drop(self):
        """
        Drops unnecessary columns
        """
        
        data= self.load()
        
        data = data.dropna()
        data= data.iloc[:, -2:]
        data = data.reset_index(drop= True)
        
        return data
    
    def analyze_data(self):
        """
        Prints some sample data points from the cleaned data
        """
        data= self.drop()
        
        for sr_no, i in enumerate(np.random.randint(10, 100, size= 5)):
            print("_________________________")
            print("Data Point {0}".format(sr_no + 1))
            print("Summary:")
            print(data['Summary'].iloc[i])
            print("Full Text:")
            print(data['Text'].iloc[i])

In [None]:
obj= Load_amazon_data(AMAZON_DATA_PATH, seed= 1)

## Load the Data

In [None]:
data= obj.load()
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Dropping Unnecessary columns

In [None]:
data= obj.drop()
data.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
data.shape

(568411, 2)

## Analyze the data

In [None]:
obj.analyze_data()

_________________________
Data Point 1
Summary:
Mushy
Full Text:
The flavors are good.  However, I do not see any differce between this and Oaker Oats brand - they are both mushy.
_________________________
Data Point 2
Summary:
Delicious product!
Full Text:
I can remember buying this candy as a kid and the quality hasn't dropped in all these years. Still a superb product you won't be disappointed with.
_________________________
Data Point 3
Summary:
Forget Molecular Gastronomy - this stuff rockes a coffee creamer!
Full Text:
I know the product title says Molecular Gastronomy, but don't let that scare you off.  I have been looking for this for a while now, not for food science, but for something more down to earth.  I use it to make my own coffee creamer.<br /><br />I have to have my coffee blonde and sweet - but the flavored creamers are full of the bad kinds of fat, and honestly, I hate to use manufactured "food" items.  I really don't think they are good for the body.  On the other h

## A list of english contractions for data transformation

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
class Data_cleaning:
    def __init__(self):
        self.clean_summaries= []
        self.clean_texts= []

    def clean_text(self, text, remove_stopwords = False):
        """
        Defines a series of cleaning operations 
        """
        text = text.lower()

        if True:
            text = text.split()
            new_text = []
            for word in text:
                if word in contractions:
                    new_text.append(contractions[word])
                else:
                    new_text.append(word)
            text = " ".join(new_text)

        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
        text = re.sub(r'\<a href', ' ', text)
        text = re.sub(r'&amp;', '', text) 
        text = re.sub(r'[_"\-;%()|+&=*%.!,?:#$@\[\]/]', ' ', text)
        text = re.sub(r'<br />', ' ', text)
        text = re.sub(r'<br >', ' ', text)
        text = re.sub(r'<br  >', ' ', text)
        text = re.sub(r'\'', ' ', text)

        # Optionally, remove stop words
        if remove_stopwords:
            text = text.split()
            stops = set(stopwords.words("english"))
            text = [w for w in text if not w in stops]
            text = " ".join(text)

        return text
    
    def clean(self, data):
        """
        Applies the clean_text() to the entire dataset
        """
        for summary in data.Summary:
            self.clean_summaries.append(self.clean_text(summary))

        print("Summaries are complete.")

        for text in data.Text:
            self.clean_texts.append(self.clean_text(text))

        print("Texts are complete.")
        
        return self.clean_summaries, self.clean_texts

In [None]:
# import nltk
# nltk.download('stopwords')

clean_obj= Data_cleaning()
clean_summaries, clean_texts= clean_obj.clean(data)

Summaries are complete.
Texts are complete.


## Display random points

In [None]:
np.random.seed(1)

for sr_no, i in enumerate(np.random.randint(10, 100, size= 5)):
    print("_________________________")
    print("Data Point #{0}".format(sr_no + 1))
    print("Summary:")
    print(clean_summaries[i])
    print("Full Text:")
    print(clean_texts[i])

_________________________
Data Point #1
Summary:
mushy
Full Text:
the flavors are good  however  i do not see any differce between this and oaker oats brand   they are both mushy 
_________________________
Data Point #2
Summary:
delicious product 
Full Text:
i can remember buying this candy as a kid and the quality has not dropped in all these years  still a superb product you will not be disappointed with 
_________________________
Data Point #3
Summary:
forget molecular gastronomy   this stuff rockes a coffee creamer 
Full Text:
i know the product title says molecular gastronomy  but do not let that scare you off  i have been looking for this for a while now  not for food science  but for something more down to earth  i use it to make my own coffee creamer   i have to have my coffee blonde and sweet   but the flavored creamers are full of the bad kinds of fat  and honestly  i hate to use manufactured  food  items  i really do not think they are good for the body  on the other hand  i

## Make a dict for storing the data in a file

In [None]:
#Appending new data 
amazon_data= list()
for (summ, story) in zip(clean_summaries, clean_texts):
    amazon_data.append({'story': [story], 'highlights':[summ]})

In [None]:

for sr_no, i in enumerate(np.random.randint(10, 10000, size= 5)):
    print("_________________________")
    print("Data Point #{0}".format(sr_no + 1))
    print("Summary:")
    print(amazon_data[i]['highlights'])
    print("Full Text:")
    print(amazon_data[i]['story'])

_________________________
Data Point #1
Summary:
['celebration s hibiscus tea']
Full Text:
['in an effort to reverse augmenting high blood pressure  a physician recommended i drink hibiscus herbal tea  kudos to celebration herbals for delivering the tea one day after the order was placed  i followed instructions on the box and used spring water which was steeped for approximately ten minutes  although the primary reason for drinking the tea was medicinal  i also wanted a flavorable tea  sadly the taste of the tea was unexceptional  accordingly  the next box of hibiscus tea that i purchase will not be from celebration herbals ']
_________________________
Data Point #2
Summary:
['yummy']
Full Text:
['when you have a craving for a sweet but you do not will not too much  this bar is just perfect  it is yummy ']
_________________________
Data Point #3
Summary:
['taste good but not as good as i had hoped']
Full Text:
['if you love vegetable soup you should like this  yes it has come chicken 

In [None]:
from pickle import dump,load

dump(amazon_data, open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/full_amazon_data.pkl', 'wb'))

In [None]:
from pickle import dump,load

amazon_data= load(open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/full_amazon_data.pkl', 'rb'))
amazon_data[0]

{'story': ['i have bought several of the vitality canned dog food products and have found them all to be of good quality  the product looks more like a stew than a processed meat and it smells better  my labrador is finicky and she appreciates this product better than most '],
 'highlights': ['good quality dog food']}

In [None]:
amazon_data[5]

{'story': ['i got a wild hair for taffy and ordered this five pound bag  the taffy was all very enjoyable with many flavors  watermelon  root beer  melon  peppermint  grape  etc  my only complaint is there was a bit too much red black licorice flavored pieces  just not my particular favorites   between me  my kids  and my husband  this lasted only two weeks  i would recommend this brand of taffy    it was a delightful treat '],
 'highlights': ['nice taffy']}

In [None]:
len(amazon_data)

568411

___

---

## Limiting the data to 2l points

In [None]:
# amazon_data= amazon_data[:200000]

___

## Counts the occurrences of each word

In [None]:
def count_words(count_dict, text):
    '''Count the number of occurrences of each word in a set of text'''
    for word in text.split():
        if word not in count_dict:
            count_dict[word] = 1
        else:
            count_dict[word] += 1

In [None]:
word_counts = {}
for data_point in amazon_data:
    count_words(word_counts, data_point['highlights'][0])
    count_words(word_counts, data_point['story'][0])

print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 118436


In [None]:
print(word_counts)



## Load the Embeddings

In [None]:
import numpy as np
embeddings_index = {}
with open('/home/nikhil/Music/Text-Summarization/Embeddings/numberbatch-en-17.06.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.array(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 417195


In [None]:
del embeddings_index['417194']

In [None]:
embeddings_index

{'##': array([-2.640e-02,  4.680e-02, -9.900e-03, -2.420e-02, -7.620e-02,
         5.620e-02,  8.630e-02,  1.150e-02, -4.710e-02,  4.420e-02,
        -8.750e-02,  3.760e-02, -4.040e-02, -8.600e-03,  1.610e-02,
        -1.689e-01,  1.485e-01, -2.010e-02,  1.021e-01, -6.350e-02,
        -3.170e-02,  1.420e-02,  5.880e-02, -1.299e-01, -9.050e-02,
         3.890e-02, -4.520e-02,  1.352e-01,  7.310e-02,  6.480e-02,
         1.309e-01,  4.930e-02,  7.850e-02,  1.540e-02, -1.015e-01,
         1.603e-01,  4.290e-02, -8.600e-02,  1.120e-02, -2.340e-02,
        -3.170e-02, -1.770e-02,  2.620e-02, -8.540e-02, -5.020e-02,
         6.000e-03, -3.890e-02, -3.060e-02,  1.039e-01, -5.900e-03,
         5.810e-02,  3.870e-02, -7.100e-03, -4.590e-02,  8.890e-02,
        -2.250e-02, -1.519e-01,  1.830e-02, -4.990e-02,  3.100e-03,
        -1.350e-02,  7.810e-02,  7.790e-02,  7.640e-02, -3.490e-02,
         2.370e-02, -9.130e-02, -1.350e-02,  1.970e-02, -8.590e-02,
         9.770e-02,  5.550e-02, -1.392e-01

In [None]:
# Find the number of words that are missing from CN, and are used more than our threshold.
missing_words = 0
threshold = 20

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),5)*100
            
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {0}%".format(missing_ratio))

Number of words missing from CN: 2529
Percent of words that are missing from vocabulary: 2.1350000000000002%


In [None]:
# with open("/home/nikhil/Music/Text-Summarization/Embeddings/wiki-news-300d-1M.vec") as f:
#     count= 0
#     for line in f:
#         vals= line.split()
#         word= vals[0]
#         vector= vals[1:]
#         count= count+1
#         print(len(vector))
#         if count==10:
#             break
#         print(word)
        
        

In [None]:
word_counts

{'good': 249392,
 'quality': 40585,
 'dog': 81862,
 'food': 138785,
 'i': 1692375,
 'have': 433720,
 'bought': 50661,
 'several': 20281,
 'of': 799282,
 'the': 1889227,
 'vitality': 478,
 'canned': 11514,
 'products': 32379,
 'and': 1300685,
 'found': 54697,
 'them': 215506,
 'all': 162635,
 'to': 1017135,
 'be': 189817,
 'product': 169622,
 'looks': 9322,
 'more': 136854,
 'like': 265350,
 'a': 1221915,
 'stew': 1228,
 'than': 118247,
 'processed': 3853,
 'meat': 14226,
 'it': 1089345,
 'smells': 9986,
 'better': 76617,
 'my': 486120,
 'labrador': 383,
 'is': 935565,
 'finicky': 2417,
 'she': 77822,
 'appreciates': 146,
 'this': 689266,
 'most': 44610,
 'not': 622310,
 'as': 283559,
 'advertised': 3268,
 'arrived': 17505,
 'labeled': 1526,
 'jumbo': 364,
 'salted': 1897,
 'peanuts': 4561,
 'were': 83420,
 'actually': 26980,
 'small': 34796,
 'sized': 4566,
 'unsalted': 738,
 'sure': 31985,
 'if': 170787,
 'was': 328910,
 'an': 91548,
 'error': 893,
 'or': 180471,
 'vendor': 2642,
 'in

In [None]:
#dictionary to convert words to integers
vocab_to_int = {}

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index: #evevn if the count is less than threshold, we have its embedding
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 118436
Number of words we will use: 58469
Percent of words we will use: 49.370000000000005%


In [None]:
# Need to use 300 for embedding dimensions to match CN's vectors.
embedding_dim = 300
nb_words = len(vocab_to_int)

word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in CN, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

58469


In [None]:
word_embedding_matrix.shape

(58469, 300)

In [None]:
def convert_to_ints(sentence, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    
    sentence_ints = []
    for word in sentence.split():
        word_count += 1
        if word in vocab_to_int:
            sentence_ints.append(vocab_to_int[word])
        else:
            sentence_ints.append(vocab_to_int["<UNK>"])
            unk_count += 1
    if eos:
        sentence_ints.append(vocab_to_int["<EOS>"])
    return sentence_ints, word_count, unk_count

In [None]:
word_count = 0
unk_count = 0

int_summaries= list()
int_texts= list()

for data_point in amazon_data:
    summaries, word_count, unk_count = convert_to_ints(data_point['highlights'][0], word_count, unk_count)
    texts, word_count, unk_count = convert_to_ints(data_point['story'][0], word_count, unk_count, eos=True)
    int_summaries.append(summaries)
    int_texts.append(texts)
    

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in headlines: 47820426
Total number of UNKs in headlines: 148883
Percent of words that are UNK: 0.31%


In [None]:
import pandas as pd
def create_lengths(text):
    '''Create a data frame of the sentence lengths from a text'''
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)

In [None]:
lengths_texts.head()

Unnamed: 0,counts
0,49
1,33
2,94
3,42
4,28


In [None]:
print("Summaries:")
print(lengths_summaries.describe())
print("Texts:")
print(lengths_texts.describe())

Summaries:
              counts
count  568411.000000
mean        4.181624
std         2.657872
min         0.000000
25%         2.000000
50%         4.000000
75%         5.000000
max        48.000000
Texts:
              counts
count  568411.000000
mean       80.948391
std        77.920658
min         1.000000
25%        35.000000
50%        58.000000
75%        99.000000
max      3475.000000


In [None]:
print(lengths_summaries.shape)
lengths_summaries

(568411, 1)


Unnamed: 0,counts
0,4
1,3
2,4
3,2
4,2
5,2
6,8
7,3
8,2
9,3


In [None]:
# Inspect the length of texts
print(np.percentile(lengths_texts.counts, 90))
print(np.percentile(lengths_texts.counts, 95))
print(np.percentile(lengths_texts.counts, 99))
print(np.percentile(lengths_texts.counts, 10))

161.0
217.0
382.0
25.0


In [None]:
# Inspect the length of summaries
print(np.percentile(lengths_summaries.counts, 90))
print(np.percentile(lengths_summaries.counts, 95))
print(np.percentile(lengths_summaries.counts, 99))
print(np.percentile(lengths_summaries.counts, 10))

8.0
9.0
13.0
2.0


In [None]:
def unk_counter(sentence):
    '''Counts the number of time UNK appears in a sentence.'''
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [None]:
min(lengths_texts.counts)

1

In [None]:
# takes a long time  , this is normal

# Sort the summaries and texts by the length of the texts, shortest to longest
# Limit the length of summaries and texts based on the min and max ranges.
# Remove reviews that include too many UNKs

sorted_summaries = []
sorted_texts = []
max_text_length = 161
max_summary_length = 8
summ_min_length = 2
text_min_length= 25
unk_text_limit = 1
unk_summary_limit = 0

for length in range(min(lengths_texts.counts), max_text_length): 
    for count, words in enumerate(int_summaries):
        if (len(int_summaries[count]) >= summ_min_length and
            len(int_texts[count]) >= text_min_length and
            len(int_summaries[count]) <= max_summary_length and
            unk_counter(int_summaries[count]) <= unk_summary_limit and
            unk_counter(int_texts[count]) <= unk_text_limit and
            length == len(int_texts[count])  #SO that points are not repeated
           ):
            
            sorted_summaries.append(int_summaries[count])
            sorted_texts.append(int_texts[count])
        
# Compare lengths to ensure they match
print(len(sorted_summaries))
print(len(sorted_texts))

363600
363600


In [None]:
sorted_texts

[[37,
  137,
  33,
  209,
  0,
  28,
  33,
  92,
  188,
  13,
  91,
  9,
  156,
  99,
  210,
  4,
  181,
  211,
  95,
  98,
  212,
  28,
  92,
  213,
  58467],
 [9,
  295,
  54,
  136,
  252,
  9,
  1293,
  187,
  1269,
  677,
  848,
  522,
  76,
  1294,
  53,
  98,
  848,
  76,
  655,
  1291,
  98,
  637,
  1295,
  134,
  58467],
 [37,
  33,
  55,
  560,
  1258,
  255,
  8,
  9,
  241,
  4,
  5,
  439,
  187,
  28,
  33,
  916,
  136,
  271,
  98,
  597,
  28,
  74,
  23,
  2597,
  58467],
 [304,
  54,
  136,
  252,
  4,
  3225,
  17,
  518,
  3223,
  3224,
  1519,
  243,
  39,
  9,
  1933,
  226,
  9,
  243,
  5,
  868,
  17,
  689,
  9,
  1933,
  58467],
 [298,
  3227,
  40,
  257,
  22,
  17,
  245,
  3228,
  3021,
  138,
  747,
  261,
  23,
  334,
  226,
  37,
  19,
  458,
  1138,
  40,
  1767,
  181,
  518,
  522,
  58467],
 [423,
  3435,
  33,
  31,
  405,
  182,
  3431,
  33,
  31,
  405,
  563,
  469,
  423,
  3435,
  92,
  953,
  134,
  39,
  90,
  975,
  13,
  39,
  90,
  27

In [None]:
len(vocab_to_int)

58469

In [None]:
dump(sorted_texts, open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/sorted_texts.pkl', 'wb'))
dump(sorted_summaries, open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/sorted_summaries.pkl', 'wb'))
dump(word_embedding_matrix, open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/word_embedding_matrix.pkl', 'wb'))
dump(vocab_to_int, open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/vocab_to_int.pkl', 'wb'))
dump(int_to_vocab, open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/int_to_vocab.pkl', 'wb'))

In [None]:
from pickle import dump, load
sorted_texts= load(open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/sorted_texts.pkl', 'rb'))
sorted_summaries= load(open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/sorted_summaries.pkl', 'rb'))
word_embedding_matrix= load(open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/word_embedding_matrix.pkl', 'rb'))
vocab_to_int= load(open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/vocab_to_int.pkl', 'rb'))
int_to_vocab= load(open('/home/nikhil/Music/Text-Summarization/Processed_data/Amazon_data/int_to_vocab.pkl', 'rb'))

In [None]:
index= 200000

sample_text= sorted_texts[index]
sample_summary= sorted_summaries[index]

sent= ""
for word in sample_text:
    word= int_to_vocab[word]
    sent += ' ' + word
    
sent

' i have just found out today that the gelatin in this product is coming from pork i do not know why they would add gelatin in the first place dha from fish oil is fine but the lady that answered my phone told me that all dha gerber baby food items have pork gelatin do infants need gelatin at this age <EOS>'

In [None]:
summ= ""
for word in sample_summary:
    summ += int_to_vocab[word] + " "
    
summ

'pork gelatin '

## Text Padding

In [None]:
max_text_length= 217
max_summary_length= 9 + 1

def transform_input_text(texts):
    padded_texts= list()
    for text in texts:
        if len(text) < max_text_length:
            text= text + [vocab_to_int['<PAD>'] for i in range(max_text_length - len(text))]
            
        else:
            text= text[:max_text_length]
        padded_texts.append(text)
        
    return padded_texts

# def transform_output_text(texts):
#     padded_texts= list()
#     max_length_texts= list()
    
#     for text in texts:
#         text= text[:max_summary_length]
#         max_length_texts.append(text)
    
#     for text in max_length_texts:
#         text= [vocab_to_int['<GO>']] + text + [vocab_to_int['<EOS>']]
#         if len(text) < max_summary_length:
#             text= text + [vocab_to_int['<PAD>'] for i in range(max_summary_length - len(text))]
            
#         padded_texts.append(text)
        
#     return padded_texts


In [None]:
padded_sorted_texts= transform_input_text(sorted_texts)
padded_sorted_summaries= sorted_summaries

# padded_sorted_summaries= transform_output_text(sorted_summaries)

In [None]:
len(padded_sorted_texts)

363600

In [None]:
word_embedding_matrix.shape

(58469, 300)

## Model Building
The encoder-decoder model for recurrent neural networks is an architecture for sequence-to-sequence prediction problems.

It is comprised of two sub-models, as its name suggests:

    Encoder: The encoder is responsible for stepping through the input time steps and encoding the entire sequence into a fixed length vector called a context vector.
    Decoder: The decoder is responsible for stepping through the output time steps while reading from the context vector.


## Real-time Data Generation for Keras model

In [None]:
def pad_text(text):
    '''Pad text upto a max length'''
    if len(text) < max_summary_length:
        text= text + [vocab_to_int['<PAD>'] for i in range(max_summary_length - len(text))]
        
    return text

In [None]:
import keras

BATCH_SIZE= 200
class DataGenerator(keras.utils.Sequence):
    'Generates batches of data for keras'
    def __init__(self, texts, summaries, word_embedding_matrix, batch_size= BATCH_SIZE,
                 n_classes= len(vocab_to_int), shuffle= True, EMBEDDING_SIZE= 300, mode= 'train'):
        'Initialization'
        
        self.batch_size = batch_size
        self.max_text_length= max_text_length
        
        self.max_summ_length= max_summary_length
        self.vocab_to_int= vocab_to_int
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.mode= mode
        
        brk_point= int(len(texts)*0.8)
        if self.mode == 'train':
            texts= texts[:brk_point]
            summaries= summaries[:brk_point]
        else:
            texts= texts[brk_point:]
            summaries= summaries[brk_point:]

        self.texts= texts
        self.summaries= summaries
        
        self.word_embedding_matrix= word_embedding_matrix
        self.EMBEDDING_SIZE= EMBEDDING_SIZE
        
        self.on_epoch_end()
        
    def __len__(self):   #Correct
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.texts) / self.batch_size))
    
    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        [X, Z], y = self.__data_generation(indexes)
        return [X, Z], y
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.texts))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        # Initialization
        X = np.empty((self.batch_size, self.max_text_length, self.EMBEDDING_SIZE))
        y = np.empty((self.batch_size, self.max_summ_length, self.n_classes), dtype=int)
        Z = np.empty((self.batch_size, self.max_summ_length, self.EMBEDDING_SIZE))

        # Generate data
        for i, ID in enumerate(indexes):
            
            text_data_point= self.texts[ID]
            summary_data_point= self.summaries[ID]
            
            for j, word in enumerate(text_data_point):
                emb= self.word_embedding_matrix[word]
                X[i, j, ]= emb
            
            input_summary_point= [self.vocab_to_int["<GO>"]] + summary_data_point
            output_summary_point= summary_data_point
            
            input_summary_point= input_summary_point[:max_summary_length]
            output_summary_point= output_summary_point[:max_summary_length -1]
            
            output_summary_point= output_summary_point + [self.vocab_to_int['<EOS>']]
            
            
            input_summary_point= pad_text(input_summary_point)
            output_summary_point= pad_text(output_summary_point)
            
            for j, word in enumerate(input_summary_point):
                
                emb= self.word_embedding_matrix[word]
                Z[i, j, ]= emb
                
            for j, word in enumerate(output_summary_point):
                temp= np.zeros((self.n_classes, ))
                temp[word]= 1
                y[i, j, ]= temp  

        return [X, Z], y

Using TensorFlow backend.


In [None]:
import numpy as np
train_gen= DataGenerator(padded_sorted_texts, padded_sorted_summaries, word_embedding_matrix, mode= 'train')
val_gen= DataGenerator(padded_sorted_texts, padded_sorted_summaries, word_embedding_matrix, mode= 'val')

## Paths

In [None]:
import time

model_path= '/home/nikhil/Music/Text-Summarization/Saved_models/Model-{}'.format(int(time.time()))
encoder_path= '/home/nikhil/Music/Text-Summarization/Saved_models/Encoder-{}'.format(int(time.time()))
decoder_path= '/home/nikhil/Music/Text-Summarization/Saved_models/Decoder-{}'.format(int(time.time()))

logs_dir= "/home/nikhil/Music/Text-Summarization/Scripts/On_full_data/logs/Text_Summ-{}".format(int(time.time()))

In [None]:
logs_dir

'/home/nikhil/Music/Text-Summarization/Scripts/On_full_data/logs/Text_Summ-1555016969'

In [None]:
import os

os.mkdir(model_path)
os.mkdir(logs_dir)

## CallBacks

### CYCLical Learning Rate

In [None]:
from CLR_master.clr_callback import *

clr_triangular = CyclicLR(mode='triangular', base_lr= 0.001,max_lr= 0.02, step_size= 3500)

### Tensorboard Callback
* tensorboard --logdir=/Checkpoints/logs

In [None]:
tb_callback= keras.callbacks.TensorBoard(log_dir= logs_dir, histogram_freq=0,
                                         batch_size=BATCH_SIZE, write_graph=True)

## Architecture 1

In [None]:
vocab_size= len(vocab_to_int)
EMBEDDING_SIZE= 300
HIDDEN_UNITS= 600

word_embedding_matrix= word_embedding_matrix
unknown_emb= word_embedding_matrix[vocab_to_int['<UNK>']]

#Architecture
encoder_inputs= Input(shape= (None, EMBEDDING_SIZE), name= 'encoder_inputs')
encoder_lstm = LSTM(units= HIDDEN_UNITS, dropout=0.4, return_state= True, name= 'encoder_lstm')

encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape= (None, EMBEDDING_SIZE), name= 'decoder_inputs')
decoder_lstm = LSTM(units= HIDDEN_UNITS, return_state=True, dropout=0.4, return_sequences=True, name='decoder_lstm')

decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, initial_state= encoder_states)
decoder_dense = Dense(units= vocab_size, activation= 'softmax', name= 'decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model(inputs= [encoder_inputs, decoder_inputs], outputs= decoder_outputs)

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_inputs = [Input(shape= (HIDDEN_UNITS,)), Input(shape= (HIDDEN_UNITS,))]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state= decoder_state_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)

In [None]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 300)    0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 300)    0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 600), (None, 2162400     encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 600),  2162400     decoder_inputs[0][0]             
                                                                 encoder_lstm[0][1]               
          

In [None]:
encoder_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_inputs (InputLayer)  (None, None, 300)         0         
_________________________________________________________________
encoder_lstm (LSTM)          [(None, 600), (None, 600) 2162400   
Total params: 2,162,400
Trainable params: 2,162,400
Non-trainable params: 0
_________________________________________________________________


## Architecture 2

In [None]:
"""__encoder___"""
encoder_inputs = Input(shape=en_shape)

encoder_LSTM = LSTM(hidden_units, dropout_U = 0.2, dropout_W = 0.2 ,return_state=True)
encoder_LSTM_rev=LSTM(hidden_units,return_state=True,go_backwards=True)

encoder_outputsR, state_hR, state_cR = encoder_LSTM_rev(encoder_inputs)
encoder_outputs, state_h, state_c = encoder_LSTM(encoder_inputs)

state_hfinal=Add()([state_h,state_hR])
state_cfinal=Add()([state_c,state_cR])

encoder_states = [state_hfinal,state_cfinal]

"""____decoder___"""
decoder_inputs = Input(shape=(None,de_shape[1]))
decoder_LSTM = LSTM(hidden_units,return_sequences=True,return_state=True)
decoder_outputs, _, _ = decoder_LSTM(decoder_inputs,initial_state=encoder_states) 
decoder_dense = Dense(de_shape[1],activation='linear')
decoder_outputs = decoder_dense(decoder_outputs)

model= Model(inputs=[encoder_inputs,decoder_inputs], outputs=decoder_outputs)

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(encoder_model, to_file='encoder.png', show_shapes=True, show_layer_names=True)
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
plot_model(decoder_model, to_file='decoder.png', show_shapes=True, show_layer_names=True)

___

In [None]:
from keras.models import Model, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint

class Seq2seq:
    def __init__(self, vocab_to_int, int_to_vocab, word_embedding_matrix, load_model_path, encoder_path, decoder_path):
        '''Initialization'''
        self.max_text_length= max_text_length
        self.max_summary_length= max_summary_length
        self.vocab_to_int= vocab_to_int
        self.int_to_vocab= int_to_vocab
        self.vocab_size= len(self.vocab_to_int)
        self.EMBEDDING_SIZE= 300
        self.HIDDEN_UNITS= 600
        
        self.load_model_path= load_model_path
        self.encoder_path= encoder_path
        self.decoder_path= decoder_path
        
        self.word_embedding_matrix= word_embedding_matrix
        self.unknown_emb= self.word_embedding_matrix[self.vocab_to_int['<UNK>']]
      
        #Architecture
        encoder_inputs= Input(shape= (None, self.EMBEDDING_SIZE), name= 'encoder_inputs')
        encoder_lstm = LSTM(units= self.HIDDEN_UNITS, return_state= True, name= 'encoder_lstm', 
                            dropout= 0.2, recurrent_dropout= 0.2)
        
        encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
        encoder_states = [encoder_state_h, encoder_state_c]

        decoder_inputs = Input(shape= (None, self.EMBEDDING_SIZE), name= 'decoder_inputs')
        decoder_lstm = LSTM(units= self.HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm',
                            dropout= 0.2, recurrent_dropout= 0.2)
        
        decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
                                                                         initial_state= encoder_states)
        decoder_dense = Dense(units= self.vocab_size, activation= 'softmax', name= 'decoder_dense')
        decoder_outputs = decoder_dense(decoder_outputs)

        model = Model(inputs= [encoder_inputs, decoder_inputs], outputs= decoder_outputs)

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

        self.model = model

        self.encoder_model = Model(encoder_inputs, encoder_states)

        decoder_state_inputs = [Input(shape= (self.HIDDEN_UNITS,)), Input(shape= (self.HIDDEN_UNITS,))]
        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state= decoder_state_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        
        self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)
    
    def fit(self, train_gen, val_gen):
        '''Training'''
        
        checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.h5'
        checkpoint = ModelCheckpoint(model_path + "/" + checkpoint_name,
                                     monitor='val_loss', verbose=1, save_best_only=True, mode='min')
     
        self.model.fit_generator(generator= train_gen, validation_data= val_gen,
                                 callbacks= [tb_callback, clr_triangular, checkpoint], epochs=100)

#         self.model.save(self.model_path)  # creates a HDF5 file 'my_model.h5'
#         self.encoder_model.save(self.encoder_path)
#         self.decoder_model.save(self.decoder_path)
        
        return self.model, self.encoder_model, self.decoder_model
    
    def load_model(self):
        """Loads the model at the given paths"""
        model= load_model(self.load_model_path)
        encoder= load_model(self.encoder_path)
        decoder= load_model(self.decoder_path)
        
        return model, encoder, decoder
    
    
    def visualize_clr():
        
        plt.xlabel('Training Iterations')
        plt.ylabel('Learning Rate')
        plt.title("CLR - 'triangular' Policy")
        plt.plot(clr_triangular.history['iterations'], clr_triangular.history['lr'])

    def summarize(self, input_text, model, encoder_model, decoder_model):
        
#         self.model= model
#         self.encoder_model= encoder_model
#         self.decoder_model= decoder_model
        
        input_seq = np.zeros((1, self.max_text_length, self.EMBEDDING_SIZE))
        for idx, word in enumerate(input_text.lower().split(' ')):
            if idx >= self.max_text_length:
                break
            emb = self.unknown_emb  # default [UNK]
            if word in self.vocab_to_int:
                emb = self.word_embedding_matrix[self.vocab_to_int[word]]
            input_seq[0, idx, :] = emb
        
        states_value = encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, self.EMBEDDING_SIZE))
        target_seq[0, 0, :] = self.word_embedding_matrix[self.vocab_to_int['<GO>']]
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_word = self.int_to_vocab[sample_token_idx]
            target_text_len += 1

#             if sample_word != '<GO>' and sample_word != '<EOS>':
            target_text += ' ' + sample_word

            if sample_word == '<EOS>' or target_text_len >= self.max_summary_length:
                terminated = True

            if sample_word in self.vocab_to_int:
                target_seq[0, 0, :] = self.word_embedding_matrix[self.vocab_to_int[sample_word]]

            else:
                target_seq[0, 0, :] = self.unknown_emb

            states_value = [h, c]
        return target_text.strip()

In [None]:
obj= Seq2seq(vocab_to_int, int_to_vocab, word_embedding_matrix,
             load_model_path='/home/nikhil/Music/Text-Summarization/Saved_models/model.h5',
             decoder_path= '/home/nikhil/Music/Text-Summarization/Saved_models/decoder.h5', 
             encoder_path='/home/nikhil/Music/Text-Summarization/Saved_models/encoder.h5')

In [None]:
model, encoder, decoder= obj.fit(train_gen, val_gen)

Epoch 1/100
  91/1454 [>.............................] - ETA: 31:45 - loss: 3.1007 - acc: 0.6069

In [None]:
obj.visualize_clr()

TypeError: visualize_clr() takes 0 positional arguments but 1 was given

# Load the model

In [None]:
HIDDEN_UNITS= 600

model = load_model('/home/nikhil/Music/Text-Summarization/Saved_models/Model-1554991745/Weights-001--2.75836.h5')

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output   # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]   # input_2
decoder_state_input_h = Input(shape=(HIDDEN_UNITS,), name='input_3')
decoder_state_input_c = Input(shape=(HIDDEN_UNITS,), name='input_4')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]

decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(decoder_inputs, initial_state= decoder_states_inputs)
decoder_states = [state_h_dec, state_c_dec]

decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
# from keras.models import Model, load_model
# model, encoder, decoder= obj.load_model()

In [None]:
sample_text= sorted_texts[1500]
# sample_summary= sorted_summaries[105]

sample_sent_text= list()
for word in sample_text:
    sample_sent_text.append(int_to_vocab[word])

In [None]:
sent=""
for i in range(len(sample_sent_text)-1):
    sent+=sample_sent_text[i] + " " 
    
sent

'i bought these petrodex chews for my dogs because i like petrodex toothpaste '

In [None]:
summarize(sent, model, encoder_model, decoder_model)

'anecdotal been <EOS>'

In [None]:
# vocab_size= len(vocab_to_int)
# EMBEDDING_SIZE= 300
# HIDDEN_UNITS= 1024

# model_path= '/home/nikhil/Music/Text-Summarization/Saved_models/model.h5'
# decoder_path= "/home/nikhil/Music/Text-Summarization/Saved_models/decoder.h5"
# encoder_path= "/home/nikhil/Music/Text-Summarization/Saved_models/encoder.h5"


# #         self.word_embedding_matrix= word_embedding_matrix
# unknown_emb= word_embedding_matrix[vocab_to_int['<UNK>']]

# encoder_inputs= Input(shape= (None, EMBEDDING_SIZE), name= 'encoder_inputs')
# encoder_lstm = LSTM(units= HIDDEN_UNITS, return_state= True, name= 'encoder_lstm')

# encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
# encoder_states = [encoder_state_h, encoder_state_c]

# encoder_model = Model(encoder_inputs, encoder_states)


# decoder_inputs = Input(shape= (None, EMBEDDING_SIZE), name= 'decoder_inputs')
# decoder_lstm = LSTM(units= HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')

# decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs,
#                                                                  initial_state= encoder_states)
# decoder_dense = Dense(units= vocab_size, activation= 'softmax', name= 'decoder_dense')
# decoder_outputs = decoder_dense(decoder_outputs)

# model = Model(inputs= [encoder_inputs, decoder_inputs], outputs= decoder_outputs)

# model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# decoder_state_inputs = [Input(shape= (HIDDEN_UNITS,)), Input(shape= (HIDDEN_UNITS,))]
# decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state= decoder_state_inputs)
# decoder_states = [state_h, state_c]
# decoder_outputs = decoder_dense(decoder_outputs)

# decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)


# from keras.models import load_model
# from keras.callbacks import ModelCheckpoint

# checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        
# model.fit_generator(generator= train_gen, validation_data= val_gen, epochs=25) #, callbacks= [checkpoint])

# model.save(model_path)  # creates a HDF5 file 'my_model.h5'

# encoder_model.save(encoder_path)
# decoder_model.save(decoder_path)

# model = load_model(model_path)
# encoder_model = load_model(encoder_path)
# decoder_model = load_model(decoder_path)

In [None]:
# obj= Seq2seq(vocab_to_int, int_to_vocab, word_embedding_matrix)
# model, encoder, decoder= obj.fit(train_gen, val_gen)

In [None]:
# encoder_inputs = model.input[0]   # input_1
# encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output   # lstm_1
# encoder_states = [state_h_enc, state_c_enc]
# encoder_model = Model(encoder_inputs, encoder_states)

# decoder_inputs = model.input[1]   # input_2
# decoder_state_input_h = Input(shape=(HIDDEN_UNITS,), name='input_3')
# decoder_state_input_c = Input(shape=(HIDDEN_UNITS,), name='input_4')
# decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
# decoder_lstm = model.layers[3]

# decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(decoder_inputs, 
#                                                          initial_state= decoder_states_inputs)

# decoder_states = [state_h_dec, state_c_dec]
# decoder_dense = model.layers[4]

# decoder_outputs = decoder_dense(decoder_outputs)
# decoder_model = Model(
#     [decoder_inputs] + decoder_states_inputs,
#     [decoder_outputs] + decoder_states)

In [None]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_inputs (InputLayer)     (None, None, 300)    0                                            
__________________________________________________________________________________________________
decoder_inputs (InputLayer)     (None, None, 300)    0                                            
__________________________________________________________________________________________________
encoder_lstm (LSTM)             [(None, 100), (None, 160400      encoder_inputs[0][0]             
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 100),  160400      decoder_inputs[0][0]             
                                                                 encoder_lstm[0][1]               
          

In [None]:
decoder_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_inputs (InputLayer)     (None, None, 300)    0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 100),  160400      decoder_inputs[0][0]             
                                                                 input_1[0][0]                    
          

## Prediction Time

In [None]:
EMBEDDING_SIZE= 300

def summarize(input_text, model, encoder_model, decoder_model):
    """
    Sumarize
    """
    input_seq = np.zeros((1, max_text_length, EMBEDDING_SIZE))
    for idx, word in enumerate(input_text.lower().split(' ')):
        if idx >= max_text_length:
            break
        emb = word_embedding_matrix[vocab_to_int['<UNK>']]  # default [UNK]
        if word in vocab_to_int:
            emb = word_embedding_matrix[vocab_to_int[word]]
        input_seq[0, idx, :] = emb

    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, EMBEDDING_SIZE))
    target_seq[0, 0, :] = word_embedding_matrix[vocab_to_int['<GO>']]
    target_text = ''
    target_text_len = 0
    terminated = False
    while not terminated:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sample_token_idx = np.argmax(output_tokens[0, -1, :])
        sample_word = int_to_vocab[sample_token_idx]
        target_text_len += 1

#         if sample_word != '<GO>' and sample_word != '<EOS>':
        target_text += ' ' + sample_word

        if sample_word == '<EOS>' or target_text_len >= max_summary_length:
            terminated = True

        if sample_word in vocab_to_int:
            target_seq[0, 0, :] = word_embedding_matrix[vocab_to_int[sample_word]]

        else:
            target_seq[0, 0, :] = word_embedding_matrix[vocab_to_int['<UNK>']]

        states_value = [h, c]
    return target_text.strip()

## Summarise 

In [None]:
summarize(sent, model, encoder_model, decoder_model)

'believe not be be ok bitter bother exceptional understand'

In [None]:
from keras.utils.vis_utils import plot_model
import matplotlib.pyplot as plt
plot_model(decoder, show_shapes= True, show_layer_names= True)

___

___

In [None]:
vocab_size = 
src_txt_length = 
sum_txt_length = 
# encoder input model
inputs = Input(shape=(src_txt_length,))
encoder1 = Embedding(vocab_size, 128)(inputs)
encoder2 = LSTM(128)(encoder1)
encoder3 = RepeatVector(sum_txt_length)(encoder2)
# decoder output model
decoder1 = LSTM(128, return_sequences=True)(encoder3)
outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder1)
# tie it together
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

## Buffer Implementation

In [None]:
def define_models(n_input= 300, d_input= 300, n_output= len(vocab_to_int), n_units= 150):
	# define training encoder
	encoder_inputs = Input(shape= (None, n_input))
	encoder = LSTM(n_units, return_state= True)
	encoder_outputs, state_h, state_c = encoder(encoder_inputs)
	encoder_states = [state_h, state_c]
	# define training decoder
	decoder_inputs = Input(shape= (None, d_input))
	decoder_lstm = LSTM(n_units, return_sequences=True, return_state=True)
	decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state= encoder_states)
	decoder_dense = Dense(n_output, activation='softmax')
	decoder_outputs = decoder_dense(decoder_outputs)
	model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
	# define inference encoder
	encoder_model = Model(encoder_inputs, encoder_states)
	# define inference decoder
	decoder_state_input_h = Input(shape=(n_units,))
	decoder_state_input_c = Input(shape=(n_units,))
    
	decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
	decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
	decoder_states = [state_h, state_c]
	decoder_outputs = decoder_dense(decoder_outputs)
	decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
	# return all models
	return model, encoder_model, decoder_model

In [None]:
model, encoder_model, decoder_model= define_models()

In [None]:
keras.utils.print_summary(decoder_model)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 150),  270600      input_6[0][0]                    
                                                                 input_7[0][0]                    
          

In [None]:
keras.utils.print_summary(model)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 150), (None, 270600      input_5[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 150),  270600      input_6[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
import numpy as np

In [None]:
model.fit_generator(generator= gen, epochs= 2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f854326ab00>

## Prediction