In [1]:
import numpy as np
import tensorflow as tf
import io
import json
import pandas as pd

index_index=1

raw = pd.read_csv('news_summary.csv',encoding='iso-8859-1')

import re

#Removes non-alphabetic characters:
def text_strip(row):
        
    #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
    row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
    row=re.sub("(\\r)", ' ', str(row)).lower() 
    row=re.sub("(\\n)", ' ', str(row)).lower()
        
    row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
    row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
    row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
    row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
    row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
    row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
    row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
    row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
    row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
    row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
    row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
    row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
    row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
    row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
    #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
    try:
        url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
        repl_url = url.group(3)
        row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
    except:
        pass #there might be emails with no url in them
        

        
    row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
    #Should always be last
    row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        
        
    yield row


a=raw.text[index_index]

brief_cleaning1 = text_strip(a)

from time import time
import spacy
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

t = time()

#Batch the data points into 5000 and run on all cores for faster preprocessing
text = [str(doc) for doc in nlp.pipe(brief_cleaning1, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.text import tokenizer_from_json

with open('xtokenizer.json') as f: 

        data = json.load(f) 

        x_tokenizer = tokenizer_from_json(data)

with open('ytokenizer.json') as f: 

        data = json.load(f) 

        y_tokenizer = tokenizer_from_json(data)

reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index

from tensorflow.keras.models import load_model

model=load_model('model.h5')

model.summary()

def seq2summary(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
            newString=newString+reverse_target_word_index[i]+' '
    return newString

x    =   x_tokenizer.texts_to_sequences(text)
x    =   pad_sequences(x,  maxlen=1000, padding='post')

#Model to summarize the text between 0-15 words for Summary and 0-100 words for Text
max_text_len=1000
max_summary_len=65

def decode_sequence(input_seq):
    
    enc_emb =  model.get_layer('embedding')
    a=enc_emb.apply(input_seq)
    
    encoder_lstm = model.get_layer('lstm')
    encoder_output1, state_h1, state_c1 = encoder_lstm.apply(a)
    
    encoder_lstm1 = model.get_layer('lstm_1')
    encoder_output2, state_h2, state_c2 = encoder_lstm1.apply(encoder_output1)
    
    
    encoder_lstm2 = model.get_layer('lstm_2')
    encoder_outputs, state_h, state_c= encoder_lstm2.apply(encoder_output2)
    
    dec_emb = model.get_layer('embedding_1')
    decoder_lstm = model.get_layer('lstm_3')
    decoder_dense = model.get_layer('time_distributed')
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        
        b=dec_emb.apply(target_seq)
        
        decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm.apply(b,initial_state=[state_h, state_c])
      
        output_tokens = decoder_dense.apply(decoder_outputs)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        state_h, state_c = decoder_fwd_state, decoder_back_state

    return decoded_sentence

print("Summary:",raw.text[index_index])

print("Summary:",raw.headlines[index_index])

print("Predicted summary:",decode_sequence(x.reshape(1,max_text_len)))

