In [1]:
import numpy as np
import tensorflow as tf
import io
import json
import pandas as pd
from time import time
import spacy
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.text import tokenizer_from_json
from tensorflow.keras.models import load_model

Using TensorFlow backend.


In [2]:
import re

#Removes non-alphabetic characters:
def text_strip(row):
        
    #ORDER OF REGEX IS VERY VERY IMPORTANT!!!!!!
        
    row=re.sub("(\\t)", ' ', str(row)).lower() #remove escape charecters
    row=re.sub("(\\r)", ' ', str(row)).lower() 
    row=re.sub("(\\n)", ' ', str(row)).lower()
        
    row=re.sub("(__+)", ' ', str(row)).lower()   #remove _ if it occors more than one time consecutively
    row=re.sub("(--+)", ' ', str(row)).lower()   #remove - if it occors more than one time consecutively
    row=re.sub("(~~+)", ' ', str(row)).lower()   #remove ~ if it occors more than one time consecutively
    row=re.sub("(\+\++)", ' ', str(row)).lower()   #remove + if it occors more than one time consecutively
    row=re.sub("(\.\.+)", ' ', str(row)).lower()   #remove . if it occors more than one time consecutively
        
    row=re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", ' ', str(row)).lower() #remove <>()|&©ø"',;?~*!
        
    row=re.sub("(mailto:)", ' ', str(row)).lower() #remove mailto:
    row=re.sub(r"(\\x9\d)", ' ', str(row)).lower() #remove \x9* in text
    row=re.sub("([iI][nN][cC]\d+)", 'INC_NUM', str(row)).lower() #replace INC nums to INC_NUM
    row=re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", 'CM_NUM', str(row)).lower() #replace CM# and CHG# to CM_NUM
        
        
    row=re.sub("(\.\s+)", ' ', str(row)).lower() #remove full stop at end of words(not between)
    row=re.sub("(\-\s+)", ' ', str(row)).lower() #remove - at end of words(not between)
    row=re.sub("(\:\s+)", ' ', str(row)).lower() #remove : at end of words(not between)
        
    row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces
        
    #Replace any url as such https://abc.xyz.net/browse/sdf-5327 ====> abc.xyz.net
    try:
        url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', str(row))
        repl_url = url.group(3)
        row = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)',repl_url, str(row))
    except:
        pass #there might be emails with no url in them
        

        
    row = re.sub("(\s+)",' ',str(row)).lower() #remove multiple spaces
        
    #Should always be last
    row=re.sub("(\s+.\s+)", ' ', str(row)).lower() #remove any single charecters hanging between 2 spaces

        
        
    yield row

In [3]:
with open('xtokenizer15.json') as f: 

        data = json.load(f) 

        x_tokenizer15 = tokenizer_from_json(data)
        
with open('ytokenizer15.json') as f: 

        data = json.load(f) 

        y_tokenizer15 = tokenizer_from_json(data)

In [4]:
with open('xtokenizer65.json') as f: 

        data = json.load(f) 

        x_tokenizer65 = tokenizer_from_json(data)
        
with open('ytokenizer65.json') as f: 

        data = json.load(f) 

        y_tokenizer65 = tokenizer_from_json(data)

In [5]:
reverse_target_word_index15=y_tokenizer15.index_word
reverse_source_word_index15=x_tokenizer15.index_word
target_word_index15=y_tokenizer15.word_index

In [6]:
reverse_target_word_index65=y_tokenizer65.index_word
reverse_source_word_index65=x_tokenizer65.index_word
target_word_index65=y_tokenizer65.word_index

In [7]:
model15=load_model('model15.h5')

In [8]:
model65=load_model('model65.h5')

In [9]:
def seq2summary1(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index15['sostok']) and i!=target_word_index15['eostok']):
            newString=newString+reverse_target_word_index15[i]+' '
    return newString

def seq2summary2(input_seq):
    newString=''
    for i in input_seq:
        if((i!=0 and i!=target_word_index65['sostok']) and i!=target_word_index65['eostok']):
            newString=newString+reverse_target_word_index65[i]+' '
    return newString

In [10]:
max_text_len1=100
max_summary_len1=15
max_text_len2=1000
max_summary_len2=65

In [11]:
def decode_sequence1(input_seq):
    
    enc_emb =  model15.get_layer('embedding')
    a=enc_emb.apply(input_seq)
    
    encoder_lstm = model15.get_layer('lstm')
    encoder_output1, state_h1, state_c1 = encoder_lstm.apply(a)
    
    encoder_lstm1 = model15.get_layer('lstm_1')
    encoder_output2, state_h2, state_c2 = encoder_lstm1.apply(encoder_output1)
    
    
    encoder_lstm2 = model15.get_layer('lstm_2')
    encoder_outputs, state_h, state_c= encoder_lstm2.apply(encoder_output2)
    
    dec_emb = model15.get_layer('embedding_1')
    decoder_lstm = model15.get_layer('lstm_3')
    decoder_dense = model15.get_layer('time_distributed')
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index15['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        
        b=dec_emb.apply(target_seq)
        
        decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm.apply(b,initial_state=[state_h, state_c])
      
        output_tokens = decoder_dense.apply(decoder_outputs)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index15[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len1-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        state_h, state_c = decoder_fwd_state, decoder_back_state

    return decoded_sentence

def decode_sequence2(input_seq):
    
    enc_emb =  model65.get_layer('embedding')
    a=enc_emb.apply(input_seq)
    
    encoder_lstm = model65.get_layer('lstm')
    encoder_output1, state_h1, state_c1 = encoder_lstm.apply(a)
    
    encoder_lstm1 = model65.get_layer('lstm_1')
    encoder_output2, state_h2, state_c2 = encoder_lstm1.apply(encoder_output1)
    
    
    encoder_lstm2 = model65.get_layer('lstm_2')
    encoder_outputs, state_h, state_c= encoder_lstm2.apply(encoder_output2)
    
    dec_emb = model65.get_layer('embedding_1')
    decoder_lstm = model65.get_layer('lstm_3')
    decoder_dense = model65.get_layer('time_distributed')
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    
    # Populate the first word of target sequence with the start word.
    target_seq[0, 0] = target_word_index65['sostok']

    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        
        b=dec_emb.apply(target_seq)
        
        decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm.apply(b,initial_state=[state_h, state_c])
      
        output_tokens = decoder_dense.apply(decoder_outputs)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index65[sampled_token_index]
        
        if(sampled_token!='eostok'):
            decoded_sentence += ' '+sampled_token

        # Exit condition: either hit max length or find stop word.
        if (sampled_token == 'eostok'  or len(decoded_sentence.split()) >= (max_summary_len2-1)):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update internal states
        state_h, state_c = decoder_fwd_state, decoder_back_state

    return decoded_sentence

In [12]:
while 1:    
    a=input()
    if a=='exit':
        break

    brief_cleaning1 = text_strip(a)

    nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

    t = time()

    #Batch the data points into 5000 and run on all cores for faster preprocessing
    text = [str(doc) for doc in nlp.pipe(brief_cleaning1, batch_size=5000, n_threads=-1)]

    print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

    print(len(str(text).split()))

    if len(str(text).split())>100 and len(str(text).split())<1000:
        x    =   x_tokenizer65.texts_to_sequences(text)
        x    =   pad_sequences(x,  maxlen=max_text_len2, padding='post')
        print("Predicted summary:",decode_sequence2(x[0].reshape(1,max_text_len2)))
    elif len(str(text).split())<101:
        x    =   x_tokenizer15.texts_to_sequences(text)
        x    =   pad_sequences(x,  maxlen=max_text_len1, padding='post')
        print("Predicted summary:",decode_sequence1(x[0].reshape(1,max_text_len1)))
    else:
        print("Very Big Text")


 During a concert, 18-year-old singer Billie Eilish played a video of herself stripping off to criticise body-shaming. Billie, known for covering up her body in oversized clothes, said in a voiceover, "If I wear what's comfortable, I'm not a woman. If I shed the layers, I'm a sl*t." "Though you've never seen my body, you still judge it," she added.


Time to clean up everything: 0.0 mins
52
Instructions for updating:
Please use `layer.__call__` method instead.
Predicted summary:  start singer threatens to shoot her with her body end


 exit
