In [1]:
import pickle
data = pickle.load(open("./data_pik", 'rb'))

In [2]:
import pandas as pd
df = pd.DataFrame(data[:500])
df_train = pd.DataFrame(data[:100])
df_test = pd.DataFrame(data[100:120])

In [3]:
import random
from rouge_score import rouge_scorer

def calc_rouge_scores(pred_summaries, gold_summaries, 
                                 keys=['rouge1', 'rougeL'], use_stemmer=True):
    #Calculate rouge scores
    scorer = rouge_scorer.RougeScorer(keys, use_stemmer= use_stemmer)
    
    n = len(pred_summaries)
    
    scores = [scorer.score(pred_summaries[j], gold_summaries[j]) for 
              j in range(n)] 
    
    dict_scores={}                                                            
    for key in keys:
        dict_scores.update({key: {}})
        
    
    for key in keys:
        
        precision_list = [scores[j][key][0] for j in range(len(scores))]
        recall_list = [scores[j][key][1] for j in range(len(scores))]
        f1_list = [scores[j][key][2] for j in range(len(scores))]

        precision = np.mean(precision_list)
        recall = np.mean(recall_list)
        f1 = np.mean(f1_list)
        
        dict_results = {'recall': recall, 'precision': precision, 'f1': f1}
        
        dict_scores[key] = dict_results
        
    return dict_scores

In [4]:
"""
preprocessing_embed.py
"""
import pickle
import pandas as pd
import numpy as np
import spacy
from sentence_transformers import SentenceTransformer
from datetime import datetime as dt


### Helper function

def text_to_sent_list(text, 
                      nlp = spacy.load("en_core_web_sm"), 
                      embedder = SentenceTransformer('distilbert-base-nli-mean-tokens'),
                      min_len=2):
    
    ''' Returns cleaned article sentences and BERT sentence embeddings'''
    
    #convert to list of sentences
    text = nlp(text)
    sents = list(text.sents)
    #remove short sentences by threshhold                                                                                                
    sents_clean = [sentence.text for sentence in sents if len(sentence)> min_len]
    #remove entries with empty list
    sents_clean = [sentence for sentence in sents_clean if len(sentence)!=0]
    #embed sentences (deafult uses BERT SentenceTransformer)
    sents_embedding= np.array(embedder.encode(sents_clean, convert_to_tensor=True))
    
    return sents_clean, sents_embedding



#load nlp and embedder
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

t1 = dt.now()
print(t1)

#extract clean sentence list and sentence embedding for each article TEXT
f = lambda text: text_to_sent_list(text, nlp=nlp, embedder=embedder, min_len=2)
s_interim_tuple = df['description'].apply(f)


df['text_clean'] = s_interim_tuple.apply(lambda x: x[0])
df['text_embedding'] = s_interim_tuple.apply(lambda x: x[1])

#extract clean sentence list and sentence embedding for each article SUMMARY
f = lambda summ: text_to_sent_list(summ, nlp=nlp, embedder=embedder, min_len=0)
s_interim_tuple = df['abstract'].apply(f)

df['summary_clean'] = s_interim_tuple.apply(lambda x: x[0])
df['summary_embedding'] = s_interim_tuple.apply(lambda x: x[1])

# with open(output_file, 'wb') as handle:                                     
#     pickle.dump(df, handle)

t2=dt.now()
print(t2)
print(t2-t1)


2021-03-23 20:17:46.228254
2021-03-23 20:51:17.153450
0:33:30.925196


In [5]:
"""
preprocessing label_target.py
"""
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

from datetime import datetime as dt

### Helper Functions

def find_sim_single_summary(summary_sentence_embed, doc_emedding):
    '''returns array of indices for max cosine sim per summary sentences'''
    cos_sim_mat = cosine_similarity(doc_emedding, summary_sentence_embed)
    idx_arr = np.argmax(cos_sim_mat, axis=0)
    
    return idx_arr

def label_sent_in_summary(s_text, s_summary):
    '''returns index list and binary target labels in an array'''
    doc_num = s_text.shape[0]
    
    #initialize zeros
    labels = [np.zeros(doc.shape[0]) for doc in s_text.tolist()] 
    
    #calc idx for most similar
    
    idx_list = [np.sort(find_sim_single_summary(s_summary[j], s_text[j])) for j 
                                                            in range(doc_num)]
      
    for j in range(doc_num):
        labels[j][idx_list[j]]= 1 
    
    return idx_list, labels


### Script

t1 = dt.now()
print(t1)

#get index list and target labels
idx_list, labels = label_sent_in_summary(df.text_embedding, df.summary_embedding)

#wrap in dataframe
df['labels'] = labels
df['labels_idx_list'] = idx_list

t2 = dt.now()

print(t2)

print(t2-t1)

2021-03-23 20:56:58.841657
2021-03-23 20:56:59.249785
0:00:00.408128


In [6]:
'''lstm1.py'''
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import TimeDistributed
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from keras.layers import Bidirectional
# from sklearn.preprocessing import train_test_split


#step 1: process data for ltsm input

#convert to numpy array
to_array = lambda x: np.array(x)
df.text_embedding = df.text_embedding.apply(to_array)
df.labels= df.labels.apply(to_array)
df.text_embedding = df.text_embedding.apply(lambda x: x.reshape(1, x.shape[0],x.shape[1]))
df.labels = df.labels.apply(lambda x: x.reshape(1, len(x),1))

# #train_test split
# train_test_split()
# train_doc_labels = set(data_dict['train_test_sets'][0][0].flatten())
# mask_train = np.array([x in train_doc_labels for x in df.index]) 

X_train = df.text_embedding[:400].tolist()
y_train = df.labels[:400].tolist()

X_test = df.text_embedding[400:].tolist()
y_test = df.labels[400:].tolist()


# define LSTM
model = Sequential()

model.add(LSTM(25, input_shape=(None, 768), return_sequences=True, dropout=0))
#model.add(LSTM(25, input_shape=(None, 768), return_sequences=True, dropout=0))

#model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0), input_shape=(None, 768)))
#model.add(Bidirectional(LSTM(50, return_sequences=True, dropout=0), input_shape=(None, 768)))

model.add(TimeDistributed(Dense(1, activation='sigmoid')))
model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=[tf.keras.metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)])


# train LSTM
training_loss =[]
training_metric = []
for j in range(len(X_train)):   
    X, y = X_train[j], y_train[j]
    history = model.fit(X, y, epochs=1, batch_size=1)
    training_loss.append(history.history['loss'])
    
# evaluate LSTM
y_pred_list =[]
idx_list=[]
for j in range(len(X_test)):
    X= X_test[j]
    y_pred = model.predict(X, verbose=0)
    idx = np.argsort(y_pred[0].flatten())[-3:]
    idx = sorted(idx)
    y_pred_list.append(y_pred)
    idx_list.append(idx)

    
#retrieve summary pairs
doc_index = df.index[400:]
pred_summaries = [' '.join(np.array(df.text_clean[doc_index].iloc[j])[np.array(idx_list[j])].tolist()) 
                  for j in range(len(idx_list))]
df_gold = df.summary_clean[doc_index]
gold_summaries = [' '.join(df_gold .iloc[j]) for j in range(len(pred_summaries))]
summaries_comp = tuple(zip(pred_summaries, gold_summaries))


#calculate rouge score
scores = calc_rouge_scores(pred_summaries, gold_summaries, 
                                  keys=['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

results_dict ={'summaries_comp': summaries_comp,
               'sent_index_number': idx, 'Rouge': scores, 'mod_summary': model.summary()}













Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, None, 25)          79400     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 1)           26        
Total params: 79,426
Trainable params: 79,426
Non-trainable params: 0
_________________________________________________________________


In [7]:
results_dict

{'summaries_comp': (('beside straight stitching , a commercial - type double overlock which creates a casing over the raw edge of the fabric may be used . a patterned piece of mesh , which pattern is readily devised by one of ordinary skill , is appropriately folded and sewn along one edge . the garment 30 is deliberately made loose fitting to enable the user to comfortably wear the garment over regular clothes , and to produce the blousing effect necessary for maximum insect protection properties .',
   'an insect proof garment for protection against mosquitoes , black flies and other insects is disclosed . the garment is made entirely from a lightweight semi - rigid insect excluding mesh and is comprised of a one - piece fully encloseable head net attached to an upper body portion , with the upper body portion extending from the neck to proximate the hips of the wearer and has sleeves connected thereto . there is a separate lower portion extending from the wearer &# 39 ; s waist and 

In [8]:
df['abstract'][8]

'accurately recognizing from eye - gaze patterns when a user is reading , skimming , or scanning on a display filled with heterogeneous content , and then supplying information tailored to meet individual needs . heterogeneous content includes objects normally encountered on computer monitors , such as text , images , hyperlinks , windows , icons , and menus . three distinct mechanisms are used : coarse or quantized representation of eye - movements , accumulation of pooled numerical evidence based detection , and mode switching . analysis of text the user is reading or skimming may infer user interest and adapt to the user &# 39 ; s needs .'

In [8]:
from tensorflow.keras.models import model_from_yaml
import pickle

model_yaml = model.to_yaml()
with open("model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)

model.save_weights("model.h5")

print("Saved model to disk")


Saved model to disk


In [9]:
pickle.dump(results_dict, open("lstm-500-summary.pkl", "wb"))

In [9]:
results_dict

{'summaries_comp': (('after stained with hematoxylin and eosin respectively , the sections were photographed under a microscope and the area and thickness of the tunica intima were calculated . a polylactic acid sustained - release layer containing heparin was coated on the surface of the scaffold to prevent formation of acute thrombus and to control sustained - release of the medicine efficiently . firstly the polymer solution was sprayed on the surface of the scaffold , and then the as 2 o 3 aqueous solution was sprayed thereon . upon drying , as 2 o 3 formed conglomerated particles on the surface of the scaffold , and a solvent was sprayed on the surface of the scaffold to level the polymer coating .',
   'an arsenic trioxide medical elution scaffold and preparation method thereof , wherein the arsenic trioxide medical elution scaffold comprises scaffold , polymer coating covering the scaffold and arsenic trioxide loaded in the polymer coating which is in the form of single particle

In [None]:
('while walking on the treadmill , the operable interface with the user occurs at handgrips which are normally biased up or alternatively fixedly secured , such that the user may reach up to head level or above , and subsequently grasp and pull either of the hand grips and exert downward force at the handgrip in a manner which in some respects may provide for the exercise activity known as lat pull - downs . typically , when the user grasps and manipulates handle bar 470 during treadmill activity , the dual axes of a 400 and b 400 perform in combination to simulate geometry which exhibits characteristics in which the user would sense the presence of qualities of caster . in this instance , the hand grips or handle bar of this treadmill apparatus may exhibit programmable and / or interactive force and motion characteristics with the user .',
   'a treadmill apparatus comprising a rigid member movably supported overhead of a user wherein the rigid member has guiding arrangement for guiding a handle of the rigid member through a path having a vertical component , and wherein said handle has a biasing arrangement for biasing the handle toward an upper end of the path .')

In [10]:
results_dict['Rouge']

{'rouge1': {'recall': 0.3611975854094878,
  'precision': 0.35076253585759276,
  'f1': 0.3364340529724948},
 'rouge2': {'recall': 0.09700098645419626,
  'precision': 0.09235974166337434,
  'f1': 0.08958483806754741},
 'rougeL': {'recall': 0.21208936338085496,
  'precision': 0.20483224959605237,
  'f1': 0.196869888835738}}