In [None]:
import re
import string
import pandas as pd
from sklearn.metrics import roc_auc_score,auc
from sklearn.metrics import classification_report, confusion_matrix
from keras.utils import plot_model
from keras.layers import Layer
from sklearn.metrics import roc_auc_score , f1_score
import keras.backend as K
import pickle
import tensorflow as tf
import numpy as np
from keras.preprocessing.sequence import pad_sequences

### Attention Layer

In [None]:
class attention(Layer):
    def __init__(self,**kwargs):
        super(attention,self).__init__(**kwargs)

    def build(self,input_shape):
        self.W=self.add_weight(name="att_weight",shape=(input_shape[-1],1),initializer="normal")
        self.b=self.add_weight(name="att_bias",shape=(input_shape[1],1),initializer="zeros")        
        super(attention, self).build(input_shape)

    def call(self,x):
        et=K.squeeze(K.tanh(K.dot(x,self.W)+self.b),axis=-1)
        at=K.softmax(et)
        at=K.expand_dims(at,axis=-1)
        output=x*at
        return K.sum(output,axis=1)

    def compute_output_shape(self,input_shape):
        return (input_shape[0],input_shape[-1])

    def get_config(self):
        return super(attention,self).get_config()

### Fun 1

In [None]:
# vocab dictionary contains alphabets and numbers
vocab ={}
numbers="0123456789"

for i in string.ascii_letters :
  vocab[i]=1

for i in numbers :
  vocab[i]=1

In [None]:
#Load tokenizer
tokenizer = pickle.load(open('tokenizer.pkl','rb'))


#Load model 
bi_lstm_with_attention = tf.keras.models.load_model('bi_lstm_with_attention.h5',
                                                    custom_objects={'attention': attention,'auc':auc})

In [None]:
def pipeline(data):
    
    #1 PREPROCESS DATA

    data =  re.sub(r'<.*?>',' ',data)        #replace url & links
    data =  re.sub(r'http\S+',' ', data)
  
    data =  re.sub(r'_MATH_',' math ', data)  #handle special vocab
    data =  re.sub(r'_math_',' math ', data)

    data =  re.sub(r'_MATHDISP_',' math ', data)
    data =  re.sub(r'_mathdisp_',' math ', data)

    data =  re.sub(r'_REF_',' ref ', data)
    data =  re.sub(r'_ref_',' ref ', data)

    data =  re.sub(r'_CITE_',' cite ', data)
    data =  re.sub(r'_cite_',' cite ', data)
    
    new_data = ""                           #space before & after special chars            
    for char in data :
      if char==' ' :
        new_data = new_data + ' '    
      else :
        if char not in vocab :
          new_data = new_data + ' ' + char + ' '
        else :
          new_data = new_data + char

    data = new_data.strip()
    data = data.lower()

    #2 CREATE DATAFRAME
    df = pd.DataFrame()
    df['data']=[data]

    #2 TOKENIZE DATA
    X = tokenizer.texts_to_sequences(df['data'])
    X = pad_sequences(X, maxlen=128, padding='post')

    #3 PREDICT 
    pred_label = bi_lstm_with_attention.predict(X)
    pred_label = np.argmax(pred_label,axis=1)

    return pred_label

In [None]:
text = "In this section, we will provide two examples."

pred_label  = pipeline(text)

if(pred_label==1) :
  print(text,' Class: Edit')
else :
  print(text,' Class: no-Edit')

In this section, we will provide two examples.  Class: no-Edit


In [None]:
text = "In this section we will provide two example."

pred_label  = pipeline(text)

if(pred_label==1) :
  print(text,' Class: Edit')
else :
  print(text,' Class: no-Edit')

In this section we will provide two example.  Class: Edit
