In [1]:
import spacy
import random
import re
import numpy as np
from spacy.lang.en import English
from spacy.symbols import ORTH
from spacy.symbols import NORM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import sys 
import fasttext
sys.path.append('./luima_sbd')
import luima_sbd.sbd_utils as luima
from spacy.language import Language
random.seed(42)
from joblib import dump, load
from pathlib import Path

In [2]:
wv_model = fasttext.load_model("wordEmbeddingsModel.bin")
nlp = spacy.load("en_core_web_sm")

nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Fed. Cir.', [{ORTH: 'Fed. Cir.'}])

def spacy_tokenize(txt):
    nlp.disable_pipes('parser')
    doc = nlp.pipe(txt, n_process=4)
    doc = nlp(txt)
    tokens = list(doc)
    clean_tokens = []
    for i in range(len(tokens)):
        t=tokens[i]
        t1=tokens[i]
#         print(t.pos_, t.text)
        if(i!=len(tokens)-1):
            t1=tokens[i+1]
        if(t1!=t and t1.pos_=='PART' and re.search(r'\'', t1.text)):
            scrap = t.text+t1.text
            scrap = re.sub(r'\W','',scrap).lower()
            clean_tokens.append(scrap)
            i=i+1           
        elif t.pos_ == 'PUNCT':
            pass
        elif t.text in ('Vet. App.','Fed. Cir.'):
            lem=t.lemma_
            lem=lem.lower()
            clean_tokens.append(lem)
        elif (t.text[0].isalpha()==False and t.is_digit==False):
            if(t.is_upper==False):
                pass
            else:
                lem=t.lemma_
                lem=lem.lower()
                clean_tokens.append(lem)            
        elif t.pos_ == 'NUM':
            clean_tokens.append(f'<NUM{len(t)}>')
        else:
            lem=t.lemma_
            lem = re.sub(r'\W','',lem)
            lem=lem.lower()
            clean_tokens.append(lem)
    return clean_tokens

def spans_add_spacy_tokens(spans):
    for s in spans:
        tokens = spacy_tokenize(s['txt'])
        s['tokens_spacy'] = tokens
        s['tokens_number'] = len(tokens)
        
def add_word_vec(spans):
    for s in spans:
        final_vector= []
        sum_vec= np.zeros(100)
        if(len(s["tokens_spacy"])!=0):
            for word in s['tokens_spacy']:
                w_vec = wv_model.get_word_vector(word)
                sum_vec=np.add(w_vec,sum_vec)
            final_vector=sum_vec/s['tokens_number']
            s['word_vec']=final_vector
        else:
            s['word_vec']=np.zeros(100)

     



In [3]:
model_file="Radial_Kernel_SVM_WordEmb"
vectorizer_file="spacy_tfidf_vectorizer"
clf = load(f'{model_file}.joblib') 
spacy_tfidf_vectorizer = load(f'{vectorizer_file}.joblib')

train_mean = 20.592041800643088 
train_std = 15.221984070624842
for_predict=[]
    
def make_dict(sentences, indices, doc_length):
    for i in range(len(sentences)):
        s={}
        s['txt']=sentences[i]
        s['start_normalized'] = indices[i][0]/doc_length
        for_predict.append(s)        
    

def make_feature_vectors_and_labels(spans):
    # function takes long to execute
    # note: we un-sparse the matrix here to be able to manipulate it
    
#     tfidf = spacy_tfidf_vectorizer.transform([s['txt'] for s in spans]).toarray()
    starts_normalized = np.array([s['start_normalized'] for s in spans])
    num_tokens_norm = np.array([((s['tokens_number']-train_mean)/train_std) for s in spans])
    word_embd = np.array([s['word_vec'] for s in spans])
#     y = np.array([s['type'] for s in spans])
#     print(tfidf.shape, starts_normalized.shape, num_tokens_norm.shape, word_embd.shape)
    X = np.concatenate((word_embd, np.expand_dims(starts_normalized, axis=1), np.expand_dims(num_tokens_norm, axis=1)), axis=1)
    return X

def analyze(doc):
    predictions=[]
    doc_length=len(doc)
    sentences = luima.text2sentences(doc, offsets=False)
    index = luima.text2sentences(doc, offsets=True)
    make_dict(sentences, index, doc_length)
    spans_add_spacy_tokens(for_predict)
    add_word_vec(for_predict)
    predict_X=make_feature_vectors_and_labels(for_predict)
    for_predict.clear()
    labels=clf.predict(predict_X)
    for i in range(len(sentences)):
        array=[sentences[i],labels[i]]
        predictions.append(array)
        
    return predictions

In [4]:
test_string="TEST STRING"
analyze(test_string)

[['TEST STRING', 'Header']]