# spacy toolkit
> Tools enhancing spacy usage

In [1]:
# default_exp  spacy

In [9]:
# export
from forgebox.html import DOM,JS
import numpy as np
import json

Normalization and cosine distance calculation, between each word of 2 sentences

In [10]:
# export
def l2norm(x):return np.sqrt(np.power(x,2).sum(-1))
def normal(x):return (x/l2norm(x)[:,None])
def distance(a,b):return 1- normal(a)@(normal(b).T)

In [11]:
# export

highlight = """
$(document).ready(function(){
    const red=(dom)=>{
        $(dom)
            .css("background-color","#FFCCEE")
            .css("box-shadow","0px 0px 10px #FFCCEE")
    }
    const white=(dom)=>{
        $(dom).css("background-color","#FFFFFF")
            .css("box-shadow","0px 0px 0px #FFFFFF")
    }
    $(".nlp_tok")
            .css("margin","1px 2px")
            .css("padding","1px 1px")
            .css("border-radius","5px");
        var edge = JSON.parse(window.edge_data)
        var edge2 = JSON.parse(window.edge_data2)
        var all_b = []
        for(var a in edge)
        {
            $(`#map_word_src_${a}`).data("match",edge[a].idx)
            $(`#map_word_src_${a}`).css("font-weight",900)
           $(`#map_word_src_${a}`)
                   .hover(function(){
                       red(this)
                       var b_match = $(this).data("match")
                       for(var i in b_match){
                           var b = b_match[i]
                           red(document.querySelector(`#map_word_tgt_${b}`))
                       }
                   })
            $(`#map_word_src_${a}`)
                    .mouseleave(function(){
                        white(this)
                        var b_match = $(this).data("match")
                        for(var i in b_match){
                           var b = b_match[i]
                           white(document.querySelector(`#map_word_tgt_${b}`))
                       }
                    })
        }
        for(var a in edge2)
        {
            $(`#map_word_tgt_${a}`).data("match",edge2[a].idx)
            $(`#map_word_tgt_${a}`).css("font-weight",900)
           $(`#map_word_tgt_${a}`)
                   .hover(function(){
                       red(this)
                       var b_match = $(this).data("match")
                       for(var i in b_match){
                           var b = b_match[i]
                           red(document.querySelector(`#map_word_src_${b}`))
                       }
                   })
            $(`#map_word_tgt_${a}`)
                    .mouseleave(function(){
                        white(this)
                        var b_match = $(this).data("match")
                        for(var i in b_match){
                           var b = b_match[i]
                           white(document.querySelector(`#map_word_src_${b}`))
                       }
                    })
        }
        
})
"""
def make_map(distance_map,th = .1):
    """
    create a hot spot map between similar token
    return dict, token index with matching token index in the target sentence
    """
    a2b_dict = dict()
    tgt_range = np.arange(distance_map.shape[-1])
    for i in range(len(distance_map)):
        slice_ = (distance_map[i,:]<th)
        if slice_.sum()>0:
            a2b_dict[str(i)]= dict(dist = list(distance_map[i,:][slice_].astype(float)),
                              idx = list(tgt_range[slice_].astype(str)))
    return a2b_dict

def make_span(tok,tok_id,classes):
    """
    create span for token
    tok,spacy token
    tok_id, span's html id
    
    return forgebox.html.DOM
    """
    return DOM(tok.text,"span",{"class":" ".join(classes),"id":tok_id})

def make_div(doc,id_list,classes_list):
    """
    create div tag for a sentence
    doc:spacy doc
    id_list, a list of html id
    classes_list, a list of classes assigning to the token span
    
    return forgebox.html.DOM
    """
    div = DOM("","div",{"class":"text-block"})
    for tok,tok_id,classes in zip(doc,id_list,classes_list):
        div.append(make_span(tok,tok_id,classes))
    return div

def doc2div(doc,distance_map,th=.1,is_src= True):
    totallen = len(doc)
    src_tgt = "src" if is_src else "tgt"
    id_list = list(f"map_word_{src_tgt}_{i}" for i in range(len(doc)))
    if is_src:
        edge_map = make_map(distance_map,th=th)
        DOM(f"window.edge_data = '{json.dumps(edge_map)}'","script")()
    else:
        edge_map2 = make_map(distance_map.T,th=th)
        DOM(f"window.edge_data2 = '{json.dumps(edge_map2)}'","script")()
    
    return make_div(doc,id_list,zip([src_tgt,]*totallen,["nlp_tok"]*totallen))



def compare_sentences(A,B,nlp,th=.1):
    """
    Compare between sentences
    A: str, a sentence
    B: str, another sentence
    nlp: loaded spacy: eg. from spacy import load;nlp = load("some_model_name"), 
    see spacy documentation https://spacy.io/usage/spacy-101/
    
    th:float, threshold for consine distance, smaller it is, fewer matching tokens will be found, 
    but assume that means higher quality
    """
    doc_a = nlp(A)
    doc_b = nlp(B)
    distance_map = distance(doc_a.tensor,doc_b.tensor)
    DOM("Sentence A","h3",{"class":"text-primary"})()
    doc2div(doc_a,distance_map,th=th,is_src=True)()
    DOM("Sentence B","h3",{"class":"text-primary"})()
    doc2div(doc_b,distance_map,th=th,is_src=False)()
    JS(highlight)

## Visualizing Sentence Compare

### Example
* Suppose you have 2 sentences, name them A and B

In [14]:
A = """BACKGROUND: In the COU-AA-302 study (NCT00887198), abiraterone acetate plus prednisone (AAP) 
significantly improved outcomes in patients with metastatic castration-resistant prostate cancer (mCRPC) 
versus prednisone alone. Baseline clinical parameters predicting that treatment response could help inform 
clinical decisions were explored. OBJECTIVE: To identify patients who derive the greatest clinical benefit 
from AAP treatment. DESIGN, SETTING, AND PARTICIPANTS: A total of 1088 mCRPC patients treated with either 
AAP or prednisone in the first-line setting in COU-AA-302 were included in this post hoc analysis. 
INTERVENTION: Abiraterone acetate1000mg daily versus placebo, both plus prednisone 10mg daily. 
OUTCOME MEASUREMENTS AND STATISTICAL ANALYSIS: Univariate and multivariable Cox regression 
analyses were performed, including clinical and pathological parameters for the primary end points overall 
survival (OS) and radiographic progression-free survival (rPFS), and secondary study end points. 
Tumor-associated baseline parameters independently impacting OS were applied to stratify patients according 
to possible treatment effects. RESULTS AND LIMITATIONS: Baseline prostate-specific antigen (PSA), tumor-related 
pain as assessed by the Brief Pain Inventory-Short Form (BPI-SF), and Gleason score (GS) at primary diagnosis
were identified as tumor-associated variables that independently impacted OS. AAP significantly improved 
outcomes versus prednisone in both group 1 (BPI-SF 0-1 and PSA <80 ng/ml and GS <8; p=0.006; hazard ratio 
[HR]: 0.61) and group 2 (BPI-SF 2-3 and/or PSA ≥80 ng/ml and/or GS ≥8; p=0.03; HR: 0.84). 
The differences observed for treatment effects between groups 1 and 2 for OS (HR: 0.61 vs 0.84), rPFS 
(HR: 0.41 vs 0.59), and time to chemotherapy (HR: 0.64 vs 0.71) were not statistically significant. 
CONCLUSIONS: AAP significantly improved outcomes in mCRPC patients compared with prednisone alone regardless 
of baseline pain and PSA level, and GS at primary diagnosis with no significant differences between observed 
treatment effects in groups 1 and 2. PATIENT SUMMARY: Treatment with abiraterone acetate and prednisone 
(compared with treatment with prednisone only) for metastatic castration-resistant prostate cancer 
increased survival in all patients in the study regardless of pain, prostate-specific antigen levels 
at the start of treatment, and Gleason score at primary diagnosis."""

B = """'Treatment with abiraterone acetate and prednisone (compared with treatment with prednisone only) 
for metastatic castration-resistant prostate cancer increased survival in all patients in the study regardless 
of pain, prostate-specific antigen levels at the start of treatment, and Gleason score at primary diagnosis.'
"""

You can run the comparation between A & B

In [15]:
from spacy import load
nlp = load("en_core_web_sm")

compare_sentences(A,B,nlp)