In [1]:
import sys
sys.path.append(r"C:\Users\siyam\Desktop\Legal DS\project\venv\Lib\site-packages")
import os
from pathlib import Path
import json
import random
import numpy as np
import spacy


from tqdm import tqdm
import spacy
import json
import random
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn import model_selection
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.symbols import ORTH
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
from spacy.language import Language
from luima_sbd import sbd_utils as luima

## Load Data 

In [2]:
CURATED_ANN_PATH = "../Data/ldsi_w2021-20220221T223611Z-001/ldsi_w2021/ldsi_w21_curated_annotations_v2.json"

In [3]:
with open(CURATED_ANN_PATH, 'r') as j:
     data = json.loads(j.read())

In [4]:
data.keys()

dict_keys(['documents', 'annotations', 'types'])

In [5]:
annotations = data['annotations']
documents_by_id = {d['_id']: d for d in data['documents']}
types_by_id = {t['_id']: t for t in data['types']}
type_ids_by_name = {t['name']: t['_id'] for t in data['types']}
type_names_by_id = {t['_id']: t['name'] for t in data['types']}
doc_id_by_name = {d['name']: d['_id'] for d in data['documents']}
doc_name_by_id = {d['_id']: d['name'] for d in data['documents']}

### Common functions

In [6]:
# get all sentences assuming every annotation is a sentence
def make_span_data(documents_by_id, types_by_id, annotations):
    span_data = []
    for a in annotations:
        start = a['start']
        end = a['end']
        document_txt = documents_by_id[a['document']]['plainText']
        atype = a['type']
        sd = {'txt': document_txt[start:end],
              'document': a['document'],
              'type': types_by_id[atype]['name'],
              'start': a['start'],
              'start_normalized': a['start'] / len(document_txt),
              'end': a['end']}
        span_data.append(sd)
    return span_data

In [7]:
def prepare_ann_span_by_doc(spans, doc_id):
    start = []
    end = []
    for span in spans:
        if span['document'] == doc_id:
            start.append(span['start'])
            end.append(span['end'])
    return start, end

In [8]:
def generate_ann_span_by_doc_with_spacy(train_doc_ids, nlp):
    gen_ann_span_by_doc = {}
    for train_id in tqdm(train_doc_ids, disable=True):
        text = documents_by_id[train_id]['plainText']
        doc = nlp(text)
        sentences = list(doc.sents)
        sentence_starts = [sent.start_char for sent in sentences]
        sentence_ends = [sent.end_char for sent in sentences]
        gen_ann_span_by_doc[train_id] = {'start': sentence_starts, 'end': sentence_ends}
    return gen_ann_span_by_doc

In [9]:
def generate_ann_span_by_doc_with_luima(train_doc_ids):
    gen_ann_span_by_doc = {}
    for train_id in tqdm(train_doc_ids, disable=True):
        text = documents_by_id[train_id]['plainText']
        doc = luima.text2sentences(text, offsets=False)
        indices = luima.text2sentences(text, offsets=True)
        
        sentence_starts = [ind[0] for ind in indices]
        sentence_ends = [ind[1] for ind in indices]
        gen_ann_span_by_doc[train_id] = {'start': sentence_starts, 'end': sentence_ends}
    return gen_ann_span_by_doc

In [10]:
def find_closest_start_point(train_doc_ids, true_ann_span_by_doc, gen_ann_span_by_doc):
    closest_by_id = {}
    for train_id in train_doc_ids:
        true_starts = []
        closest_neighbors = []
        for true_start in true_ann_span_by_doc[train_id]['start']:
            dist = 7000000
            for gen_start in gen_ann_span_by_doc[train_id]['start']:
                cal_dist = abs(true_start - gen_start)
                if cal_dist < dist:
                    dist = cal_dist
                    closest_neighbor = gen_start
            true_starts.append(true_start)
            closest_neighbors.append(closest_neighbor)
        closest_by_id[train_id] = {'true': true_starts, 'pred': closest_neighbors}
    return closest_by_id

In [11]:
def calculate_error_metrics(train_doc_ids, true_ann_span_by_doc, gen_ann_span_by_doc, closest_by_id):
    TP = 0
    FP = 0
    FN = 0
    
    tot_true_splits = 0
    tot_gen_splits = 0
    for train_id in train_doc_ids:
        true_split_len = len(true_ann_span_by_doc[train_id]['start'])
        gen_split_len = len(gen_ann_span_by_doc[train_id]['start'])
        
        tot_true_splits += true_split_len
        tot_gen_splits += gen_split_len
        
        tp_doc = (abs(np.array(closest_by_id[train_id]['true']) - np.array(closest_by_id[train_id]['pred'])) <= 3).sum()
        fn_doc = true_split_len - tp_doc
        fp_doc = gen_split_len - tp_doc
        
        TP += tp_doc
        FP += fp_doc
        FN += fn_doc
        
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
#     print(f"True split length: {tot_true_splits}")
#     print(f"Generated split length: {tot_gen_splits}")
#     print(f"TP: {TP}, FP: {FP}, FN: {FN}\n")
    return precision, recall, f1_score

## Create Corpus

In [12]:
spans = make_span_data(documents_by_id, types_by_id, annotations)
span_labels = [s['type'] for s in spans]

In [13]:
train_doc_ids = np.load('../Data/train.npy')

In [14]:
train_doc_ids.shape

(113,)

In [15]:
true_ann_span_by_doc = {}
for train_id in train_doc_ids:
    ann_span_starts, ann_span_ends = prepare_ann_span_by_doc(spans, train_id)
    true_ann_span_by_doc[train_id] = {'start': ann_span_starts, 'end': ann_span_ends}

### Phase 2.1: Using standard sentence segmenter

#### List spans of annotations of each document

In [16]:
# %time 
# nlp = spacy.load("en_core_web_sm")
# gen_ann_span_by_doc = generate_ann_span_by_doc_with_spacy(train_doc_ids, nlp)

In [17]:
# %time closest_by_id = find_closest_start_point(train_doc_ids, true_ann_span_by_doc, gen_ann_span_by_doc)

In [18]:
# precision, recall, f1_score = calculate_error_metrics(train_doc_ids, true_ann_span_by_doc, gen_ann_span_by_doc, closest_by_id)
# print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}\nF1_score: {f1_score:.2f}')

### Phase 2.2: Extending spacy

In [19]:
single_word_headers = ["REPRESENTATION",
#                        "WITNESS",
#                        "ATTORNEY",
#                        "REASONS",
                       "____________________________________________",
                       "ORDER",
                       "INTRODUCTION",
#                        "CONCLUSION",
#                        "FINDINGS"
                      ]


other_headers = ["THE ISSUE",
                 "WITNESS AT HEARING ON APPEAL",
                 "ATTORNEY FOR THE BOARD",
                 "FINDINGS OF FACT",
                 "CONCLUSION OF LAW",
                 "REASONS AND BASES FOR FINDING AND CONCLUSION",
                ]

In [20]:
# EXTEND SPACY CELL
@Language.component("set_custom_boundaries_original")
def set_custom_boundaries_original(doc):
    for i, token in enumerate(doc):
        if token.text in ("’s", "'s"):
            doc[i].is_sent_start = False
        elif token.text in ("“", "‘") and i < len(doc) - 1:
            doc[i+1].is_sent_start = False
        elif token.text in ("”", "’"):
            doc[i].is_sent_start = False
        elif token.text in ("\n","\t","\r"," ","  ","   ","    ","DC.","Archive", "NO."):
            doc[i].is_sent_start = False
        elif token.text =="DOCKET":
            doc[i].is_sent_start = False
        elif token.text in ("THE","REPRESENTATION","WITNESS","ATTORNEY","REASONS","____________________________________________","ORDER","INTRODUCTION","CONCLUSION","FINDINGS", "FINDING"):
            doc[i].is_sent_start = True
    return doc

In [21]:
# EXTEND SPACY CELL
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for i in range(len(doc)):
        if doc[i].text in ("’s", "'s"):
            doc[i].is_sent_start = False
#         elif doc[i].text in ("“", "‘") and i < len(doc) - 1:
#             doc[i+1].is_sent_start = False
            
#         elif doc[i].text in ("”", "’"):
#             doc[i].is_sent_start = False
            
#         elif doc[i].text in ("\n","\t","\r"," ","  ","   ","    ","DC.","Archive", "NO."):
#             doc[i].is_sent_start = False
        elif doc[i].text in ("\n", "\t", "\r", "DC.","Archive", "NO."):
            doc[i].is_sent_start = False    
            
        elif doc[i].text =="DOCKET" and doc[i+1:i+3].text =="NO.":
            doc[i].is_sent_start = False
            end = i+3
            while i+1<=end:
                doc[i+1].is_sent_start = False
                i += 1
            
        elif doc[i].text in single_word_headers:
#             print(doc[i].text)
            doc[i].is_sent_start = True
            i += 1
            while (doc[i].text.isspace()):
                i += 1
            doc[i].is_sent_start = True
            
        # Fixed    
        elif doc[i].text == "THE" and doc[i+1].text == "ISSUE":
            doc[i].is_sent_start = True
            i += 2
            while (doc[i].text.isspace()):
                i += 1
            doc[i].is_sent_start = True
        # Fixed
        elif doc[i].text == "WITNESS" and doc[i+1: i+5].text == "AT HEARING ON APPEAL":
            doc[i].is_sent_start = True
            end = i + 5
            while i+1<=end:
                doc[i+1].is_sent_start = False
                i += 1
            while (doc[i].text.isspace()):
                i += 1
            doc[i].is_sent_start = True
    
        # Fixed    
        elif doc[i].text == "ATTORNEY" and doc[i+1: i+4].text == "FOR THE BOARD":
            doc[i].is_sent_start = True
            end = i + 4
            while i+1 <= end:
                doc[i+1].is_sent_start = False
                i += 1
            while (doc[i].text.isspace()):
                i += 1
            doc[i].is_sent_start = True            
            
        # Fixed
        elif (doc[i].text == "FINDINGS" or doc[i].text == "FINDING") and doc[i+1: i+3].text == "OF FACT":
            doc[i].is_sent_start = True
            end = i + 3
            while i+1 <= end:
                doc[i+1].is_sent_start = False
                i += 1
            while (doc[i].text.isspace()):
                i += 1
            doc[i].is_sent_start = True      
            
        # Fixed    
        elif doc[i].text == "CONCLUSION" and doc[i+1: i+3].text == "OF LAW":
            doc[i].is_sent_start = True
            end = i + 3
            while i+1 <= end:
                doc[i+1].is_sent_start = False
                i += 1
            while (doc[i].text.isspace()):
                i += 1
            doc[i].is_sent_start = True  
            
#         elif doc[i].text == "Vet" and doc[i+1: i+3].text == "OF LAW":
#             doc[i].is_sent_start = True
#             end = i + 3
#             while i+1 <= end:
#                 doc[i+1].is_sent_start = False
#                 i += 1
#             while (doc[i].text.isspace()):
#                 i += 1
#             doc[i].is_sent_start = True  
            
        # Fixed    
        elif doc[i].text == "REASONS" and (doc[i+1: i+7].text == "AND BASES FOR FINDING AND CONCLUSION" or doc[i+1: i+7].text == "AND BASES FOR FINDINGS AND CONCLUSION"):
            doc[i].is_sent_start = True
            end = i + 7
            while i+1 <= end:
                doc[i+1].is_sent_start = False
                i += 1
            while (doc[i].text.isspace()):
                i += 1
            doc[i].is_sent_start = True      

        elif doc[i].text.lower() == "on" and doc[i+1:i+4].text == "appeal from the":
            doc[i].is_sent_start = True
            end = i + 4
            while i+1 <= end:
                doc[i+1].is_sent_start = False
                i += 1
            

    return doc

### Test the splits for a single document

In [22]:
doc_id = '61aea55c97ad59b4cfc4129f'

##### View the originally splitted sentences

In [23]:
# Original split
doc_id = doc_id
df = pd.DataFrame([true_ann_span_by_doc[doc_id]['start'], true_ann_span_by_doc[doc_id]['end']]).T
df.columns = ['start', 'end']
df = df.sort_values('start').reset_index(drop=True)
df_true = np.array(df)

test_doc = documents_by_id[doc_id]['plainText']
print(f"Total true splits: {df_true.shape[0]}")
print(f"True splits for the document: {doc_id}")
print("---------------------------------------------------------")
for i, true_span in enumerate(df_true):
    print(f"Sentence {i+1}: {test_doc[true_span[0]: true_span[1]]}\n\n")
    

Total true splits: 185
True splits for the document: 61aea55c97ad59b4cfc4129f
---------------------------------------------------------
Sentence 1: Citation Nr: 0610579	
Decision Date: 04/13/06    Archive Date: 04/26/06

DOCKET NO.  04-19 577	)	DATE
	)
	)


Sentence 2: On appeal from the
Department of Veterans Affairs Regional Office in Manila, the 
Republic of the Philippines


Sentence 3: THE ISSUE


Sentence 4: Entitlement to a higher initial rating for service-connected 
lumbosacral strain with lumbar disc disease, currently 
evaluated as 10 percent disabling.


Sentence 5: REPRESENTATION


Sentence 6: Appellant represented by:	Veterans of Foreign Wars of 
the United States


Sentence 7: WITNESS AT HEARING ON APPEAL


Sentence 8: Veteran


Sentence 9: ATTORNEY FOR THE BOARD


Sentence 10: Tanya A. Smith, Counsel


Sentence 11: INTRODUCTION


Sentence 12: The veteran had active service from December 1998 to April 
2003.


Sentence 13: This matter comes before the Board of

##### View the generated splits using standard segmenter

In [24]:
# Generated splits
doc_id = doc_id
nlp = spacy.load("en_core_web_sm")
gen_ann_span_std = generate_ann_span_by_doc_with_spacy([doc_id], nlp)

df = pd.DataFrame([gen_ann_span_std[doc_id]['start'], gen_ann_span_std[doc_id]['end']]).T
df.columns = ['start', 'end']
df = df.sort_values('start').reset_index(drop=True)
df_gen = np.array(df)

test_doc = documents_by_id[doc_id]['plainText']

print(f"Generated splits with standard segmenter for the document: {doc_id}")
print("-------------------------------------------------------------------------------------")

for i, gen_span in enumerate(df_gen):
    print(f"Sentence {i+1}: {(test_doc[gen_span[0]: gen_span[1]]).strip()}\n\n")


Generated splits with standard segmenter for the document: 61aea55c97ad59b4cfc4129f
-------------------------------------------------------------------------------------
Sentence 1: Citation Nr: 0610579	
Decision Date: 04/13/06    Archive Date: 04/26/06

DOCKET NO.


Sentence 2: 04-19 577	)	DATE
	)
	)

On appeal from the
Department of Veterans Affairs Regional Office in Manila, the 
Republic of the Philippines


THE ISSUE

Entitlement to a higher initial rating for service-connected 
lumbosacral strain with lumbar disc disease, currently 
evaluated as 10 percent disabling.


Sentence 3: REPRESENTATION

Appellant represented by:	Veterans of Foreign Wars of 
the United States


WITNESS AT HEARING ON APPEAL

Veteran



ATTORNEY FOR THE BOARD

Tanya A. Smith, Counsel


INTRODUCTION


Sentence 4: The veteran had active service from December 1998 to April 
2003.


Sentence 5: This matter comes before the Board of Veterans' Appeals 
(Board) on appeal from a 

##### View the generated splits using the extended segmenter

In [25]:
# Generated splits
doc_id = doc_id
nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe("set_custom_boundaries_original", before="parser")
nlp.add_pipe("set_custom_boundaries", before="parser")
gen_ann_span_ext = generate_ann_span_by_doc_with_spacy([doc_id], nlp)

df = pd.DataFrame([gen_ann_span_ext[doc_id]['start'], gen_ann_span_ext[doc_id]['end']]).T
df.columns = ['start', 'end']
df = df.sort_values('start').reset_index(drop=True)
df_gen = np.array(df)

test_doc = documents_by_id[doc_id]['plainText']

print(f"Generated splits with standard segmenter for the document: {doc_id}")
print("-------------------------------------------------------------------------------------")

for i, gen_span in enumerate(df_gen):
    print(f"Sentence {i+1}: {(test_doc[gen_span[0]: gen_span[1]]).strip()}\n\n")


Generated splits with standard segmenter for the document: 61aea55c97ad59b4cfc4129f
-------------------------------------------------------------------------------------
Sentence 1: Citation Nr: 0610579	
Decision Date: 04/13/06    Archive Date: 04/26/06

DOCKET NO.  04-19 577	)	DATE
	)
	)


Sentence 2: On appeal from the
Department of Veterans Affairs Regional Office in Manila, the 
Republic of the Philippines


Sentence 3: THE ISSUE


Sentence 4: Entitlement to a higher initial rating for service-connected 
lumbosacral strain with lumbar disc disease, currently 
evaluated as 10 percent disabling.


Sentence 5: 


Sentence 6: REPRESENTATION


Sentence 7: Appellant represented by:	Veterans of Foreign Wars of 
the United States


Sentence 8: WITNESS AT HEARING ON APPEAL


Sentence 9: Veteran


Sentence 10: ATTORNEY FOR THE BOARD


Sentence 11: Tanya A. Smith, Counsel


Sentence 12: INTRODUCTION


Sentence 13: The veteran had active service from December 1998 to April 
2003.




### Error analysis on an individual level

In [26]:
doc_id = '61aea55c97ad59b4cfc412af'

##### Standard Spacy

In [27]:
# Test for a single document
nlp = spacy.load("en_core_web_sm")
gen_ann_span_standard = generate_ann_span_by_doc_with_spacy([doc_id], nlp)
len(gen_ann_span_standard[doc_id]['start'])
# gen_ann_span_standard

# For the standard spacy
closest_by_id_std = find_closest_start_point(
    [doc_id], 
    true_ann_span_by_doc, 
    gen_ann_span_standard
)


# For the standard spacy
print(f"Error analysis for the document {doc_id} with standard segmenter")
print("----------------------------------------------------------------------------------")
precision, recall, f1_score = calculate_error_metrics(
    [doc_id], 
    true_ann_span_by_doc, 
    gen_ann_span_standard, 
    closest_by_id_std
)

print(f'\nPrecision: {precision:.2f}\nRecall: {recall:.2f}\nF1_score: {f1_score:.2f}')

Error analysis for the document 61aea55c97ad59b4cfc412af with standard segmenter
----------------------------------------------------------------------------------

Precision: 0.62
Recall: 0.56
F1_score: 0.59


##### Extended Spacy

In [28]:
# Test for a single document
nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe("set_custom_boundaries_original", before="parser")
nlp.add_pipe("set_custom_boundaries", before="parser")
nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Fed. Cir.', [{ORTH: 'Fed. Cir.'}])

gen_ann_span_ext = generate_ann_span_by_doc_with_spacy([doc_id], nlp)
len(gen_ann_span_ext[doc_id]['start'])
# gen_ann_span_ext

# For the extended spacy
closest_by_id_ext = find_closest_start_point(
    [doc_id], 
    true_ann_span_by_doc, 
    gen_ann_span_ext
)


# For the extended spacy
print(f"Error analysis for the document {doc_id} with extended segmenter")
print("----------------------------------------------------------------------------------")
precision, recall, f1_score = calculate_error_metrics(
    [doc_id], 
    true_ann_span_by_doc, 
    gen_ann_span_ext, 
    closest_by_id_ext
)

print(f'\nPrecision: {precision:.2f}\nRecall: {recall:.2f}\nF1_score: {f1_score:.2f}')

Error analysis for the document 61aea55c97ad59b4cfc412af with extended segmenter
----------------------------------------------------------------------------------

Precision: 0.74
Recall: 0.78
F1_score: 0.76


##### LUIMA SBD

In [29]:
gen_ann_span_luima = generate_ann_span_by_doc_with_luima([doc_id])
len(gen_ann_span_luima[doc_id]['start'])
# gen_ann_span_ext

# For the LUIMA
closest_by_id_luima = find_closest_start_point(
    [doc_id], 
    true_ann_span_by_doc, 
    gen_ann_span_luima
)


# For the LUIMA

print(f"Error analysis for the document {doc_id} with LUIMA segmenter")
print("----------------------------------------------------------------------------------")

precision, recall, f1_score = calculate_error_metrics(
    [doc_id], 
    true_ann_span_by_doc, 
    gen_ann_span_luima, 
    closest_by_id_luima
)

print(f'\nPrecision: {precision:.2f}\nRecall: {recall:.2f}\nF1_score: {f1_score:.2f}')

Error analysis for the document 61aea55c97ad59b4cfc412af with LUIMA segmenter
----------------------------------------------------------------------------------

Precision: 0.83
Recall: 0.98
F1_score: 0.90


### Overall error metric analysis

##### Error analysis for the standard segmenter

In [30]:
# Test for a single document
nlp = spacy.load("en_core_web_sm")
print("Splitting the documents")
gen_ann_span_standard = generate_ann_span_by_doc_with_spacy(train_doc_ids, nlp)


# Find closest neighbors
closest_by_id_std = find_closest_start_point(
    train_doc_ids, 
    true_ann_span_by_doc, 
    gen_ann_span_standard
)


# Error metrics
precision, recall, f1_score = calculate_error_metrics(
    train_doc_ids, 
    true_ann_span_by_doc, 
    gen_ann_span_standard, 
    closest_by_id_std
)
print("Error metrics using the standard segmenter:")
print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}\nF1_score: {f1_score:.2f}')

Splitting the documents
Error metrics using the standard segmenter:
Precision: 0.60
Recall: 0.62
F1_score: 0.61


##### Error analysis for the extended segmenter

In [31]:
# Test for a single document
nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe("set_custom_boundaries_original", before="parser")
nlp.add_pipe("set_custom_boundaries", before="parser")
nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Fed. Cir.', [{ORTH: 'Fed. Cir.'}])

print("Splitting the documents")
gen_ann_span_ext = generate_ann_span_by_doc_with_spacy(train_doc_ids, nlp)

# Find closest neighbors
closest_by_id_ext = find_closest_start_point(
    train_doc_ids, 
    true_ann_span_by_doc, 
    gen_ann_span_ext
)

# Error metrics
precision, recall, f1_score = calculate_error_metrics(
    train_doc_ids, 
    true_ann_span_by_doc, 
    gen_ann_span_ext, 
    closest_by_id_ext
)
print("Error metrics using the extended segmenter:")
print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}\nF1_score: {f1_score:.2f}')

Splitting the documents
Error metrics using the extended segmenter:
Precision: 0.71
Recall: 0.77
F1_score: 0.74


##### Error analysis for the LUIMA segmenter

In [32]:
print("Splitting the documents")
gen_ann_span_luima = generate_ann_span_by_doc_with_luima(train_doc_ids)

# Find closest neighbors
closest_by_id_luima = find_closest_start_point(
    train_doc_ids, 
    true_ann_span_by_doc, 
    gen_ann_span_luima
)

# Error metrics
precision, recall, f1_score = calculate_error_metrics(
    train_doc_ids, 
    true_ann_span_by_doc, 
    gen_ann_span_luima, 
    closest_by_id_luima
)
print("Error metrics using the LUIMA SBD segmenter:")
print(f'Precision: {precision:.2f}\nRecall: {recall:.2f}\nF1_score: {f1_score:.2f}')

Splitting the documents
Error metrics using the LUIMA SBD segmenter:
Precision: 0.83
Recall: 0.99
F1_score: 0.90


### Save the error metrics for all the documents

##### Standard Spacy

In [33]:
nlp = spacy.load("en_core_web_sm")
error_metric_std_segmenter = []
for doc_id in tqdm(train_doc_ids):
    gen_ann_span_standard = generate_ann_span_by_doc_with_spacy([doc_id], nlp)
    # gen_ann_span_standard

    # For the standard spacy
    closest_by_id_std = find_closest_start_point(
        [doc_id], 
        true_ann_span_by_doc, 
        gen_ann_span_standard
    )

    precision, recall, f1_score = calculate_error_metrics(
        [doc_id], 
        true_ann_span_by_doc, 
        gen_ann_span_standard, 
        closest_by_id_std
    )
    
    em_doc = {
        'doc_id': doc_id,
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1_score, 2)
    }
    error_metric_std_segmenter.append(em_doc)

100%|████████████████████████████████████████████████████████████████████████████████| 113/113 [00:52<00:00,  2.15it/s]


##### Extended Spacy

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("set_custom_boundaries", before="parser")
nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Fed. Cir.', [{ORTH: 'Fed. Cir.'}])

error_metric_ext_segmenter = []
for doc_id in tqdm(train_doc_ids):
    gen_ann_span_ext = generate_ann_span_by_doc_with_spacy([doc_id], nlp)
    # gen_ann_span_standard

    # For the standard spacy
    closest_by_id_ext = find_closest_start_point(
        [doc_id], 
        true_ann_span_by_doc, 
        gen_ann_span_ext
    )

    precision, recall, f1_score = calculate_error_metrics(
        [doc_id], 
        true_ann_span_by_doc, 
        gen_ann_span_ext, 
        closest_by_id_ext
    )
    
    em_doc = {
        'doc_id': doc_id,
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1_score, 2)
    }
    error_metric_ext_segmenter.append(em_doc)

##### LUIMA SBD

In [None]:
error_metric_luima = []
for doc_id in tqdm(train_doc_ids):
    gen_ann_span_luima = generate_ann_span_by_doc_with_luima([doc_id])
    closest_by_id_luima = find_closest_start_point(
        [doc_id], 
        true_ann_span_by_doc, 
        gen_ann_span_luima
    )


    precision, recall, f1_score = calculate_error_metrics(
        [doc_id], 
        true_ann_span_by_doc, 
        gen_ann_span_luima, 
        closest_by_id_luima
    )
    
    em_doc = {
        'doc_id': doc_id,
        'precision': round(precision, 2),
        'recall': round(recall, 2),
        'f1_score': round(f1_score, 2)
    }
    error_metric_luima.append(em_doc)

In [None]:
error_metric_std_segmenter[0], error_metric_ext_segmenter[0], error_metric_luima[0]

In [None]:
df_std = pd.DataFrame(error_metric_std_segmenter)
df_ext = pd.DataFrame(error_metric_ext_segmenter)
df_lumia = pd.DataFrame(error_metric_luima)

In [None]:
df_std.sort_values('precision').head()

In [None]:
df_ext[df_ext['doc_id']=='61aea55e97ad59b4cfc412de'], \
df_ext[df_ext['doc_id']=='61aea55e97ad59b4cfc412f0'], \
df_ext[df_ext['doc_id']=='61aea55c97ad59b4cfc41299']

In [None]:
df_ext.iloc[17]

#### General observations:
1. Spacy segments a sentence whenever it comes across a fullstop. e.g: 
##### Original sentence: 
Sentence 1: Citation Nr: 0721357	
Decision Date: 07/17/07    Archive Date: 08/02/07

DOCKET NO.  05-13 724	)	DATE
	)
	)
##### Spacy segmented sentence:
Sentence 1: Citation Nr: 0721357	
Decision Date: 07/17/07    Archive Date: 08/02/07

DOCKET NO.

2. Citations are being broken down into sentences too many times

#### Things to improve
1. not only splitting sentence on fullstop, but also for other occurances such as new lines.
2. Different section headers should be considered as a separate sentence.


REPRESENTATION

ATTORNEY FOR THE BOARD

**INTRODUCTION **

FINDING OF FACT

FINDINGS OF FACT

CONCLUSION OF LAW

REASONS AND BASES FOR FINDING AND CONCLUSION

REASONS AND BASES FOR FINDINGS AND CONCLUSION

ORDER

In [None]:
example_basic_1 = 'In sum, as the preponderance of the evidence is against the Veteran\'s claim, his appeal must be denied.'
example_cit_1 = 'Smith v. Gober, 14 Vet. App. 227 (2000), aff\'d 281 F.3d 1384 (Fed. Cir. 2002); Dela Cruz v. Principi, 15 Vet. App. 143 (2001); see also Quartuccio v. Principi, 16 Vet. App. 183 (2002).'
example_rule_1 = '"To establish a right to compensation for a present disability, a Veteran must show: "(1) the existence of a present disability; (2) in-service incurrence or aggravation of a disease or injury; and (3) a causal relationship between the present disability and the disease or injury incurred or aggravated during service"-the so-called "nexus" requirement."'
example_mixed_1 = 'In Dingess v. Nicholson, 19 Vet. App. 473 (2006), the U.S. Court of Appeals for Veterans Claims held that, upon receipt of an application for a service-connection claim, 38 U.S.C.A. � 5103(a) and 38 C.F.R. � 3.159(b) require VA to provide the claimant with notice that a disability rating and an effective date for the award of benefits will be assigned if service connection is awarded. '

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(example_cit_1)
for token in doc:
    print(token)

In [None]:
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("set_custom_boundaries", before="parser")
nlp.tokenizer.add_special_case('Vet. App.', [{ORTH: 'Vet. App.'}])
nlp.tokenizer.add_special_case('Fed. Cir.', [{ORTH: 'Fed. Cir.'}])
doc = nlp(example_cit_1)
list(doc.sents)