In [2]:
import sys
sys.path.append('../code')
import os
from pathlib import Path
import json
import random
import numpy as np
import spacy


from tqdm import tqdm
import pickle
import spacy
import json
import random
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn import model_selection
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.symbols import ORTH
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
from spacy.language import Language
from luima_sbd import sbd_utils as luima
import seaborn as sns
import math
import fasttext

random.seed(42)
%matplotlib inline

# Phase 4.1: Custom Fasttext embeddings

In [31]:
# Path to the tokened file
tokened_data = '../Data/tokenized_sentences.txt'

In [32]:
# Training an unsupervisesd fasttext model for the word embeddings
%time model = fasttext.train_unsupervised(tokened_data, epoch=10, minCount=20)

CPU times: user 2.32 s, sys: 238 ms, total: 2.56 s
Wall time: 747 ms


In [33]:
len(model.get_words(on_unicode_error='ignore'))

39

In [34]:
model.save_model("../models/word_embedding_model_notebook.bin")

# Phase 4.2: Evaluating custom embeddings manually

In [36]:
# test_words=["veteran","vet","service","cause","caused","remanded","vietnam","see","board","notice","claim", "judge","records","letter"]
def get_word_neighbors(word, model):
#     print(f"Received word: {word}")
    nn = model.get_nearest_neighbors(word, on_unicode_error='ignore')
#     print(nn)
    word_to_neighbors ={
        "test_word": word,
        "nearest_neighbours": nn
    }
#     print(word_to_neighbors)
    return word_to_neighbors
#     test_word_track.append(app_dict)

In [37]:
# Loading the JSON file containing the data and the annotations
CURATED_ANN_PATH = "../Data/ldsi_w21_curated_annotations_v2.json"
with open(CURATED_ANN_PATH, 'r') as j:
     data = json.loads(j.read())
        
annotations = data['annotations']
documents_by_id = {d['_id']: d for d in data['documents']}
types_by_id = {t['_id']: t for t in data['types']}
type_ids_by_name = {t['name']: t['_id'] for t in data['types']}
type_names_by_id = {t['_id']: t['name'] for t in data['types']}
doc_id_by_name = {d['name']: d['_id'] for d in data['documents']}
doc_name_by_id = {d['_id']: d['name'] for d in data['documents']}

granted_doc_ids = set([doc['_id'] for doc in data['documents'] if doc['outcome'] == 'granted'])
denied_doc_ids = set([doc['_id'] for doc in data['documents'] if doc['outcome'] == 'denied'])
print(len(granted_doc_ids), len(denied_doc_ids))

# Filter out the IDs of the 141 documents from a total of 540
ids_annotated_docs = set([ann['document'] for ann in data['annotations']])
print(len(ids_annotated_docs))

granted_ids = sorted(list(granted_doc_ids.intersection(ids_annotated_docs)))
denied_ids = sorted(list(denied_doc_ids.intersection(ids_annotated_docs)))
print(len(granted_ids), len(denied_ids))

os.environ['PYTHONHASHSEED'] = str(42)
random.seed(42)
np.random.seed(42)


random.shuffle(granted_ids)
random.shuffle(denied_ids)
granted_train, granted_val, granted_test = np.split(granted_ids, [int(len(granted_ids)*0.8), int(len(granted_ids)*0.9)])
denied_train, denied_val, denied_test = np.split(denied_ids, [57, 64])

train_set, dev_set, test_set = np.concatenate((granted_train, denied_train), axis=0), \
                                np.concatenate((granted_val, denied_val), axis=0), \
                                    np.concatenate((granted_test, denied_test), axis=0), \

print(train_set.shape, dev_set.shape, test_set.shape)

180 180
141
70 71
(113,) (14,) (14,)


In [38]:
# Loading the train, dev and test ids saved in phase 1
train_ids, dev_ids, test_ids = train_set, dev_set, test_set
train_ids.shape, dev_ids.shape, test_ids.shape

((113,), (14,), (14,))

In [39]:
# # Loading the train, dev and test ids saved in phase 1
# train_ids, dev_ids, test_ids = np.load('../Data/train.npy'), np.load('../Data/dev.npy'), np.load('../Data/test.npy')
# train_ids.shape, dev_ids.shape, test_ids.shape

In [40]:
# get all sentences assuming every annotation is a sentence
def make_span_data(documents_by_id, types_by_id, annotations, doc_ids):
    span_data = []
    for doc_id in doc_ids:
        for a in annotations:
            if a['document'] == doc_id:
                start = a['start']
                end = a['end']
                document_txt = documents_by_id[a['document']]['plainText']
                atype = a['type']
                sd = {'txt': document_txt[start:end],
                      'document': a['document'],
                      'type': types_by_id[atype]['name'],
                      'start': a['start'],
                      'start_normalized': a['start'] / len(document_txt),
                      'end': a['end']}
                span_data.append(sd)
    return span_data

In [41]:
# Creating train, dev and test spans
train_spans = make_span_data(documents_by_id, types_by_id, annotations, train_ids)
dev_spans = make_span_data(documents_by_id, types_by_id, annotations, dev_ids)
test_spans = make_span_data(documents_by_id, types_by_id, annotations, test_ids)

train_spans_txt = [s['txt'] for s in train_spans]
dev_spans_txt = [s['txt'] for s in dev_spans]
test_spans_txt = [s['txt'] for s in test_spans]


train_spans_labels = np.array([s['type'] for s in train_spans])
test_spans_labels = np.array([s['type'] for s in test_spans])
dev_spans_labels = np.array([s['type'] for s in dev_spans])
all_spans_txt = []
all_spans_txt.extend(train_spans_txt)
all_spans_txt.extend(dev_spans_txt)
all_spans_txt.extend(test_spans_txt)
len(train_spans), len(dev_spans), len(test_spans), len(train_spans_txt), len(dev_spans_txt), len(test_spans_txt) 

(12450, 1376, 1523, 12450, 1376, 1523)

In [42]:
def extract_sentences_with_chosen_word(text_spans, words, no_of_sentences_to_extract):
    sentences = []
    for s in text_spans:
        if len(words) > 1:
            if words[0].lower() in s and words[1].lower() in s.lower():
                sentences.append(s)
        else:
            if words[0].lower() in s.lower():
                sentences.append(s)
        if len(sentences) == no_of_sentences_to_extract:
            return sentences
    return sentences

In [43]:
del(model)
model = fasttext.load_model("../models/word_embedding_model_notebook.bin")
test_words=["veteran","v.","argues","ptsd","granted","korea","holding","also"]
word_to_neigbors = []
for word in test_words:
    word_to_neigbors.append(get_word_neighbors(word, model))
    
word_to_neigbors



[{'test_word': 'veteran',
  'nearest_neighbours': [(0.9999794363975525, 'veterans'),
   (0.9999740719795227, "veteran's"),
   (0.9999135136604309, 'connection'),
   (0.9999091625213623, 'service'),
   (0.9999045133590698, 'record'),
   (0.9998936057090759, 'evidence'),
   (0.9998903274536133, '<NUM1>'),
   (0.9998826384544373, '<NUM2>'),
   (0.9998824596405029, '<NUM4>'),
   (0.9998706579208374, 'the')]},
 {'test_word': 'v.',
  'nearest_neighbours': [(0.0285479836165905, 'ro'),
   (0.02703719772398472, 'as'),
   (0.025357596576213837, '<NUM1>'),
   (0.025125067681074142, '</s>'),
   (0.025109509006142616, 'his'),
   (0.025029828771948814, 'connection'),
   (0.024953637272119522, 'to'),
   (0.024773629382252693, 'not'),
   (0.024593332782387733, 'board'),
   (0.024123193696141243, "veteran's")]},
 {'test_word': 'argues',
  'nearest_neighbours': [(0.01604788564145565, 'v'),
   (0.015904435887932777, 'be'),
   (0.015416345559060574, 'on'),
   (0.01388480607420206, 'in'),
   (0.01351588126

In [44]:
# Using get_word_neighbors() function you can extract the neighbors of a chosen word

word_neighbors = get_word_neighbors("mdd", model)
for w_n in word_neighbors['nearest_neighbours']:
    print(w_n[1], end=", ")

he, </s>, ro, va, ptsd, be, the, not, connection, in, 

In [45]:
word_of_interest = "table"
neighbor_to_search = "connection"
# no_of_sentences = len(all_spans_txt)
no_of_sentences = 100

# Get the sentences with the chosen word and a given neighbor
# sentences = extract_sentences_with_chosen_word(train_spans_txt, [word_of_interest, neighbor_to_search], no_of_sentences)

# Just get the sentences with the chosen word
sentences = extract_sentences_with_chosen_word(all_spans_txt, [word_of_interest], no_of_sentences)

for sent in sentences:
    print(sent.strip(), end="\n\n")

With a chronic disease shown 
as such in service, so as to permit a finding of service 
connection, subsequent manifestations of the same chronic 
disease at any later date, however remote, are service 
connected, unless clearly attributable to intercurrent 
causes.

In 
his hearing presentation with the undersigned, the Veteran 
complained that his service-connected knee was often unstable 
when he would turn suddenly, and that he had been prescribed 
medication for increase in pain; but, he had not received a 
knee brace or other stabilizing assistive device.

Where a chronic disease under 3.309(a) is "shown as such in service" ("meaning clearly diagnosed beyond legitimate question" Walker, 708 F.3d at 1339) or in the presumptive period so as to permit a finding of service connection, subsequent manifestations of the same chronic disease at any later date, however remote, are service connected, unless clearly attributable to intercurrent causes.

1.	The veteran has been not

In [None]:
# Open a BVA decision and pick words from it
bva_decision = documents_by_id['61aea55c97ad59b4cfc412ac']['plainText'].strip()
print(bva_decision)