In [None]:
import spacy
import json
import random
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn import model_selection
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.symbols import ORTH
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
import random
random.seed(42)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          title=None,
                          cmap=plt.cm.Blues):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 8))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
def top_tfidf_features(row, features, top_n=15):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df


def top_features_in_doc(Xtr, features, row_id, top_n=15):
    ''' Top tfidf features in specific document (matrix row) '''
    xtr_row = Xtr[row_id]
    if type(xtr_row) is not np.ndarray:
        xtr_row = xtr_row.toarray()
    row = np.squeeze(xtr_row)
    return top_tfidf_features(row, features, top_n)


def top_mean_features(Xtr, features, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids]
    else:
        D = Xtr
    if type(D) is not np.ndarray:
        D = D.toarray()
    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_features(tfidf_means, features, top_n)


def top_features_by_class(Xtr, y, features, min_tfidf=0.1, top_n=25):
    ''' Return a list of dfs, where each df holds top_n features and their mean tfidf value
        calculated across documents with the same class label. '''
    dfs = {}
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y==label)
        feats_df = top_mean_features(Xtr, features, ids, min_tfidf=min_tfidf, top_n=top_n)
        feats_df.label = label
        dfs[label] = feats_df
    return dfs


def span_top_tfidf(spans_txt, spans_tfidf, features, index):
    print('span text:\n'+spans_txt[index]+'\n')
    print(top_features_in_doc(spans_tfidf, features, index))

In [None]:
corpus_fpath = './ldsi_s2021/ldsi_bva_sentence_corpus_v1.json'
data = json.load(open(corpus_fpath))


In [None]:
affirmed = open('./ldsi_s2021/affirmed_ids.txt', 'r').read().split("\n")
denied= open('./ldsi_s2021/denied_ids.txt', 'r').read().split("\n")
remanded = open('./ldsi_s2021/remanded_ids.txt', 'r').read().split("\n")
print(len(affirmed), len(denied), len(remanded))

In [None]:
annotations = data['annotations']
documents_by_id = {d['_id']: d for d in data['documents']}
types_by_id = {t['_id']: t for t in data['types']}
type_ids_by_name = {t['name']: t['_id'] for t in data['types']}
type_names_by_id = {t['_id']: t['name'] for t in data['types']}
doc_id_by_name = {d['name']: d['_id'] for d in data['documents']}
doc_name_by_id = {d['_id']: d['name'] for d in data['documents']}

In [None]:
doc_lengths = [len(d['plainText']) for d in documents_by_id.values()]
plt.hist(doc_lengths, bins=50)
plt.show()

In [None]:
# get all sentences assuming every annotation is a sentence
def make_span_data(documents_by_id, types_by_id, annotations):
    span_data = []
    for a in annotations:
        start = a['start']
        end = a['end']
        document_txt = documents_by_id[a['document']]['plainText']
        atype = a['type']
        document_name=documents_by_id[a['document']]['name']
        if document_name in affirmed:
            decision='affirmed'
        elif document_name in denied:
            decision='denied'
        elif document_name in remanded:
            decision='remanded'
        sd = {'txt': document_txt[start:end],
              'document': a['document'],
              'type': types_by_id[atype]['name'],
              'start': a['start'],
              'start_normalized': a['start'] / len(document_txt),
              'end': a['end'],
              'name': document_name,
              'decisions': decision}
        span_data.append(sd)
    return span_data

spans = make_span_data(documents_by_id, types_by_id, annotations)
span_labels = [s['type'] for s in spans]
span_decisions = [s['decisions'] for s in spans]

In [None]:
random.seed(42)
aff=random.sample(affirmed, 6)
den=random.sample(denied, 6)
rem=random.sample(remanded, 6)
test_affirm, dev_affirm = aff[0:3], aff[3:6] 
test_denied, dev_denied = den[0:3], den[3:6] 
test_remanded, dev_remanded = rem[0:3], rem[3:6] 


In [None]:
test_ids = test_affirm+test_denied+test_remanded
dev_ids = dev_affirm+dev_denied+dev_remanded

In [None]:
test_spans=[]
dev_spans=[]
train_spans=[]
for s in spans:
    if s['name'] in test_ids:
        test_spans.append(s)
    elif s['name'] in dev_ids:
        dev_spans.append(s)
    else:
        train_spans.append(s)

In [None]:
train_spans_txt = [s['txt'] for s in train_spans]
test_spans_txt = [s['txt'] for s in test_spans]
dev_spans_txt = [s['txt'] for s in dev_spans]


In [None]:
unique_files=pd.DataFrame(train_spans).name.unique()

In [None]:
tot_tp=0
tot_fp=0
tot_fn=0
result = []
for file in unique_files:
    print(file)
# '1204131.txt'
# unique_files[file_index] 
    train_ann_doc=[]
    for span in spans:
        if(span['name']==file):
            train_ann_doc.append(span)
     
    doc=[]
    for d in data['documents']:
        if (d['name']==file):
            doc.append(d)
    
    true_start=[]
    true_end=[]
    for ann in train_ann_doc:
        true_start.append(ann['start'])
        true_end.append(ann['end'])
        
    tot_sent=len(true_start)
    
    nlp = spacy.load("en_core_web_sm")
    scrap = nlp(doc[0]['plainText'])

    assert scrap.has_annotation("SENT_START")

    tp_count=0
    fn_count=0
    fp_count=0
    count=0
    for sent in scrap.sents:
        start = sent.start_char
        end = sent.end_char
        flag=0
        count=count+1
        for i in range(len(true_start)):
            start_range=true_start[i]-3
            end_range=true_end[i]+3
            if(start>=true_end[i]):
                continue;
            if((start>=start_range and start <= start_range+6) and (end >= end_range-6 and end <= end_range)):         
                tp_count+=1
#                 print("===========TRUE POS===========")
#                 print(sent.text)
#                 print(sent.start_char, sent.end_char)
#                 print(true_start[i],true_end[i])
                flag=1
        if(flag==1):
            flag=0
            continue
    fp_count=count-tp_count    
    fn_count=tot_sent-tp_count
    print(f"For File:{file}\n True Positive:{tp_count}\n False Positive:{fp_count}\n False Negative:{fn_count}\n")
    doc_prec=tp_count/(tp_count+fp_count)
    doc_recall=tp_count/(tp_count+fn_count)
    doc_f1=2*doc_prec*doc_recall/(doc_prec+doc_recall)
    
    print(f"For File: {file} Precision: {doc_prec} Recall: {doc_recall} F1 Score: {doc_f1}\n")
    
    diction={
        "File": file,
        "Precision": doc_prec,
        "Recall": doc_recall,
        "F1_Score": doc_f1
    }
    result.append(diction)
    
    tot_tp=tot_tp+tp_count
    tot_fp=tot_fp+fp_count
    tot_fn=tot_fn+fn_count

print(f"Total Stats \n True Positive:{tot_tp}\n False Positive:{tot_fp}\n False Negative:{tot_fn}\n")


prec=tot_tp/(tot_tp+tot_fp)
recall=tot_tp/(tot_tp+tot_fn)
f1_score=2*prec*recall/(prec+recall)
print(f"Precision: {prec}\n Recall: {recall}\n F1 Score: {f1_score}")


In [None]:
result = sorted(result, key=lambda k: k['Precision'])
result[0:3]