In [None]:
import spacy
import json
import random
import re
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn import model_selection
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.symbols import ORTH
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree
import matplotlib.pyplot as plt
import plotly.express as px
import random
import sys 
import math
import os
sys.path.append('./luima_sbd')
import luima_sbd.sbd_utils as luima
from spacy.language import Language
random.seed(42)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          title=None,
                          cmap=plt.cm.Blues):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(8, 8))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
corpus_fpath = './ldsi_s2021/ldsi_bva_sentence_corpus_v1.json'
data = json.load(open(corpus_fpath))
affirmed = open('./ldsi_s2021/affirmed_ids.txt', 'r').read().split("\n")
denied= open('./ldsi_s2021/denied_ids.txt', 'r').read().split("\n")
remanded = open('./ldsi_s2021/remanded_ids.txt', 'r').read().split("\n")
# print(len(affirmed), len(denied), len(remanded))
annotations = data['annotations']
documents_by_id = {d['_id']: d for d in data['documents']}
types_by_id = {t['_id']: t for t in data['types']}
type_ids_by_name = {t['name']: t['_id'] for t in data['types']}
type_names_by_id = {t['_id']: t['name'] for t in data['types']}
doc_id_by_name = {d['name']: d['_id'] for d in data['documents']}
doc_name_by_id = {d['_id']: d['name'] for d in data['documents']}

def make_span_data(documents_by_id, types_by_id, annotations):
    span_data = []
    for a in annotations:
        start = a['start']
        end = a['end']
        document_txt = documents_by_id[a['document']]['plainText']
        atype = a['type']
        document_name=documents_by_id[a['document']]['name']
        if document_name in affirmed:
            decision='affirmed'
        elif document_name in denied:
            decision='denied'
        elif document_name in remanded:
            decision='remanded'
        sd = {'txt': document_txt[start:end],
              'document': a['document'],
              'type': types_by_id[atype]['name'],
              'start': a['start'],
              'start_normalized': a['start'] / len(document_txt),
              'end': a['end'],
              'name': document_name,
              'decisions': decision}
        span_data.append(sd)
    return span_data

spans = make_span_data(documents_by_id, types_by_id, annotations)
span_labels = [s['type'] for s in spans]
span_decisions = [s['decisions'] for s in spans]

In [None]:
random.seed(42)
aff=random.sample(affirmed, 6)
den=random.sample(denied, 6)
rem=random.sample(remanded, 6)
test_affirm, dev_affirm = aff[0:3], aff[3:6] 
test_denied, dev_denied = den[0:3], den[3:6] 
test_remanded, dev_remanded = rem[0:3], rem[3:6] 

test_ids = test_affirm+test_denied+test_remanded
dev_ids = dev_affirm+dev_denied+dev_remanded

test_spans=[]
dev_spans=[]
train_spans=[]
for s in spans:
    if s['name'] in test_ids:
        test_spans.append(s)
    elif s['name'] in dev_ids:
        dev_spans.append(s)
    else:
        train_spans.append(s)
        
unique_files=pd.DataFrame(train_spans).name.unique()

In [None]:
file_list=os.listdir("./ldsi_s2021/unlabeled/unlabeled/") 

In [None]:
tot_tp=0
tot_fp=0
tot_fn=0
result = []

# [{'File': '0843259.txt',
#   'Precision': 0.2916666666666667,
#   'Recall': 0.5185185185185185,
#   'F1_Score': 0.37333333333333335},
#  {'File': '0942105.txt',
#   'Precision': 0.3767123287671233,
#   'Recall': 0.6043956043956044,
#   'F1_Score': 0.46413502109704635},
#  {'File': '0820506.txt',
#   'Precision': 0.40540540540540543,
#   'Recall': 0.6521739130434783,
#   'F1_Score': 0.5}]

result_dict=[]
for file in file_list:
#     print(file)
    doc = open(f'./ldsi_s2021/unlabeled/unlabeled/{file}', 'r').read()

    sentences = luima.text2sentences(doc, offsets=False)
    index = luima.text2sentences(doc, offsets=True)
    count_dict={
        "file_name":file,
        "sent_count":len(sentences),
        "sentences":sentences
    }
    result_dict.append(count_dict)
    

In [None]:
df = pd.DataFrame(result_dict)

In [None]:
# df.to_pickle("./sentence_segmented_dict.pkl")
df = pd.read_pickle("./sentence_segmented_dict.pkl")

In [None]:
%time df.sent_count.sum()

In [None]:
df

In [None]:
bin_width= 0.5
# here you can choose your rounding method, I've chosen math.ceil
nbins = math.ceil((df["sent_count"].max() - df["sent_count"].min()) / bin_width)
print(nbins)
fig = px.histogram(df, x="sent_count", nbins=nbins,
            width=1200, height=600,
            labels={ # replaces default labels by column name
                "sent_count": "Number of Sentences",
            },
            template="plotly_white")
fig.update_yaxes(showgrid=True)
fig.update_layout(title_text="Histogram of Number of Sentences in Each Document", title_x=0.5, font_size=18)
fig.show()