In [3]:
import pandas as pd
import json
import numpy as np

In [4]:
df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/TRDataChallenge2023.txt', lines=True)

In [5]:
df

Unnamed: 0,documentId,postures,sections
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"[{'headtext': '', 'paragraphs': ['Plaintiff Dw..."
1,Ib06ab4d056a011e98c7a8e995225dbf9,"[Appellate Review, Sentencing or Penalty Phase...","[{'headtext': '', 'paragraphs': ['After pleadi..."
2,Iaa3e3390b93111e9ba33b03ae9101fb2,"[Motion to Compel Arbitration, On Appeal]","[{'headtext': '', 'paragraphs': ['Frederick Gr..."
3,I0d4dffc381b711e280719c3f0e80bdd0,"[On Appeal, Review of Administrative Decision]","[{'headtext': '', 'paragraphs': ['Appeal from ..."
4,I82c7ef10d6d111e8aec5b23c3317c9c0,[On Appeal],"[{'headtext': '', 'paragraphs': ['Order, Supre..."
...,...,...,...
17995,Ia5743cf0e4b611e99e94fcbef715f24d,[Appellate Review],"[{'headtext': '', 'paragraphs': ['¶1 On Februa..."
17996,I974c18f08f1611e998e8870e22e55653,[Objection to Proof of Claim],[{'headtext': 'ORDER OVERRULING DEBTOR'S OBJEC...
17997,Idaaa92f0886f11e998e8870e22e55653,"[Appellate Review, Trial or Guilt Phase Motion...","[{'headtext': '', 'paragraphs': ['A jury convi..."
17998,I247a8420677e11e9a072efd81f5238d6,"[Appellate Review, Jury Selection Challenge or...","[{'headtext': '', 'paragraphs': ['Defendant Ch..."


In [6]:
def concatenate_from_list_of_dicts(row):
    # Get the list of dicts from the sections
    list_of_dicts = row["sections"]

    # Make somewhere to put the flattened out list
    concatenated_text = []

    # Iterate over each dict in the list
    for my_dict in list_of_dicts:
        # Concatenate header text
        concatenated_text.append(my_dict["headtext"])
        # Concatenate paragraphs
        concatenated_text.extend(my_dict["paragraphs"])

    # Join the list into a single string
    return " ".join(concatenated_text)

df["concatenated_text"] = df.apply(concatenate_from_list_of_dicts, axis=1)

In [7]:
# This example easily shows the header is being concatenated as well as the paragraph since it starts with the header
df["concatenated_text"].iloc[17996]

"ORDER OVERRULING DEBTOR'S OBJECTION TO CLAIMS On April 17, 2019 the Court held a hearing on Debtor's Renewed Objections to Proofs of Claims of LVNV Funding, LLC (the “Objection”) (doc. 191) in the above captioned case. Appearances were made on the record. Briefly summarized, the Objection poses the question whether three proofs of claim that were filed within the deadline that unquestionably pertained when the case was converted from one under Chapter 7 to one under Chapter 13, and which was confirmed by an official “Notice” sent out by the Clerk's Office, were nonetheless untimely because an intervening change in the pertinent Federal Rule of Bankruptcy Procedure, which shortened by almost two months the deadline for filing claims, should be applied retroactively. Stated slightly differently, the Court must decide whether it would be “just and practicable” to apply the change in the Rule to this already pending case. At the conclusion of the hearing, the Court took the matter under s

# There are 18,000 unique document IDs.

In [8]:
df.documentId.nunique()

18000

# There are 27,659 postures in total. Most documents have one or two postures.

In [9]:
df['len_postures'] = df.apply(lambda row: len(row['postures']), axis=1)
df['len_postures'].sum()

27659

In [10]:
df['len_postures'].value_counts()

Unnamed: 0_level_0,count
len_postures,Unnamed: 1_level_1
1,8118
2,7604
3,1129
0,923
4,190
5,32
7,2
6,2


# There are 224 unique class labels.

In [11]:
set(df['postures'].sum())

{'Appellate Review',
 "Application for Attorneys' or Professional Fees and Expenses",
 'Application for Bankruptcy Trustee Fees',
 'Application to Employ Attorney or Other Professional',
 'Application to Vacate Arbitration Award',
 'Bail or Custody Motion',
 'Certified Question',
 'Declinatory Exception of Improper Venue',
 'Declinatory Exception of Insufficiency of Service of Process',
 'Declinatory Exception of Lack of Personal Jurisdiction',
 'Declinatory Exception of Lack of Subject Matter Jurisdiction',
 'Declinatory Exception of Lis Pendens',
 'Dilatory Exception of Unauthorized Use of Summary Proceeding',
 'Joinder',
 'Jury Selection Challenge or Motion',
 'Juvenile Delinquency Proceeding',
 'Juvenile Wardship Petition',
 'Motion Authorizing and Approving Payment of Certain Prepetition Obligations',
 'Motion For Turnover',
 'Motion for Abandonment of Property',
 'Motion for Abatement',
 'Motion for Additional Discovery',
 'Motion for Additur',
 'Motion for Adequate Protection',


In [12]:
len(set(df['postures'].sum()))

224

# Let's one hot encode them

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df['postures'])
binary_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
binary_labels.shape

(18000, 224)

In [15]:
df['ohe_postures'] = [list(arr) for arr in binary_labels]

In [16]:
df['ohe_postures']

Unnamed: 0,ohe_postures
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...
17995,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17996,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17997,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17998,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [17]:
df.loc[17997]['postures']

['Appellate Review', 'Trial or Guilt Phase Motion or Objection']

In [18]:
#This list contains two 1s, the first one and one near the end (alphabetical), which is what we want.
print(*df.loc[17997]['ohe_postures'])

1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1


In [19]:
df.loc[17998]['postures']

['Appellate Review',
 'Jury Selection Challenge or Motion',
 'Post-Trial Hearing Motion',
 'Sentencing or Penalty Phase Motion or Objection']

In [20]:
# This list contains four 1s, the first one, one near the beginning and two near
# the end. Looks good. (I also manually checked these were the right ones but
# for the purposes of this notebook this should be enough.)
print(*df.loc[17998]['ohe_postures'])

1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0


In [21]:
df

Unnamed: 0,documentId,postures,sections,concatenated_text,len_postures,ohe_postures
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"[{'headtext': '', 'paragraphs': ['Plaintiff Dw...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Ib06ab4d056a011e98c7a8e995225dbf9,"[Appellate Review, Sentencing or Penalty Phase...","[{'headtext': '', 'paragraphs': ['After pleadi...","After pleading guilty, William Jerome Howard,...",2,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Iaa3e3390b93111e9ba33b03ae9101fb2,"[Motion to Compel Arbitration, On Appeal]","[{'headtext': '', 'paragraphs': ['Frederick Gr...","Frederick Greene, the plaintiff below, deriva...",2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,I0d4dffc381b711e280719c3f0e80bdd0,"[On Appeal, Review of Administrative Decision]","[{'headtext': '', 'paragraphs': ['Appeal from ...",Appeal from an amended judgment of the Suprem...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,I82c7ef10d6d111e8aec5b23c3317c9c0,[On Appeal],"[{'headtext': '', 'paragraphs': ['Order, Supre...","Order, Supreme Court, New York County (Arthur...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
17995,Ia5743cf0e4b611e99e94fcbef715f24d,[Appellate Review],"[{'headtext': '', 'paragraphs': ['¶1 On Februa...","¶1 On February 5, 2017, a jury in the Fifth J...",1,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17996,I974c18f08f1611e998e8870e22e55653,[Objection to Proof of Claim],[{'headtext': 'ORDER OVERRULING DEBTOR'S OBJEC...,ORDER OVERRULING DEBTOR'S OBJECTION TO CLAIMS ...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17997,Idaaa92f0886f11e998e8870e22e55653,"[Appellate Review, Trial or Guilt Phase Motion...","[{'headtext': '', 'paragraphs': ['A jury convi...",A jury convicted Antonio Avila Medrano of Con...,2,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
17998,I247a8420677e11e9a072efd81f5238d6,"[Appellate Review, Jury Selection Challenge or...","[{'headtext': '', 'paragraphs': ['Defendant Ch...","Defendant Charles York Walker, Jr., appeals f...",4,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [22]:
df['sections'][0]

[{'headtext': '',
  'paragraphs': ['Plaintiff Dwight Watson (“Husband”) appeals from the trial court’s equitable distribution order entered 28 February 2017. On appeal, plaintiff contends that the trial court erred in its classification, valuation, and distribution of the parties’ property and in granting defendant Gertha\u2009 Watson (“Wife”) an unequal distribution of martial property. Because the trial court’s findings of fact do not support its conclusions of law and because the distributional factors found by the trial court are based upon some of those erroneous findings and conclusions, we reverse the equitable distribution order and remand for entry of a new equitable distribution order.']},
 {'headtext': 'Background',
  'paragraphs': ['Husband and Wife were married in November 1989. Although the trial court’s equitable distribution order found the date of separation as October 2007, the parties stipulated in the final pretrial order to a date of separation of October 2009. Hus

# Let's get a better look at what's in the sections.

In [23]:
# We'll explode the sections to put one section on each row, replicating the
# other columns in the original row when more new rows are produced from an
# original one.
df = df.explode('sections').reset_index(drop=True)

In [24]:
df

Unnamed: 0,documentId,postures,sections,concatenated_text,len_postures,ohe_postures
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': '', 'paragraphs': ['Plaintiff Dwi...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Background', 'paragraphs': ['Hus...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'I. Classification issues', 'para...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'A. Cadillac El Dorado', 'paragra...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
91559,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'II', 'paragraphs': ['We begin wi...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
91560,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'A', 'paragraphs': ['The next que...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
91561,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'B', 'paragraphs': ['Naturally, A...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
91562,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'C', 'paragraphs': ['Balancing is...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [25]:
#This document had 16 sections.
df[df['documentId']=='Ib4e590e0a55f11e8a5d58a2c8dcb28b5']

Unnamed: 0,documentId,postures,sections,concatenated_text,len_postures,ohe_postures
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': '', 'paragraphs': ['Plaintiff Dwi...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Background', 'paragraphs': ['Hus...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'I. Classification issues', 'para...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'A. Cadillac El Dorado', 'paragra...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'B. Valuation of home equity, HEL...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],{'headtext': 'III. Unequal Distribution of the...,Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'IV. Conclusion', 'paragraphs': [...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


A review of the .txt file in Notepad++ reveals that the word headtext occurs 91,564 times so it's definitely true that there is one header per list of paragraphs, whether empty string or not.

In [26]:
df['headtext'] = df.apply(lambda row: row['sections']['headtext'], axis=1)

In [27]:
df['paragraphs'] = df.apply(lambda row: row['sections']['paragraphs'], axis=1)

In [28]:
df

Unnamed: 0,documentId,postures,sections,concatenated_text,len_postures,ohe_postures,headtext,paragraphs
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': '', 'paragraphs': ['Plaintiff Dwi...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,[Plaintiff Dwight Watson (“Husband”) appeals f...
1,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Background', 'paragraphs': ['Hus...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Background,[Husband and Wife were married in November 198...
2,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,[Husband argues that the trial court erred in ...
3,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'I. Classification issues', 'para...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",I. Classification issues,[Although Husband does not clearly identify an...
4,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'A. Cadillac El Dorado', 'paragra...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A. Cadillac El Dorado,[Husband contends that the trial court’s findi...
...,...,...,...,...,...,...,...,...
91559,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'II', 'paragraphs': ['We begin wi...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",II,[We begin with some straightforward observatio...
91560,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'A', 'paragraphs': ['The next que...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",A,"[The next question is whether Davis, who was s..."
91561,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'B', 'paragraphs': ['Naturally, A...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",B,"[Naturally, ANCRA is not the final word for Se..."
91562,Id5f8b500e68311e78c5db03c58f2bc1d,[On Appeal],"{'headtext': 'C', 'paragraphs': ['Balancing is...","A parent has a fundamental right, protected b...",1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",C,"[Balancing is notoriously difficult, when (as ..."


In [29]:
# If you explode the df to have one row per paragraph there are 542,915 rows.
df = df.explode('paragraphs').reset_index(drop=True)
len(df)

542915

In [30]:
#This document has 44 paragraphs.
df[df['documentId']=='Ib4e590e0a55f11e8a5d58a2c8dcb28b5']

Unnamed: 0,documentId,postures,sections,concatenated_text,len_postures,ohe_postures,headtext,paragraphs
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': '', 'paragraphs': ['Plaintiff Dwi...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",,Plaintiff Dwight Watson (“Husband”) appeals fr...
1,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Background', 'paragraphs': ['Hus...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Background,Husband and Wife were married in November 1989...
2,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Background', 'paragraphs': ['Hus...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Background,A hearing was held on 25 October 2016. Followi...
3,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,Husband argues that the trial court erred in v...
4,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,Husband challenges some findings of fact as un...
5,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,Our review of an equitable distribution order ...
6,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,"However, even applying this generous standard ..."
7,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,․
8,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,"In fact, to enter a proper equitable distribut..."
9,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"{'headtext': 'Analysis', 'paragraphs': ['Husba...",Plaintiff Dwight Watson (“Husband”) appeals f...,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Analysis,"Robinson v. Robinson, 210 N.C. App. 319, 322-2..."


In [31]:
# There are 542,915 rows in the dataframe but if you recount just the paragraphs the number is smaller at 542,169.
df.paragraphs.count()

542169

In [32]:
#Several of them are null.
len(df[df.paragraphs.isnull()])

746

In [33]:
#df[df.paragraphs.isnull()]

In [34]:
#There are 746 null paragraphs represented.
542915-542169

746

In [35]:
df = df.dropna(subset=['paragraphs'])

In [36]:
# How many human-readable sections is it most common for each document to have?
# (Text that falls under the same heading.) It's most common for a document to
# have 1, 4 or 5 conceptual sections.
num_sections = df.groupby('documentId')['headtext'].nunique().reset_index()
num_sections.columns = ['documentId', 'unique_section_count']
num_sections.unique_section_count.value_counts().head(10)

Unnamed: 0_level_0,count
unique_section_count,Unnamed: 1_level_1
1,7514
4,1537
5,1379
6,1198
7,1052
3,842
8,829
9,631
2,519
10,481


In [37]:
#There are no 0 paragraph count documents because we did a dropna on paragraphs. Before that there were 3 of them.
#After the dropna, the largest number of paragraphs per document was 1165. There was one document of that length.
num_paragraphs = df.groupby('documentId')['paragraphs'].nunique().reset_index()
num_paragraphs.columns = ['documentId', 'unique_paragraph_count']
num_paragraphs_to_plot = num_paragraphs.unique_paragraph_count.value_counts().sort_index()
num_paragraphs_to_plot

Unnamed: 0_level_0,count
unique_paragraph_count,Unnamed: 1_level_1
1,5
2,164
3,488
4,801
5,911
...,...
496,1
519,1
571,1
593,1


In [38]:
#After the dropna, the most common number of paragraphs per document was 6. There were 921 documents with 6 paragraphs.
num_paragraphs.unique_paragraph_count.value_counts().sort_values(ascending=False)

Unnamed: 0_level_0,count
unique_paragraph_count,Unnamed: 1_level_1
6,921
5,911
4,801
7,713
8,594
...,...
280,1
471,1
189,1
519,1


In [39]:
#What's going on here? Someone had a lot to say!
num_paragraphs[num_paragraphs['unique_paragraph_count']==1165]

Unnamed: 0,documentId,unique_paragraph_count
11710,Ia5b6cf70e39e11e99758f497fe5ac24e,1165


In [40]:
#Wow, it's about a volcano! That must've been a well-examined case.
df[df['documentId']=='Ia5b6cf70e39e11e99758f497fe5ac24e']

Unnamed: 0,documentId,postures,sections,concatenated_text,len_postures,ohe_postures,headtext,paragraphs
73556,Ia5b6cf70e39e11e99758f497fe5ac24e,[],"{'headtext': 'MEMORANDUM OPINION, FINDINGS OF ...","MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",THIS MATTER comes before the Court on the benc...
73557,Ia5b6cf70e39e11e99758f497fe5ac24e,[],"{'headtext': 'FINDINGS OF FACT', 'paragraphs':...","MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",FINDINGS OF FACT,All parties have submitted proposed findings o...
73558,Ia5b6cf70e39e11e99758f497fe5ac24e,[],"{'headtext': '1. The Valles Caldera Geology.',...","MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1. The Valles Caldera Geology.,1. The Valles Caldera is a volcanic crater in ...
73559,Ia5b6cf70e39e11e99758f497fe5ac24e,[],"{'headtext': '1. The Valles Caldera Geology.',...","MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1. The Valles Caldera Geology.,"2. Approximately 1.2 million years ago, a seri..."
73560,Ia5b6cf70e39e11e99758f497fe5ac24e,[],"{'headtext': '1. The Valles Caldera Geology.',...","MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1. The Valles Caldera Geology.,3. Volcanologists classify the Valles Caldera ...
...,...,...,...,...,...,...,...,...
74725,Ia5b6cf70e39e11e99758f497fe5ac24e,[],{'headtext': 'c. Jemez Pueblo's Valles Caldera...,"MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",c. Jemez Pueblo's Valles Caldera Use Does Not ...,"Strong v. United States, 518 F.2d at 572."
74726,Ia5b6cf70e39e11e99758f497fe5ac24e,[],{'headtext': 'c. Jemez Pueblo's Valles Caldera...,"MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",c. Jemez Pueblo's Valles Caldera Use Does Not ...,462. The record contains no evidence that othe...
74727,Ia5b6cf70e39e11e99758f497fe5ac24e,[],{'headtext': 'c. Jemez Pueblo's Valles Caldera...,"MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",c. Jemez Pueblo's Valles Caldera Use Does Not ...,463. The record also does not support an infer...
74728,Ia5b6cf70e39e11e99758f497fe5ac24e,[],{'headtext': 'c. Jemez Pueblo's Valles Caldera...,"MEMORANDUM OPINION, FINDINGS OF FACT, CONCLUSI...",0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",c. Jemez Pueblo's Valles Caldera Use Does Not ...,464. The evidence that other Pueblos and Tribe...


In [41]:
#Here are documents of a more common length.
num_paragraphs[num_paragraphs['unique_paragraph_count']==6]

Unnamed: 0,documentId,unique_paragraph_count
52,I00e513503d4211eabed3a1bc09b332eb,6
54,I00f087901ff611e9a174b18b713fc6d4,6
59,I01035af0da4011e9a803cc27e5772c47,6
113,I01ee0a8a4e6111e6a807ad48145ed9f1,6
118,I01fbb2f0324b11eabbc4990d21dc61be,6
...,...,...
17916,Ifeb1565017e311e892c0e944351936c3,6
17919,Ifebf2db06f3211e88808c81b5a222cba,6
17932,Ifef4f41003b911e98f4d8d23fc0d7c2b,6
17951,Iff3c2ef005c211ea99759a7d72d9b23a,6


In [42]:
#This seems to be a much simpler case.
df[df['documentId']=='I00e513503d4211eabed3a1bc09b332eb']

Unnamed: 0,documentId,postures,sections,concatenated_text,len_postures,ohe_postures,headtext,paragraphs
538619,I00e513503d4211eabed3a1bc09b332eb,"[Motion to Dismiss, On Appeal]","{'headtext': 'DECISION & ORDER', 'paragraphs':...",DECISION & ORDER ORDERED that the order is aff...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",DECISION & ORDER,"ORDERED that the order is affirmed, with costs."
538620,I00e513503d4211eabed3a1bc09b332eb,"[Motion to Dismiss, On Appeal]","{'headtext': 'DECISION & ORDER', 'paragraphs':...",DECISION & ORDER ORDERED that the order is aff...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",DECISION & ORDER,The plaintiff is the owner of a building locat...
538621,I00e513503d4211eabed3a1bc09b332eb,"[Motion to Dismiss, On Appeal]","{'headtext': 'DECISION & ORDER', 'paragraphs':...",DECISION & ORDER ORDERED that the order is aff...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",DECISION & ORDER,The role of an RPAPL article 7–A administrator...
538622,I00e513503d4211eabed3a1bc09b332eb,"[Motion to Dismiss, On Appeal]","{'headtext': 'DECISION & ORDER', 'paragraphs':...",DECISION & ORDER ORDERED that the order is aff...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",DECISION & ORDER,"Here, the plaintiff did not obtain permission ..."
538623,I00e513503d4211eabed3a1bc09b332eb,"[Motion to Dismiss, On Appeal]","{'headtext': 'DECISION & ORDER', 'paragraphs':...",DECISION & ORDER ORDERED that the order is aff...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",DECISION & ORDER,We reject the defendant's contention that this...
538624,I00e513503d4211eabed3a1bc09b332eb,"[Motion to Dismiss, On Appeal]","{'headtext': 'DECISION & ORDER', 'paragraphs':...",DECISION & ORDER ORDERED that the order is aff...,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",DECISION & ORDER,"In light of our determination, we need not rea..."
