# Data preprocessing

This series of notebooks outline the steps undertook to preprocess the sampling data in preparation for EDA, label generation, and modeling.

Step 2: Clean-up the opinions from various sources and combine them to one column

# Import libraries

In [1]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from eyecite import clean_text

# Helper functions

In [2]:
# Function to recursively extract xml text elements and combine them into one string
def extract_text(tag):
    combined_text = '' 
    
    # Extract the text, stripping leading/trailing whitespace
    text = tag.get_text(strip=False)
    if text:
        return text
        
    # Recursively process all child elements (if any)
    for child in tag.find_all(True):  # True matches all tags
        combined_text += extract_text(child)  # Concatenate child text recursively
    return combined_text

# Function to clean opinion_text in xml format
def clean_xml(opinion_text):
    # Use BeautifulSoup to load the xml opinion_text
    soup = BeautifulSoup(opinion_text, 'lxml-xml')
    
    # Remove the text inside <a> tags
    for a_tag in soup.find_all('a'):
        a_tag.string = '' 

    # Extract the texts recursively over each element
    extracted_text = extract_text(soup)

    # Clean up the extracted text using eyecite
    cleaned_text = clean_text(extracted_text, ['all_whitespace', 'underscores'])
    return cleaned_text

# Function to clean opinion_text in html format
def clean_html(opinion_text):
    cleaned_text = clean_text(opinion_text, ['html', 'all_whitespace', 'underscores'])
    return cleaned_text

# Function to clean opinion_text in plain_text format
def clean_plain(opinion_text):
    cleaned_text = clean_text(opinion_text, ['all_whitespace', 'underscores'])
    return cleaned_text

In [3]:
# Function to get the first non-null value from the specified columns
def get_first_non_null(row, columns):
    for col in columns:
        if pd.notna(row[col]):
            return row[col], col
    return None, None  # Return None if all columns are null

# Load the data

In [4]:
df = pd.read_csv("outputs/1a.columns_cleaned.csv")
df

Unnamed: 0,opinion_id,opinion_date_created,opinion_type,opinion_extracted_by_ocr,opinion_per_curiam,opinion_plain_text,opinion_html,opinion_html_with_citations,opinion_html_anon_2020,opinion_html_columbia,...,cluster_disposition,cluster_syllabus,docket_id,docket_number,docket_view_count,court_id,court_jurisdiction,court_in_use,court_short_name,court_full_name
0,444587,2011-08-23 09:37:07+00,010combined,f,f,,"<p class=""case_cite"">748 F.2d 972</p>\n <p ...","<p class=""case_cite""><span class=""citation no-...",,,...,,,464952,84-1064,1,ca5,F,t,Fifth Circuit,Court of Appeals for the Fifth Circuit
1,9410469,2023-07-21 14:07:23.906127+00,010combined,f,f,Nebraska Supreme Court Online Library\nwww.neb...,,"<pre class=""inline"">Nebraska Supreme Court Onl...",,,...,,,67625502,S-22-272,0,neb,S,t,Nebraska Supreme Court,Nebraska Supreme Court
2,714663,2012-04-17 06:49:10+00,010combined,f,f,,"<p class=""case_cite"">78 F.3d 599</p>\n <p c...","<p class=""case_cite""><span class=""citation no-...",,,...,,,2320406,94-2273,0,ca11,F,t,Eleventh Circuit,Court of Appeals for the Eleventh Circuit
3,2729050,2014-09-08 21:38:34.377891+00,010combined,f,f,"Pursuant to Ind. Appellate Rule 65(D),\nthis M...",,"<pre class=""inline"">Pursuant to Ind. Appellate...",,,...,,,1202177,20A03-1202-CR-90,0,indctapp,SA,t,Indiana Court of Appeals,Indiana Court of Appeals
4,692963,2012-04-17 05:18:08+00,010combined,f,f,,"<p class=""case_cite"">51 F.3d 288</p>\n <p c...","<p class=""case_cite""><span class=""citation no-...",,,...,,,64372,91-3081,1,cadc,F,t,D.C. Circuit,Court of Appeals for the D.C. Circuit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,524265,2011-08-23 10:00:25+00,010combined,f,f,,"<p class=""case_cite"">876 F.2d 266</p>\n <p ...","<p class=""case_cite""><span class=""citation no-...",,,...,,,686659,673,0,ca2,F,t,Second Circuit,Court of Appeals for the Second Circuit
949,839254,2013-03-01 21:46:23.176345+00,010combined,f,f,Order ...,,"<div>\n<center><b><span class=""citation"" data-...",,,...,,,2439863,132002,0,mich,S,t,Michigan Supreme Court,Michigan Supreme Court
950,563586,2011-08-23 10:11:53+00,010combined,f,f,,"<p class=""case_cite"">936 F.2d 585</p>\n <p ...","<p class=""case_cite""><span class=""citation no-...",,,...,,,1521449,90-3023,0,ca11,F,t,Eleventh Circuit,Court of Appeals for the Eleventh Circuit
951,9607832,2023-08-22 03:02:22.627003+00,030concurrence,f,f,,,,,,...,,,774786,"15817, 15818",0,idaho,S,t,Idaho Supreme Court,Idaho Supreme Court


In [5]:
df.columns

Index(['opinion_id', 'opinion_date_created', 'opinion_type',
       'opinion_extracted_by_ocr', 'opinion_per_curiam', 'opinion_plain_text',
       'opinion_html', 'opinion_html_with_citations', 'opinion_html_anon_2020',
       'opinion_html_columbia', 'opinion_html_lawbox', 'opinion_xml_harvard',
       'cluster_id', 'cluster_judges', 'cluster_nature_of_suit',
       'cluster_source', 'cluster_blocked', 'cluster_precedential_status',
       'cluster_citation_count', 'cluster_case_name',
       'cluster_case_name_short', 'cluster_case_name_full', 'cluster_summary',
       'cluster_history', 'cluster_headmatter', 'cluster_headnotes',
       'cluster_posture', 'cluster_arguments', 'cluster_cross_reference',
       'cluster_disposition', 'cluster_syllabus', 'docket_id', 'docket_number',
       'docket_view_count', 'court_id', 'court_jurisdiction', 'court_in_use',
       'court_short_name', 'court_full_name'],
      dtype='object')

### confirm opinion_id is the unique identifier for each opinion

In [6]:
assert df['opinion_id'].nunique() == len(df['opinion_id'])

In [7]:
opinions_columns = ['opinion_plain_text',
       'opinion_html', 'opinion_html_with_citations', 'opinion_html_anon_2020',
       'opinion_html_columbia', 'opinion_html_lawbox', 'opinion_xml_harvard']

df[opinions_columns].head()

Unnamed: 0,opinion_plain_text,opinion_html,opinion_html_with_citations,opinion_html_anon_2020,opinion_html_columbia,opinion_html_lawbox,opinion_xml_harvard
0,,"<p class=""case_cite"">748 F.2d 972</p>\n <p ...","<p class=""case_cite""><span class=""citation no-...",,,,"<?xml version=""1.0"" encoding=""utf-8""?>\n<opini..."
1,Nebraska Supreme Court Online Library\nwww.neb...,,"<pre class=""inline"">Nebraska Supreme Court Onl...",,,,
2,,"<p class=""case_cite"">78 F.3d 599</p>\n <p c...","<p class=""case_cite""><span class=""citation no-...",,,,
3,"Pursuant to Ind. Appellate Rule 65(D),\nthis M...",,"<pre class=""inline"">Pursuant to Ind. Appellate...",,,,
4,,"<p class=""case_cite"">51 F.3d 288</p>\n <p c...","<p class=""case_cite""><span class=""citation no-...",,,,"<?xml version=""1.0"" encoding=""utf-8""?>\n<opini..."


In [8]:
df[opinions_columns].isnull().sum()

opinion_plain_text             798
opinion_html                   870
opinion_html_with_citations    549
opinion_html_anon_2020         948
opinion_html_columbia          885
opinion_html_lawbox            809
opinion_xml_harvard            233
dtype: int64

# Extract and clean the opinions from each respective column

In [9]:
cleaning_functions = {
    'opinion_plain_text': clean_plain,
    'opinion_html': clean_html,
    'opinion_html_with_citations': clean_html,
    'opinion_html_anon_2020': clean_html,
    'opinion_html_columbia': clean_html,
    'opinion_html_lawbox': clean_html,
    'opinion_xml_harvard': clean_xml
}

for column, clean_func in cleaning_functions.items():
    df.loc[:, column] = df[column].apply(lambda x: clean_func(x) if pd.notna(x) else x)

In [10]:
df[opinions_columns].head()

Unnamed: 0,opinion_plain_text,opinion_html,opinion_html_with_citations,opinion_html_anon_2020,opinion_html_columbia,opinion_html_lawbox,opinion_xml_harvard
0,,"748 F.2d 972 UNITED STATES of America, Plainti...","748 F.2d 972 UNITED STATES of America, Plainti...",,,,"REAVLEY, Circuit Judge: Juan Velasquez appeal..."
1,Nebraska Supreme Court Online Library www.nebr...,,Nebraska Supreme Court Online Library www.nebr...,,,,
2,,78 F.3d 599 U.S. v. Johnston ** NO. 94-2273 Un...,78 F.3d 599 U.S. v. Johnston ** NO. 94-2273 Un...,,,,
3,"Pursuant to Ind. Appellate Rule 65(D), this Me...",,"Pursuant to Ind. Appellate Rule 65(D), this Me...",,,,
4,,51 F.3d 288 311 U.S.App.D.C. 145 UNITED STATES...,51 F.3d 288 311 U.S.App.D.C. 145 UNITED STATES...,,,,Opinion of the Court filed by Circuit Judge T...


# Create one column for the opinion to be used for modeling

In accordance with the [documentation](https://www.courtlistener.com/help/api/rest/case-law/), the opinions used for modeling is selected based on the order below (from best to worst).

In [11]:
opinion_order = ["opinion_html_with_citations",
                 "opinion_html_columbia",
                 "opinion_html_lawbox",
                 "opinion_xml_harvard",
                 "opinion_html_anon_2020",
                 "opinion_html",
                 "opinion_plain_text"]

In [12]:
df[['opinion', 'opinion_source']] = df.apply(
    lambda row: pd.Series(get_first_non_null(row, opinion_order)), axis=1
)
df[opinions_columns + ['opinion', 'opinion_source']].head()

Unnamed: 0,opinion_plain_text,opinion_html,opinion_html_with_citations,opinion_html_anon_2020,opinion_html_columbia,opinion_html_lawbox,opinion_xml_harvard,opinion,opinion_source
0,,"748 F.2d 972 UNITED STATES of America, Plainti...","748 F.2d 972 UNITED STATES of America, Plainti...",,,,"REAVLEY, Circuit Judge: Juan Velasquez appeal...","748 F.2d 972 UNITED STATES of America, Plainti...",opinion_html_with_citations
1,Nebraska Supreme Court Online Library www.nebr...,,Nebraska Supreme Court Online Library www.nebr...,,,,,Nebraska Supreme Court Online Library www.nebr...,opinion_html_with_citations
2,,78 F.3d 599 U.S. v. Johnston ** NO. 94-2273 Un...,78 F.3d 599 U.S. v. Johnston ** NO. 94-2273 Un...,,,,,78 F.3d 599 U.S. v. Johnston ** NO. 94-2273 Un...,opinion_html_with_citations
3,"Pursuant to Ind. Appellate Rule 65(D), this Me...",,"Pursuant to Ind. Appellate Rule 65(D), this Me...",,,,,"Pursuant to Ind. Appellate Rule 65(D), this Me...",opinion_html_with_citations
4,,51 F.3d 288 311 U.S.App.D.C. 145 UNITED STATES...,51 F.3d 288 311 U.S.App.D.C. 145 UNITED STATES...,,,,Opinion of the Court filed by Circuit Judge T...,51 F.3d 288 311 U.S.App.D.C. 145 UNITED STATES...,opinion_html_with_citations


### Confirm no blank opinions

In [13]:
assert df["opinion"].isnull().sum() == 0

# Clean up the columns & save the cleaned sample

### Drop the separate opinion columns and keep only the combined opinion column

In [14]:
df = df.drop(columns=opinions_columns)
df.columns

Index(['opinion_id', 'opinion_date_created', 'opinion_type',
       'opinion_extracted_by_ocr', 'opinion_per_curiam', 'cluster_id',
       'cluster_judges', 'cluster_nature_of_suit', 'cluster_source',
       'cluster_blocked', 'cluster_precedential_status',
       'cluster_citation_count', 'cluster_case_name',
       'cluster_case_name_short', 'cluster_case_name_full', 'cluster_summary',
       'cluster_history', 'cluster_headmatter', 'cluster_headnotes',
       'cluster_posture', 'cluster_arguments', 'cluster_cross_reference',
       'cluster_disposition', 'cluster_syllabus', 'docket_id', 'docket_number',
       'docket_view_count', 'court_id', 'court_jurisdiction', 'court_in_use',
       'court_short_name', 'court_full_name', 'opinion', 'opinion_source'],
      dtype='object')

### Add a column for the opinion word count

In [15]:
df["opinion_word_count"] = df["opinion"].astype(str).str.split().str.len()
df["opinion_word_count"].describe()

count      953.000000
mean      1633.599161
std       2217.746311
min          1.000000
25%        136.000000
50%        891.000000
75%       2222.000000
max      21186.000000
Name: opinion_word_count, dtype: float64

### Reorder the columns

In [16]:
columns = df.columns.tolist()

columns.remove('opinion')
columns.insert(1, 'opinion')

columns.remove('opinion_source')
columns.insert(2, 'opinion_source')

columns.remove('opinion_word_count')
columns.insert(3, 'opinion_word_count')

df = df[columns]
df.columns

Index(['opinion_id', 'opinion', 'opinion_source', 'opinion_word_count',
       'opinion_date_created', 'opinion_type', 'opinion_extracted_by_ocr',
       'opinion_per_curiam', 'cluster_id', 'cluster_judges',
       'cluster_nature_of_suit', 'cluster_source', 'cluster_blocked',
       'cluster_precedential_status', 'cluster_citation_count',
       'cluster_case_name', 'cluster_case_name_short',
       'cluster_case_name_full', 'cluster_summary', 'cluster_history',
       'cluster_headmatter', 'cluster_headnotes', 'cluster_posture',
       'cluster_arguments', 'cluster_cross_reference', 'cluster_disposition',
       'cluster_syllabus', 'docket_id', 'docket_number', 'docket_view_count',
       'court_id', 'court_jurisdiction', 'court_in_use', 'court_short_name',
       'court_full_name'],
      dtype='object')

### Save the cleaned file

In [17]:
df.to_csv("outputs/1b.opinions_cleaned.csv", index=False)