# Data preprocessing

This series of notebooks outline the steps undertook to preprocess the sampling data in preparation for EDA, label generation, and modeling.

Step 1: Clean up the columns

# Import libraries

In [1]:
import numpy as np
import pandas as pd

# Load the data

In [2]:
df = pd.read_csv("inputs/random_opinion.csv")
df

Unnamed: 0,id,date_created,date_modified,type,sha1,download_url,local_path,plain_text,html,html_lawbox,...,start_date,end_date,jurisdiction,notes,pacer_court_id,fjc_court_id,pacer_has_rss_feed,date_last_pacer_contact,pacer_rss_entry_types,parent_court_id
0,444587,2011-08-23 09:37:07+00,2023-08-05 04:12:40.860263+00,010combined,8079e7d17db825002e3e84d1ca4e95b1b230fe40,http://bulk.resource.org/courts.gov/c/F2/748/7...,,,"<p class=""case_cite"">748 F.2d 972</p>\n <p ...",,...,1891-03-03,,F,,5.0,5,f,,,
1,9410469,2023-07-21 14:07:23.906127+00,2023-07-21 15:10:53.580684+00,010combined,11ceaec7c3e06ead7db7bc2c17c59541427c098f,https://www.nebraska.gov/apps-courts-epub/publ...,pdf/2023/07/21/np_dodge_mgmt._co._v._holcomb.pdf,Nebraska Supreme Court Online Library\nwww.neb...,,,...,1854-01-01,,S,,,,,,,
2,714663,2012-04-17 06:49:10+00,2022-03-08 05:46:05.427551+00,010combined,8ec3e40994a1f967e668898eb39acd3bedd055cd,http://bulk.resource.org/courts.gov/c/F3/78/78...,,,"<p class=""case_cite"">78 F.3d 599</p>\n <p c...",,...,1980-10-14,,F,,11.0,11,t,,opinions,
3,2729050,2014-09-08 21:38:34.377891+00,2022-03-09 00:46:53.474332+00,010combined,3a35b2ef1a18590b49a158949595bb7ba14e42de,http://www.in.gov/judiciary/opinions/pdf/11011...,pdf/2012/11/01/darrius_woods_v._state_of_india...,"Pursuant to Ind. Appellate Rule 65(D),\nthis M...",,,...,1891-01-01,,SA,Created by Lawbox\r\nStart date: http://www.in...,,,,,,
4,692963,2012-04-17 05:18:08+00,2023-08-05 12:31:20.633974+00,010combined,77730da5d3a71908d6e3023a7e44de2a387ec5bd,http://bulk.resource.org/courts.gov/c/F3/51/51...,,,"<p class=""case_cite"">51 F.3d 288</p>\n <p c...",,...,1893-02-09,,F,,12.0,0,t,,"orders,opinions,judgments",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
948,524265,2011-08-23 10:00:25+00,2022-03-08 03:29:34.343665+00,010combined,93883994ea4fede70681f0f6b47fa81f1a703728,http://bulk.resource.org/courts.gov/c/F2/876/8...,,,"<p class=""case_cite"">876 F.2d 266</p>\n <p ...",,...,1891-03-03,,F,,2.0,2,t,,"orders, opinions",
949,839254,2013-03-01 21:46:23.176345+00,2022-03-08 08:42:13.16712+00,010combined,7cb10ff74f2dfdbc107282d053aed279398491ea,http://publicdocs.courts.mi.gov:81/SCT/PUBLIC/...,pdf/2008/06/23/people_of_mi_v._steven_anthony_...,Order ...,,<div>\n<center><b>750 N.W.2d 177 (2008)</b></c...,...,1837-01-01,,S,,,,,,,
950,563586,2011-08-23 10:11:53+00,2022-03-08 03:57:02.230031+00,010combined,2d74672e12e59d9a6b5179b82b4911c92d7d8b62,http://bulk.resource.org/courts.gov/c/F2/936/9...,,,"<p class=""case_cite"">936 F.2d 585</p>\n <p ...",,...,1980-10-14,,F,,11.0,11,t,,opinions,
951,9607832,2023-08-22 03:02:22.627003+00,2023-08-22 03:02:22.627019+00,030concurrence,,,,,,,...,1863-01-01,,S,,,,,,,


# Clean-up the columns

### remove columns that don't have any values

In [3]:
print("number of columns: ", len(df.columns))
df = df.dropna(axis=1, how="all")
print("number of remaining columns: ", len(df.columns))

number of columns:  130
number of remaining columns:  108


### clean-up the opinion columns

In [4]:
columns = ["id", "date_created", "date_modified", "type", "sha1", "download_url", "local_path",
    "plain_text", "html", "html_lawbox", "html_columbia", "html_with_citations",
    "extracted_by_ocr", "author_id", "cluster_id", "per_curiam", "page_count",
    "author_str", "joined_by_str", "xml_harvard", "html_anon_2020", "ordering_key"]
print("number of columns: ", len(columns))

dropped_columns = set(columns) - set(df.columns)
print("columns dropped: ", list(dropped_columns))
remaining_columns = set(columns) - set(dropped_columns)
print("number of columns remaining: ", len(remaining_columns))

number of columns:  22
columns dropped:  ['ordering_key', 'joined_by_str']
number of columns remaining:  20


In [5]:
columns_to_drop = ["date_modified", "sha1", "download_url", "local_path", "author_id", "author_str", "page_count"]
df = df.drop(columns=columns_to_drop)

print("columns dropped: ", list(columns_to_drop))
remaining_columns = set(remaining_columns) - set(columns_to_drop)
print("number of columns remaining: ", len(remaining_columns))

print("opinion columns: ", list(remaining_columns))

columns dropped:  ['date_modified', 'sha1', 'download_url', 'local_path', 'author_id', 'author_str', 'page_count']
number of columns remaining:  13
opinion columns:  ['plain_text', 'date_created', 'xml_harvard', 'cluster_id', 'html_anon_2020', 'per_curiam', 'id', 'html_with_citations', 'html', 'html_lawbox', 'html_columbia', 'extracted_by_ocr', 'type']


### clean-up the cluster columns

In [6]:
columns = ["id.1", "judges", "date_created.1", "date_modified.1", "date_filed", "slug",
    "case_name_short", "case_name", "case_name_full", "scdb_id", "source",
    "procedural_history", "attorneys", "nature_of_suit", "posture", "syllabus",
    "citation_count", "precedential_status", "date_blocked", "blocked", "docket_id",
    "scdb_decision_direction", "scdb_votes_majority", "scdb_votes_minority",
    "date_filed_is_approximate", "correction", "cross_reference", "disposition",
    "filepath_json_harvard", "headnotes", "history", "other_dates", "summary",
    "arguments", "headmatter", "filepath_pdf_harvard"]

dropped_columns = set(columns) - set(df.columns)
print("columns dropped: ", list(dropped_columns))
remaining_columns = set(columns) - set(dropped_columns)
print("number of columns remaining: ", len(remaining_columns))

columns dropped:  ['filepath_pdf_harvard', 'correction', 'procedural_history']
number of columns remaining:  33


In [7]:
columns_to_drop = ["id.1", "date_created.1", "date_modified.1", "date_filed", "slug", "scdb_id", "attorneys",
                  "date_blocked", "scdb_decision_direction", "scdb_votes_majority", "scdb_votes_minority",
                  "date_filed_is_approximate", "filepath_json_harvard", "other_dates"]
df = df.drop(columns=columns_to_drop)

print("columns dropped: ", list(columns_to_drop))
remaining_columns = set(remaining_columns) - set(columns_to_drop)
print("number of columns remaining: ", len(remaining_columns))

print("cluster columns: ", list(remaining_columns))

columns dropped:  ['id.1', 'date_created.1', 'date_modified.1', 'date_filed', 'slug', 'scdb_id', 'attorneys', 'date_blocked', 'scdb_decision_direction', 'scdb_votes_majority', 'scdb_votes_minority', 'date_filed_is_approximate', 'filepath_json_harvard', 'other_dates']
number of columns remaining:  19
cluster columns:  ['docket_id', 'arguments', 'case_name', 'posture', 'headmatter', 'judges', 'headnotes', 'citation_count', 'precedential_status', 'case_name_short', 'history', 'source', 'summary', 'disposition', 'blocked', 'case_name_full', 'nature_of_suit', 'syllabus', 'cross_reference']


### clean-up the docket columns

In [8]:
columns = ["id.2", "date_created.2", "date_modified.2", "date_argued", "date_reargued",
    "date_reargument_denied", "case_name_short.1", "case_name.1", "case_name_full.1",
    "slug.1", "docket_number", "date_blocked.1", "blocked.1", "court_id",
    "date_cert_denied", "date_cert_granted", "assigned_to_id", "cause", "date_filed.1",
    "date_last_filing", "date_terminated", "filepath_ia", "filepath_local",
    "jurisdiction_type", "jury_demand", "nature_of_suit.1", "pacer_case_id",
    "referred_to_id", "source.1", "assigned_to_str", "referred_to_str", "view_count",
    "date_last_index", "appeal_from_id", "appeal_from_str", "appellate_case_type_information",
    "appellate_fee_status", "panel_str", "originating_court_information_id", "mdl_status",
    "filepath_ia_json", "ia_date_first_change", "ia_needs_upload", "ia_upload_failure_count",
    "docket_number_core", "idb_data_id", "federal_defendant_number", "federal_dn_case_type",
    "federal_dn_judge_initials_assigned", "federal_dn_judge_initials_referred",
    "federal_dn_office_code", "parent_docket_id"
]

dropped_columns = set(columns) - set(df.columns)
print("columns dropped: ", list(dropped_columns))
remaining_columns = set(columns) - set(dropped_columns)
print("number of columns remaining: ", len(remaining_columns))

columns dropped:  ['date_reargued', 'jury_demand', 'federal_defendant_number', 'cause', 'panel_str', 'federal_dn_office_code', 'mdl_status', 'filepath_ia', 'federal_dn_case_type', 'parent_docket_id', 'date_cert_granted', 'filepath_local', 'date_cert_denied', 'referred_to_id', 'federal_dn_judge_initials_referred', 'federal_dn_judge_initials_assigned']
number of columns remaining:  36


In [9]:
columns_to_drop = ["id.2", "slug.1", 'blocked.1', 'case_name.1', 'case_name_full.1', 'case_name_short.1',
                  'date_blocked.1','date_created.2','date_filed.1', 'date_modified.2','nature_of_suit.1',
                  'source.1', 'idb_data_id', 'date_argued', 'appeal_from_id','appeal_from_str',
                   'appellate_case_type_information','appellate_fee_status','assigned_to_id','assigned_to_str',
                  'date_last_filing','date_last_index','date_reargument_denied','date_terminated', 
                   'filepath_ia_json','ia_date_first_change', 'ia_needs_upload','ia_upload_failure_count',
                   'originating_court_information_id', 'pacer_case_id', 'docket_number_core', "jurisdiction_type",
                  "referred_to_str"]
df = df.drop(columns=columns_to_drop)

print("columns dropped: ", list(columns_to_drop))
remaining_columns = set(remaining_columns) - set(columns_to_drop)
print("number of columns remaining: ", len(remaining_columns))

print("docket columns: ", list(remaining_columns))

columns dropped:  ['id.2', 'slug.1', 'blocked.1', 'case_name.1', 'case_name_full.1', 'case_name_short.1', 'date_blocked.1', 'date_created.2', 'date_filed.1', 'date_modified.2', 'nature_of_suit.1', 'source.1', 'idb_data_id', 'date_argued', 'appeal_from_id', 'appeal_from_str', 'appellate_case_type_information', 'appellate_fee_status', 'assigned_to_id', 'assigned_to_str', 'date_last_filing', 'date_last_index', 'date_reargument_denied', 'date_terminated', 'filepath_ia_json', 'ia_date_first_change', 'ia_needs_upload', 'ia_upload_failure_count', 'originating_court_information_id', 'pacer_case_id', 'docket_number_core', 'jurisdiction_type', 'referred_to_str']
number of columns remaining:  3
docket columns:  ['view_count', 'court_id', 'docket_number']


### clean-up the court columns

In [10]:
columns = court_columns = ["id.3", "date_modified.3", "in_use", "has_opinion_scraper", "has_oral_argument_scraper",
    "position", "citation_string", "short_name", "full_name", "url", "start_date",
    "end_date", "jurisdiction", "notes", "pacer_court_id", "fjc_court_id",
    "pacer_has_rss_feed", "date_last_pacer_contact", "pacer_rss_entry_types",
    "parent_court_id"
]

dropped_columns = set(columns) - set(df.columns)
print("columns dropped: ", list(dropped_columns))
remaining_columns = set(columns) - set(dropped_columns)
print("number of columns remaining: ", len(remaining_columns))

columns dropped:  ['date_last_pacer_contact']
number of columns remaining:  19


In [11]:
columns_to_drop = ['id.3', 'date_modified.3', 'start_date', 'end_date', 'url','citation_string',
                  'position','fjc_court_id','has_opinion_scraper','has_oral_argument_scraper','notes',
                  'pacer_court_id','pacer_has_rss_feed','pacer_rss_entry_types','parent_court_id']
df = df.drop(columns=columns_to_drop)

print("columns dropped: ", list(columns_to_drop))
remaining_columns = set(remaining_columns) - set(columns_to_drop)
print("number of columns remaining: ", len(remaining_columns))

print("court columns: ", list(remaining_columns))

columns dropped:  ['id.3', 'date_modified.3', 'start_date', 'end_date', 'url', 'citation_string', 'position', 'fjc_court_id', 'has_opinion_scraper', 'has_oral_argument_scraper', 'notes', 'pacer_court_id', 'pacer_has_rss_feed', 'pacer_rss_entry_types', 'parent_court_id']
number of columns remaining:  4
court columns:  ['jurisdiction', 'full_name', 'in_use', 'short_name']


### rename & reorder all columns

In [12]:
opinion_columns = {"id": "opinion_id", 
                   "date_created": "opinion_date_created",
                   "type": "opinion_type",
                   "extracted_by_ocr": "opinion_extracted_by_ocr",
                   "per_curiam": "opinion_per_curiam",
                   "plain_text": "opinion_plain_text",
                   "html": "opinion_html",
                   "html_with_citations": "opinion_html_with_citations",
                   "html_anon_2020": "opinion_html_anon_2020",
                   "html_columbia": "opinion_html_columbia",
                   "html_lawbox": "opinion_html_lawbox",
                   "xml_harvard": "opinion_xml_harvard"}

cluster_columns = {"cluster_id": "cluster_id",
                   "judges": "cluster_judges",
                   "nature_of_suit": "cluster_nature_of_suit",
                   "source": "cluster_source",
                   "blocked": "cluster_blocked",
                   "precedential_status": "cluster_precedential_status",
                   "citation_count": "cluster_citation_count",
                   "case_name": "cluster_case_name",
                   "case_name_short": "cluster_case_name_short",
                   "case_name_full": "cluster_case_name_full",
                   "summary": "cluster_summary",
                   "history": "cluster_history",
                   "headmatter": "cluster_headmatter",
                   "headnotes": "cluster_headnotes",
                   "posture": "cluster_posture",
                   "arguments": "cluster_arguments",
                   "cross_reference": "cluster_cross_reference",
                   "disposition": "cluster_disposition",
                   "syllabus": "cluster_syllabus"}

docket_columns = {'docket_id': 'docket_id',
                  'docket_number': 'docket_number',
                  'view_count': 'docket_view_count'}

court_columns = {'court_id': 'court_id',
                 'jurisdiction': 'court_jurisdiction', 
                 'in_use': 'court_in_use',
                 'short_name': 'court_short_name',
                 'full_name': 'court_full_name'}

combined_columns = {}
for d in [opinion_columns, cluster_columns, docket_columns, court_columns]:
    combined_columns.update(d)

df.rename(columns=combined_columns, inplace=True)
df = df[list(combined_columns.values())]

In [13]:
df.head()

Unnamed: 0,opinion_id,opinion_date_created,opinion_type,opinion_extracted_by_ocr,opinion_per_curiam,opinion_plain_text,opinion_html,opinion_html_with_citations,opinion_html_anon_2020,opinion_html_columbia,...,cluster_disposition,cluster_syllabus,docket_id,docket_number,docket_view_count,court_id,court_jurisdiction,court_in_use,court_short_name,court_full_name
0,444587,2011-08-23 09:37:07+00,010combined,f,f,,"<p class=""case_cite"">748 F.2d 972</p>\n <p ...","<p class=""case_cite""><span class=""citation no-...",,,...,,,464952,84-1064,1,ca5,F,t,Fifth Circuit,Court of Appeals for the Fifth Circuit
1,9410469,2023-07-21 14:07:23.906127+00,010combined,f,f,Nebraska Supreme Court Online Library\nwww.neb...,,"<pre class=""inline"">Nebraska Supreme Court Onl...",,,...,,,67625502,S-22-272,0,neb,S,t,Nebraska Supreme Court,Nebraska Supreme Court
2,714663,2012-04-17 06:49:10+00,010combined,f,f,,"<p class=""case_cite"">78 F.3d 599</p>\n <p c...","<p class=""case_cite""><span class=""citation no-...",,,...,,,2320406,94-2273,0,ca11,F,t,Eleventh Circuit,Court of Appeals for the Eleventh Circuit
3,2729050,2014-09-08 21:38:34.377891+00,010combined,f,f,"Pursuant to Ind. Appellate Rule 65(D),\nthis M...",,"<pre class=""inline"">Pursuant to Ind. Appellate...",,,...,,,1202177,20A03-1202-CR-90,0,indctapp,SA,t,Indiana Court of Appeals,Indiana Court of Appeals
4,692963,2012-04-17 05:18:08+00,010combined,f,f,,"<p class=""case_cite"">51 F.3d 288</p>\n <p c...","<p class=""case_cite""><span class=""citation no-...",,,...,,,64372,91-3081,1,cadc,F,t,D.C. Circuit,Court of Appeals for the D.C. Circuit


In [14]:
print("number of columns: ", len(df.columns))
df.columns

number of columns:  39


Index(['opinion_id', 'opinion_date_created', 'opinion_type',
       'opinion_extracted_by_ocr', 'opinion_per_curiam', 'opinion_plain_text',
       'opinion_html', 'opinion_html_with_citations', 'opinion_html_anon_2020',
       'opinion_html_columbia', 'opinion_html_lawbox', 'opinion_xml_harvard',
       'cluster_id', 'cluster_judges', 'cluster_nature_of_suit',
       'cluster_source', 'cluster_blocked', 'cluster_precedential_status',
       'cluster_citation_count', 'cluster_case_name',
       'cluster_case_name_short', 'cluster_case_name_full', 'cluster_summary',
       'cluster_history', 'cluster_headmatter', 'cluster_headnotes',
       'cluster_posture', 'cluster_arguments', 'cluster_cross_reference',
       'cluster_disposition', 'cluster_syllabus', 'docket_id', 'docket_number',
       'docket_view_count', 'court_id', 'court_jurisdiction', 'court_in_use',
       'court_short_name', 'court_full_name'],
      dtype='object')

### save the sample data with cleaned columns

In [15]:
df.to_csv("outputs/1a.columns_cleaned.csv", index=False)