### Preliminary Analysis

In [1]:
import pandas as pd
import string

In [6]:
cases_text = pd.read_csv("/Users/iphonex/Downloads/Court-Cases-Text-Analytics/Ontario-Court-Cases/data/courtlistener_cases.csv")
cases_meta = pd.read_json('data/courtlistener_cases.json')

In [7]:
# Check first rows
print("Text CSV columns:", cases_text.columns)
print("Metadata JSON columns:", cases_meta.columns)

Text CSV columns: Index(['docket_id', 'docket_number', 'case_name', 'court_id', 'date_filed',
       'date_terminated', 'nature_of_suit', 'cause', 'jurisdiction_type',
       'cluster_id', 'cluster_date_filed', 'cluster_case_name', 'judges',
       'panel_str', 'citation_count', 'opinion_id', 'opinion_type',
       'author_str', 'opinion_text_html', 'opinion_text_plain', 'download_url',
       'opinions_cited_count', 'absolute_url', 'cluster_url'],
      dtype='str')
Metadata JSON columns: Index(['docket_id', 'docket_number', 'case_name', 'court_id', 'date_filed',
       'date_terminated', 'nature_of_suit', 'cause', 'jurisdiction_type',
       'cluster_id', 'cluster_date_filed', 'cluster_case_name', 'judges',
       'panel_str', 'citation_count', 'opinion_id', 'opinion_type',
       'author_str', 'opinion_text_html', 'opinion_text_plain', 'download_url',
       'opinions_cited_count', 'absolute_url', 'cluster_url'],
      dtype='str')


In [None]:

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Define legal stopwords
legal_stopwords = set([
    "court", "plaintiff", "defendant", "opinion", "slip", "term", "filed", "judge"
])

# Cleaning function
def clean_text(text):
    text = str(text).lower()                   # lowercase
    text = re.sub(r'\d+', '', text)            # remove numbers
    text = re.sub(r'\s+', ' ', text)           # collapse whitespace
    text = re.sub(r'[^\w\s]', '', text)        # remove punctuation
    tokens = text.split()
    tokens = [t for t in tokens if t not in ENGLISH_STOP_WORDS and t not in legal_stopwords]
    return " ".join(tokens)

# Apply cleaning and store in new column
cases_text['clean_text'] = cases_text['case_text'].apply(clean_text)

# Remove empty docs if needed
cases_text = cases_text[cases_text['clean_text'].str.strip() != ""]

In [None]:

# ---- DATE SPAN ----
if 'cluster_date_filed' in cases_meta.columns:
    cases_meta['cluster_date_filed'] = pd.to_datetime(cases_meta['cluster_date_filed'], errors='coerce')
    print("\nCases span from", cases_meta['cluster_date_filed'].min().date(),
          "to", cases_meta['cluster_date_filed'].max().date())

# ---- JURISDICTION ----
if 'jurisdiction' in cases_meta.columns:
    print("\nJurisdictions covered:")
    print(cases_meta['jurisdiction'].value_counts())

# ---- TOTAL NUMBER OF JUDGES ----
if 'judges' in cases_meta.columns:
    all_judges = set()
    for j in cases_meta['judges']:
        if pd.isna(j):
            continue
        try:
          
            judges_list = ast.literal_eval(j) if isinstance(j, str) else j
            all_judges.update(judges_list)
        except:
            all_judges.add(j)  
    print("\nTotal unique judges across all cases:", len(all_judges))



Cases span from 2024-04-15 to 2026-01-26

Total unique judges across all cases: 10


In [19]:
from bertopic import BERTopic
import pandas as pd

# Load CSV
cases_text = pd.read_csv("/Users/iphonex/Downloads/Court-Cases-Text-Analytics/Ontario-Court-Cases/data/courtlistener_cases.csv")

# Inspect columns to find the case text column
print(cases_text.columns)
# Suppose it is 'case_text' (replace with the correct one)
text_col = 'opinion_text_plain'

# Ensure all values are strings and remove empty docs
docs = cases_text[text_col].fillna("").astype(str)
docs = docs[docs.str.strip() != ""]  # remove docs that are just whitespace

print(f"Number of documents to analyze: {len(docs)}")

# Create and fit BERTopic
topic_model = BERTopic(language="english", calculate_probabilities=True)
topics, probs = topic_model.fit_transform(docs.tolist())

# Add predicted topic to the DataFrame
cases_text.loc[docs.index, 'predicted_topic'] = topics

# See top 10 topics
print(topic_model.get_topic_info().head(10))




Index(['docket_id', 'docket_number', 'case_name', 'court_id', 'date_filed',
       'date_terminated', 'nature_of_suit', 'cause', 'jurisdiction_type',
       'cluster_id', 'cluster_date_filed', 'cluster_case_name', 'judges',
       'panel_str', 'citation_count', 'opinion_id', 'opinion_type',
       'author_str', 'opinion_text_html', 'opinion_text_plain', 'download_url',
       'opinions_cited_count', 'absolute_url', 'cluster_url'],
      dtype='str')
Number of documents to analyze: 274


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 1284.83it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


   Topic  Count               Name  \
0     -1    127  -1_the_of_to_that   
1      0     87   0_the_of_to_that   
2      1     50   1_the_of_to_that   
3      2     10    2_the_of_to_and   

                                      Representation  \
0      [the, of, to, that, and, in, for, at, is, as]   
1   [the, of, to, that, and, in, for, at, court, is]   
2      [the, of, to, that, in, and, at, is, for, as]   
3  [the, of, to, and, in, court, that, for, this,...   

                                 Representative_Docs  
0  [(Slip Opinion)              OCTOBER TERM, 202...  
1  [                   PRELIMINARY PRINT\n\n     ...  
2  [(Slip Opinion)              OCTOBER TERM, 202...  
3  [                   Cite as: 606 U. S. ____ (2...  


In [16]:
# See the first few rows and column names
print(cases_text.columns)
print(cases_text.head(1).T)  # Transpose to see all columns clearly


Index(['docket_id', 'docket_number', 'case_name', 'court_id', 'date_filed',
       'date_terminated', 'nature_of_suit', 'cause', 'jurisdiction_type',
       'cluster_id', 'cluster_date_filed', 'cluster_case_name', 'judges',
       'panel_str', 'citation_count', 'opinion_id', 'opinion_type',
       'author_str', 'opinion_text_html', 'opinion_text_plain', 'download_url',
       'opinions_cited_count', 'absolute_url', 'cluster_url',
       'predicted_case_type'],
      dtype='str')
                                                                      0
docket_id                                                      72187113
docket_number                                                     25-51
case_name                                               Klein v. Martin
court_id                                                         scotus
date_filed                                                          NaN
date_terminated                                                     NaN
nature_of_su