In [8]:
#Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import os
import zipfile
import os
import re


In [10]:
#Import df and their associated .txt files

df = pd.read_csv('df_aligned.csv')

clean_folder = "/Users/iphonex/Downloads/Court-Cases-Text-Analytics/Ontario-Court-Cases/court_case_texts_cleaned"

# Read texts in same order as df
texts = []
for fname in df['file_identifier']:
    file_path = os.path.join(clean_folder, fname)
    with open(file_path, encoding='utf-8') as f:
        texts.append(f.read())

print(f"Loaded {len(df)} cases and {len(texts)} texts")


Loaded 385 cases and 385 texts


### Basic Text Pre-Processing 

In [11]:
#Cleaning function 
def clean_court_text_keep_paragraphs(text):
    # 1. Remove everything above "ONTARIO COURT OF JUSTICE"
    if "ONTARIO COURT OF JUSTICE" in text:
        text = "ONTARIO COURT OF JUSTICE\n" + text.split("ONTARIO COURT OF JUSTICE", 1)[1]

    # 2. Remove bracketed numbers like [16], [1], [2] etc.
    text = re.sub(r'\[\s*\d+\s*\]', '', text)
    
    # 3. Replace multiple line breaks with a single line break to keep paragraphs
    text = re.sub(r'\n{2,}', '\n', text)  # keep single line break between paragraphs
    
    # 4. Replace multiple spaces with a single space (within paragraphs)
    text = re.sub(r'[ ]{2,}', ' ', text)
    
    # 5. Lowercase for NLP (optional)
    text = text.lower()
    
    # 6. Strip leading/trailing spaces on each line
    text = '\n'.join([line.strip() for line in text.split('\n') if line.strip() != ''])
    
    return text

# Create folder for cleaned files
clean_folder = "court_case_texts_cleaned"
os.makedirs(clean_folder, exist_ok=True)

# Clean and save each file
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith(".txt"):
            path = os.path.join(root, file)
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                raw_text = f.read()
            
            cleaned_text = clean_court_text_keep_paragraphs(raw_text)
            
            # Save cleaned text
            cleaned_path = os.path.join(clean_folder, file)
            with open(cleaned_path, "w", encoding="utf-8") as f:
                f.write(cleaned_text)

print(f"All cleaned files saved in: {clean_folder}")

All cleaned files saved in: court_case_texts_cleaned


### TF-ID and KMeans Approach for Topic Clustering

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF representation
# Use texts already loaded from previous cell
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.85,
    min_df=5,
    ngram_range=(1,2)
)

X = vectorizer.fit_transform(texts)
print(f"TF-IDF matrix shape: {X.shape}")

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
from sklearn.cluster import KMeans

n_clusters = 8   # you can tune this
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X)

df["unsupervised_cluster"] = clusters


In [None]:
import numpy as np

terms = vectorizer.get_feature_names_out()

def print_top_terms_per_cluster(kmeans_model, n_terms=15):
    order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
    
    for i in range(n_clusters):
        print(f"\nCluster {i}:")
        top_terms = [terms[ind] for ind in order_centroids[i, :n_terms]]
        print(", ".join(top_terms))

print_top_terms_per_cluster(kmeans)



Cluster 0:
defendant, evidence, officer, police, reasonable, breath, court, vehicle, doubt, did, alcohol, reasonable doubt, said, testimony, testified

Cluster 1:
cookies, captcha, canlii, performance, site, manage cookies, privacy policy, accept cookies, functionality, website, improve, manage, policy, help, use

Cluster 2:
sentence, mr, offender, sentencing, court, years, conditional, offence, victim, conditional sentence, community, firearm, offences, criminal, custody

Cluster 3:
complainant, evidence, accused, sexual, did, testified, ms, crown, assault, defence, court, consent, doubt, reasonable, trial

Cluster 4:
mr, ms, evidence, crown, did, testified, court, police, reasonable, doubt, officer, accused, reasonable doubt, said, time

Cluster 5:
mr, officer, charter, evidence, demand, police, applicant, accused, pc, counsel, grounds, breath, ito, affiant, reasonable

Cluster 6:
delay, trial, crown, jordan, defence, dates, court, applicant, disclosure, ceiling, days, 11, defence d

Cluster 1 seems to be a groupping of baddly scrapped cases.
After manual checking, all documents appearing in cluster 1 need to be removed from the data, as they do not contain any information regarding criminal cases.

### Removing the files and Renaming them => (clean_files.py)

### Approach 2 for Topic Modeling: LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

legal_stopwords = [
    "mr", "ms", "court", "judge", "justice",
    "evidence", "accused", "crown", "defence",
    "said", "did", "testified", "trial",
    "reasonable", "doubt", "officer",
    "police", "applicant"
]

vectorizer = CountVectorizer(
    stop_words="english",
    max_df=0.85,
    min_df=5,
    ngram_range=(1,2)
)

# Extend stopwords
vectorizer.stop_words_ = set(vectorizer.get_stop_words()).union(legal_stopwords)

X = vectorizer.fit_transform(texts)


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 8

lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    learning_method="batch"
)

lda.fit(X)


0,1,2
,"n_components  n_components: int, default=10 Number of topics. .. versionchanged:: 0.19  ``n_topics`` was renamed to ``n_components``",8
,"doc_topic_prior  doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.",
,"topic_word_prior  topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.",
,"learning_method  learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM  update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch  of training data to update the ``components_`` variable incrementally. The  learning rate is controlled by the ``learning_decay`` and the  ``learning_offset`` parameters. .. versionchanged:: 0.20  The default learning method is now ``""batch""``.",'batch'
,"learning_decay  learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.",0.7
,"learning_offset  learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.",10.0
,"max_iter  max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.",10
,"batch_size  batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.",128
,"evaluate_every  evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.",-1
,"total_samples  total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.",1000000.0


In [None]:
import numpy as np

terms = vectorizer.get_feature_names_out()

def print_top_words(model, feature_names, n_top_words=15):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx}:")
        top_indices = topic.argsort()[::-1][:n_top_words]
        top_terms = [feature_names[i] for i in top_indices]
        print(", ".join(top_terms))

print_top_words(lda, terms)



Topic 0:
sexual, sentence, offender, years, child, victim, children, offences, sentencing, conditional, order, pornography, conditional sentence, child pornography, age

Topic 1:
mr, officer, charter, search, para, right, applicant, reasonable, officers, information, rights, accused, arrest, 10, grounds

Topic 2:
mr, defendant, firearm, accused, possession, drugs, reasonable, para, said, bag, act, ammunition, street, freedom, firearms

Topic 3:
mr, sentence, offender, sentencing, years, offences, para, community, custody, order, days, ms, victim, conditional, months

Topic 4:
mr, reasonable, video, moore, doubt, mr moore, defence, ms, officer, reasonable doubt, force, accused, para, witness, assault

Topic 5:
accused, complainant, ms, mr, defendant, testified, said, sexual, told, doubt, reasonable, asked, reasonable doubt, assault, defence

Topic 6:
delay, defence, applicant, disclosure, 2024, days, dates, application, 11, jordan, matter, para, months, set, mr

Topic 7:
mr, said, murr

In [7]:
#Check number of documents for each topic

# Compute topic distribution for each document
doc_topic_dist = lda.transform(X)

# Assign dominant topic for each document
dominant_topics = np.argmax(doc_topic_dist, axis=1)

# Add to your dataframe
df['dominant_topic'] = dominant_topics

# Count how many cases are in each topic
topic_counts = df['dominant_topic'].value_counts().sort_index()
print("Number of cases per LDA topic:")
print(topic_counts)


NameError: name 'lda' is not defined

These clusters are now vetter defined, some early hypothesis for topic modeling could be:
- Topic 0: sexual offense
- Topic 1: administration of justice
- Topic 2: drug and weapons
- Topic 3: sentencing
- Topic 4: assault
- Topic 5: sexual assault
- Topic 6: trial delay 
- Topic 7: impaired driving

However, because criminal cases are complex, these topics could still be too broad for a final topic classification. A supplemental approach would be to cluster each cases inside these current clusters to obtain finer and more appropriate grouppings

In [None]:
# Keep only the rows in df corresponding to files we have
df = df[df['file_identifier'].isin(files)].reset_index(drop=True)

# Check
print(len(df), len(texts), X.shape[0])


510 385 385


In [None]:
# Strip spaces and lowercase everything
df['file_identifier_norm'] = df['file_identifier'].str.strip().str.lower()
files_norm = [f.strip().lower() for f in files]

# Keep only matching rows
df = df[df['file_identifier_norm'].isin(files_norm)].reset_index(drop=True)

# Check alignment
print(len(df), len(texts), X.shape[0])


510 385 385


In [None]:
import os

clean_folder = "/Users/iphonex/Downloads/Court-Cases-Text-Analytics/Ontario-Court-Cases/court_case_texts_cleaned"

# 1️⃣ Get the actual .txt files present in the folder
existing_files = [f for f in os.listdir(clean_folder) if f.endswith(".txt")]

# 2️⃣ Normalize names
existing_files_norm = [f.strip().lower() for f in existing_files]
df['file_identifier_norm'] = df['file_identifier'].str.strip().str.lower()

# 3️⃣ Keep only rows that have a corresponding file
df = df[df['file_identifier_norm'].isin(existing_files_norm)].reset_index(drop=True)

# 4️⃣ Read texts in the same order as df
texts = []
for fname in df['file_identifier']:
    file_path = os.path.join(clean_folder, fname)
    with open(file_path, encoding='utf-8') as f:
        texts.append(f.read())

# 5️⃣ Now check alignment
print("Rows in df:", len(df))
print("Number of texts:", len(texts))


Rows in df: 510
Number of texts: 510
