In [119]:
#Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import os
import zipfile
import os
import re


In [120]:
#Unzip the zip files
zip_path = "court_case_texts.zip"
extract_path = "court_case_texts"

with zipfile.ZipFile(zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

In [121]:
#Check number of text files
text_files = []

for root, dirs, files in os.walk("court_case_texts"):
    for file in files:
        if file.endswith(".txt"):
            text_files.append(os.path.join(root, file))

length = len(text_files)
print('Length of text files:', length)


Length of text files: 510


In [122]:
#Create the df for meta data associated
df = pd.read_csv('canlii_final_report_20.csv')
df

Unnamed: 0,Judge,Heard_Date,Released_Date,Case_Title,URL
0,C.A. Brannagan,27 November 2025,2025-12-31,R. v. M.T.,https://www.canlii.org/en/on/oncj/doc/2025/202...
1,Fergus ODonnell,Unknown,2025-12-30,R. v. J.G.,https://www.canlii.org/en/on/oncj/doc/2025/202...
2,S. Robichaud,"October 27 to October 31, November 12, 17-25, ...",2025-12-29,R. v. Laguerre,https://www.canlii.org/en/on/oncj/doc/2025/202...
3,S. Robichaud,"December 29, 2025",2025-12-29,R. v. Khosa,https://www.canlii.org/en/on/oncj/doc/2025/202...
4,S. G. Pratt,"4 November, 17 December, 2025",2025-12-24,R. v. Lachance,https://www.canlii.org/en/on/oncj/doc/2025/202...
...,...,...,...,...,...
505,Brock Jones,"June 13, 2023, and December 10, 2024",2025-01-06,R. v. Aden,https://www.canlii.org/en/on/oncj/doc/2025/202...
506,Brock Jones,"October 15-17, 28, and December 19, 2024",2025-01-06,R. v. A.B.,https://www.canlii.org/en/on/oncj/doc/2025/202...
507,H. Pringle,"December 6, 2024 [1]",2025-01-03,R. v. Williams,https://www.canlii.org/en/on/oncj/doc/2025/202...
508,Unknown,Unknown,2025-01-02,R. v. Wu,https://www.canlii.org/en/on/oncj/doc/2025/202...


In [123]:
#Cleaning function 
def clean_court_text_keep_paragraphs(text):
    # 1. Remove everything above "ONTARIO COURT OF JUSTICE"
    if "ONTARIO COURT OF JUSTICE" in text:
        text = "ONTARIO COURT OF JUSTICE\n" + text.split("ONTARIO COURT OF JUSTICE", 1)[1]

    # 2. Remove bracketed numbers like [16], [1], [2] etc.
    text = re.sub(r'\[\s*\d+\s*\]', '', text)
    
    # 3. Replace multiple line breaks with a single line break to keep paragraphs
    text = re.sub(r'\n{2,}', '\n', text)  # keep single line break between paragraphs
    
    # 4. Replace multiple spaces with a single space (within paragraphs)
    text = re.sub(r'[ ]{2,}', ' ', text)
    
    # 5. Lowercase for NLP (optional)
    text = text.lower()
    
    # 6. Strip leading/trailing spaces on each line
    text = '\n'.join([line.strip() for line in text.split('\n') if line.strip() != ''])
    
    return text

# Create folder for cleaned files
clean_folder = "court_case_texts_cleaned"
os.makedirs(clean_folder, exist_ok=True)

# Clean and save each file
for root, dirs, files in os.walk(extract_path):
    for file in files:
        if file.endswith(".txt"):
            path = os.path.join(root, file)
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                raw_text = f.read()
            
            cleaned_text = clean_court_text_keep_paragraphs(raw_text)
            
            # Save cleaned text
            cleaned_path = os.path.join(clean_folder, file)
            with open(cleaned_path, "w", encoding="utf-8") as f:
                f.write(cleaned_text)

print(f"All cleaned files saved in: {clean_folder}")

All cleaned files saved in: court_case_texts_cleaned


### TF-ID Approach for Topic Clustering

In [None]:


from sklearn.feature_extraction.text import TfidfVectorizer

texts = []

for file in files:
    with open(os.path.join(text_folder, file), "r", encoding="utf-8") as f:
        texts.append(f.read())

# TF-IDF representation
vectorizer = TfidfVectorizer(
    stop_words="english",
    max_df=0.85,
    min_df=5,
    ngram_range=(1,2)
)

X = vectorizer.fit_transform(texts)


In [None]:
from sklearn.cluster import KMeans

n_clusters = 8   # you can tune this
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X)

df["unsupervised_cluster"] = clusters


In [None]:
import numpy as np

terms = vectorizer.get_feature_names_out()

def print_top_terms_per_cluster(kmeans_model, n_terms=15):
    order_centroids = kmeans_model.cluster_centers_.argsort()[:, ::-1]
    
    for i in range(n_clusters):
        print(f"\nCluster {i}:")
        top_terms = [terms[ind] for ind in order_centroids[i, :n_terms]]
        print(", ".join(top_terms))

print_top_terms_per_cluster(kmeans)



Cluster 0:
defendant, evidence, officer, police, reasonable, breath, court, vehicle, doubt, did, alcohol, reasonable doubt, said, testimony, testified

Cluster 1:
cookies, captcha, canlii, performance, site, manage cookies, privacy policy, accept cookies, functionality, website, improve, manage, policy, help, use

Cluster 2:
sentence, mr, offender, sentencing, court, years, conditional, offence, victim, conditional sentence, community, firearm, offences, criminal, custody

Cluster 3:
complainant, evidence, accused, sexual, did, testified, ms, crown, assault, defence, court, consent, doubt, reasonable, trial

Cluster 4:
mr, ms, evidence, crown, did, testified, court, police, reasonable, doubt, officer, accused, reasonable doubt, said, time

Cluster 5:
mr, officer, charter, evidence, demand, police, applicant, accused, pc, counsel, grounds, breath, ito, affiant, reasonable

Cluster 6:
delay, trial, crown, jordan, defence, dates, court, applicant, disclosure, ceiling, days, 11, defence d

Cluster 1 seems to be a groupping of baddly scrapped cases.
After manual checking, all documents appearing in cluster 1 need to be removed from the data, as they do not contain any information regarding criminal cases.

### Removing the files and Renaming them

In [None]:
#Removing the files

files_to_remove = [
    "case_33.txt", "case_103.txt", "case_223.txt",
    "case_224.txt", "case_225.txt", "case_226.txt",
    "case_227.txt", "case_228.txt", "case_229.txt", "case_230.txt"
]

# Additional files to remove
additional_files = [
    'case_231.txt', 'case_232.txt', 'case_233.txt', 'case_234.txt', 'case_235.txt',
    'case_236.txt', 'case_237.txt', 'case_238.txt', 'case_239.txt', 'case_240.txt',
    'case_241.txt', 'case_242.txt', 'case_243.txt', 'case_244.txt', 'case_245.txt',
    'case_246.txt', 'case_247.txt', 'case_248.txt', 'case_249.txt', 'case_250.txt',
    'case_251.txt', 'case_252.txt', 'case_253.txt', 'case_254.txt', 'case_255.txt',
    'case_256.txt', 'case_257.txt', 'case_258.txt', 'case_259.txt', 'case_260.txt',
    'case_261.txt', 'case_262.txt', 'case_263.txt', 'case_264.txt', 'case_265.txt',
    'case_266.txt', 'case_267.txt', 'case_268.txt', 'case_269.txt', 'case_270.txt',
    'case_271.txt', 'case_272.txt', 'case_273.txt', 'case_274.txt', 'case_275.txt',
    'case_276.txt', 'case_277.txt', 'case_278.txt', 'case_279.txt', 'case_280.txt',
    'case_281.txt', 'case_282.txt', 'case_283.txt', 'case_284.txt', 'case_285.txt',
    'case_286.txt', 'case_287.txt', 'case_288.txt', 'case_289.txt', 'case_290.txt',
    'case_291.txt', 'case_292.txt', 'case_293.txt', 'case_294.txt', 'case_295.txt',
    'case_296.txt', 'case_297.txt', 'case_298.txt', 'case_299.txt', 'case_300.txt',
    'case_301.txt', 'case_302.txt', 'case_303.txt', 'case_304.txt', 'case_305.txt',
    'case_306.txt', 'case_307.txt', 'case_308.txt', 'case_309.txt', 'case_310.txt',
    'case_311.txt', 'case_312.txt', 'case_313.txt', 'case_314.txt', 'case_315.txt',
    'case_316.txt', 'case_317.txt', 'case_318.txt', 'case_319.txt', 'case_320.txt',
    'case_321.txt', 'case_322.txt', 'case_323.txt', 'case_324.txt', 'case_325.txt',
    'case_326.txt', 'case_327.txt', 'case_328.txt', 'case_329.txt', 'case_330.txt',
    'case_331.txt', 'case_332.txt', 'case_333.txt', 'case_334.txt', 'case_335.txt',
    'case_336.txt', 'case_337.txt', 'case_338.txt', 'case_339.txt', 'case_340.txt',
    'case_341.txt', 'case_342.txt', 'case_379.txt', 'case_388.txt', 'case_483.txt'
]

# Combine both lists
files_to_remove.extend(additional_files)

# Remove from texts and files
texts = [text for text, fname in zip(texts, files) if fname not in files_to_remove]
files = [fname for fname in files if fname not in files_to_remove]

indices_to_remove = [33, 103, 223, 224, 225, 226, 227, 228, 229, 230] + list(range(231, 343)) + [379, 388, 483]
df = df[~df.index.isin(indices_to_remove)].reset_index(drop=True)


In [None]:
#Cheking the shape of the csv
df.shape

(385, 7)

In [None]:
#Renaming the files for better identification 

def make_filename_from_case_title(case_title):
    # Lowercase
    name = case_title.lower()
    # Replace spaces and special characters with underscores
    name = re.sub(r'[^a-z0-9]+', '_', name)
    # Trim underscores at start/end
    name = name.strip('_')
    # Add .txt extension
    return name + ".txt"


clean_folder = "/Users/iphonex/Downloads/Court-Cases-Text-Analytics/Ontario-Court-Cases/court_case_texts_cleaned"

# Make a new column for the filename identifier
df['file_identifier'] = df['Case_Title'].apply(make_filename_from_case_title)

# Rename files on disk
for idx, row in df.iterrows():
    old_filename = f"case_{idx}.txt"  # original filename
    new_filename = row['file_identifier']
    old_path = os.path.join(clean_folder, old_filename)
    new_path = os.path.join(clean_folder, new_filename)
    if os.path.exists(old_path):
        os.rename(old_path, new_path)


files = df['file_identifier'].tolist()


### Approach 2 for Topic Modeling: LDA

In [126]:
from sklearn.feature_extraction.text import CountVectorizer

legal_stopwords = [
    "mr", "ms", "court", "judge", "justice",
    "evidence", "accused", "crown", "defence",
    "said", "did", "testified", "trial",
    "reasonable", "doubt", "officer",
    "police", "applicant"
]

vectorizer = CountVectorizer(
    stop_words="english",
    max_df=0.85,
    min_df=5,
    ngram_range=(1,2)
)

# Extend stopwords
vectorizer.stop_words_ = set(vectorizer.get_stop_words()).union(legal_stopwords)

X = vectorizer.fit_transform(texts)


In [127]:
from sklearn.decomposition import LatentDirichletAllocation

n_topics = 8

lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    learning_method="batch"
)

lda.fit(X)


0,1,2
,"n_components  n_components: int, default=10 Number of topics. .. versionchanged:: 0.19  ``n_topics`` was renamed to ``n_components``",8
,"doc_topic_prior  doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.",
,"topic_word_prior  topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.",
,"learning_method  learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM  update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch  of training data to update the ``components_`` variable incrementally. The  learning rate is controlled by the ``learning_decay`` and the  ``learning_offset`` parameters. .. versionchanged:: 0.20  The default learning method is now ``""batch""``.",'batch'
,"learning_decay  learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.",0.7
,"learning_offset  learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.",10.0
,"max_iter  max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.",10
,"batch_size  batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.",128
,"evaluate_every  evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.",-1
,"total_samples  total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.",1000000.0


In [128]:
import numpy as np

terms = vectorizer.get_feature_names_out()

def print_top_words(model, feature_names, n_top_words=15):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic {topic_idx}:")
        top_indices = topic.argsort()[::-1][:n_top_words]
        top_terms = [feature_names[i] for i in top_indices]
        print(", ".join(top_terms))

print_top_words(lda, terms)



Topic 0:
sexual, sentence, offender, years, child, victim, children, offences, sentencing, conditional, order, pornography, conditional sentence, child pornography, age

Topic 1:
mr, officer, charter, search, para, right, applicant, reasonable, officers, information, rights, accused, arrest, 10, grounds

Topic 2:
mr, defendant, firearm, accused, possession, drugs, reasonable, para, said, bag, act, ammunition, street, freedom, firearms

Topic 3:
mr, sentence, offender, sentencing, years, offences, para, community, custody, order, days, ms, victim, conditional, months

Topic 4:
mr, reasonable, video, moore, doubt, mr moore, defence, ms, officer, reasonable doubt, force, accused, para, witness, assault

Topic 5:
accused, complainant, ms, mr, defendant, testified, said, sexual, told, doubt, reasonable, asked, reasonable doubt, assault, defence

Topic 6:
delay, defence, applicant, disclosure, 2024, days, dates, application, 11, jordan, matter, para, months, set, mr

Topic 7:
mr, said, murr