In [1]:
#Import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import os
import zipfile
import os
import re


In [2]:
# CSV file load
csv_url = "https://raw.githubusercontent.com/romane-lg/Ontario-Court-Cases/main/data/canlii_final_report_20.csv"
df = pd.read_csv(csv_url)
print(f"CSV loaded: {len(df)} rows")


# Reading txt files from GitHub
api_url = "https://api.github.com/repos/romane-lg/Ontario-Court-Cases/contents/data/data_clean"
response = requests.get(api_url)
response.raise_for_status()

files = response.json()
txt_files = [f for f in files if f["name"].endswith(".txt")]
print(f"Found {len(txt_files)} txt files on GitHub")


# URL → txt mapping
url_to_text = {}

for file_info in txt_files:
    raw_url = file_info["download_url"]
    text = requests.get(raw_url).text

    for line in text.splitlines():
        if line.lower().startswith("source url:"):
            url = line.split(":", 1)[1].strip()
            url_to_text[url] = text
            break

# Alignment with csv
texts = []
missing_urls = []

for url in df["URL"]:
    if url in url_to_text:
        texts.append(url_to_text[url])
    else:
        texts.append("")
        missing_urls.append(url)

df["full_text"] = texts

print(f"Matched texts : {len(df) - len(missing_urls)}")
print(f"Missing texts : {len(missing_urls)}")

CSV loaded: 510 rows
Found 509 txt files on GitHub
Matched texts : 509
Missing texts : 1


### Sentence-BERT embeddings - Topic Modeling

In [3]:
from sentence_transformers import SentenceTransformer
import hdbscan
import umap
import matplotlib.pyplot as plt

In [4]:
#Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight, fast for 500+ cases
embeddings = model.encode(texts, show_progress_bar=True)


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
#Dimensionality reduction
umap_embeddings = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine').fit_transform(embeddings)

plt.figure(figsize=(10, 8))
plt.scatter(umap_embeddings[:,0], umap_embeddings[:,1], s=10)
plt.title("UMAP projection of legal case embeddings")
plt.show()



OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


The BERT sentence embedding approach seperated the cases into 3-4 groups. There is one larger cluster on the right, a medium one, a smaller one and one that seems to relate to outliers. However, since our sample is small to represent all possible criminal offense, we will chose to go with 4 clusters to represent the different cases complexities.

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom')
cluster_labels = clusterer.fit_predict(embeddings)

df['Cluster'] = cluster_labels

print(f"Number of clusters found: {len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)}")
print(df[['Case_Title', 'Cluster']].head(10))

In [None]:
plt.figure(figsize=(10, 8))
plt.scatter(umap_embeddings[:,0], umap_embeddings[:,1], c=cluster_labels, cmap='tab20', s=20)
plt.colorbar(label='Cluster')
plt.title("HDBSCAN clusters of legal cases")
plt.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

#Extact the top key words per cluster
def get_top_keywords(texts, top_n=10):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
    X = vectorizer.fit_transform(texts)
    feature_names = np.array(vectorizer.get_feature_names_out())
    
    # Sum TF-IDF scores per term across all documents
    tfidf_sums = X.sum(axis=0).A1
    top_indices = tfidf_sums.argsort()[::-1][:top_n]
    
    return feature_names[top_indices]

# Group texts by cluster
clusters = df['Cluster'].unique()
for cluster in clusters:
    if cluster == -1:
        continue  # -1 is noise in HDBSCAN
    cluster_texts = [text for text, lbl in zip(texts, cluster_labels) if lbl == cluster]
    top_words = get_top_keywords(cluster_texts, top_n=10)
    print(f"Cluster {cluster}: {', '.join(top_words)}")

**Cluster Analysis**:
- Cluster 4 (Sexual Offensses possibly involving minors): 
    * This cluster seems to involve sexual offenses, likely sexual assault cases, potentially involving minors (with words like mother, complainant) or family / household context
    * Evidence often comes from testimony, messages, or phone activity

- Cluster 1 (Violent crimes / firearm offenses): 
    * Keywords like forearm and officer suggest weapon-related iffenses or violent crimes
    * sentence, years, delay -> procedural / sentencimg context, probably serious criminal offenses 

- Cluster 5 (General criminal offenses / mixed cases): 
    * Generic crimial law terms (offender, accused, testified)
    * Seems like general sentencing cases, maybe property crimes or minor assaults

- Cluster 2 (Youth criminal offenses): 
    * YCJA refers to Youth Criminal Justice Act, so this cluster refers to youth criminal cases 
    * custody, probabtion, search seems to also refer to sentencing and procedural aspects of youth criminal cases 

- Cluster 0 (groupping of file): 
    * seems to be a groupping of boilerplate / numeric references 
    * needs to be investigated manually 

- Cluster 3 (sexual / assault offenses): 
    * Mentions of victims, mother, video, party → sexual or assault-related cases, possibly domestic or youth sexual offenses.
    * Overlaps somewhat with Cluster 4, but maybe more procedural focus or different type of sexual/assault cases.


In [None]:
# List cases in Cluster 0
cluster_0_cases = df[df['Cluster'] == 0]

print(f"Total cases in Cluster 0: {len(cluster_0_cases)}\n")

# Show relevant info: Case Title + URL
for idx, row in cluster_0_cases.iterrows():
    print(f"Row {idx}: {row['Case_Title']} -> {row['URL']}")


The cases in cluster 0 refer to cases that have not been resolved yet, thus follows the same case content. It make sense they have been groupped together

In [None]:
# Count number of cases per cluster
cluster_counts = df['Cluster'].value_counts().sort_index()

print("Number of cases per cluster:\n")
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster}: {count} cases")


Cluster 1 is the most populated, containing 336 cases. Other clusters, such as 3 and 4, have fewer cases, which aligns with our interpretation that Clusters 3 and 4 are related to sexual offenses — with Cluster 4 likely focusing more on youth cases and Cluster 3 on general assaults.

Now, we aim to sub-cluster cluster -1, 1 and 5 to identify more precise offense groups within each.

Regarding Cluster -1, these are the noise points identified by HDBSCAN. They represent cases that the algorithm could not confidently assign to any cluster, often because they are outliers or have ambiguous content.

### Topic Modeling 2nd iteration

In [None]:
# Load BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # lightweight & fast

# Only sub-cluster these main clusters
clusters_to_process = [-1, 1, 5]

# Dictionary to store subcluster labels and top words
subcluster_results = {}
subcluster_keywords = {}

# Function to get top TF-IDF words for a list of texts
def get_top_keywords(texts, top_n=10):
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
    X = vectorizer.fit_transform(texts)
    feature_names = np.array(vectorizer.get_feature_names_out())
    tfidf_sums = X.sum(axis=0).A1
    top_indices = tfidf_sums.argsort()[::-1][:top_n]
    return feature_names[top_indices]

# Loop through selected clusters and sub-cluster
for cluster in clusters_to_process:
    cluster_mask = df['Cluster'] == cluster
    cluster_texts = [text for text, mask in zip(texts, cluster_mask) if mask]
    
    if len(cluster_texts) == 0:
        continue
    
    print(f"\nProcessing Cluster {cluster} ({len(cluster_texts)} cases)")
    
    # Generate embeddings
    embeddings = model.encode(cluster_texts, show_progress_bar=True)
    
    # HDBSCAN sub-clustering
    subclusterer = hdbscan.HDBSCAN(
        min_cluster_size=3, 
        metric='euclidean', 
        cluster_selection_method='eom'
    )
    subcluster_labels = subclusterer.fit_predict(embeddings)
    
    # Store subcluster labels
    subcluster_results[cluster] = subcluster_labels
    
    # Store top keywords per subcluster
    subcluster_keywords[cluster] = {}
    unique_subclusters = np.unique(subcluster_labels)
    
    for sc in unique_subclusters:
        if sc == -1:
            continue  # skip noise
        sc_texts = [text for text, lbl in zip(cluster_texts, subcluster_labels) if lbl == sc]
        top_words = get_top_keywords(sc_texts, top_n=10)
        subcluster_keywords[cluster][sc] = top_words
        print(f"  Subcluster {sc}: {len(sc_texts)} cases | Top words: {', '.join(top_words)}")
    
    # Optional: UMAP visualization
    umap_embeddings = umap.UMAP(
        n_neighbors=15, 
        n_components=2, 
        metric='cosine'
    ).fit_transform(embeddings)
    
    plt.figure(figsize=(8,6))
    plt.scatter(
        umap_embeddings[:,0], 
        umap_embeddings[:,1], 
        c=subcluster_labels, 
        cmap='tab20', 
        s=20
    )
    plt.title(f"Cluster {cluster} subclusters")
    plt.show()

# Add subcluster labels to df
for cluster, labels in subcluster_results.items():
    mask = df['Cluster'] == cluster
    df.loc[mask, f'Subcluster_{cluster}'] = labels

print("\nSub-clustering complete! Selected clusters now have subcluster labels and top keywords.")



In [None]:
#print the top keywords per sub-cluster

print("Top keywords per subcluster:\n")

for cluster in subcluster_keywords:
    print(f"Main Cluster {cluster}:")
    for sc, words in subcluster_keywords[cluster].items():
        print(f"  Subcluster {sc} ({len([text for text, lbl in zip(texts, df['Cluster']) if lbl == cluster])} cases): {', '.join(words)}")
    print()


**Sub-Cluster Analysis**

- Cluster -1:
    * Subcluster 0: Likely procedural or traffic/Charter-related cases, including vehicle offenses, driving under influence, or other legal procedural matters.
     * Subcluster 1: Probably general criminal cases, possibly minor assaults, property crimes, or mixed offenses; broadly procedural with no strong thematic signal.
     * Subcluster 2: Likely sexual offenses or threats, some involving minors, possibly online/technology-facilitated offenses.
     * Subcluster 3: ikely assault or sexual consent-related cases, procedural context emphasized (delay, records, consent), overlaps with sexual/offense context.

- Cluster 5:
    * Subcluster 0: Likely child-related general criminal cases, maybe minor abuse or family-related criminal offenses. Procedural and sentencing context is strong.
     * Subcluster 1:Likely sexual offenses in social contexts, possibly assaults at parties or social gatherings, involving victims and witnesses.


In [None]:
df.head()

In [None]:
df.to_csv("clustered.csv", index=False, encoding="utf-8-sig")