In [4]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return '\n'.join(text)

# Extract text from PDF files
criminal_offenses_text = extract_text_from_pdf('../ilcs/720_ILCS_CRIMINAL_OFFENSES.pdf')
criminal_procedure_text = extract_text_from_pdf('../ilcs/725_ILCS_CRIMINAL_PROCEDURE.pdf')
safe_t_act_text = extract_text_from_pdf('../ilcs/Illinois Safe-T Act Full Text.pdf')

In [7]:
import spacy

# Load spacy model
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 2000000

def split_text_into_chunks(text, chunk_size=1000000):
    # Split text into chunks of specified size
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def split_chunks_into_sentences(chunks):
    sentences = []
    for chunk in chunks:
        doc = nlp(chunk)
        sentences.extend([sent.text for sent in doc.sents])
    return sentences

# Split text into sentences
offenses_prechunks = split_text_into_chunks(criminal_offenses_text)
procedure_prechunks = split_text_into_chunks(criminal_procedure_text)
safe_t_act_prechunks = split_text_into_chunks(safe_t_act_text)

offenses_sentences = split_chunks_into_sentences(offenses_prechunks)
procedure_sentences = split_chunks_into_sentences(procedure_prechunks)
safe_t_act_sentences = split_chunks_into_sentences(safe_t_act_prechunks)

In [8]:
def chunk_sentences(sentences, max_sentences=5):
    chunks = []
    current_chunk = []
    for sentence in sentences:
        current_chunk.append(sentence)
        if len(current_chunk) >= max_sentences:
            chunks.append(' '.join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Chunk sentences
offenses_chunks = chunk_sentences(offenses_sentences, max_sentences=5)
procedure_chunks = chunk_sentences(procedure_sentences, max_sentences=5)
safe_t_act_chunks = chunk_sentences(safe_t_act_sentences, max_sentences=5)

In [None]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
####     from chromadb.utils import TextEmbedding  #DefaultEmbeddingFunction

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    return model.encode(text).tolist()

# Initialize Chroma
settings = Settings()
chroma_client = chromadb.Client(settings)

# Define your collection schema
collection_name = 'il-legal-data'
chroma_client.create_collection(collection_name)

# Get the collection
collection = chroma_client.get_collection(collection_name)

# Generate embeddings and add documents to the collection
documents = offenses_chunks + procedure_chunks + safe_t_act_chunks
metadata = [{'source': 'IL Criminal Offenses'}] * len(offenses_chunks) + [{'source': 'IL Criminal Procedure'}] * len(procedure_chunks) + [{'source': 'IL Safe-T Act'}] * len(safe_t_act_chunks)

embeddings = [get_embedding(doc) for doc in documents]
# embeddings = model.encode(documents, convert_to_tensor=True)

In [16]:
for doc, emb, meta in zip(documents, embeddings, metadata):
    collection.add(doc, emb, meta)

Add of existing embedding ID: LEXIS 225 (Ill. App. 
 Ct. 1st Dist. Mar. 10, 2004), modified, 347 Ill. App. 3d 587, 283 Ill.
Insert of existing embedding ID: LEXIS 225 (Ill. App. 
 Ct. 1st Dist. Mar. 10, 2004), modified, 347 Ill. App. 3d 587, 283 Ill.
Add of existing embedding ID: People v. Adawi, 231 Ill. App. 3d 896, 
173 Ill. Dec. 310, 596 N.E.2d 1189, 1992 Ill. App.
Insert of existing embedding ID: People v. Adawi, 231 Ill. App. 3d 896, 
173 Ill. Dec. 310, 596 N.E.2d 1189, 1992 Ill. App.
Add of existing embedding ID: App. 3d 863, 294 Ill. Dec. 434, 830 
N.E.2d 846, 2005 Ill. App. LEXIS 587 (Ill. App.
Insert of existing embedding ID: App. 3d 863, 294 Ill. Dec. 434, 830 
N.E.2d 846, 2005 Ill. App. LEXIS 587 (Ill. App.
Add of existing embedding ID: People v. Chenoweth, 2013 IL 
App (4th) 120334, 375 Ill. Dec. 202, 996 N.E.2d 1258, 2013 Ill. App. LEXIS 719 (Ill. App. Ct. 4th Dist. 
2013), rev'd, 2015 IL 116898, 388 Ill.
Insert of existing embedding ID: People v. Chenoweth, 2013 IL 
App 

In [20]:
# Query example
query = "search term or question"
query_embedding = get_embedding(query)
results = collection.query(query_embedding)  ##, k=5)

print(results)

# for result in results:
#     print(result['document'], result['metadata'])

{'ids': [['App. 3d 350, 50 Ill. Dec. 954, 420 \nN.E.2d 271, 1981 Ill. App. LEXIS 2459 (Ill. App.', 'App. 3d 350, 50 Ill. Dec. 954, \n420 N.E.2d 271, 1981 Ill. App. LEXIS 2459 (Ill. App.', 'App. 3d 447, 348 N.E.2d 199, 1976 Ill. App. LEXIS 2388 (Ill. App. Ct. 1st Dist. \n', 'App. 3d 638, 50 Ill. Dec. 130, 418 N.E.2d 1124, 1981 \nIll. App. LEXIS 2321 (Ill. App. Ct. 2d Dist. 1981). \n ', 'App. 3d 525, 47 Ill. Dec. 239, 414 N.E.2d \n1355, 1981 Ill. App. LEXIS 1973 (Ill. App.', 'App. 3d 525, 47 Ill. Dec. 239, 414 N.E.2d 1355, 1981 Ill. App. LEXIS 1973 (Ill. \nApp.', 'App. 3d 365, 183 Ill. Dec. 587, \n611 N.E.2d 1235, 1993 Ill. App. LEXIS 266 (Ill. App.', 'App. 3d 473, 2 Ill. Dec. 253, 357 N.E.2d 230, 1976 Ill. App. \nLEXIS 3316 (Ill. App. Ct. 4th Dist. 1976). \n', 'App. 3d 30, 2 Ill. Dec. 821, 357 N.E.2d \n1264, 1976 Ill. App. LEXIS 3435 (Ill. App.', 'App. 3d 798, 194 Ill. Dec. 102, 627 \nN.E.2d 218, 1993 Ill. App. LEXIS 1508 (Ill. App.']], 'distances': [[1.421753168106079, 1.42175316810607

In [25]:
# Query example
query = "How can I try to get pre-trial release for my friend that is charged with a violent offense?"
query_embedding = get_embedding(query)
results = collection.query(query_embedding)  ##, k=5)

print(results)

{'ids': [["A\npresumption in favor of pretrial release shall by applied by an\nHB3653 Enrolled\nLRB101 05541 RLC 50557 b\nPublic Act 101-0652\n\narresting officer\nin\nthe\nexercise\nof\nhis\nor\nher\ndiscretion\nunder this Section.\n (a-5) A person charged with an offense shall be allowed\ncounsel\nat\nthe\nhearing\nat\nwhich\npretrial\nrelease\nbail\nis\ndetermined under Article 110 of this Code. If the defendant\ndesires counsel for his or her initial appearance but is unable\nto obtain counsel, the court shall appoint a public defender or\nlicensed attorney at law of this State to represent him or her\nfor purposes of that hearing.\n (b) Upon initial appearance of a person before the court,\nthe The judge shall:\n(1) inform Inform the defendant of the charge against\nhim and shall provide him with a copy of the charge;\n(2) advise Advise the defendant of his right to counsel\nand if indigent shall appoint a public defender or licensed\nattorney\nat\nlaw\nof\nthis\nState\nto\nrepres