# GPU Setup

In [1]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 3060


# Libraries

In [2]:
import pymongo
import tensorflow as tf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# MongoDB Connection

In [3]:
# MongoDB Connection String
client = pymongo.MongoClient("mongodb+srv://priscillalicup:bs62gnFzjoBQnJ03@cluster0.uskxsig.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client.supreme_court_jurisprudence
collection = db.year_2021_zip

# Model

In [4]:
# Ensure GPU usage for SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

# Recursive Chunking Function

In [5]:
# Recursive Chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,  # Adjusted chunk size for mini-LM
    chunk_overlap=50,  # Adding some overlap to maintain context between chunks
    separators=["\n\n", "\n", "(?<=\. )", " "],
    length_function=len
)

# Embedding Function

In [6]:
def generate_embedding(text):
  return model.encode(text).tolist()

# Embedding all documents in a DB

ONLY RUN IF NO EMBEDDINGS IN DATABASE YET

In [10]:
# ONLY RUN IF NO EMBEDDINGS IN DATABASE YET

# Append embeddings to MongoDB
# -------------------------------------------- COMMENT OUT BELOW
counter = 0
try:
    for doc in collection.find({'text': {"$exists": True}}):
        # Check if embeddings already exist
        if 'text_chunk_embeddings' in doc:
            print(f"Embeddings already exist for document {str(doc['identifier'])}")
            counter += 1
            continue

        # Chunk the document text
        chunks = text_splitter.split_text(doc['text'])

        # Generate embeddings for each chunk
        chunk_embeddings = [generate_embedding(chunk) for chunk in chunks]

        # Store the chunks and their embeddings in the document
        doc['text_chunks'] = chunks
        doc['text_chunk_embeddings'] = chunk_embeddings

        # Check the document size
        try:
            collection.replace_one({'_id': doc['_id']}, doc)
            print(f"Successfully embedded data from {str(doc['identifier'])}")
        except pymongo.errors.DocumentTooLarge as e:
            print(f"Document too large for identifier {str(doc['identifier'])}: {e}")
            # pprint.pprint(doc)  # Print the document causing the issue
            
        counter += 1
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    print(f"Number of documents embedded: {counter}")

Embeddings already exist for document A.C. No. 11959
Embeddings already exist for document A.C. No. 12690
Embeddings already exist for document A.M. No. MTJ-16-1880 [Formerly OCA IPI No. 13-2565-MTJ]
Embeddings already exist for document A.M. No. P-21-015 [Formerly A.M. No. 14-2-24-MTC]
Embeddings already exist for document G.R. No. 192809
Embeddings already exist for document G.R. No. 200642
Embeddings already exist for document G.R. No. 202105
Embeddings already exist for document G.R. No. 203194
Embeddings already exist for document G.R. No. 205261
Embeddings already exist for document G.R. No. 205385
Embeddings already exist for document G.R. No. 205979
Embeddings already exist for document G.R. No. 207522
Embeddings already exist for document G.R. No. 207619
Embeddings already exist for document G.R. No. 208465
Embeddings already exist for document G.R. No. 211239
Embeddings already exist for document G.R. No. 211571
Embeddings already exist for document G.R. No. 211691
Embeddings