# GPU Setup

In [2]:
# import tensorflow as tf
# gpus = tf.config.list_physical_devices("GPU")
# if gpus:
#     for gpu in gpus:
#         print("Found a GPU with the name:", gpu)
# else:
#     print("Failed to detect a GPU.")

In [1]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 3060


# Libraries

In [3]:
import pymongo
import tensorflow as tf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# MongoDB Connection

In [4]:
# MongoDB Connection String
client = pymongo.MongoClient("mongodb+srv://priscillalicup:bs62gnFzjoBQnJ03@cluster0.uskxsig.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client.supreme_court_jurisprudence
collection = db.year_2012_zip

# Tokens and Models

In [5]:
# Ensure GPU usage for SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

# Recursive Chunking Function

In [6]:
# Recursive Chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " "],
    length_function=len
)

## Using TF (not successful yet)

In [6]:
# # Generate Embedding function using TensorFlow

# Makes nested array embeddings
# def generate_embedding(text: str) -> 'list[float]':
#     # inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
#     # outputs = model(inputs)
#     # return outputs.last_hidden_state[:, 0, :].numpy().tolist()
#     return model.encode(text).tolist()

# Working (?)
# def generate_embedding(text: str) -> list:
#     inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True)
#     outputs = model(inputs)
#     embeddings = outputs.last_hidden_state[:, 0, :].numpy()
#     return embeddings.flatten().tolist()

# Embedding Function

In [7]:
def generate_embedding(text):
  return model.encode(text).tolist()

## Trial for embedding 1 document in MongoDB

In [22]:
# # TRIAL
# from bson import ObjectId

# # Ensure you import ObjectId from bson to work with MongoDB document IDs
# doc_id = ObjectId('664e4e1e1fd27d4ee67ccb12')


# try:
#     # Find the specific document by its ObjectId
#     doc = collection.find_one({'_id': doc_id})
    
#     if doc:
        
#         # Chunk the document text
#         chunks = text_splitter.split_text(doc['text'])

#         # Generate embeddings for each chunk
#         chunk_embeddings = [generate_embedding(chunk) for chunk in chunks]

#         # Store the chunks and their embeddings in the document
#         doc['text_chunks'] = chunks
#         doc['text_chunk_embeddings'] = chunk_embeddings

#         collection.replace_one({'_id': doc['_id']}, doc)
#         print(f"Successfully embedded data from {str(doc['identifier'])}")
#         counter += 1
#     else:
#         print(f"No document found with ID: {doc_id}")
# except Exception as e:
#     print(f"An error occurred: {e}")
# finally:
#     print(f"Number of documents embedded: {counter}")

Successfully embedded data from A.C. No. 5098
Number of documents embedded: 717


## Embedding all documents in a DB (ONLY RUN IF NO EMBEDDINGS IN DATABASE YET)

In [11]:
# ONLY RUN IF NO EMBEDDINGS IN DATABASE YET

# Append embeddings to MongoDB
# -------------------------------------------- COMMENT OUT BELOW
# counter = 0
# try:
#     for doc in collection.find({'text': {"$exists": True}}):
#         # Check if embeddings already exist
#         if 'text_chunk_embeddings' in doc:
#             print(f"Embeddings already exist for document {str(doc['identifier'])}")
#             counter += 1
#             continue

#         # Chunk the document text
#         chunks = text_splitter.split_text(doc['text'])

#         # Generate embeddings for each chunk
#         chunk_embeddings = [generate_embedding(chunk) for chunk in chunks]

#         # Store the chunks and their embeddings in the document
#         doc['text_chunks'] = chunks
#         doc['text_chunk_embeddings'] = chunk_embeddings

#         collection.replace_one({'_id': doc['_id']}, doc)
#         print(f"Successfully embedded data from {str(doc['identifier'])}")
#         counter += 1
# except Exception as e:
#     print(f"An error occurred: {e}")
# finally:
#     print(f"Number of documents embedded: {counter}")

Successfully embedded data from A.C. No. 5098
Successfully embedded data from A.C. No. 6332
Successfully embedded data from A.C. No. 6903
Successfully embedded data from A.C. No. 7481
Successfully embedded data from A.C. No. 7880
Successfully embedded data from A.C. No. 7940
Successfully embedded data from A.M. No. MTJ-07-1667
Successfully embedded data from A.M. No. MTJ-08-1711
Successfully embedded data from A.M. No. MTJ-11-1781 (Formerly OCA I.P.I. No. 09-2161-MTJ)
Successfully embedded data from A.M. No. P-09-2720 [Formerly OCA I.P.I. No. 09-3259-P]
Successfully embedded data from A.M. No. P-11-2912
Successfully embedded data from A.M. No. P-11-2948 [Formerly OCA I.P.I. No. 09-3049-P]
Successfully embedded data from A.M. No. P-11-3002 (Formerly A.M. No. 11-9-96-MTCC)
Successfully embedded data from A.M. No. P-11-3003 (Formerly A.M. IPI No. 08-2970-P)
Successfully embedded data from A.M. No. P-11-3004 (Formerly OCA I.P.I. No. 10-3483-P)
Successfully embedded data from A.M. No. P-12-

## Trying a new Data Model (chunks.embedding) with only first 10 documents

In [8]:
# Append embeddings to MongoDB  (ONLY RUN IF NO EMBEDDINGS IN DATABASE YET)
# counter = 0
# max_documents = 10  # Limit to embed only 10 documents

# try:
#     for doc in collection.find({'text': {"$exists": True}}).limit(max_documents):
#         # Check if embeddings already exist
#         if 'chunks' in doc:
#             print(f"Embeddings already exist for document {str(doc['identifier'])}")
#             counter += 1
#             continue

#         # Chunk the document text
#         chunks = text_splitter.split_text(doc['text'])

#         # Generate embeddings for each chunk
#         chunk_embeddings = [generate_embedding(chunk) for chunk in chunks]

#         # Create chunks structure
#         chunk_structures = [{"text": chunk, "embedding": embedding} for chunk, embedding in zip(chunks, chunk_embeddings)]

#         # Store the chunks and their embeddings in the document
#         doc['chunks'] = chunk_structures

#         collection.replace_one({'_id': doc['_id']}, doc)
#         print(f"Successfully embedded data from {str(doc['identifier'])}")
#         counter += 1
# except Exception as e:
#     print(f"An error occurred: {e}")
# finally:
#     print(f"Number of documents embedded: {counter}")

In [34]:
# Create an index on the chunks.embedding field
# try:
#     collection.create_index([("chunks.embedding", "knnVector")])  # Adjust "knnVector" to your specific index type if necessary
#     print("Index created successfully.")
# except Exception as e:
#     print(f"An error occurred while creating the index: {e}")

An error occurred while creating the index: Error in specification { name: "chunks.embedding_knnVector", key: { chunks.embedding: "knnVector" } } :: caused by :: Unknown index plugin 'knnVector', full error: {'ok': 0.0, 'errmsg': 'Error in specification { name: "chunks.embedding_knnVector", key: { chunks.embedding: "knnVector" } } :: caused by :: Unknown index plugin \'knnVector\'', 'code': 67, 'codeName': 'CannotCreateIndex', '$clusterTime': {'clusterTime': Timestamp(1719184634, 5), 'signature': {'hash': b'Y\xea\xa4\x11\xbb\xe3\x13\x1f\xa2\x98\xf6\xa4\xe1%\x8d|M\xf6Z\xd4', 'keyId': 7343317336448827400}}, 'operationTime': Timestamp(1719184634, 5)}


In [9]:
# Function to generate embedding for the query string
query = "murder"
query_embedding = generate_embedding(query)

In [13]:
# Define your query vector (new_structure_vector_search)
# query_vector = query_embedding  # Example query vector
query_vector = [1.0] * 384  # Simple vector with all ones

# Define the query
search_query = {
    "$search": {
        "index": "new_structure_vector_search",  # Specify the name of the search index created in MongoDB Atlas
        "knnBeta": {  # Use knnBeta operator
            "path": "chunks.embedding",
            "vector": query_vector,
            "k": 5  # Number of nearest neighbors to return
        }
    }
}

# Execute the query and add logging
try:
    results = collection.aggregate([search_query])
    
    found_results = False
    for document in results:
        pprint.pprint(document)
        found_results = True
    
    if not found_results:
        print("No results found.")
except Exception as e:
    print(f"An error occurred during the search: {e}")

No results found.


### Debug

In [59]:
sample_document = collection.find_one()
if sample_document:
    print(sample_document["chunks"])
else:
    print("No documents found in the collection.")


[{'text': 'We resolve this disbarment complaint against Atty. Clemencio Sabitsana, Jr. who is charged of: (1) violating the lawyer’s duty to preserve confidential information received from his client; [1] and (2) violating the prohibition on representing conflicting interests. [2] In her complaint, Josefina M. Aniñon ( complainant ) related that she previously engaged the legal services of Atty. Sabitsana in the preparation and execution in her favor of a Deed of Sale over a parcel of land owned by her late common-law husband, Brigido Caneja, Jr. Atty. Sabitsana allegedly violated her confidence when he subsequently filed a civil case against her for the annulment of the Deed of Sale in behalf of Zenaida L. Cañete, the legal wife of Brigido Caneja, Jr. The complainant accused Atty. Sabitsana of using the confidential information he obtained from her in filing the civil case. Atty. Sabitsana admitted having advised the complainant in the preparation and execution of the Deed of Sale. Ho

In [60]:
query_embedding

[-0.015248102135956287,
 0.099979467689991,
 -0.0315321609377861,
 0.02460394985973835,
 0.04702599719166756,
 0.014502136968076229,
 0.09060142189264297,
 -0.017771301791071892,
 0.0156710222363472,
 0.03999178484082222,
 0.06094732508063316,
 -0.0033564986661076546,
 -0.023259736597537994,
 -0.019649822264909744,
 0.002269142307341099,
 -0.06077025458216667,
 0.004631070885807276,
 0.03211989998817444,
 -0.09286132454872131,
 0.05317011848092079,
 -0.09205028414726257,
 0.0058141849003732204,
 0.05718009173870087,
 -0.011085468344390392,
 -0.05227742716670036,
 0.050663962960243225,
 0.023286055773496628,
 0.020500484853982925,
 -0.04866144061088562,
 -0.07273395359516144,
 0.019128551706671715,
 -0.006409550551325083,
 0.03783567622303963,
 0.03438939154148102,
 -0.052403077483177185,
 -0.0487772673368454,
 0.08078708499670029,
 0.00227711396291852,
 0.004041902255266905,
 0.03397601842880249,
 -0.03531337156891823,
 -0.043336447328329086,
 -0.040920648723840714,
 0.0139144789427518

In [61]:
len(query_embedding)

384

In [63]:
import pprint

# Define the query
search_query = {
    "$search": {
        "index": "new_structure_vector_search",  # Specify the name of the search index created in MongoDB Atlas
        "knnBeta": {  # Use knnBeta operator
            "path": "chunks.embedding",
            "vector": query_embedding,
            "k": 5,  # Number of nearest neighbors to return
        }
    }
}

# Execute the query and add logging
try:
    results = collection.aggregate([search_query])
    
    found_results = False
    for document in results:
        pprint.pprint(document)  # Pretty-print the document for better readability
        found_results = True
    
    if not found_results:
        print("No results found.")
except Exception as e:
    print(f"An error occurred during the search: {e}")

No results found.


In [73]:
# List indexes in the collection
indexes = collection.list_indexes()
for index in indexes:
    pprint.pprint(index)

In [78]:
collection = db["supreme_court_jurisprudence.year_2012_zip"]

query_vector = query_embedding

# Log the query vector
print(f"Query vector: {query_vector}")

# Define the search query
search_query = {
    "$search": {
      "index": "TextSemanticSearch",  # Ensure this matches exactly with your index name
      "text": {
        "query": "murder",
        "path": {
          "wildcard": "*"
        }
      }
    }
}

# Log the search query
print(f"Search query: {search_query}")

# Execute the query and log results
try:
    results = collection.aggregate([search_query])
    
    found_results = False
    for document in results:
        pprint.pprint(document)  # Pretty-print the document for better readability
        found_results = True
    
    if not found_results:
        print("No results found.")
except Exception as e:
    print(f"An error occurred during the search: {e}")

Query vector: [-0.015248102135956287, 0.099979467689991, -0.0315321609377861, 0.02460394985973835, 0.04702599719166756, 0.014502136968076229, 0.09060142189264297, -0.017771301791071892, 0.0156710222363472, 0.03999178484082222, 0.06094732508063316, -0.0033564986661076546, -0.023259736597537994, -0.019649822264909744, 0.002269142307341099, -0.06077025458216667, 0.004631070885807276, 0.03211989998817444, -0.09286132454872131, 0.05317011848092079, -0.09205028414726257, 0.0058141849003732204, 0.05718009173870087, -0.011085468344390392, -0.05227742716670036, 0.050663962960243225, 0.023286055773496628, 0.020500484853982925, -0.04866144061088562, -0.07273395359516144, 0.019128551706671715, -0.006409550551325083, 0.03783567622303963, 0.03438939154148102, -0.052403077483177185, -0.0487772673368454, 0.08078708499670029, 0.00227711396291852, 0.004041902255266905, 0.03397601842880249, -0.03531337156891823, -0.043336447328329086, -0.040920648723840714, 0.013914478942751884, -0.012908323667943478, -0

In [74]:
explain_result = collection.aggregate([search_query]).explain()
pprint.pprint(explain_result)

AttributeError: 'CommandCursor' object has no attribute 'explain'

## Trying New Flattened Collection

In [21]:
original_collection = db["year_2012_zip"]
flattened_collection = db["flattened_year_2012_zip"]

# Process and restructure documents (only 10)
processed_count = 0
for doc in original_collection.find({}).limit(10):
    if 'chunks' in doc:
        for chunk in doc['chunks']:
            new_doc = {
                "document_id": doc["_id"],
                "title": doc.get("title"),
                "chunk_text": chunk.get("text"),
                "embedding": chunk.get("embedding")
            }
            flattened_collection.insert_one(new_doc)
    processed_count += 1
    if processed_count >= 10:
        break

print(f"Processed {processed_count} documents.")

Processed 10 documents.


In [24]:
query = "murder"

collection_name = "flattened_year_2012_zip"
collection = db[collection_name]

results = collection.aggregate([
  {"$vectorSearch": {
    "queryVector": generate_embedding(query),
    "path": "embedding",
    # This is an optimization parameter that tells MongoDB how many candidate matches to consider internally, before returning the final results.
    # Higher number can improve the accuracy of the results, but may increase the computation size
    "numCandidates": 100,  
    "limit": 5,
    "index": "default",
      }}
]);

for document in results:
    print(f'Title: {document["title"]},\nChunk: {document["chunk_text"]}\n\n')


Title: A.M. No. P-09-2720 [Formerly OCA I.P.I. No. 09-3259-P],
Chunk: referred the same to the Executive Judge of the RTC, Malolos City, Bulacan, for investigation, report and recommendation. [11] On April 14, 2010, in her Report, Executive Judge Herminia V. Pasamba, Malolos City, found respondent to be guilty of assisting litigants in posting of bail bond for a fee. [12] It was also revealed that respondent indeed solicited funds for her trip to Boracay to attend the stenographer's convention. Another subject of investigation of the Executive Judge is the death threat received by complainant judge and his family. There was no proof that respondent instigated the death threat. However, it was established that there was indeed a death threat against complainant and it coincidentally happened right after respondent reported back from her detail, and her and complainant judge's unsettled differences. Likewise, during the investigation, Melody M. Tolentino, Clerk of Court III, MTC, Angat, 

## Checking Mongo Connection

In [80]:
# Initialize MongoDB client
client = pymongo.MongoClient("mongodb+srv://priscillalicup:bs62gnFzjoBQnJ03@cluster0.uskxsig.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")

# List databases to verify connection
databases = client.list_database_names()
print("Databases:", databases)

# Connect to the specific database
db = client["supreme_court_jurisprudence"]

# List collections in the database to verify existence
collections = db.list_collection_names()
print("Collections:", collections)

# Ensure the collection exists
collection = db["supreme_court_jurisprudence.year_2012_zip"]
if "supreme_court_jurisprudence.year_2012_zip" in collections:
    print("Collection found.")
else:
    print("Collection not found.")

Databases: ['sample_mflix', 'sample_training', 'sample_weatherdata', 'supreme_court_jurisprudence', 'admin', 'local']
Collections: ['year_2012_combined_json', 'year_2012', 'year_2012_zip']
Collection not found.


In [85]:
from pymongo import MongoClient
import pprint

# Initialize MongoDB client and get the collection
client = pymongo.MongoClient("mongodb+srv://priscillalicup:bs62gnFzjoBQnJ03@cluster0.uskxsig.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client["supreme_court_jurisprudence"]

# Verify and connect to the correct collection
collections = db.list_collection_names()
print("Collections:", collections)

collection_name = "year_2012_zip"
if collection_name in collections:
    collection = db[collection_name]
    print(f"Collection '{collection_name}' found.")
else:
    print(f"Collection '{collection_name}' not found.")
    raise Exception(f"Collection '{collection_name}' not found in the database.")

# Define the search query
search_query = {
    "$search": {
      "index": "TextSemanticSearch",  # Ensure this matches exactly with your index name
      "text": {
        "query": "murder",
        "path": {
          "wildcard": "*"
        }
      }
    }
}

# Log the search query
print(f"Search query: {search_query}")

# Execute the query and log results
try:
    results = collection.aggregate([search_query])
    
    found_results = False
    for document in results:
        pprint.pprint(document)  # Pretty-print the document for better readability
        found_results = True
    
    if not found_results:
        print("No results found.")
except Exception as e:
    print(f"An error occurred during the search: {e}")


Collections: ['year_2012_combined_json', 'year_2012', 'year_2012_zip']
Collection 'year_2012_zip' found.
Search query: {'$search': {'index': 'TextSemanticSearch', 'text': {'query': 'murder', 'path': {'wildcard': '*'}}}}
{'_id': ObjectId('664e4e1e1fd27d4ee67cccf5'),
 'database': 'sc_jurisprudence',
 'date': '2012-07-30',
 'division': 'FIRST DIVISION',
 'identifier': 'G.R. No. 188612',
 'parties': 'PEOPLE OF THE PHILIPPINES, PLAINTIFF -APPELLEE, VS. DIOSDADO '
            'CAMAT AND MAMERTO DULAY, ACCUSED-APPELLANTS.',
 'presiding_justice': 'LEONARDO-DE CASTRO, J.',
 'text': 'This is an appeal from the Decision [1] dated February 27, 2009 of '
         'the Court of Appeals in CA-G.R. CR.-H.C. No. 02429, entitled People '
         'of the Philippines v. Diosdado Camat and Mamerto Dulay, which '
         'affirmed with modification the Joint Decision [2] dated October 9, '
         '2002 of the Regional Trial Court (RTC) of Urdaneta City, Branch 46 '
         'in Criminal Case Nos. U-1049

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



,
                            -0.00865868292748928,
                            -0.011272682808339596,
                            -0.008761114440858364],
                           [-0.039945088326931,
                            0.0564364418387413,
                            -0.010576611384749413,
                            0.014525269158184528,
                            0.07771705090999603,
                            0.11808127909898758,
                            0.08072792738676071,
                            -0.013938776217401028,
                            0.03921113163232803,
                            0.007933476939797401,
                            0.0695420503616333,
                            0.07060544192790985,
                            0.07289217412471771,
                            -0.05151824653148651,
                            -0.013316776603460312,
                            0.007408550009131432,
                            0.03406997397542,
        

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Query (compare each embedding)

## MongoDB Vector Search Index (currently not working)

In [84]:
# Function to generate embedding for the query string
query = "murder"
query_embedding = generate_embedding(query)

### MongoDB Vector Search (currently not working with chunk embeddings)

In [None]:
# results = collection.aggregate([
#     {
#         "$project": {
#             "identifier": 1,
#             "petitioner": 1,
#             "respondent": 1,
#             "text_chunk_embeddings": 1,
#             "chunk_match_scores": {
#                 "$map": {
#                     "input": "$text_chunk_embeddings",
#                     "as": "chunk_embedding",
#                     "in": {
#                         "$let": {
#                             "vars": {
#                                 "search_result": {
#                                     "$meta": {
#                                         "$vectorSearch": {
#                                             "queryVector": query_embedding,
#                                             "path": "$$chunk_embedding"
#                                         }
#                                     }
#                                 }
#                             },
#                             "in": "$$search_result.score"
#                         }
#                     }
#                 }
#             }
#         }
#     },
#     {
#         "$addFields": {
#             "max_score": { "$max": "$chunk_match_scores" }
#         }
#     },
#     {
#         "$sort": { "max_score": -1 }
#     },
#     {
#         "$limit": 5
#     }
# ])

# for document in results:
#     print(f'Identifier: {document["identifier"]},\nPetitioner: {document["petitioner"]},\nRespondent: {document["respondent"]}\n\n')

### This is the code for Vector Search for no chunks yet --> will try for chunking here (multiple arrays of embeddings)

In [93]:
client = pymongo.MongoClient("mongodb+srv://priscillalicup:bs62gnFzjoBQnJ03@cluster0.uskxsig.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0")
db = client["supreme_court_jurisprudence"]
collection_name = "year_2012_zip"
collection = db[collection_name]

In [102]:
''' ORIGINAL '''
# results = collection.aggregate([
#   {"$vectorSearch": {
#     "queryVector": generate_embedding(query),
#     "path": "text_chunk_embeddings",
#     # This is an optimization parameter that tells MongoDB how many candidate matches to consider internally, before returning the final results.
#     # Higher number can improve the accuracy of the results, but may increase the computation size
#     "numCandidates": 100,  
#     "limit": 5,
#     "index": "TextSemanticSearch",
#       }}
# ]);

pipeline = [
  {
    '$vectorSearch': {
      'index': 'new_structure_vector_search', 
      'path': 'chunks.embedding', 
      'queryVector': query_embedding,
      'numCandidates': 150, 
      'limit': 10
    }
  }, {
    '$project': {
      '_id': 0, 
      'plot': 1, 
      'title': 1, 
      'score': {
        '$meta': 'vectorSearchScore'
      }
    }
  }
]

# for document in results:
#     print(f'Identifier: {document["identifier"]}')
# run pipeline
result = collection.aggregate(pipeline)
# print results
for i in result:
    print(i)

In [22]:
# TRIAL: for multiple chunk embeddings

# Verify the query vector generation
query_vector = generate_embedding("your query text")
print("Generated Query Vector:", query_vector)
print("Query Vector Length:", len(query_vector))

# Verify the documents in the collection
sample_docs = collection.find({}, {"text_chunk_embeddings": 1}).limit(5)
for doc in sample_docs:
    print("Sample Document:", doc)

# Simplified pipeline to debug the search stage
# pipeline = [
#     {
#         "$search": {
#             "index": "TextSemanticSearch",
#             "knnBeta": {
#                 "vector": query_vector,
#                 "path": "text_chunk_embeddings",
#                 "k": 50  # Adjust the number of nearest neighbors
#             }
#         }
#     },
#     {
#         "$project": {
#             "text_chunk_embeddings": 1,
#             "score": {"$meta": "searchScore"}
#         }
#     }
# ]

pipeline = [
    {
        "$searchMeta": {
            "index": "TextSemanticSearch",
            "knnBeta": {
                "vector": generate_embedding(query),
                "path": "text_chunk_embeddings",
                "k": 5
            }
        }
    }
]


results = collection.aggregate(pipeline)

# Print the results
found_results = False
for document in results:
    found_results = True
    print("Search Result:", document)

if not found_results:
    print("No results found.")

try:
    results = collection.aggregate(pipeline)

    for document in results:
        print(f'Identifier: {document["identifier"]},\nTitle: {document["title"]},\nParties: {document["parties"]}\n\n')

except Exception as e:
    print(f"An error occurred: {e}")

Generated Query Vector: [0.08394836634397507, 0.060505688190460205, -0.0451180636882782, 0.08733320981264114, -0.09565097838640213, 0.06681075692176819, 0.14764899015426636, 0.07552929222583771, 0.0356975756585598, -0.036975663155317307, 0.07756901532411575, -0.043951213359832764, 0.08388344198465347, -0.08083121478557587, -0.035958219319581985, 0.033973328769207, 0.014858534559607506, -0.08260113000869751, -0.07135482132434845, -0.01550315972417593, -0.003360736882314086, 0.11787575483322144, -0.016425956040620804, 0.05040014907717705, -0.01109984889626503, 0.03489190712571144, -0.04431283101439476, 0.03816388547420502, 0.05693529546260834, -0.006272321101278067, -0.12311681360006332, 0.038386546075344086, 0.06876232475042343, 0.0719674602150917, 0.06783567368984222, -0.05415000393986702, -0.03027142956852913, -0.04614860191941261, 0.01999666728079319, -0.0669698715209961, 0.006086792331188917, -0.07340560853481293, 0.05468913912773132, 0.0018096595304086804, -0.016428882256150246, -0

In [25]:
pipeline = [
    {
        "$search": {
            "index": "TextSemanticSearch",
            "knnBeta": {
                "vector": generate_embedding(query),
                "path": "text_chunk_embeddings",
                "k": 5
            }
        }
    },
    {
        "$unwind": "$text_chunk_embeddings"
    },
    {
        "$limit": 5
    }
]

# Perform the aggregation
try:
    results = collection.aggregate(pipeline)

    # Print the results
    found_results = False
    for document in results:
        found_results = True
        print("Search Result:", document)
    
    if not found_results:
        print("No results found.")

except Exception as e:
    print(f"An error occurred: {e}")


No results found.


In [48]:
# Insert a test document with a single embedding
test_doc = {
    "text_chunk_embeddings": [
        [0.1] * 384
    ]
}

collection.insert_one(test_doc)

# Run the same pipeline
results = collection.aggregate(pipeline)

# Print the results
found_results = False
for document in results:
    found_results = True
    print("Search Result:", document)

if not found_results:
    print("No results found.")

No results found.


In [28]:
# Verify the Embeddings are Stored Correctly

# Retrieve a sample document to inspect the embeddings
sample_document = collection.find_one({}, {"text_chunk_embeddings": 1})
if sample_document and "text_chunk_embeddings" in sample_document:
    print(f"Sample document found. Number of embeddings: {len(sample_document['text_chunk_embeddings'])}")
else:
    print("No embeddings found in the sample document.")


Sample document found. Number of embeddings: 16


In [29]:
# Verify the Embedding Dimensions

# Check the dimensions of a stored embedding
if sample_document and "text_chunk_embeddings" in sample_document:
    sample_embedding = sample_document["text_chunk_embeddings"][0]
    print(f"Sample embedding dimension: {len(sample_embedding)}")
else:
    print("No embeddings found in the sample document.")

# Check the dimensions of the generated embedding
query_embedding = generate_embedding(query)
print(f"Generated embedding dimension: {len(query_embedding)}")


Sample embedding dimension: 384
Generated embedding dimension: 384


In [30]:
# Use a known embedding vector directly for testing
if sample_document and "text_chunk_embeddings" in sample_document:
    known_embedding = sample_document["text_chunk_embeddings"][0]  # Adjust based on the document structure

    pipeline = [
        {
            "$search": {
                "index": "TextSemanticSearch",
                "knnBeta": {
                    "vector": known_embedding,
                    "path": "text_chunk_embeddings",
                    "k": 5
                }
            }
        },
        {
            "$limit": 5
        }
    ]

    # Perform the aggregation
    try:
        results = collection.aggregate(pipeline)

        for document in results:
            print(document)

    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("No known embeddings found in the sample document to test.")


In [31]:
# Perform the aggregation with detailed debug information
try:
    pipeline = [
        {
            "$searchMeta": {
                "index": "TextSemanticSearch",
                "knnBeta": {
                    "vector": generate_embedding(query),
                    "path": "text_chunk_embeddings",
                    "k": 5
                }
            }
        }
    ]

    results = collection.aggregate(pipeline)

    for document in results:
        print(document)

except Exception as e:
    print(f"An error occurred: {e}")


{'count': {'lowerBound': 0}}


In [27]:
# Use a known embedding vector directly for testing
known_embedding = sample_document["text_chunk_embeddings"][0]  # Adjust based on the document structure

pipeline = [
    {
        "$search": {
            "index": "TextSemanticSearch",
            "knnBeta": {
                "vector": known_embedding,
                "path": "text_chunk_embeddings",
                "k": 5
            }
        }
    },
    {
        "$limit": 5
    }
]

# Perform the aggregation
try:
    results = collection.aggregate(pipeline)

    for document in results:
        print(document)

except Exception as e:
    print(f"An error occurred: {e}")


## FAISS

In [53]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp39-cp39-win_amd64.whl.metadata (3.8 kB)
Downloading faiss_cpu-1.8.0-cp39-cp39-win_amd64.whl (14.5 MB)
   ---------------------------------------- 0.0/14.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.5 MB ? eta -:--:--
    --------------------------------------- 0.3/14.5 MB 2.9 MB/s eta 0:00:05
   --- ------------------------------------ 1.2/14.5 MB 8.1 MB/s eta 0:00:02
   ----- ---------------------------------- 2.1/14.5 MB 11.3 MB/s eta 0:00:02
   -------- ------------------------------- 3.1/14.5 MB 13.3 MB/s eta 0:00:01
   -------- ------------------------------- 3.1/14.5 MB 13.3 MB/s eta 0:00:01
   -------- ------------------------------- 3.1/14.5 MB 13.3 MB/s eta 0:00:01
   -------- ------------------------------- 3.1/14.5 MB 13.3 MB/s eta 0:00:01
   --------- ------------------------------ 3.5/14.5 MB 8.3 MB/s eta 0:00:02
   -----------

In [54]:
import faiss
import numpy as np

In [59]:
query = "murder"
query_embedding = np.array(generate_embedding(query)).astype('float32')

Current time: 11 seconds (900+ documents)

In [61]:
# might be too slow with large dataset because it loads the whole dataset into python 
# maybe needs to load in batches to load incrementally instead (?????????)

# Load embeddings from MongoDB
documents = collection.find({}, {"text_chunk_embeddings": 1, "text_chunks": 1, "_id": 1})
embeddings = []
ids = []
chunks = []
chunk_texts = []

for doc in documents:
    for i, emb in enumerate(doc['text_chunk_embeddings']):
        embeddings.append(emb)
        ids.append(doc['_id'])
        chunks.append(i)
        chunk_texts.append(doc['text_chunks'][i])

In [56]:
embeddings = np.array(embeddings).astype('float32')

In [57]:
# Create FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [60]:
k = 5  # number of nearest neighbors
D, I = index.search(query_embedding.reshape(1, -1), k)

Using Euclidean distance (L2)

Distances:
- Distances close to 0 indicate high similarity.
- Larger distances indicate lower similarity.

In [62]:
# Print the results
print("Nearest Neighbors:")
for dist, idx in zip(D[0], I[0]):
    print(f"Document ID: {ids[idx]}, Chunk Index: {chunks[idx]}, Distance: {dist}")
    print(f"Chunk Text: {chunk_texts[idx]}")
    print("----")

Nearest Neighbors:
Document ID: 664e4e1e1fd27d4ee67ccc83, Chunk Index: 29, Distance: 0.9719419479370117
Chunk Text: shall be deemed guilty of homicide and be punished by reclusion temporal .
----
Document ID: 664e4e1e1fd27d4ee67ccdb8, Chunk Index: 16, Distance: 1.0149824619293213
Chunk Text: aggravating circumstances to justify the imposition of the death penalty. The Court agrees with the CA's conclusion that the evidence on record proves beyond reasonable doubt that accused-appellants Lolito Aquino, Renato Ramos, and Norberto (Jun) Adviento, together with Francisca Talaro, conspired to kill Atty. Melvin Alipio. Murder under Article 248 of the Revised Penal Code is defined as the unlawful killing of a person, which is not parricide or infanticide, attended by circumstances such as treachery or evident premeditation. The presence of any one of the circumstances enumerated in Article 248 of the Code is sufficient to qualify a killing as murder. [19] In People v. Sanchez, [20] the Court 