In [None]:
import torch
import ollama
import os
import json
import time
from openai import OpenAI
import argparse

# ANSI escape codes for colors
PINK = "\033[95m"
CYAN = "\033[96m"
YELLOW = "\033[93m"
NEON_GREEN = "\033[92m"
RESET_COLOR = "\033[0m"

# Path to save embeddings and file modification time
EMBEDDINGS_FILE = "embeddings.pt"
MOD_TIME_FILE = "vault_mod_time.json"


# Constants
EMBEDDINGS_DIR = "Embeddings"
MOD_TIME_FILE = os.path.join(EMBEDDINGS_DIR, "mod_times.json")


# Function to open a file and return its contents as a string
def open_file(filepath):
    with open(filepath, "r", encoding="utf-8") as infile:
        return infile.read()


# Function to get relevant context from the vault based on user input
def get_relevant_context(rewritten_input, vault_embeddings, vault_content, top_k=3):
    if vault_embeddings.nelement() == 0:  # Check if the tensor has any elements
        return []
    # Encode the rewritten input
    input_embedding = ollama.embeddings(
        model="mxbai-embed-large", prompt=rewritten_input
    )["embedding"]
    # Compute cosine similarity between the input and vault embeddings
    cos_scores = torch.cosine_similarity(
        torch.tensor(input_embedding).unsqueeze(0), vault_embeddings
    )
    # Adjust top_k if it's greater than the number of available scores
    top_k = min(top_k, len(cos_scores))
    # Sort the scores and get the top-k indices
    top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()
    # Get the corresponding context from the vault
    relevant_context = [vault_content[idx].strip() for idx in top_indices]
    return relevant_context


# Function to interact with the Ollama model
def ollama_chat(
    user_input,
    system_message,
    vault_embeddings,
    vault_content,
    ollama_model,
    conversation_history,
):
    # Get relevant context from the vault
    relevant_context = get_relevant_context(
        user_input, vault_embeddings, vault_content, top_k=3
    )
    if relevant_context:
        # Convert list to a single string with newlines between items
        context_str = "\n".join(relevant_context)
        print("Context Pulled from Documents: \n\n" + CYAN + context_str + RESET_COLOR)
    else:
        print(CYAN + "No relevant context found." + RESET_COLOR)

    # Prepare the user's input by concatenating it with the relevant context
    user_input_with_context = user_input
    if relevant_context:
        user_input_with_context = context_str + "\n\n" + user_input

    # Append the user's input to the conversation history
    conversation_history.append({"role": "user", "content": user_input_with_context})

    # Create a message history including the system message and the conversation history
    messages = [{"role": "system", "content": system_message}, *conversation_history]

    # Send the completion request to the Ollama model
    response = client.chat.completions.create(model=ollama_model, messages=messages)

    # Append the model's response to the conversation history
    conversation_history.append(
        {"role": "assistant", "content": response.choices[0].message.content}
    )

    # Return the content of the response from the model
    return response.choices[0].message.content


# Function to save embeddings and file modification time
def save_embeddings(embeddings, mod_time):
    torch.save(embeddings, EMBEDDINGS_FILE)
    with open(MOD_TIME_FILE, "w") as f:
        json.dump({"mod_time": mod_time}, f)



# Function to load embeddings and file modification time
def load_embeddings(vault_name):
    embeddings_file = os.path.join(EMBEDDINGS_DIR, f"{vault_name}_embeddings.pt")
    if os.path.exists("embeddings.pt") and os.path.exists(MOD_TIME_FILE):
        embeddings = torch.load(embeddings_file)
        with open(MOD_TIME_FILE, "r") as f:
            mod_time_data = json.load(f)
        mod_time = mod_time_data.get(vault_name)
        return embeddings, mod_time
    return None, None



In [None]:

# Function to generate embeddings for vault content with checkpointing and logging
def generate_embeddings(vault_data, vault_name, start_idx=0):
    vault_embeddings = []
    progress_log = os.path.join(EMBEDDINGS_DIR, f"{vault_name}_progress.json")

    # Load progress log if exists
    if os.path.exists(progress_log):
        with open(progress_log, "r") as f:
            progress_data = json.load(f)
        start_idx = progress_data.get("last_processed_idx", 0)
        print(f"Resuming from index {start_idx}")
    else:
        start_idx = 0

    for entry in tqdm(vault_data, desc="Generating embeddings"):
        file_path = entry["file_name"]
        modification_time = entry["modification_time"]
        chunks = entry["chunks"]

        for chunk in chunks:
            content = chunk["text"]
            chunk_id = chunk[id]

            # Check GPU temperature before processing each content
            while (
                check_gpu_temperature() > 51
            ):  # Adjust temperature threshold as needed
                print("GPU temperature is too high. Pausing until temperature drops...")
                time.sleep(30)  # Sleep for 30 seconds before rechecking

            response = ollama.embeddings(model="mxbai-embed-large", prompt=content)
            if "embedding" in response:
                vault_embeddings.append(response["embedding"])
            else:
                print(f"Failed to get embedding for content: {content}")

            # Save checkpoint every 10 embeddings or at the end of each entry
            if len(vault_embeddings) % 10 == 0 or entry == vault_data[-1]:
                checkpoint_path = os.path.join(
                    EMBEDDINGS_DIR, f"{vault_name}_embeddings.pt"
                )
                # Check file modification time
                current_mod_time = os.path.getmtime(file_path)
                save_embeddings(
                    torch.tensor(vault_embeddings), current_mod_time, vault_name
                )
                vault_embeddings = []
                # with open(progress_log, "w") as f:
                #    json.dump({"last_processed_idx": id}, f)

    return vault_embeddings

In [None]:
with open("vault.json", "r", encoding="utf-8") as f:
    vault_data = json.load(f)


In [None]:
type(vault_data)

vault_data[1]

In [None]:
vault_name = "vault"
start_idx=0
EMBEDDINGS_DIR = "Embeddings"

In [None]:
vault_embeddings = []
progress_log = os.path.join(EMBEDDINGS_DIR, f"embeddings_generation_of_{vault_name}_progress.json")

In [None]:

# Load progress log if exists
if os.path.exists(progress_log):
    with open(progress_log, "r") as f:
        progress_data = json.load(f)
    last_processed_chunk_id = progress_data.get("last_processed_chunk_id")
    file_path = progress_data.get("file_path")
    print(f"Last processed chunk ID: {last_processed_chunk_id}")
    print(f"File path: {file_path}")
    # Optionally, you can use these values in your further processing
    # For example:
    # start_idx = progress_data.get("last_processed_idx", 0)
    # print(f"Resuming from index {start_idx}")
else:
    print(f"Progress log '{progress_log}' does not exist.")



In [None]:

progress_log ="Embeddings/embeddings_generation_of_vault_progress.json"

# Load progress log if exists
if os.path.exists(progress_log):
    print("exists")
    with open(progress_log, "r") as f:
        progress_data = json.load(f)
    start_chunk_id = progress_data.get("last_processed_chunk_id")
    start_file_name = progress_data.get("file_name")
    print(f"Resuming from index {start_chunk_id}")
    print(f"Resuming from index {start_file_name}")
else:
    start_idx = 0
    print(f"doesnt exist")

In [None]:
embeddings_file = os.path.join(EMBEDDINGS_DIR, "vault_embeddings.pt")
    # Load existing embeddings if they exist
if os.path.exists(embeddings_file):
    existing_embeddings = torch.load(embeddings_file)
    #print(type(existing_embeddings))
    print(len(existing_embeddings))

    chunk_id = "ec483f61ef5354dc35446dc5b75ee9b585a3ec8a175f4419602db36b9a533949"
    for dicts in existing_embeddings:
        if chunk_id in my_dict:
            print(chunk_id, "is in alredy generated embeddings file")
            pass
        else:
            content = chunk["text"]
            chunk_id = chunk["id"]
            print(content, chunk_id)
            """
            # Check GPU temperature before processing each content
            while (
                check_gpu_temperature() > 51
            ):  # Adjust temperature threshold as needed
                print("GPU temperature is too high. Pausing until temperature drops...")
                time.sleep(30)  # Sleep for 30 seconds before rechecking

            """

            response = ollama.embeddings(model="mxbai-embed-large", prompt=content)


Saving in txt format!


In [None]:
def save_embeddings_to_txt(new_embeddings):
    os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
    embeddings_file = os.path.join(EMBEDDINGS_DIR, "vault_embeddings.txt")

    # Extract the embedding values (assuming they are tensors)
    embedding_values = [embedding.tolist() for embedding in new_embeddings]

    # Save the embedding values to a text file
    with open(embeddings_file, "w") as txt_file:
        for values in embedding_values:
            txt_file.write(" ".join(str(val) for val in values) + "\n")

# Usage example:
# new_embeddings = ...  # Your list of embeddings
# save_embeddings_to_txt(new_embeddings)

Saving in pt format!


In [None]:

# Function to save embeddings
def save_embeddings(new_embeddings):
    os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
    embeddings_file = os.path.join(EMBEDDINGS_DIR, "vault_embeddings.pt")
    # Load existing embeddings if they exist
    if os.path.exists(embeddings_file):
        existing_embeddings = torch.load(embeddings_file)
        existing_embeddings.extend(new_embeddings)
        updated_embeddings = existing_embeddings
    else:
        updated_embeddings = new_embeddings

    # Save the updated embeddings
    torch.save(updated_embeddings, embeddings_file)


In [None]:
def save_embeddings_to_txt(new_embeddings):
    os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
    embeddings_file = os.path.join(EMBEDDINGS_DIR, "vault_embeddings.txt")

    # Extract the embedding values (assuming they are tensors)
    embedding_values = [embedding.tolist() for embedding in new_embeddings]

    # Save the embedding values to a text file
    with open(embeddings_file, "w") as txt_file:
        for values in embedding_values:
            txt_file.write(" ".join(str(val) for val in values) + "\n")

# Usage example:
# new_embeddings = ...  # Your list of embeddings
# save_embeddings_to_txt(new_embeddings)

#the following is generate_embeddings function!

vault_embeddings = []
embeddings_file = os.path.join(EMBEDDINGS_DIR, "vault_embeddings.pt")
    # Load existing embeddings if they exist
if os.path.exists(embeddings_file):
    existing_embeddings = torch.load(embeddings_file)

from tqdm import tqdm

for entry in tqdm(vault_data, desc="Generating embeddings"):
    file_path = entry["file_name"]
    modification_time = entry["modification_time"]
    chunks = entry["chunks"]

    for chunk in chunks:
        content = chunk["text"]
        chunk_id = chunk["id"]
        print(content, chunk_id)
        """
        # Check GPU temperature before processing each content
        while (
            check_gpu_temperature() > 51
        ):  # Adjust temperature threshold as needed
            print("GPU temperature is too high. Pausing until temperature drops...")
            time.sleep(30)  # Sleep for 30 seconds before rechecking

        """
        for dicts in existing_embeddings:
            if chunk_id in my_dict:
                print(chunk_id, "is in alredy generated embeddings file")
                pass
            else:
                content = chunk["text"]
                chunk_id = chunk["id"]
                print(content, chunk_id)
                """
                # Check GPU temperature before processing each content
                while (
                    check_gpu_temperature() > 51
                ):  # Adjust temperature threshold as needed
                    print("GPU temperature is too high. Pausing until temperature drops...")
                    time.sleep(30)  # Sleep for 30 seconds before rechecking

                """

    
                response = ollama.embeddings(model="mxbai-embed-large", prompt=content)
                if "embedding" in response:
                    vault_embeddings.append({chunk_id: response["embedding"]})
                else:
                    print(f"Failed to get embedding for content: {content}")
                # Save checkpoint every 10 embeddings or at the end of each entry
                if len(vault_embeddings) % 10 == 0 or entry == vault_data[-1]:
                    checkpoint_path = os.path.join(
                        EMBEDDINGS_DIR, f"{vault_name}_embeddings.pt"
                    )
                    # Check file modification time
                    current_mod_time = os.path.getmtime(file_path)
                    checkpoint_path = os.path.join(EMBEDDINGS_DIR, f"{vault_name}_embeddings.pt")
                    
                    save_embeddings(vault_embeddings)
                    
                    vault_embeddings = []
                    # Construct the data to save
                    data_to_save = {
                        "last_processed_chunk_id": chunk_id,
                        "file_name": file_path  # Include the file_path here
                    }
                    # Save data to JSON file
                    with open(progress_log, "w") as f:
                        json.dump(data_to_save, f)

In [None]:


vault_files = [
    f for f in os.listdir() if f.startswith("vault") and f.endswith(".txt")
]

vault_files

In [None]:

for vault_file in vault_files:
    vault_name = os.path.splitext(vault_file)[0]
    vault_content = open_file(vault_file).splitlines()

    print(len(vault_content))



In [None]:
import torch

saved_embeddings = torch.load(r"Embeddings\vault_embeddings.pt")

In [None]:
saved_embeddings.shape

print("Size of first dimension:", saved_embeddings.size(0))  # Size of the first dimension
print("Size of second dimension:", saved_embeddings.size(1))  # Size of the second dimensionsaved_embeddings.size(2))  # Size of the third dimension
# Print the first 10 elements

print(saved_embeddings[1])

print(saved_embeddings[2])


print("Number of dimensions:", saved_embeddings.dim())  # Number of dimensions


In [None]:
if os.path.exists(
    r"C:\Users\deletable\OneDrive\easy-local-rag\Embeddings\vault_embeddings.pt"
) and len(vault_content) == saved_embeddings.size(0):
    print(f"Loaded saved embeddings for {vault_name}")
    vault_embeddings = saved_embeddings

In [None]:
old_saved_embeddings, saved_mod_time = load_embeddings(vault_name)


In [None]:

combined_tensor = None

combined_tensor = torch.cat((old_saved_embeddings, saved_embeddings))



In [None]:



if saved_embeddings.numel() > 0 and current_mod_time == saved_mod_time:
    print(f"Loaded saved embeddings for {vault_name}")
    vault_embeddings = saved_embeddings
else:
    print(f"Generating new embeddings for {vault_name}")
    vault_embeddings = generate_embeddings(vault_content, vault_name)
    vault_embeddings_tensor = torch.tensor(vault_embeddings).to(device)

    save_embeddings(vault_embeddings_tensor, current_mod_time, vault_name)




In [None]:



# Ensure the embeddings are in tensor format
if not isinstance(vault_embeddings, torch.Tensor):
    vault_embeddings_tensor = torch.tensor(vault_embeddings).to(device)
else:
    vault_embeddings_tensor = vault_embeddings




# Clean up NVML
pynvml.nvmlShutdown()


In [None]:

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Ollama Chat")
parser.add_argument(
    "--model", default="llama3", help="Ollama model to use (default: llama3)"
)
args = parser.parse_args()

# Configuration for the Ollama API client
client = OpenAI(base_url="http://localhost:11434/v1", api_key="llama3")

# Load the vault content
vault_content = []
if os.path.exists("vault.txt"):
    with open("vault.txt", "r", encoding="utf-8") as vault_file:
        vault_content = vault_file.readlines()


# Conversation loop
conversation_history = []
system_message = "You are a helpful assistant that is an expert at extracting the most useful information from a given text"

while True:
    user_input = input(
        YELLOW
        + "Ask a question about your documents (or type 'quit' to exit): "
        + RESET_COLOR
    )
    if user_input.lower() == "quit":
        break

    response = ollama_chat(
        user_input,
        system_message,
        vault_embeddings_tensor,
        vault_content,
        args.model,
        conversation_history,
    )
    print(NEON_GREEN + "Response: \n\n" + response + RESET_COLOR)


In [None]:
import ollama

    
stream = ollama.chat(model='phi3', messages=[
    {'role': 'user', 'content': 'Why is the sky blue?'},
], stream=True)

for chunk in stream:
    print(chunk['message']['content'], end='', flush=True)



In [None]:
import json
from tqdm import tqdm

# Function to load vault content from JSON file and create a dictionary
def load_vault_content(json_file):
    vault_content = {}
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        for entry in tqdm(data, desc="Loading vault content"):
            for chunk in entry["chunks"]:
                chunk_id = chunk["id"]
                chunk_text = chunk["text"].strip()
                vault_content[chunk_id] = chunk_text
    return vault_content

# Example usage
json_file = "vault.json"
vault_content = load_vault_content(json_file)

# Print the vault content to verify
type(vault_content)


In [None]:
print(vault_content{5})


In [None]:
import os
import torch

# Function to load embeddings and file modification time
def load_embeddings():
    embeddings_file = r"Embeddings\vault_embeddings.pt"
    if os.path.exists(embeddings_file):
        embeddings = torch.load(embeddings_file)
        return embeddings
    return None, None

vault_embeddings = load_embeddings()

len(vault_embeddings)

In [None]:
vault_content


In [None]:
rewritten_input = " pelvic in ammatory dz PIP proximal interphalangeal PKU phenylketonuria PMB postmenopausal haemorrhage PMS premenstrual syndrome PO per os (Latin for by mouth) PoP progester1-only pill POP plaster of Paris PPH postpartum haemorrhage PR per ?"




In [None]:
input_embedding = ollama.embeddings(
    model="mxbai-embed-large", prompt=rewritten_input
)["embedding"]

print (input_embedding)


In [None]:
# Create a tensor from input_embedding
input_embedding_tensor = torch.tensor(input_embedding).unsqueeze(0)


In [None]:

# Prepare embeddings and ids lists
embeddings_list = []
ids_list = []

for embedding_dict in vault_embeddings:
    for chunk_id, embedding in embedding_dict.items():
        embeddings_list.append(embedding)
        ids_list.append(chunk_id)


In [None]:
# Create a tensor from the embeddings list
vault_embeddings_tensor = torch.tensor(embeddings_list)

# Compute cosine similarity between the input and vault embeddings
cos_scores = torch.cosine_similarity(input_embedding_tensor, vault_embeddings_tensor)

# Adjust top_k if it's greater than the number of available scores
top_k = min(3, len(cos_scores))

# Sort the scores and get the top-k indices
top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()

# Get the corresponding chunk_ids from the top indices
top_chunk_ids = [ids_list[idx] for idx in top_indices]

print(top_chunk_ids)


In [None]:
type(vault_content)

In [None]:
# Assume relevant_context is already a list of strings obtained from the get_relevant_context function
relevant_context = [vault_content[chunk_id].strip() for chunk_id in top_chunk_ids]

# Join all the relevant context strings into a single string with newlines
combined_relevant_context = "\n".join(relevant_context)

print(combined_relevant_context)


In [None]:
import json

# Assuming json_file contains valid JSON data
with open("vault.json", "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# Create a set to store encountered IDs
seen_ids = set()

# Iterate through the dictionaries
for file_dict in data:
    for chunk in file_dict["chunks"]:
        chunk_id = chunk["id"]
        if chunk_id in seen_ids:
            print(f"Duplicate ID found: {chunk_id}")
        else:
            seen_ids.add(chunk_id)


In [None]:
response = "The dictum"


print(response)
print(len(response))
sentence = response.split(".",maxsplit=1)[0]
response = "".join(response[1:])

print(sentence)

print(response)


In [None]:
from semantic_text_splitter import TextSplitter

# Maximum number of characters in a chunk
max_characters = 1000
# Optionally can also have the splitter not trim whitespace for you
splitter = TextSplitter(max_characters)
# splitter = TextSplitter(max_characters, trim=False)

chunks = splitter.chunks("your document text")

In [None]:
from transformers import pipeline

oracle = pipeline(model="medicalai/ClinicalBERT")
oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
{'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}

how to find collections in the milvus db


In [None]:
from pymilvus import connections, list_collections

def list_all_collections():
    # Connect to Milvus
    connections.connect("default", host="localhost", port="19530")
    
    # List all collections
    collections = list_collections()
    
    # Print the names of all collections
    print("Collections in Milvus:")
    for collection in collections:
        print(collection)

# Call the function to list all collections
list_all_collections()


In [None]:
from pymilvus import connections, Collection
import pandas as pd

def check_milvus_collection(collection_name):
    # Connect to Milvus
    connections.connect("default", host="localhost", port="19530")
    
    # Load the collection
    collection = Collection(collection_name)

    
    # Print some information about the collection
    print(f"Collection name: {collection.name}")
    print(f"Number of entities: {collection.num_entities}")
    
    # Load the first few entities to inspect
    results = collection.query(expr="id != ''", limit=10)
    
    # Convert the results to a DataFrame for easier viewing
    df = pd.DataFrame(results)
    print(df.head())

# Call the function with the name of your collection
check_milvus_collection("html_chunks")


how to delete a Collection


In [None]:
from pymilvus import utility

# Replace 'your_collection_name' with the name of your collection
utility.drop_collection("html_chunks")


In [None]:
from pymilvus import utility

# Replace 'your_collection_name' with the name of your collection
utility.drop_collection("example_collection")


see what are all the collection presnt

In [None]:
# List all collections
collections = utility.list_collections()

# Drop each collection
for collection_name in collections:
    if collection_name:
        print(f"Collection {collection_name} successfully.")
    else :
        print("no collections in the database")


In [None]:
# Connect to the collection
collection = Collection("html_chunks")
print(collection.schema)


In [None]:
from pymilvus import Collection, Index

# Connect to the collection
collection = Collection("html_chunks")

# Define the index parameters
index_params = {
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128},
    "metric_type": "L2"
}

# Create the index
index = Index(collection, "embedding", index_params)


In [None]:
from pymilvus import connections, utility

connections.connect(
        alias="default",
        host="localhost",
        port="19530",
        max_receive_message_size=2**30,  # Set to 1 GB (adjust as needed)
        max_send_message_size=2**30,  # Set to 1 GB (adjust as needed)
    )

# Connect to the collection
collection = Collection("html_chunks")

# Load the collection into memory
collection.load()

# Define the query expression
expr = "serial == 0"  # Example expression to get entities where serial is 0

# Specify the fields you want to retrieve
output_fields = ["id", "text", "file_name", "modification_time", "serial", "embedding"]

# Execute the query
results = collection.query(expr, output_fields=output_fields)

# Print the results
for result in results:
    print(result)


In [None]:
len(result_chunks)

In [None]:
import json

with open('Semantic_vault.json', 'r') as file:
    data = json.load(file)


In [None]:
from pymilvus import (
    connections,
    Collection,
    FieldSchema,
    CollectionSchema,
    DataType,
    utility,
)

connections.connect("default", host="localhost", port="19530")

collection_name = "html_chunks"

id_field = FieldSchema(
    name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=64
)
text_field = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535)
file_name_field = FieldSchema(
    name="file_name", dtype=DataType.VARCHAR, max_length=65535
)
modification_time_field = FieldSchema(
    name="modification_time", dtype=DataType.FLOAT
)
serial_field = FieldSchema(name="serial", dtype=DataType.INT64)
embedding_field = FieldSchema(
    name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024
)  # Adjust dim as per your embeddings

schema = CollectionSchema(
    fields=[
        id_field,
        text_field,
        file_name_field,
        modification_time_field,
        serial_field,
        embedding_field,
    ],
    description="HTML text chunks",
)

if utility.has_collection(collection_name):
    existing_collection = Collection(collection_name)
    existing_schema = existing_collection.schema

    if existing_schema != schema:
        raise ValueError(
            "The existing collection schema does not match the provided schema."
        )

else:
    collection = Collection(name=collection_name, schema=schema)

In [None]:
from tqdm import tqdm

def insert_in_batches(collection, entities, batch_size=1000):
    total = len(entities["id"])
    for start in tqdm(range(0, total, batch_size), desc="Inserting batches"):
        end = min(start + batch_size, total)
        batch = {
            "id": entities["id"][start:end],
            "text": entities["text"][start:end],
            "file_name": entities["file_name"][start:end],
            "modification_time": entities["modification_time"][start:end],
            "serial": entities["serial"][start:end],
            "embedding": entities["embedding"][start:end]
        }
        collection.insert([
            batch["id"],
            batch["text"],
            batch["file_name"],
            batch["modification_time"],
            batch["serial"],
            batch["embedding"]
        ])
        print(f"Inserted batch {start // batch_size + 1} of {total // batch_size + 1}")



# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Connect to the collection
collection = Collection("html_chunks")

# Prepare the data
entities = {
    "id": [],
    "text": [],
    "file_name": [],
    "modification_time": [],
    "serial": [],
    "embedding": []
}

dummy_embedding = [0.0] * 1024  # Dummy embedding with dimension 1024

for item in data:
    file_name = item["file_name"]
    modification_time = item["modification_time"]
    for chunk in item["chunks"]:
        entities["id"].append(chunk["id"])
        entities["text"].append(chunk["text"])
        entities["file_name"].append(file_name)
        entities["modification_time"].append(modification_time)
        entities["serial"].append(0)  # Assuming serial is 0 for all
        entities["embedding"].append(dummy_embedding)

# Insert data into Milvus in batches
insert_in_batches(collection, entities)


*updateingembeddings*

In [None]:
import torch

# Load the tensor from the .pt file
embeddings_tensor = torch.load('Embeddings\\mxbai-embed-large_vault_embeddings.pt')

# Convert the tensor back to a dictionary
embeddings_data = {f'embedding{i+1}': embedding.tolist() for i, embedding in enumerate(embeddings_tensor)}

print(embeddings_data)


In [None]:
import torch

# Load the tensor from the .pt file
embeddings_tensor = torch.load('Embeddings\\mxbai-embed-large_vault_embeddings.pt')

# Specify the number of top embeddings to load
top_n = 5

# Slice the tensor to get the top N embeddings
top_embeddings_tensor = embeddings_tensor[1]


print(top_embeddings_data)


In [None]:

# Slice the tensor to get the top N embeddings
top_embeddings_tensor = embeddings_tensor[1]


print(top_embeddings_tensor)


In [None]:
from pymilvus import Collection, Index

# Connect to the collection
collection = Collection("html_chunks")

# Define the index parameters
index_params = {
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128},
    "metric_type": "L2"
}

# Create the index
index = Index(collection, "embedding", index_params)


In [None]:
from pymilvus import connections, Collection

connections.connect("default", host="localhost", port="19530")


In [None]:
from tqdm import tqdm
from pymilvus import connections, Collection

# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Connect to the collection
collection = Collection("html_chunks")

# Load the collection into memory
collection.load()

# Prepare the update data
update_data = [
    {"id":"0089abb82785319d86a1f289b195ff11aec07d9af5b7689a43770e63490c20dc", "embedding":[-0.7828123569488525, -0.27709338068962097, 0.41511330008506775, 0.9022812247276306, -0.6322506070137024, 0.25069254636764526, -0.2767322361469269, 0.36378464102745056, 0.867172360420227, 0.29119518399238586, -0.15393337607383728, 0.25213611125946045, -0.030258454382419586, -0.27532655000686646, -0.8751985430717468, -0.3354091942310333, 0.08982079476118088, 0.40445676445961, -0.05755322426557541, -0.20795336365699768, 1.0453933477401733, 0.06438960880041122, 0.14159871637821198, -0.44276347756385803, -0.7332879900932312, 0.44683006405830383, -0.6011638641357422, -0.43703967332839966, 0.9407795071601868, 1.2600488662719727, -0.9180915951728821, -0.19145409762859344, 0.3684351146221161, -0.6113595962524414, -0.6186680793762207, -0.06702829897403717, 0.44037437438964844, 0.0577254593372345, -0.6090666055679321, -0.67686527967453, -0.14073005318641663, 0.791888952255249, 1.2983211278915405, -0.6797413229942322, -0.9377477169036865, 0.026104621589183807, 0.002500586211681366, 0.6734619736671448, -0.3600749373435974, -0.4859426021575928, 0.28935375809669495, -0.25887036323547363, -0.20125725865364075, -1.1320416927337646, 0.12654437124729156, -0.41371965408325195, 0.19836944341659546, 0.8286101818084717, 0.4089277386665344, -0.6084116697311401, 0.305874228477478, 0.8362943530082703, 1.3196208477020264, -0.4272621273994446, 0.14155367016792297, 1.1993752717971802, -0.7608587741851807, -0.24938003718852997, 0.4166712164878845, -0.34164080023765564, -0.7164101600646973, 1.3966069221496582, 0.1842719316482544, -0.3932875990867615, -0.08720587193965912, -0.0024885423481464386, -0.0983353704214096, 0.07610379159450531, 0.10145093500614166, 0.8245826959609985, 0.2206858992576599, 0.46613121032714844, 0.5964999794960022, 0.8553804755210876, 0.047342654317617416, -0.6342425346374512, 0.44146043062210083, -0.07817327231168747, 0.08509775251150131, -0.4073849320411682, -0.570923924446106, 0.4592619240283966, 0.059260960668325424, 0.8091621398925781, 0.16043901443481445, 0.6404651999473572, -0.7015365362167358, -0.4782419800758362, -0.09655443578958511, -0.11392519623041153, 0.3022889196872711, 1.1102445125579834, -0.3002280294895172, 0.5474172830581665, -0.6330676674842834, -0.5207372307777405, 0.35849103331565857, -0.6403241753578186, -0.6694198846817017, -0.2248014658689499, 0.6102932095527649, 0.6094659566879272, -0.2199753224849701, -0.12408298254013062, -0.25588691234588623, 0.7326384782791138, 0.11910589039325714, 0.010322792455554008, 0.17390039563179016, 0.39949628710746765, 0.2885652780532837, -0.4026263654232025, 0.6853739023208618, -0.8465296626091003, -1.062707781791687, -0.8048808574676514, 0.14866429567337036, -0.0200975239276886, 0.33638861775398254, -1.0627965927124023, -0.20597469806671143, -0.6585509181022644, 0.3964637517929077, -0.42271727323532104, 0.3422508239746094, -0.33586516976356506, 0.5504766702651978, 0.11341515183448792, 0.43062320351600647, 0.38355928659439087, 0.1631673276424408, 0.028494618833065033, 0.09033548831939697, 1.0734225511550903, 0.29145586490631104, 0.3386431932449341, 0.05573185533285141, 0.38668256998062134, -0.21481506526470184, 0.7326793670654297, 0.46962404251098633, 0.6254472136497498, 0.4054642915725708, -0.9464871287345886, -0.3126782178878784, -0.7804150581359863, -0.6361397504806519, 1.1802926063537598, 0.2841724455356598, 0.2932494282722473, -0.7587368488311768, 0.14325524866580963, -0.21557894349098206, -0.2519146502017975, 0.2892794609069824, 0.4353052079677582, 0.05374610424041748, -0.4044302999973297, 0.2772212028503418, 0.44893187284469604, 0.8168901801109314, -0.5159703493118286, 0.05127984285354614, -0.07906079292297363, 0.07874259352684021, -0.19184830784797668, 0.024883270263671875, -0.2994977831840515, -0.8908796310424805, 0.2894689738750458, -1.033141851425171, -1.1443676948547363, 0.027115348726511, -0.4413950443267822, -0.8843006491661072, 0.2633334994316101, 0.16155856847763062, -0.18143001198768616, -0.4340452253818512, -0.13534517586231232, 0.6675559878349304, 0.8297298550605774, -1.620424509048462, -0.0478750541806221, -0.7924292683601379, 0.10492290556430817, -1.1588443517684937, 0.5396803021430969, -0.23328852653503418, -0.5367871522903442, -0.8641131520271301, 0.04577129706740379, -0.9081952571868896, -0.6130248308181763, 0.21835562586784363, 0.00754740834236145, 0.5138764977455139, 0.385618656873703, 0.7304354310035706, -0.24746915698051453, 1.4227275848388672, 0.4114985466003418, -0.9272089004516602, -0.20446808636188507, 0.7575680613517761, -0.04947645217180252, 0.36690831184387207, -0.0753171369433403, -0.48955339193344116, 0.16400061547756195, 0.6950822472572327, 0.796556830406189, 0.10567907989025116, 1.092645287513733, -0.11592575162649155, 0.864288866519928, -0.13690586388111115, 0.5976158380508423, 0.15143530070781708, -0.10309546440839767, -0.28697264194488525, 1.5222625732421875, -0.029410772025585175, -0.27701815962791443, -0.2898039221763611, 0.4311669170856476, 0.2761283814907074, 0.01433992013335228, 0.4125245213508606, 0.6000123023986816, 0.5534193515777588, 0.31543514132499695, 0.8913508057594299, 0.711327850818634, 0.44659700989723206, -0.47488290071487427, -0.5959708094596863, -1.0232748985290527, -0.9673599004745483, -0.1320626437664032, -0.9237433075904846, 0.4658123254776001, -0.4616546034812927, -0.5537431240081787, -0.756135880947113, -0.2055044174194336, 0.8977864384651184, 0.9900720119476318, -1.073256015777588, -0.05154156684875488, -0.13122674822807312, 0.14755132794380188, 0.17332904040813446, 0.15981528162956238, -0.3830607533454895, 0.10646172612905502, 0.32112863659858704, -0.2840203642845154, -0.17590004205703735, -0.07977893203496933, -0.9956254959106445, -1.0929687023162842, 0.351550817489624, -0.3931729197502136, -0.45285850763320923, 0.6257066130638123, -0.15066084265708923, -0.5823119878768921, 1.1720454692840576, -0.6299132108688354, -0.6330057382583618, -0.7318710684776306, 0.6776159405708313, -0.1377057433128357, -0.07928688079118729, 0.574948787689209, -0.9032381176948547, -0.3774559199810028, -0.05741565674543381, -0.23508578538894653, -0.7008081674575806, -0.8436916470527649, -0.40787258744239807, -0.738305926322937, -0.20799748599529266, 0.06622184813022614, 0.4232769310474396, -0.5065373182296753, 0.19286414980888367, -0.6509308815002441, -0.24057413637638092, -0.04327694699168205, -1.1149649620056152, 0.32449042797088623, 0.04027672857046127, -0.009925730526447296, -0.00860348716378212, 0.035241805016994476, 0.961402416229248, 1.6239004135131836, -1.3194670677185059, -0.0843912810087204, 0.26747560501098633, 0.11685553193092346, -1.0494153499603271, 0.2685088515281677, 0.4690292179584503, -0.3495185673236847, -0.2709217667579651, -0.02148091048002243, -0.9055293798446655, 0.667119562625885, -0.4028824269771576, -0.058205414563417435, -0.6161108613014221, -0.08350023627281189, -0.055240094661712646, -0.7080701589584351, 0.9326834678649902, -0.7189999222755432, 0.015568023547530174, -0.6325771808624268, -0.48769688606262207, 0.4339579939842224, 0.06462158262729645, 1.0564589500427246, 0.012237174436450005, -0.28099945187568665, -0.09105395525693893, 1.1428085565567017, 0.003152664750814438, -0.365811288356781, 0.24031777679920197, 0.46469855308532715, 0.06441424041986465, -0.4390372335910797, 1.1711890697479248, -0.21651363372802734, -0.26394930481910706, 0.3631729483604431, -0.10780630260705948, 0.09698575735092163, -0.2340589463710785, -0.8349785804748535, 0.4507956802845001, 0.7355408668518066, -1.2916113138198853, 0.3771459460258484, -0.13844043016433716, 0.061914242804050446, 1.2471542358398438, 0.9479784965515137, 0.48663467168807983, 0.7191925644874573, 0.25303512811660767, -1.073193073272705, 0.16893619298934937, 0.835830569267273, 0.32706326246261597, 0.2556898891925812, 0.5823195576667786, -0.5997410416603088, 0.08555889874696732, 0.36556774377822876, -1.1713024377822876, -0.6900826096534729, 0.8452309370040894, 0.16897399723529816, 0.3335384726524353, -0.8841701745986938, 0.6400192379951477, -0.38393110036849976, 0.10511837154626846, 0.08530358970165253, -0.8921894431114197, -0.7649389505386353, -0.26284167170524597, 0.8040898442268372, -0.9934847354888916, 0.13527844846248627, -0.00034137204056605697, -0.13388687372207642, 0.7602734565734863, 0.19115203619003296, -0.5571087598800659, -1.1135334968566895, 1.2016669511795044, 1.0383418798446655, -0.05373189598321915, 0.0072942450642585754, 0.9476677179336548, -0.12187536060810089, 0.005340047180652618, 0.6960304379463196, -0.4471643567085266, 0.22554655373096466, 0.022479910403490067, -0.1371382176876068, 0.045324329286813736, 0.19908663630485535, -0.9888361692428589, -0.1708393692970276, -0.7932950258255005, 0.8040523529052734, 0.18941457569599152, 0.38939759135246277, 0.35048311948776245, -0.46439129114151, -0.6405993700027466, -0.44786250591278076, -0.8720367550849915, -0.4702659845352173, -1.059401273727417, 0.169228196144104, 0.6573431491851807, -0.15779396891593933, 0.41328224539756775, 0.4089913070201874, 0.32797449827194214, 0.2709249258041382, 0.5829678177833557, -0.7375612258911133, -0.8174483180046082, 0.48164117336273193, -0.28550973534584045, 0.438120573759079, -0.5372886657714844, 0.6216517686843872, 0.3498506546020508, -0.6326940059661865, 0.3916647136211395, 0.5389883518218994, -0.29212018847465515, -0.13139373064041138, 0.37030261754989624, 0.8290108442306519, 1.1654198169708252, 0.7834715843200684, 0.8289585709571838, 0.281147301197052, -0.10776208341121674, -0.22225645184516907, 1.0595126152038574, -0.47823986411094666, 0.558658242225647, 0.5952408313751221, 0.6904051303863525, -0.3643483817577362, 0.23490102589130402, -0.7168416976928711, 0.08201944082975388, -0.21190235018730164, 1.0949116945266724, -0.4312162697315216, -0.21731089055538177, 0.8046319484710693, -0.34188300371170044, -0.3342248201370239, 0.34150010347366333, 0.5644885897636414, 0.22329670190811157, 0.5374613404273987, 0.6638902425765991, -0.04241419583559036, 0.6314306855201721, -1.264382243156433, 0.17592641711235046, -0.08156540989875793, 0.585129976272583, -0.2954152226448059, 0.7137546539306641, 1.0163776874542236, 0.3597449064254761, -0.010748045518994331, -0.06771355122327805, -0.41270333528518677, 0.04926379770040512, 0.32313305139541626, -0.22411608695983887, -0.11568377912044525, -0.18440884351730347, -0.044718608260154724, -0.41990965604782104, 0.43218889832496643, -1.1676831245422363, -0.1465362012386322, -0.9497988820075989, -0.4961434602737427, 0.48876118659973145, 0.01912865787744522, -0.20438435673713684, -0.9661914110183716, -0.8275837898254395, 0.5886679291725159, -0.24904929101467133, -0.8241468071937561, -1.2563287019729614, 0.42271888256073, -0.21833771467208862, -0.18834301829338074, -0.35647517442703247, 0.21790724992752075, -0.160165473818779, 0.28961777687072754, 1.2050076723098755, 0.5882022380828857, 0.6677361130714417, -0.7755512595176697, -0.6698856949806213, 0.3085930347442627, 0.4423472285270691, -0.8612925410270691, -0.7086820006370544, 0.9501177072525024, -0.07202187180519104, 0.2954002618789673, -0.3254663944244385, 0.026675090193748474, 0.025175977498292923, -0.11680497229099274, -0.16134566068649292, -1.1736458539962769, -0.17515845596790314, -0.22735998034477234, -1.1782408952713013, 0.23448622226715088, 0.555097222328186, 1.0272732973098755, -0.12091849744319916, -0.49476057291030884, -0.6139771342277527, -0.3111691474914551, -0.6111134886741638, -0.1311991959810257, 0.08840072154998779, -0.7535379528999329, -0.423729807138443, 0.9652791619300842, -0.9326862096786499, -0.030271410942077637, -0.2672901749610901, 0.572633683681488, 0.5541442632675171, 0.06106690689921379, -0.25328925251960754, 0.10050307214260101, 0.2205144464969635, -0.0029218830168247223, 0.37970659136772156, 1.0893585681915283, -0.48965054750442505, -0.28187936544418335, -0.45352500677108765, 0.05686216801404953, 0.23815053701400757, 0.9108790755271912, -0.8947011232376099, -0.3918737471103668, 0.6078469753265381, -0.46113941073417664, -0.25768133997917175, 0.028962820768356323, 0.5540531873703003, 0.7688850164413452, 1.1604217290878296, -0.2545185089111328, -0.3496750295162201, -0.29876038432121277, -0.6075044870376587, 0.10560159385204315, -0.5743539929389954, -0.37509840726852417, 0.6476843953132629, -0.7451499700546265, 0.909752368927002, -0.4916507303714752, 1.3429774045944214, 0.8164942264556885, 0.5551773309707642, -1.136173963546753, -0.88737952709198, -0.18391817808151245, 0.7150078415870667, -1.311277985572815, -0.646000862121582, -0.28940948843955994, -0.8243460655212402, -0.6619224548339844, -0.6285117268562317, -0.4861736595630646, -0.544216513633728, -0.7019585371017456, 1.073026180267334, -0.3136158287525177, 0.46049872040748596, -0.30617138743400574, 0.011663413606584072, 0.03099004179239273, 0.46234264969825745, -0.17282962799072266, -0.22173409163951874, 0.1355520337820053, 0.05259473994374275, 0.35731786489486694, 1.1285725831985474, -0.5342696309089661, 0.36401888728141785, 0.19066904485225677, 0.5359245538711548, -0.1151067465543747, -0.7907774448394775, 0.34285256266593933, 0.03209856152534485, -0.18068496882915497, -0.7659915089607239, 0.37322714924812317, -0.5482969284057617, -0.9801268577575684, -0.7808899879455566, -0.13044746220111847, -0.03961855173110962, -0.12683773040771484, -0.46861815452575684, 0.16096775233745575, 0.48979854583740234, 0.07882393151521683, 0.18732787668704987, 1.2765270471572876, -0.3927789628505707, 0.32367679476737976, 1.0622624158859253, -0.48433879017829895, 0.14840534329414368, -1.0559195280075073, -0.5786721706390381, -0.19665919244289398, -0.23280677199363708, 0.07061266154050827, -0.6196882724761963, -0.20694640278816223, -0.21130990982055664, 0.14239314198493958, 0.7344328165054321, 0.21972158551216125, 0.2129114270210266, 0.15087349712848663, -0.0742127075791359, -0.41161251068115234, 0.4481009840965271, -0.6592992544174194, 0.3445277512073517, -0.06725998222827911, -1.0093241930007935, -0.39873993396759033, 1.0417283773422241, -0.2408204823732376, 0.1389636993408203, -1.0223289728164673, -0.5833547711372375, -0.39379027485847473, -0.45204979181289673, -0.6208251714706421, -0.8309588432312012, -0.4838869571685791, -0.8137210011482239, -0.02119668386876583, 0.38687852025032043, 0.07786019891500473, -0.3280502259731293, -0.47039806842803955, 0.05917166918516159, 0.006624951958656311, 0.47081729769706726, -0.653079628944397, 0.11899271607398987, -0.6001037955284119, -0.4698660373687744, 0.30799829959869385, 0.7207427620887756, -0.46462497115135193, -0.8700082302093506, -0.693430483341217, 1.330476999282837, -0.24625979363918304, 0.5697460174560547, -0.4665035307407379, -0.17081290483474731, 0.4641020596027374, 0.23030589520931244, -0.8305099606513977, 0.9053868055343628, 1.9754174947738647, -0.4045619070529938, 0.1424213945865631, 0.09013987332582474, -0.7641696929931641, -0.43348047137260437, 0.254970520734787, 0.39023301005363464, -0.6019237041473389, -0.11761455237865448, -0.10652146488428116, -0.619891345500946, -0.5585184097290039, -0.39253541827201843, -0.14359170198440552, -0.02045685052871704, 0.08113674819469452, -0.47777172923088074, -0.7695105075836182, 0.7146340608596802, -0.2978198528289795, -0.026410579681396484, -0.679672360420227, 0.5441126823425293, 0.4229040741920471, -0.4900919198989868, 0.10135972499847412, -0.16382941603660583, -0.4112054705619812, 0.5304985642433167, -0.422262579202652, 0.16837505996227264, -0.21960686147212982, 0.07616384327411652, -0.43301618099212646, -0.2341102510690689, 0.1110062301158905, 0.1406770646572113, 0.05324694514274597, 1.0372394323349, 0.43421730399131775, 0.20339930057525635, -0.0714939683675766, 0.542420506477356, -0.6587614417076111, -0.09479944407939911, -0.37277328968048096, -0.24222326278686523, 0.332794189453125, -0.21185892820358276, -0.34475618600845337, -0.8007310628890991, -0.6896945834159851, -0.027323976159095764, -0.37570860981941223, 0.10788998007774353, -0.5305667519569397, 0.30824315547943115, 0.7836369276046753, 0.5023688673973083, -0.6462723016738892, 0.8511403203010559, -0.39739662408828735, 0.6937305927276611, -0.08644947409629822, -0.4637433588504791, 0.18260690569877625, 0.33808445930480957, -0.17987027764320374, -0.006979070603847504, -0.12938007712364197, 0.20369721949100494, 0.20433661341667175, 0.36284542083740234, 0.5256175994873047, 0.339283287525177, -0.052791714668273926, 1.0536131858825684, -0.8461920022964478, 0.6701087951660156, 0.003691643476486206, 0.35625624656677246, 0.12332391738891602, -0.6573437452316284, -0.05978787690401077, -0.18037474155426025, 1.1996794939041138, 0.4259011745452881, 0.1476624608039856, 0.8975593447685242, 0.0557866096496582, -0.6699909567832947, 0.22699102759361267, 0.5546096563339233, 0.5099072456359863, 0.8300193548202515, 0.9974905252456665, -1.2952442169189453, 0.13909275829792023, 0.5345084071159363, 0.1415347009897232, 0.04033251851797104, 0.8311087489128113, 0.3860299587249756, -0.3479769825935364, 0.09967685490846634, 0.49587464332580566, -0.5085391998291016, 0.34436240792274475, -0.8622831702232361, 0.6187289953231812, -0.022166062146425247, -0.1321491301059723, 0.0720633715391159, -0.5104812383651733, 0.5399727821350098, 0.012869700789451599, -0.23510035872459412, 0.7465575933456421, 0.39066028594970703, 0.6935822367668152, -0.7358700037002563, -1.0268124341964722, -0.41557225584983826, 0.5272613763809204, 0.36830899119377136, -0.08851656317710876, -0.17620517313480377, 0.617353081703186, -0.6875759363174438, -0.9804831743240356, -0.699802577495575, 0.01282181590795517, 0.35673341155052185, 0.2078530192375183, 0.25989022850990295, -0.693341851234436, 0.328828901052475, 0.5572656989097595, -0.23232977092266083, 1.2812203168869019, -1.2430477142333984, -0.564706027507782, -0.6317421197891235, -0.05283181369304657, -1.0940709114074707, -0.08549457788467407, 0.42516979575157166, -0.4948086142539978, 0.7227563261985779, 0.4729738235473633, -0.29119032621383667, 0.08366723358631134, -0.35979485511779785, 0.8140606880187988, 0.657080352306366, 0.05383536219596863, 0.514124870300293, -1.2432138919830322, -0.7750939726829529, 0.4888150095939636, -0.3858971893787384, -0.8024272918701172, -0.44721364974975586, 0.5933371186256409, -0.46949437260627747, -0.1451990157365799, 0.4638333320617676, 0.709322988986969, 0.12979909777641296, 0.2850637435913086, -0.7411597371101379, 0.4654315710067749, 1.1913286447525024, -0.15056781470775604, 0.2919842600822449, -0.3896973133087158, 0.5137627124786377, 0.48908865451812744, -0.24162420630455017, -0.3973892629146576, -0.9040055274963379, 0.031022492796182632, -0.3406449854373932, 1.2701733112335205, 0.0048257410526275635, -0.053963929414749146, -0.09831936657428741, -0.2770848870277405, -0.5249350666999817, -0.6511154770851135, -0.5731566548347473, 0.04406149685382843, 0.9729418158531189, 1.0695326328277588, -0.1138397604227066, 0.743656575679779, -0.4605046510696411, -0.02104860544204712, 1.3837884664535522, 0.5222346186637878, -0.08241817355155945, 1.0239756107330322, 0.09018297493457794, 0.3138029873371124, -0.06273296475410461, 0.08198478817939758, -0.28149712085723877, -0.5058165788650513, -0.3699208199977875, -0.1168360561132431, -1.2314711809158325, -0.8921575546264648, -0.15453292429447174, 0.3605525493621826, 1.106641173362732, -0.3346380293369293, -1.2032334804534912, -0.27449116110801697, -0.10209019482135773, -1.43972909450531, -0.1842588484287262, -0.803627073764801, 0.5417440533638, -0.09008854627609253, -0.6216168999671936, 0.5643336176872253, -0.8007320165634155, 3.5021815299987793, 1.016852855682373, 0.5111916065216064, -0.06203165650367737, 0.3394480347633362, 1.3156750202178955, 0.7115001082420349, -0.6001495718955994, -0.583597719669342, -1.116575002670288, -0.23857945203781128, -0.5605703592300415, 0.20842598378658295, -0.12406795471906662, -0.07421883940696716, 0.3580460846424103, -0.47767090797424316, 1.2727248668670654, 1.255099892616272, -1.141005277633667, -1.0805120468139648, 0.3944804072380066, 0.24160261452198029, 0.4390304684638977, -0.42183157801628113, 0.5386890172958374, 0.5698067545890808, 0.2089328169822693, 0.3864547908306122, -0.28130751848220825, -0.1943625807762146, -1.0028913021087646, 0.1828201562166214, -0.2802167534828186, -0.21930807828903198, 0.9968680739402771, -0.17843198776245117, -0.8042709827423096, 0.026393946260213852, -0.45423999428749084, -0.060831256210803986, 0.674354076385498, 0.2047150731086731, 0.20027579367160797, 0.03743268549442291, 0.6377115249633789, -0.19428640604019165, -0.5843294262886047, 1.161407232284546, 0.003731667995452881, 0.9762888550758362, -0.1507333666086197, -0.3480280637741089, 0.3626687228679657, -0.5249403715133667, 0.0167134590446949, -0.1862075924873352, 0.03579982370138168, 0.012735847383737564, -0.37850436568260193, 0.5220426917076111, 0.8285608887672424, -0.9965909719467163, 0.003847181797027588, 0.07117022573947906, 0.2738375663757324, 0.2219880223274231, 1.2206134796142578, 0.09095965325832367, -0.43485915660858154, 0.24298234283924103, -0.3875679075717926, -0.4092029333114624, 0.014655247330665588, -0.14599615335464478, 0.21616807579994202, 0.4090018570423126, -0.20996913313865662, -0.08457768708467484, -0.0279887355864048, 0.48800235986709595, 0.002125512808561325, -0.7597452402114868, 0.16583509743213654, 0.5458806157112122, 0.28260183334350586, -0.4618920683860779, -0.4035499691963196, -0.7964058518409729, 0.48726287484169006, -0.016892068088054657, 0.7664936780929565, -0.039836250245571136, 0.3039447069168091, 0.6841843724250793]}
]

"""

# Specify the number of top embeddings to load
top_n = 5

# Iterate through the top N embeddings
for idx, embedding_dict in enumerate(tqdm(embeddings_tensor[:top_n])):
    for id, embedding in embedding_dict.items():
        update_data["id"] = id
        update_data["embedding"] = embedding

"""
print(update_data)

# Update the embeddings in Milvus with tqdm progress bar
collection.insert(data=update_data)


print("Embeddings updated successfully.")


# Example data matching the schema
update_data = [
    {"id": 1,  "embedding": [0.1, 0.2, 0.3, ...]},
    {"id": 2,  "vector": [0.4, 0.5, 0.6, ...]}
]


In [None]:


{'0089abb82785319d86a1f289b195ff11aec07d9af5b7689a43770e63490c20dc': [-0.7214671969413757, -0.16116780042648315], 
'ac3a3817216183abf66ef7904b63de7d9e2590a477c3c94b4a1a741f2591ac19': [-0.7828123569488525, -0.5159667730331421], 
'b7ad834cca68dd2d26ce5a3bba4d424a9b4c2216a1be407a648e5549e73a1320': [0.7405392527580261, -0.9613822102546692]}




In [None]:
from pymilvus import Collection

# Assuming you have already created a collection
collection = Collection(name="html_chunks")

# Get the schema of the collection
schema = collection.schema

# Extract field names
field_names = [field.name for field in schema.fields]

print("Field names:", field_names)


In [None]:
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType

# Connect to Milvus
connections.connect(host='localhost', port='19530')

# Define fields
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True ),
    FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=2)
]

# Enable dynamic fields in the schema definition
schema = CollectionSchema(fields, enable_dynamic_field=True)

# Create the collection with the schema
collection = Collection(name="example_collection", schema=schema)

# Now you can insert data with dynamic fields
data = [
    {"id": 1, "vector": [0.1,0.1]},
    {"id": 2, "vector": [0.4,0.1]}
]

# Insert data into the collection
collection.insert(data)


In [None]:
# Now you can insert data with dynamic fields
data = [
    {"id": 1, "vector": [0.1,0.1]},
    {"id": 2, "vector": [0.4,0.1]}
]

# Insert data into the collection
collection.upsert(data)



# Extract field names
field_names = [field.name for field in schema.fields]
print("Field names:", field_names)

# Get collection statistics
stats = collection.num_entities
print("Number of entities:", stats)

In [None]:
from pymilvus import connections, Collection

# Connect to Milvus
connections.connect(host='localhost', port='19530')

# Get the collection you want to delete
collection = Collection(name="html_chunks")

# Drop the collection
collection.drop()

print("Collection deleted successfully.")


In [None]:
from pymilvus import connections, Collection

# Connect to Milvus
connections.connect(host='localhost', port='19530')

# Get the collection
collection = Collection(name="example_collection")

collection.load()

# Define the query vector
query_vector = [[0.1, 0.1]]

# Perform the search
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
results = collection.search(data=query_vector, anns_field="vector", param=search_params, limit=10)

# Print the results
for result in results:
    print(result)


In [None]:
from pymilvus import MilvusClient

client = MilvusClient("default.db")


In [5]:
import os
from groq import Groq

client = Groq(
    api_key = "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",
)

chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Explain the importance of fast language models",
        }
    ],
    model="llama3-70b-8192",
)

print(chat_completion.choices[0].message.content)

Fast language models are artificial intelligence (AI) models that can process and generate human-like language quickly and efficiently. The importance of fast language models lies in their ability to revolutionize various industries and aspects of our lives. Here are some reasons why fast language models are crucial:

1. **Improved User Experience**: Fast language models enable applications to respond rapidly to user input, providing a seamless and interactive experience. This is particularly important for applications like chatbots, virtual assistants, and language translation systems, where quick responses are essential for effective communication.
2. **Real-time Processing**: Fast language models can process and analyze large amounts of text data in real-time, making them ideal for applications that require immediate insights, such as sentiment analysis, entity recognition, and topic modeling.
3. **Enhanced Customer Service**: Fast language models can help companies respond quickly 

In [17]:
import io
import asyncio
import edge_tts
from pydub import AudioSegment
from pydub.playback import play
import nest_asyncio

# Apply the nest_asyncio patch
nest_asyncio.apply()

async def text_to_speech_and_play(text, voice="en-GB-MiaNeural", speed=1):
    try:
        rate = "+" + str(int((speed - 1) * 100)) + "%"
        communicate = edge_tts.Communicate(text, voice, rate=rate)
        await communicate.save("output.mp3")

        # Load the saved audio file
        sound = AudioSegment.from_file("output.mp3", format="mp3")

        # Play the audio
        play(sound)

    except Exception as e:
        print(f"Error occurred during playback: {e}")

# Example usage
text = "Hello, this is a test of the edge-tts library for text-to-speech conversion."
asyncio.run(text_to_speech_and_play(text, speed=1.2))  # Increase speed to 1.5x


In [3]:
import torch
print(torch.cuda.is_available())


False


In [11]:
from pymilvus import connections, utility

# Specify the collection name you want to check
collection_name = "html_chunks"

# Set up a Milvus client
connections.connect("default", host="localhost", port="19530")

# Get the load state of the collection
collection_load_state = utility.loading_progress(collection_name)

# Handle different load states
if collection_load_state["num_loaded_entities"] == 0:
    print(f"Collection '{collection_name}' state: NotLoad")
elif collection_load_state["num_loaded_entities"] < collection_load_state["num_total_entities"]:
    print(f"Collection '{collection_name}' state: Loading")
elif collection_load_state["num_loaded_entities"] == collection_load_state["num_total_entities"]:
    print(f"Collection '{collection_name}' state: Loaded")
else:
    print(f"Collection '{collection_name}' state: NotExist")


KeyError: 'num_loaded_entities'

In [1]:
import subprocess

def is_docker_running():
    try:
        # Run the `docker info` command to check if Docker is running
        result = subprocess.run(['docker', 'info'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if result.returncode == 0:
            return True
        else:
            return False
    except FileNotFoundError:
        # Docker command not found
        return False

# Example usage
if is_docker_running():
    print("Docker is running")
else:
    print("Docker is not running")


Docker is running


In [13]:
import openwakeword as oww
import sounddevice as sd
import numpy as np

# Initialize OpenWakeWord
model = oww.Model(model_path=None)  # Default path loads available models

# Add or configure the wake word if it's not pre-loaded
wake_word = "hey llama"
model.add_phrase(wake_word)

# Define a callback function to handle audio stream
def audio_callback(indata, frames, time, status):
    if status:
        print(status)

    # Convert the audio input to the format expected by OpenWakeWord
    audio_data = np.frombuffer(indata, dtype=np.float32)
    
    # Check if the wake word is detected
    if model(audio_data):
        print("Wake word detected!")
        # Insert the action to be performed after wake word detection
        interact_with_ollama()

# Function to interact with Ollama after wake word detection
def interact_with_ollama():
    print("Listening for commands...")

    # Your existing logic to interact with the Ollama model goes here
    user_input = input("Enter your message: ")
    if user_input.lower() == "exit":
        return
    # Call your existing function to interact with the model
    # chat_with_model(user_input, ...)

# Start streaming from the microphone
def start_audio_stream():
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=16000):
        print("Listening for wake word...")
        while True:
            sd.sleep(1000)

# Start the wake word detection
if __name__ == "__main__":
    start_audio_stream()




ValueError: Tried to import the tflite runtime for provided tflite models, but it was not found. Please install it using `pip install tflite-runtime`

In [65]:
import chromadb

# Initialize the client
client = chromadb.PersistentClient(path=r"C:\Users\deletable\OneDrive\easy-local-rag\chroma")

# List all collections
collections = client.list_collections()

# Print the names of all collections
for collection in collections:
    print(collection.name)


cosine_HTML_chunks


Delete the collection

In [64]:
import chromadb

# Initialize the client
client = chromadb.PersistentClient(path=r"C:\Users\deletable\OneDrive\easy-local-rag\chroma")

# Delete the collection
client.delete_collection("Pubmed_cosine_HTML_chunks")


the bottom script will give you filenames:

In [68]:

import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
import pprint

top_k = 3

client = chromadb.PersistentClient(
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

# Get or create the collection
collection_name = "Pubmed_cosine_HTML_chunks"
collection = client.get_or_create_collection(name=collection_name)


# Perform similarity search
search_result = collection.query(
    query_embeddings=[[-0.08516432344913483, 0.08014687895774841, -0.07012069970369339, -0.10898280143737793, -0.09050536900758743, -0.08526218682527542, 0.10030454397201538, 0.15522067248821259, 0.012114186771214008, 0.10639500617980957, 0.008805494755506516, 0.05442481115460396, 0.03465191647410393, 0.028559671714901924, -0.134894460439682, -0.18552149832248688, 0.026617251336574554, 0.019334273412823677, 0.10644889622926712, 0.05985170602798462, -0.03649519383907318, 0.015249935910105705, -0.03107881173491478, -0.010054516606032848, -0.46007677912712097, 0.10861436277627945, -0.05994759500026703, 0.04516821727156639, -0.011839311569929123, -0.347425639629364, 0.020754985511302948, 0.025376571342349052, 0.10062119364738464, 0.8449876308441162, -0.10185477882623672, -0.1434156596660614, -0.054548583924770355, 0.05233529210090637, 0.2184046357870102, -0.0813790038228035, 0.05146090313792229, -0.05304325744509697, 0.11248011142015457, -0.13038092851638794, 0.10295883566141129, -0.004762962460517883, -0.15500953793525696, 1.1274689313722774e-05, -0.08904366940259933, 0.06143191456794739, -0.018922479823231697, -0.053873658180236816, 0.169210284948349, 0.14936034381389618, -0.09829618781805038, -0.22343048453330994, 0.15945053100585938, 0.0070803300477564335, 0.07069210708141327, 0.010721355676651001, 0.02663564868271351, -0.004208119586110115, -0.1399766206741333, -0.0977683961391449, 0.06209952384233475, -0.0556347630918026, 0.08135731518268585, -0.061552803963422775, -0.09494484961032867, -0.022429365664720535, 3.7706660805270076e-05, 0.043919917196035385, 0.02659017965197563, -0.21362948417663574, -0.04170606657862663, 0.14489993453025818, -0.0059790355153381824, -0.011542046442627907, -0.0752001628279686, 0.11277741938829422, 0.046247635036706924, -0.0909157395362854, -0.04400252550840378, -0.06202460452914238, -0.0011787934927269816, -0.014479072764515877, 0.04113598167896271, 0.03770507127046585, -0.13029788434505463, 0.025773290544748306, -0.15488074719905853, 0.11903590708971024, 0.049613092094659805, 0.007639687042683363, 0.01567675545811653, 0.04785308986902237, 0.06721355020999908, -0.046504415571689606, -0.045879293233156204, -0.059470582753419876, 0.06168476864695549, -0.10027970373630524, -0.0674644261598587, -0.14495448768138885, 0.020851541310548782, 0.12003814429044724, 0.20522953569889069, 0.08428669720888138, -0.05892927199602127, -0.11536426842212677, -0.23416019976139069, 0.0018085071351379156, -0.07070806622505188, 0.010645189322531223, -0.004336973186582327, -0.020631389692425728, 0.03065110743045807, 0.07985195517539978, -0.1321975141763687, -0.041398320347070694, -0.1763782799243927, -0.03564571589231491, 0.0235776174813509, -0.006365486886352301, -0.006042829714715481, -0.017963794991374016, 0.06799042969942093, 0.08076076954603195, 0.0032358269672840834, 0.13598176836967468, 0.18494445085525513, -0.13559812307357788, -0.009217245504260063, 0.006775752641260624, -0.05495721846818924, -0.04928441718220711, 0.026240170001983643, 0.06865938752889633, 0.038902461528778076, 0.08401056379079819, 0.0770847350358963, -0.07557597011327744, -0.1437402069568634, -0.15276378393173218, 0.03405749797821045, -0.04480227082967758, 0.07376246154308319, -0.0685897171497345, -0.10108672827482224, 0.21495544910430908, 0.08362596482038498, -0.13870815932750702, -0.1295190006494522, 0.023982951417565346, 0.06405029445886612, 0.006037629209458828, -0.007884057238698006, 0.08326654881238937, 0.013188078068196774, 0.08377881348133087, -0.08425391465425491, -0.0108269564807415, -0.09302978217601776, 0.03650178760290146, -0.020379116758704185, -0.0333290658891201, 0.17171123623847961, -0.03758701682090759, -0.1286001354455948, 0.06436650454998016, 0.05689525604248047, -0.05462693050503731, 0.08488926291465759, -0.06198824569582939, 0.05298083275556564, -0.06664018332958221, -0.07788290828466415, -0.012998498976230621, -0.049315884709358215, 0.014422333799302578, -0.3581048548221588, -0.22451913356781006, 0.0625971332192421, -0.14038829505443573, 0.06566029042005539, -0.08482363075017929, 0.0022657401859760284, 0.07498940825462341, 0.09196881204843521, 0.006403515115380287, -0.06539736688137054, 0.17259357869625092, -0.09097731113433838, 0.04770367220044136, -0.15324190258979797, -0.024541500955820084, -0.024288836866617203, 0.10357131063938141, 0.03912923485040665, 0.16567009687423706, 0.18381023406982422, 0.06878244876861572, 0.00738576939329505, -0.11536312848329544, -0.05019015073776245, 0.05222740024328232, 0.15369150042533875, 0.01827181689441204, -0.1902027577161789, 0.13950492441654205, 0.036528222262859344, 0.04520252346992493, 0.0027360222302377224, 0.06001190468668938, 0.07423355430364609, 0.11115796864032745, -0.11186251789331436, -0.08247027546167374, 0.05890597030520439, 0.05308829993009567, -0.011163468472659588, -0.03608299419283867, 0.012121322564780712, -0.0008161090663634241, 0.09022685140371323, 0.12506060302257538, -0.05633295699954033, -0.09302274882793427, -0.057294853031635284, -0.23456181585788727, 0.006746804807335138, -0.5065657496452332, 0.09578386694192886, -0.12967583537101746, 0.011693300679326057, 0.051802828907966614, 0.05914287641644478, 0.6072143912315369, 0.07127012312412262, -0.020410198718309402, -0.03489945828914642, 0.04361039400100708, -0.04665124788880348, 0.15553252398967743, 0.16537770628929138, 0.05246272310614586, 0.1877719610929489, 0.02050972357392311, 0.10738631337881088, 0.1469716876745224, 0.037049371749162674, -0.14762315154075623, -0.09966325759887695, -0.0365191288292408, 0.14608824253082275, 0.02446305938065052, 0.054565444588661194, -0.2562551200389862, 0.009446523152291775, -0.11061955243349075, -0.13682974874973297, -0.0933014377951622, 0.11133664846420288, -0.12674570083618164, 0.00010131915769306943, -0.05157821625471115, -0.13210667669773102, 0.058024194091558456, 0.03807764872908592, -0.037507303059101105, 0.0571536161005497, -0.0036925848107784986, 0.16140064597129822, -0.051283303648233414, -0.09363976866006851, 0.2224537581205368, -0.04274001345038414, 0.06070416048169136, 0.10055593401193619, -0.09755372256040573, 0.0814170241355896, 0.01872110366821289, -0.010833565145730972, -0.10512725263834, -0.15411126613616943, 0.07903236895799637, -0.13297833502292633, -0.13459710776805878, -0.10537867993116379, -0.14940211176872253, -0.09323669224977493, 0.016325145959854126, -0.014117883518338203, -0.00378767354413867, -0.0701071098446846, -0.25391480326652527, 0.08236627280712128, -0.0007901711505837739, 0.022089453414082527, 0.05172843486070633, -0.034119218587875366, -0.048584919422864914, -0.059167202562093735, -0.253111332654953, 0.0625821202993393, 0.012629946693778038, 0.06776551902294159, 0.16014696657657623, 0.04319840669631958, 0.12698620557785034, -0.02920190431177616, 0.0630149319767952, -0.0025288185570389032, -0.04462449625134468, 0.03889352083206177, -0.20589697360992432, 0.05584607273340225, -0.013392653316259384, -0.03847014904022217, -0.08238102495670319, -0.05995403602719307, 0.10177063196897507, 0.001174315926618874, 0.010950235649943352, -0.044170524924993515, -0.019200902432203293, 0.26440417766571045, 0.03493909910321236, -0.057181790471076965, 0.06351670622825623, 0.07400666177272797, -0.025124123319983482, -0.15494215488433838, -0.021451737731695175, -0.4534289836883545, 0.012067782692611217, 0.0621144101023674, 0.21160103380680084, 0.11134129017591476, 0.1082671731710434, -0.0463523231446743, -0.05608126521110535, -0.19433577358722687, -0.012829700484871864, -0.015744434669613838, 0.0911993458867073, -0.02424774505198002, 0.049095675349235535, -0.09813039004802704, 0.03580354526638985, 0.0695047378540039, 0.12124886363744736, -0.008460629731416702, -0.08389204740524292, 0.23917070031166077, 0.046481065452098846, -0.04244881123304367, 0.14896991848945618, -0.021434321999549866, -0.01340445689857006, -0.012449564412236214, 0.08457997441291809, -0.0808805450797081, 0.09895525127649307, -0.018134159967303276, 0.05155997723340988, -0.18139728903770447, -0.11717859655618668, -0.1516074538230896, 0.05345912650227547, -0.018325889483094215, -0.21223783493041992, 0.03504456952214241, -0.07194051146507263, 0.22237731516361237, -0.014699237421154976, 0.004607155919075012, -0.11539514362812042, -0.18897756934165955, -0.08300289511680603, 0.09956438839435577, -0.08646820485591888, 0.01783558912575245, -0.10573551058769226, 0.061314161866903305, -0.039543598890304565, -0.05980094522237778, 0.0326862595975399, 0.08478645235300064, -0.00326493545435369, -0.12469702214002609, 0.09936073422431946, -0.05430418998003006, -0.0691470131278038, -0.027656666934490204, -0.01953360252082348, 0.5362934470176697, 0.09254541248083115, -0.004330520983785391, -0.031267616897821426, -0.060905858874320984, 0.0640263557434082, -0.009710782207548618, 0.111631840467453, 0.16338874399662018, 0.055665161460638046, 0.09687992930412292, 0.1281154602766037, -0.13049761950969696, -0.03607385233044624, 0.3053611218929291, -0.013016009703278542, 0.10927309840917587, -0.03379428759217262, 0.3288969397544861, -0.0020465394482016563, -0.07236519455909729, -0.08383750915527344, -0.02694801613688469, -0.18820618093013763, -0.011265580542385578, 0.07832682877779007, 0.05027671530842781, -0.0859573483467102, -13.222697257995605, -0.07487444579601288, 0.031408946961164474, -0.01147894561290741, 0.03556213527917862, -0.04903572425246239, -0.13901250064373016, 0.07206080853939056, 0.027002518996596336, -0.19686684012413025, 0.024814842268824577, -0.016392625868320465, -0.07458216696977615, 0.041850555688142776, 0.00010878340981435031, 0.12716178596019745, 0.006737975869327784, -0.01019476167857647, -0.030887970700860023, -0.04285993427038193, 0.07794985920190811, -0.04220212623476982, -0.04086093604564667, 0.08307884633541107, 0.14243018627166748, 0.041468191891908646, 0.174406036734581, 0.03215872123837471, 0.30663198232650757, 0.019826974719762802, 0.023045651614665985, 0.055014777928590775, -0.16390317678451538, -0.12218941748142242, -0.06322915107011795, -0.1069844514131546, 0.018192971125245094, -0.1352004110813141, -0.04394420608878136, -0.03894772380590439, -0.06848055869340897, 0.04126720502972603, -0.127117320895195, -0.05957122519612312, 0.012195548042654991, -0.1003977358341217, -1.523532748222351, -0.3970016837120056, -0.05050989240407944, -0.03827812150120735, 0.24339771270751953, -0.011906655505299568, 0.047624148428440094, -0.3140111565589905, -0.05335318297147751, 0.04418649151921272, 0.05666784569621086, -0.035893600434064865, 0.0839119404554367, -0.03559822589159012, -0.09226492792367935, -0.08940377086400986, 0.028889384120702744, -0.06184031814336777, -0.0261959470808506, 0.05819745734333992, 0.270313560962677, -0.15130431950092316, -0.03375469893217087, 0.0116941649466753, -0.07246489822864532, -0.006499141920357943, 0.20694848895072937, 0.004197890870273113, 0.171929731965065, -0.004924568813294172, 0.06303209811449051, -0.052851706743240356, 0.47219815850257874, -0.053185075521469116, 0.031586870551109314, -0.020945684984326363, 0.06357955932617188, 0.0188713651150465, -0.08179040998220444, 0.04007582738995552, -0.08294269442558289, 0.05936206877231598, 0.04595489054918289, 0.01629425399005413, -0.04653998836874962, 0.08335790783166885, -0.058757465332746506, -0.05139465630054474, -0.01358104683458805, -0.023513659834861755, 0.057811155915260315, -0.13859158754348755, 0.01796201802790165, 0.12293550372123718, -0.2364118993282318, 0.01733630895614624, 0.10337008535861969, -0.10650324821472168, 0.11293976753950119, -0.06416315585374832, -0.06942905485630035, -0.13072232902050018, 0.08175444602966309, -0.043886423110961914, -0.10394897311925888, -0.16042537987232208, -0.049101363867521286, -0.0013463685754686594, -0.029396839439868927, -0.09348718076944351, 0.005376279819756746, 0.01603480614721775, -0.06578020751476288, 0.07253937423229218, -0.13316868245601654, 0.02583414502441883, -0.07272601127624512, 0.024255450814962387, -0.19793717563152313, 0.07015810906887054, -0.10554513335227966, -0.10172480344772339, 0.06321966648101807, 0.03713037073612213, -0.025589115917682648, -0.03321894258260727, -0.22063469886779785, -0.0553801953792572, -0.1668863594532013, 0.003320684190839529, 0.008914255537092686, -0.11531995236873627, -0.08435747027397156, -0.028689289465546608, -0.04053815081715584, 0.11762531846761703, -0.050939008593559265, 0.10287512838840485, -0.07583077996969223, -0.06786437332630157, 0.04367407038807869, -0.0330645851790905, 0.1293914020061493, -0.007046611979603767, 0.023822613060474396, -0.1176624447107315, 0.17427171766757965, -0.06007726117968559, -0.016713930293917656, -0.1859760582447052, -0.23808704316616058, -0.13169190287590027, -0.05822927504777908, 0.003847337793558836, 0.08431995660066605, 0.19931739568710327, 0.044779639691114426, 0.04467805474996567, -0.0704176053404808, 0.09116455167531967, -0.026608020067214966, -0.08298879861831665, -0.09093616157770157, 0.0410446934401989, 0.05824669823050499, 0.14212733507156372, -0.021388262510299683, -0.10280431807041168, 0.154693603515625, -0.03354785591363907, -0.10577930510044098, -0.04479876905679703, -0.07051508128643036, -0.08105042576789856, -0.039242953062057495, 0.1742008924484253, 0.03574031591415405, 0.09542287141084671, 0.036342721432447433, -0.1240854486823082, -0.006594838108867407, -0.03109796531498432, 0.09213853627443314, 0.015898359939455986, 0.06304597854614258, -0.1590043604373932, -0.09437056630849838, -0.19430123269557953, 0.09887678176164627, -0.02100072242319584, -0.11715584248304367, 0.17514146864414215, -0.05755104497075081, -0.13072486221790314, -0.061966195702552795, -0.23693647980690002, 0.07889887690544128, -0.08172629773616791, -0.22889253497123718, -0.07272980362176895, -0.07777056843042374, 0.5667501091957092, 0.05001988261938095, 0.1594378650188446, -0.0006323873531073332, -0.05302990972995758, 0.11753305047750473, 0.17767153680324554, -0.13607871532440186, -0.016408566385507584, 0.0183921679854393, -0.08027352392673492, -0.034110430628061295, -0.10124687850475311, -0.059493858367204666, -0.032247550785541534, 0.10873839259147644, 0.0359850712120533, 0.14040154218673706, 0.09402698278427124, -0.07765302807092667, 0.10686232149600983, -0.05263320729136467, -0.013859380967915058, 0.1977034956216812, 0.09915334731340408, 0.12856373190879822, -0.0975089892745018, 0.15175457298755646, 0.03799976781010628, -0.10414876788854599, -0.053358692675828934, -0.017724720761179924, -0.13658247888088226, 0.05156998708844185, 0.060714464634656906, -0.2958482503890991, 0.14701175689697266, -0.019301477819681168, 0.06767327338457108, -0.04174285754561424, -0.11534395813941956, 0.11032777279615402, -0.0034342780709266663, -0.045157697051763535, -0.018150636926293373, 0.08220121264457703, -0.052283357828855515, 0.06628261506557465, 0.04676384851336479, 0.08939169347286224, 0.05565325543284416, 0.10443636029958725, 0.13788653910160065, -0.014642789028584957, 0.06557450443506241, -0.03621593490242958, 0.012770853005349636, 0.0767999142408371, -0.19034376740455627, -0.17295952141284943, -0.0150951212272048, 0.09666310995817184, 0.002650681883096695, 0.003561436664313078, -0.0020962085109204054, 0.0345112569630146, -0.03587958961725235, 0.11408645659685135, -0.09822522848844528, -0.1507755070924759, 0.014490500092506409, -0.11060557514429092, -0.050653666257858276, -0.04607270285487175, -0.10440875589847565, 0.10458727180957794, 0.09721674025058746, -0.06524527072906494, 0.060959216207265854, 0.008710566908121109, 0.1170988380908966, -0.1956748217344284, 0.03395863622426987, -0.006324555724859238, -0.16066694259643555, 0.04735308885574341, -0.110935740172863, 0.09415572881698608, 0.02771943435072899, 0.02165941894054413, 0.10124284029006958, -0.003187768626958132, 0.09361710399389267, -0.06815502047538757, -0.16614201664924622, -0.16295160353183746, -0.1169387474656105, 0.08040954917669296, -0.04879547655582428, -0.1863919496536255, 0.08588913083076477, -0.003713657148182392, 0.0442575179040432, 0.21752677857875824, 0.01338858064264059, -0.092305026948452, 0.1030898168683052, -0.010862583294510841, -0.054023291915655136, -0.07451532036066055, -0.0951089933514595, -0.14062674343585968, 0.040747229009866714, -0.019221147522330284, -0.08748071640729904, 0.07636114954948425, -0.026665089651942253, -0.06597712635993958, 0.05750902742147446, 0.1649509221315384, -0.043297093361616135, 0.08522365242242813, -0.063418909907341, 0.08035138249397278, -0.06429285556077957, 0.03779247775673866, 0.17609268426895142, -0.15335755050182343, -0.09287185966968536, -0.04853308945894241, 0.036872752010822296, 0.1802975982427597, 0.05691390857100487, 0.05843531712889671, -0.03814857080578804, -0.08523312956094742, 0.2508593499660492]],
    n_results=top_k,
    include=["metadatas","distances"],  # Fields to return in the search results
    #where={"distances": {"$lt": 0.1}},
)

# Extracting the text fields and joining them with a double newline separator
texts = [item['file_name'] for item in search_result['metadatas'][0]]
result = "\n\n".join(texts)


print(search_result)


# Extract relevant context
if search_result:
    all_titles = [item["text"] for item in search_result["metadatas"][0]]
    relevant_context = "\n\n".join(all_titles)

"""
# Create clickable links (formatting may vary based on your terminal)
clickable_texts = [f"\033]8;;file://{item['file_name']}\033\\{os.path.basename(item['file_name'])}\033]8;;\033\\" for item in search_result['metadatas'][0]]

result = "\n\n".join(clickable_texts)

print(result)

"""

{'ids': [['3baecdda7f61716704a40ca571ffc653f9a95704eb64de12e14501e09c65ef92', 'c2cfd3ef0415d886e8786e59d73c69608f108aff5d9780e1b08ec8be62664d2a', '331f98d47227382f6c58a0ec4ccecf28a8243dfb9ea0b0456ec39a3640cabd7c']], 'distances': [[8.344650268554688e-07, 0.004223346710205078, 0.004545390605926514]], 'metadatas': [[{'file_name': 'C:\\Users\\deletable\\Google Drive\\goodman Gilman 12e\\I. General Principles\\2..htm', 'modification_time': 1719016913.2649157, 'text': 'However, in general, ion trapping ass. c transmembrane pH gradients is not large `.`pH difference b/w tissue & blood (~7.0 vs 7.4) is small. more imp determinant of blood-tissue partitioning is relative binding of drug to plasma proteins & tissue macromolecules that limits Conc. of free drug. Plasma Proteins Many drugs circulate in bloodstream bound to plasma proteins. Albumin is a major carrier for acidic drugs; 1 -acid glycoprotein binds basic drugs. Nonspecific binding to other plasma proteins generally occurs to a much sma

'\n# Create clickable links (formatting may vary based on your terminal)\nclickable_texts = [f"\x1b]8;;file://{item[\'file_name\']}\x1b\\{os.path.basename(item[\'file_name\'])}\x1b]8;;\x1b\\" for item in search_result[\'metadatas\'][0]]\n\nresult = "\n\n".join(clickable_texts)\n\nprint(result)\n\n'

 Simplifying database

In [15]:
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
import json

# Initialize the PersistentClient with a specified path
client = chromadb.PersistentClient(
    path="/path/to/save/to",  # Specify the path where data will be stored
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,
)

# Get or create the collection
collection_name = "deletable_chunks"
collection = client.get_or_create_collection(name=collection_name)

# Function to insert batches into ChromaDB
def insert_batches_to_chromadb(collection, csv_path, batch_size=10000):
    for batch in read_csv_in_batches(csv_path, batch_size):
        ids = []
        texts = []
        file_names =[]
        modification_times =[]
        embeddings = []
        unique_ids = set()

        for row in batch:
            chunk_id = row[0]
            if chunk_id in unique_ids:
                print(f"Duplicate ID found: {chunk_id}, skipping this entry.")
                continue

            unique_ids.add(chunk_id)
            text = row[1]
            file_name = row[2]
            modification_time = float(row[3])  # Ensure it's a float
            embedding_str = row[4]
            embedding = json.loads(embedding_str)  # Ensure it's a list of floats

            ids.append(chunk_id)
            texts.append(text)
            file_names.append(file_name)
            modification_times.append(modification_time)
            embeddings.append(embedding)

        try:
            collection.add(embeddings=embeddings, texts=texts,file_names=file_names,modification_times=modification_times,ids=ids)
            print(f"Inserted {len(ids)} records to ChromaDB")
        except Exception as e:
            print(f"An error occurred while adding documents: {e}")

# Perform a test query to check if data is inserted
test_query_result = collection.get(include=["metadatas"])
print("Test Query Result:", test_query_result)

# Perform similarity search for a specific word in the text field of metadata
search_word = "your_search_word"  # Replace with the word you want to search for
search_result = collection.get(
    include=["documents"],
    where_document={"$contains": search_word}
)

# Print the search results
if search_result['metadatas']:
    for result in search_result['metadatas']:
        print(result["text"])
else:
    print("No results found for the search word.")


{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': None, 'uris': None, 'data': None, 'included': ['metadatas']}


Nomalizing embeddings

In [37]:
import chromadb
import numpy as np

# Function to normalize embeddings
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

# Initialize the client
client = chromadb.PersistentClient(path=r"C:\Users\deletable\OneDrive\easy-local-rag\chroma")

# Retrieve the collection
collection = client.get_collection("Pubmed_cosine_HTML_chunks")

# Retrieve all embeddings and metadata from the collection
items = collection.get(ids=None, include=["embeddings", "metadatas", "documents"])

# Extract embeddings and ids
embeddings = items['embeddings']
ids = items['ids']

# Normalize the embeddings
normalized_embeddings = normalize_embeddings(np.array(embeddings))

# Update the collection with normalized embeddings using upsert
for i, item_id in enumerate(ids):
    collection.upsert(ids=[item_id], embeddings=[normalized_embeddings[i].tolist()])
