In [None]:
import sqlite3
import numpy as np
import openai
import torch
import transformers
from transformers import BertTokenizer, BertModel
from concurrent.futures import ThreadPoolExecutor
from transformers import AutoModel, AutoTokenizer
from concurrent.futures import ProcessPoolExecutor, as_completed
import chromadb
import llm
from transformers import pipeline
import langchain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
#get document IDs from SQLite DB
def fetch_document_ids(db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT goid FROM subset_table")  # Adjust based on your table schema
    ids = [row[0] for row in cursor.fetchall()]
    conn.close()
    return ids
db_path = 'subset_data.db'
document_ids = fetch_document_ids(db_path)

In [None]:
#get from SQLite DB using the retrieved document IDs, store as an array
def fetch_documents_by_ids(db_path, document_ids):
    documents = []
    # Open a single connection
    with sqlite3.connect(db_path) as conn:
        for doc_id in document_ids:
            cursor = conn.cursor()
            cursor.execute("SELECT text FROM subset_table WHERE goid = ?", (doc_id,))
            result = cursor.fetchone()
            if result:
                documents.append(result[0])
            else:
                documents.append(None)
    return documents

documents = fetch_documents_by_ids(db_path, document_ids)

In [None]:
#exclude any Null elements
documents = [element for element in documents if element is not None]

In [None]:
# the path to your model directory
model_path = "./multi-qa-MiniLM-L6-cos-v1/"
model_kwargs = {"device": "cuda"}

# Load the model
embeddings = SentenceTransformerEmbeddings(model_name="./multi-qa-MiniLM-L6-cos-v1/",  model_kwargs=model_kwargs)

In [None]:
#do the first 100000 documents
documents_array = documents[0:100000] 

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

#chunk the documents
def split_docs(documents, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    # Create a list to store the split documents
    split_documents = []
    # Iterate over each document string in the array
    for doc in documents:
        # Split the document string and add the chunks to the list
        split_documents.extend(text_splitter.split_text(doc))
    return split_documents

# Split documents stored in the array
docs_split = split_docs(documents_array)

docs = text_splitter.create_documents(documents_array)

In [None]:
persist_directory = "chroma_db"

vectordb = Chroma.from_documents(
    documents=docs, embedding=embeddings, persist_directory=persist_directory
)

vectordb.persist()

In [None]:
vectordb = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)
vectordb._collection.count()

In [None]:
def batch_process(documents_arr, batch_size, process_function):
    for i in range(0, len(documents_arr), batch_size):
        batch = documents_arr[i:i + batch_size]
        process_function(batch)

def add_to_chroma_database(batch):
    vectordb.add_documents(documents=batch)

batch_size = 41000

batch_process(docs, batch_size, add_to_chroma_database)