In [None]:
import torch
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
crypto_wiki_articles = "WQ/WikiQuery/data/Wikipedia_Crypto_Articles.csv"

# Loading dataframe content into a document

articles = DataFrameLoader(crypto_wiki_articles,
                           page_content_column = "article")
document = articles.load()

In [None]:
def split_documents_into_chunks(data, column_name="article", chunk_size=500, chunk_overlap=50, separators=['\n\n', '\n', '.']):
    """
    Split documents into chunks.

    Args:
    data (DataFrame): Input data containing articles.
    column_name (str): Name of the column containing the articles.
    chunk_size (int): Size of each chunk.
    chunk_overlap (int): Overlap between chunks.
    separators (list): List of separators for splitting documents.

    Returns:
    list: List of document chunks.
    """
    articles = DataFrameLoader(data, page_content_column=column_name)
    document = articles.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators,
        length_function=len
    )

    document_chunks = text_splitter.split_documents(document)

    return document_chunks


In [None]:
document_chunks = split_documents_into_chunks(document, column_name="article", chunk_size=500, chunk_overlap=50, separators=['\n\n', '\n', '.']):

In [None]:
def create_vector_database(doc_chunks, embedding_model, persist_directory"):
    """
    Create a vector database from document chunks.

    Args:
    doc_chunks (list): List of document chunks.
    embedding_model: Embedding model for vectorization.
    persist_directory (str): Directory to persist the vector database.

    Returns:
    Chroma: database successful creation message.
    """

    try:
        vec_database = Chroma.from_documents(
            doc_chunks,
            embedding_model,
            persist_directory=persist_directory
        )

        return {"message": "Vector database created successfully"}

    except Exception as e:
        error_message = f"Error occurred: {str(e)}"
        raise HTTPException(status_code=500, detail=error_message)

In [None]:
# Creating an embedding model
%%capture
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
persist_directory = "WQ/WikiQuery/vector_db"
create_vector_database(document_chunks, embedding_model, persist_directory=persist_directory):

In [None]:
# Printing the number of chunks

len(document_chunks)

In [None]:
# Initiate a chromadb instance to save the the data embeddings

chroma_db = Chroma.from_documents(document_chunks,
                                  embedding_model,
                                  persist_directory = "chroma_db")

retriever = chroma_db.as_retriever()

In [None]:
# Testing the retriver functionality
docs = retriever.get_relevant_documents("What is cryptocurrency")

In [None]:
len(docs)