In [None]:
#### pip installs ####
!pip install langchain-experimental langchain-huggingface qdrant-client

In [None]:
#### Imports ####
# Reading Files #
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

# Embeddings #
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

# Database #
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

In [None]:
### Initializations ####

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using " + DEVICE)

# Text splitter
# Use a chunker to chunk text
# Different options here: length chunking, character break chunking, semantic chunking
# Using semantic chunking for best separation of different information to help retrieval
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = SemanticChunker(embeddings=embeddings)

# Dense embedder
DENSE_MODEL_NAME = "intfloat/e5-base"
dense_model = SentenceTransformer(
    DENSE_MODEL_NAME,
    device="cpu",
)
DENSE_VECTOR_SIZE = dense_model.get_sentence_embedding_dimension()
print("Dense model " + DENSE_MODEL_NAME + " initialized")

# Sparse embedder
SPARSE_MODEL_NAME = "naver/splade_v2_max"
sparse_tokenizer = AutoTokenizer.from_pretrained(
    SPARSE_MODEL_NAME,
    device="cpu",
)
sparse_model = AutoModel.from_pretrained(SPARSE_MODEL_NAME).to(DEVICE)
SPARSE_VECTOR_SIZE = 50000  # Sparse embeddings can get very large
print("Sparse model " + SPARSE_MODEL_NAME + " initialized")
# Choices:
#    TF-IDF: Term frequency based
#    BM25: Probabilistic-based
#    SPLADE: Hybrid dense-sparse

# Database
COLLECTION_NAME = "class_materials"

In [None]:
#### Reading Files ####
'''
Reads the file at filepath and splits it into chunks.

Args:
    filepath (String): path to file to be read
Returns:
    array of Document objects, each a chunk of file read
'''
def read_file(filepath):
    with open(filepath, "r", encoding="utf-8") as file:
        content = file.read()

    docs = [Document(page_content=content)]
    chunks = text_splitter.split_documents(docs)

    # Add title to each chunk to use as id in database
    for i, chunk in enumerate(chunks):
        chunk.metadata["title"] = f"{filepath}_{i}"
        print(chunk.page_content)
        print('---')
        
    file.close()

    return chunks