In [1]:
data_path = 'Hindi-Aesthetics-Corpus/Corpus'
chunk_size = 500
chunk_overlap = 50
batch_size = 4000
host = 'localhost'
port = 6333

In [2]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    is_separator_regex=False,
)

loader = DirectoryLoader(data_path, loader_cls=TextLoader)
docs = loader.load_and_split(text_splitter=text_splitter)

In [None]:
import fasttext as ft
embed_model = ft.load_model('wiki.hi.bin')

In [3]:
import pandas as pd

data = []
for doc in docs:
    row_data = {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    data.append(row_data)

df = pd.DataFrame(data)

In [4]:
df['page_content'] = df['page_content'].replace('\\n', ' ', regex=True)

In [6]:
df['id'] = range(1, len(df) + 1)
df['payload'] = df[['page_content', 'metadata']].to_dict(orient='records')
df['embeddings'] = df['page_content'].apply(lambda x: (embed_model.get_sentence_vector(x)).tolist())

In [7]:
from qdrant_client import QdrantClient
client = QdrantClient(host=host, port=port)

In [8]:
from qdrant_client.models import VectorParams, Distance
client.delete_collection(collection_name="my_collection")
client.recreate_collection(
   collection_name="my_collection",
   vectors_config=VectorParams(size=300, distance=Distance.COSINE),
)

True

In [9]:
from qdrant_client.models import Batch

In [11]:
client.upsert(
 collection_name="my_collection",
 points=Batch(
     ids=df['id'].to_list()[:batch_size],
     payloads=df['payload'][:batch_size],
     vectors=df['embeddings'].to_list()[:batch_size],
 ),
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [12]:
client.close()