In [1]:
# !pip install langchain transformers qdrant-client accelerate torch bitsandbytes

In [2]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama2")


In [3]:
llm.invoke("Tell me a joke")

"\nSure, here's one:\n\nWhy don't scientists trust atoms?\n\nBecause they make up everything!\n\nI hope you found that amusing! Do you want to hear another one?"

In [4]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
 chunk_size=1000,
 chunk_overlap=20,
 length_function=len,
 is_separator_regex=False,
)
loader = DirectoryLoader('Hindi-Aesthetics-Corpus/Corpus', loader_cls=TextLoader)
docs = loader.load_and_split(text_splitter=text_splitter)

In [44]:
import fasttext as ft
# Loding model for Hindi.
embed_model = ft.load_model('wiki.hi.bin')



In [46]:
import fasttext
from huggingface_hub import hf_hub_download

model_path = hf_hub_download(repo_id="facebook/fasttext-hi-vectors", filename="model.bin")
embed_model = fasttext.load_model(model_path)

model.bin:   0%|          | 0.00/6.96G [00:00<?, ?B/s]



In [13]:
path = 'BAAI/bge-large-en'
## Loading the model
from transformers import AutoModel, AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained(path)
embed_model = AutoModel.from_pretrained(path,output_hidden_states=True)

In [29]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [48]:
embed_model = HuggingFaceBgeEmbeddings(model_name=path)

In [49]:
url = "http://localhost:63333"
collections_name = "hindi_aesthetics"

We need to somehow get the fasttext embedding model and merge it with Qdrant. Then it will work flawlessly

In [15]:
from langchain.vectorstores import Qdrant

In [22]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Qdrant
# from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [25]:
loader = TextLoader("test.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=250)
docs = text_splitter.split_documents(documents)


Created a chunk of size 1309, which is longer than the specified 500
Created a chunk of size 874, which is longer than the specified 500


In [50]:
qdrant = Qdrant.from_documents(docs, embed_model, url=url, collection_name=collections_name, prefer_grpc=False)

In [None]:
import pandas as pd
data = []
for doc in docs:
 row_data = {
 "page_content": doc.page_content,
 "metadata": doc.metadata
 }
 data.append(row_data)
df = pd.DataFrame(data)
df['page_content'] = df['page_content'].replace('\\n', ' ', regex=True)

In [None]:
df['embeddings'] = df['page_content'].apply(lambda x: (embed_model.get_sentence_vector(x)).tolist())

In [None]:
df['id'] = range(1, len(df) + 1)

In [None]:
payload = df[['page_content', 'metadata']].to_dict(orient='records')

In [None]:
from qdrant_client import QdrantClient
client = QdrantClient(location=':memory:')

In [None]:
from langchain.vectorstores import Qdrant

In [None]:
Qdrant.from_documents(docs, index_name="hindi_aesthetics_corpus", embedding=df['embeddings'])

In [None]:

from langchain.embeddings import HuggingFaceEmbeddings

import fasttext as ft
# Loding model for Hindi.
embed_model = ft.load_model('wiki.hi.bin')

doc_store = Qdrant.from_texts(
    texts, embeddings, url="<qdrant-url>", api_key="<qdrant-api-key>", collection_name="texts"
)

In [None]:
# !pip install fasttext
# wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.hi.zip
# !unzip wiki.hi.zip

In [None]:
# import fasttext.util
# fasttext.util.download_model('hi', if_exists='ignore')  # English
# ft = fasttext.load_model('cc.hi.300.bin')