In [1]:
from typing import List
from decouple import config
from langchain_nomic.embeddings import NomicEmbeddings
from langchain_community.vectorstores import UpstashVectorStore
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document
from upstash_vector import Index, Vector
from langchain_community.embeddings.yandex import YandexGPTEmbeddings
import wikipedia

In [2]:
wikipedia.set_lang("ru")

In [3]:
NOMIC_API_KEY = config('NOMIC_API_KEY')
EMBED_CONDIG = {
    "nomic_api_key": NOMIC_API_KEY, 
    "model": 'nomic-embed-text-v1.5',
    'dimensionality': 768
}

In [3]:
UPSTASH_VECTOR_REST_URL = config("UPSTASH_VECTOR_REST_URL")
UPSTASH_VECTOR_REST_TOKEN = config("UPSTASH_VECTOR_REST_TOKEN")
STORE_CONFIG = {
    "index_url": UPSTASH_VECTOR_REST_URL,
    "index_token": UPSTASH_VECTOR_REST_TOKEN
}

In [4]:
YC_FOLDER_ID = config('YC_FOLDER_ID')
YC_API_KEY = config('YC_API_KEY')
YA_EMB_CONFIG = {
    "api_key": YC_API_KEY,
    "folder_id": YC_FOLDER_ID,
    "doc_model_name": "text-search-query"
}

In [5]:
embeddings = YandexGPTEmbeddings()

In [5]:
embed_model = NomicEmbeddings(**EMBED_CONDIG)

In [6]:
store = UpstashVectorStore(embedding=embeddings, **STORE_CONFIG)

In [25]:
wikipedia.search('Нью ЙОрк')

['Нью-Йорк',
 'Нью-Йорк (штат)',
 'Манхэттен',
 'Нью-Йорк (Украина)',
 'Нью-Йорк Янкис',
 'Нью-Йорк Рейнджерс',
 'Банды Нью-Йорка',
 'Нью-Йорк, я люблю тебя',
 'Нью-Йорк Айлендерс',
 'Нью-Йорк Никс']

In [7]:
documents = []
cities = ["Чита", "Нью-Йорк"]
for city in cities:
    wikipedia_page_result = wikipedia.page(title=city)
    doc = Document(
        page_content=wikipedia_page_result.content,
        metadata={
            "source": f"{wikipedia_page_result.url}",
            "title": city
        })
    documents.append(doc)

In [8]:
OPENAI_LLM_MODEL = "gpt-4o"
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    model_name=OPENAI_LLM_MODEL, chunk_size=400, chunk_overlap=100
)

In [9]:
docs = text_splitter.split_documents(documents)

In [13]:
import time

In [14]:
def request_with_delay(func, *args, **kwargs):
    time.sleep(0.1)
    return func(*args, **kwargs)

In [17]:
inserted_vectors = []
for i in range(0, len(docs), 10):
    docs_batch = docs[i:i+10]
    inserted_batch = store.add_documents(docs_batch)
    inserted_vectors.append(inserted_vectors)

Retrying langchain_community.embeddings.yandex._embed_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised _InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.RESOURCE_EXHAUSTED
	details = "ai.embeddingsTextEmbeddingRequestsPerSecond.rate rate quota limit exceed: allowed 10 requests"
	debug_error_string = "UNKNOWN:Error received from peer ipv4:158.160.54.160:443 {created_time:"2024-12-05T18:05:38.5022347+00:00", grpc_status:8, grpc_message:"ai.embeddingsTextEmbeddingRequestsPerSecond.rate rate quota limit exceed: allowed 10 requests"}"
>.


In [24]:
result = store.similarity_search("Небоскребы", k=5)

In [30]:
result = store.similarity_search_with_score("Буряты", k=5)
for doc, score in result:
    print(f"{doc.metadata['title']} - {score}")

Чита - 0.6985642
Чита - 0.6720789
Чита - 0.65302455
Чита - 0.64623934
Чита - 0.6457484
