In [22]:
!pip install --upgrade pandas pyarrow datasets
!pip install pinecone-client sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [23]:
import os
import json
from dotenv import load_dotenv
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

In [24]:
load_dotenv("../.env")

True

In [25]:
# === GENERAL CONFIG ===
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

## For Spanish to Quechua

In [26]:
FIRST_PINECONE_INDEX_NAME = "spa-quz-translation-index"
FIRST_SRC_LANGUAGE_KEY= "spanish"
FIRST_TGT_LANGUAGE_KEY = "cuzco quechua"

In [27]:
model = SentenceTransformer(EMBEDDING_MODEL)
pc = Pinecone(api_key=PINECONE_API_KEY)

In [28]:
first_index = pc.Index(FIRST_PINECONE_INDEX_NAME)
initial_stats = first_index.describe_index_stats()
print(f"Initial index stats: {initial_stats}")

Initial index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 106432}},
 'total_vector_count': 106432}


In [42]:
id = 0
with open('./words.json', 'r') as f:
    words = json.load(f)

    for word in words:
        source_sentence = word[FIRST_SRC_LANGUAGE_KEY]
        target_sentence = word[FIRST_TGT_LANGUAGE_KEY]
        source_sentence_embedding = model.encode(source_sentence).tolist()

        print(f"Source sentence: {source_sentence}")
        print(f"Target sentence: {target_sentence}")

        response = first_index.upsert(vectors=[(str(id), source_sentence_embedding, {"source_language": FIRST_SRC_LANGUAGE_KEY, 
                                                              "source_sentence": source_sentence, 
                                                              "target_language": FIRST_TGT_LANGUAGE_KEY, 
                                                              "target_sentence": target_sentence})])
        
        id+=1
        print(f"Upsert response for ID {id}: {response}")

print("Pinecone index created and populated successfully.")

Source sentence: Nombre de los colores en quechua
Target sentence: Llimpikunapa sutin: Puka, Q'ello, Anqas, Q'omer, Yana, Yuraq
Upsert response for ID 1: {'upserted_count': 1}
Pinecone index created and populated successfully.


In [43]:
stats = first_index.describe_index_stats()
print(f"Final index stats: {stats}")

Final index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 106432}},
 'total_vector_count': 106432}


In [44]:
query_embedding = model.encode("Nombre de colores en quechua").tolist()
query_response = first_index.query(
    vector=query_embedding,
    top_k=4,
    include_metadata=True
)
print(f"Query response: {query_response}")

Query response: {'matches': [{'id': '0',
              'metadata': {'source_language': 'spanish',
                           'source_sentence': 'Nombre de los colores en '
                                              'quechua',
                           'target_language': 'cuzco quechua',
                           'target_sentence': 'Llimpikunapa sutin: Puka, '
                                              "Q'ello, Anqas, Q'omer, Yana, "
                                              'Yuraq'},
              'score': 0.98862958,
              'values': []},
             {'id': '93349',
              'metadata': {'source_language': 'spanish',
                           'source_sentence': 'Cómo nos influyen los colores, '
                                              '1 / 10',
                           'target_language': 'cuzco quechua',
                           'target_sentence': 'Makilla kaychis Diosmanta '
                                              'mañakunaykichispaq ”, 15 / 1

## For the other way around, Quechua to Spanish

In [45]:
NEW_PINECONE_INDEX_NAME = "quz-spa-translation-index"
NEW_SRC_LANGUAGE_KEY= "cuzco quechua"
NEW_TGT_LANGUAGE_KEY = "spanish"

In [46]:
new_index = pc.Index(NEW_PINECONE_INDEX_NAME)

In [47]:
id = 0
with open('./words.json', 'r') as f:
    words = json.load(f)

    for word in words:
        source_sentence = word[NEW_SRC_LANGUAGE_KEY]
        target_sentence = word[NEW_TGT_LANGUAGE_KEY]
        source_sentence_embedding = model.encode(source_sentence).tolist()

        print(f"Source sentence: {source_sentence}")
        print(f"Target sentence: {target_sentence}")

        response = new_index.upsert(vectors=[(str(id), source_sentence_embedding, {"source_language": NEW_SRC_LANGUAGE_KEY, 
                                                              "source_sentence": source_sentence, 
                                                              "target_language": NEW_TGT_LANGUAGE_KEY, 
                                                              "target_sentence": target_sentence})])
        
        id+=1
        print(f"Upsert response for ID {id}: {response}")

print("Pinecone index created and populated successfully.")

Source sentence: Llimpikunapa sutin: Puka, Q'ello, Anqas, Q'omer, Yana, Yuraq
Target sentence: Nombre de los colores en quechua
Upsert response for ID 1: {'upserted_count': 1}
Pinecone index created and populated successfully.


In [48]:
query_embedding = model.encode("Llinphikunata: Puka, Q'ello, Anqas, Q'omer, Yana, Yuraq").tolist()
query_response = new_index.query(
    vector=query_embedding,
    top_k=4,
    include_metadata=True
)
print(f"Query response: {query_response}")

Query response: {'matches': [{'id': '2',
              'metadata': {'source_language': 'cuzco quechua',
                           'source_sentence': "Llinphikunata: Puka, Q'ello, "
                                              "Anqas, Q'omer, Yana, Yuraq",
                           'target_language': 'spanish',
                           'target_sentence': 'Los colores son: rojo, '
                                              'amarillo, azul, verde, negro y '
                                              'blanco'},
              'score': 1.00000012,
              'values': []},
             {'id': '539',
              'metadata': {'source_language': 'cuzco quechua',
                           'source_sentence': 'LLINPHIKUNA  Manan wakin '
                                              "llinphikunata mich'uspa "
                                              "horqonchismanchu q'ello, puka, "
                                              'anqas    Iskay ñeqe llinphikuna '
              