# Install the requirements and import them

In [2]:
!pip install pandas
!pip install huggingface_hub
!pip install datasets
!pip install sentence-transformers
!pip install pinecone-client
!pip install openai

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1
Collecting pinecone-client
  Using cached pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting pinecone-plugin-inference<2.0.0,>=1.0.3 (from pinecone-client)
  Using cached pinecone_plugin_inference-1.1.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone-client)
  Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Using cached pinecone_client-5.0.1-py3-none-any.whl (244 kB)
Using cached pinecone_plugin_inference-1.1.0-py3-none-any.whl (85 kB)
Using cached pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-inference, pinecone-client
Successf

In [3]:
import os
import datasets
from dotenv import load_dotenv
from openai import AzureOpenAI
from pinecone import Pinecone
from pinecone import ServerlessSpec
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


# Load general data

In [4]:
load_dotenv()

SOURCE_LANGUAGE_CODE = "en"
TARGET_LANGUAGE_CODE = "qu"

DATASET_NAME = "pollitoconpapass/eng-quz-translation-dataset"
DATASET_SOURCE_COLUMN_NAME = "English"
DATASET_TARGET_COLUMN_NAME = "Quechua"

EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_INDEX_NAME = os.getenv('PINECONE_INDEX_NAME')

# Generate RAG

In [5]:
dataset = datasets.load_dataset(DATASET_NAME)
model = SentenceTransformer(EMBEDDING_MODEL)
pc = Pinecone(api_key=PINECONE_API_KEY)



In [6]:
'''To create the index in Pinecone, if you already have one created with the same name, it will give you an error. '''

index_name = PINECONE_INDEX_NAME
if index_name not in pc.list_indexes():
    pc.create_index(index_name, dimension=384, metric="cosine", spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  ))

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': 'b0320d534ae6a589e5d99daa7d705b1a', 'Date': 'Fri, 25 Oct 2024 00:14:14 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [12]:
index = pc.Index(index_name)

### Script above is for making the actual ingestion. 

🤬 Only run it once. ONCE!

In [None]:
import json

json_data = []

id = 0
for entry in dataset["train"]:
  source_sentence = entry[DATASET_SOURCE_COLUMN_NAME]
  source_sentence_embedding = model.encode(source_sentence).tolist() 

  json_entry = {
      "source_language": SOURCE_LANGUAGE_CODE,
      "source_sentence": source_sentence,
      "target_language": TARGET_LANGUAGE_CODE,
      "target_sentence": entry[DATASET_TARGET_COLUMN_NAME],
      "source_sentence_embedding": source_sentence_embedding
  }

  json_data.append(json_entry)
  index.upsert(vectors=[(str(id), source_sentence_embedding, {"source_language": SOURCE_LANGUAGE_CODE, 
                                                              "source_sentence": source_sentence, 
                                                              "target_language": TARGET_LANGUAGE_CODE, 
                                                              "target_sentence": entry[DATASET_TARGET_COLUMN_NAME]})])
  id += 1


with open("json_data_4_quechua_RAG.json", "w", encoding="utf-8") as f: # -> this will generate a JSON file for you to check 
  json.dump(json_data, f, ensure_ascii=False, indent=4)


print("Pinecone index created and populated successfully.")

# Translation Implementation

In [13]:
# === USAGE EXAMPLE ===
def retrieve_similar_sentence(query_sentence):
    query_embedding = model.encode(query_sentence).tolist()

    response = index.query(
        vector=query_embedding,
        top_k=4,
        include_metadata=True
    )
    
    results = []
    for match in response['matches']:
      metadata = match['metadata']
      score = match['score']

      results.append({
          "source_sentence": metadata["source_sentence"],
          "target_sentence": metadata["target_sentence"],
          "score": score
      })
    
    return results

In [None]:
# Test query index function
results = retrieve_similar_sentence("What are your symptoms?")
print(results)

print(results[0]["source_sentence"])
print(results[0]["target_sentence"])

[{'source_sentence': 'What are some of those signs?', 'target_sentence': '¿Imaynatan chayta rikuchiwaqchis?', 'score': 0.620543718}, {'source_sentence': 'Besides pain, what else do you feel?', 'target_sentence': '¿Nanaymanta imatawantaq sintinky?', 'score': 0.525890827}, {'source_sentence': 'Do you have a headache and is it accompanied by nausea?', 'target_sentence': '¿Umaykichu mansunkiy imatawantaq sintinky umayqkichu muyun?', 'score': 0.501809835}, {'source_sentence': 'Do you have fever or chills?', 'target_sentence': "¿Rupaychu jap'isunki o chirichu?", 'score': 0.500708342}]
What are some of those signs?
¿Imaynatan chayta rikuchiwaqchis?


In [15]:
client = AzureOpenAI(
    api_key=os.getenv('AZURE_ASSISTANT_API_KEY'),
    api_version=os.getenv('AZURE_ASSISTANT_API_VERSION'),
    azure_endpoint=os.getenv('AZURE_ASSISTANT_DOMAIN'),
    azure_deployment=os.getenv('AZURE_ASSISTANT_DEPLOYMENT_ID')
)

def translate_sentence(sentence, source_language="english", target_language="quechua"):
  results = retrieve_similar_sentence(sentence)

  prompt = f""" Your task is to translate text from source_language 
                    {source_language} to target_language {target_language} 
                    using provided context details.

                    context:
                    ```
                    source_sentence: {results[0]["source_sentence"]}
                    target_sentence: {results[0]["target_sentence"]}

                    source_sentence: {results[1]["source_sentence"]}
                    target_sentence: {results[1]["target_sentence"]}

                    source_sentence: {results[2]["source_sentence"]}
                    target_sentence: {results[2]["target_sentence"]}
                    ```

                    text:
                    ```
                    {sentence}
                    ```
  """

  response = client.chat.completions.create(
      model="gpt4-o",
      messages=[{"role": "system", "content": prompt},
                {"role": "user", "content": sentence}]
  )

  return response.choices[0].message.content

In [16]:
answer = "Open your mouth. Now take off your clothes, I'm going to examine you"
translation = translate_sentence(answer)
print(translation)

Simiykita kichay. Ch'utikuy p'achaykita cawasayki imaynas cashanky.
