In [2]:
import chromadb
import boto3
import os
from botocore.config import Config

In [3]:
chroma_client = chromadb.PersistentClient(path='/app/storage/db')

collection = chroma_client.get_collection(name='default')


#print(os.getenv('AWS_PROFILE'))
#os.environ["AWS_PROFILE"] = "SANDBOX_AI-Developer"

# Configuración del cliente de Amazon Bedrock
bedrock_client = boto3.client('bedrock-runtime', region_name='us-east-1', )
model_id = 'cohere.embed-multilingual-v3'

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter


def extract_keywords_nltk(text, num_keywords=5):
    # Tokenizar el texto
    tokens = word_tokenize(text.lower())
    

    # Eliminar stop words
    stop_words = set(stopwords.words('spanish'))
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Contar la frecuencia de las palabras
    word_freq = Counter(filtered_tokens)

    # Obtener las palabras más comunes como palabras clave
    keywords = [word for word, _ in word_freq.most_common(num_keywords)]
    return keywords

In [5]:
import json



def get_embeddings( texts):
        
        
        if not isinstance(texts, list) or not all(isinstance(text, str) for text in texts):
            raise ValueError("texts debe ser una lista de cadenas.")

        try:
            # Serializar el cuerpo de la solicitud a JSON
            payload = json.dumps({'texts': texts, 'input_type':'search_query'})

            # Enviar la solicitud al modelo
            response = bedrock_client.invoke_model(
                modelId=model_id,
                body=payload,
                contentType='application/json'
            )

            # Leer y procesar la respuesta
            response_body = response['body'].read().decode('utf-8')
            result = json.loads(response_body)

            embeddings = result.get('embeddings', [])
            if embeddings:
                return (embeddings)
            else:
                print("No se encontraron embeddings en la respuesta.")
                return None

        except json.JSONDecodeError as json_error:
            print(f"Error al procesar JSON: {json_error}")
            return None
        except KeyError as key_error:
            print(f"Error en la respuesta de la API: {key_error}")
            return None
        except Exception as e:
            print(f"Error al obtener embeddings: {e}")
            return None

In [20]:

import numpy as np


aws_embeddings =get_embeddings(texts=['aws'])
print(get_embeddings(texts=['aws']))

pascal_embed = get_embeddings(texts=['nuclear'])
print(get_embeddings(texts=['pascal']))

nparray = np.array(pascal_embed)
print(type(nparray))

results = collection.query(
                query_embeddings=nparray,
                n_results=10,
                include=["metadatas", "documents", "distances","embeddings"]
            )

print(results)
# results = collection.query(
#     query_texts=['aws'],
#     n_results=10
# )

# print(collection.get())


# # for result in results['documents'][0]:
# #     print(extract_keywords_nltk(result))


# #print([ chunk for chunk in results['documents'][0]])

[[0.01979065, 0.020263672, -0.009628296, 0.0317688, -0.012825012, -0.008110046, 0.00015330315, -0.03213501, -0.058502197, -0.009681702, 0.0007920265, -0.057159424, 0.032562256, -0.029708862, 0.009979248, -0.03100586, 0.02758789, -0.012321472, 0.027740479, -0.011642456, -0.030136108, 0.017089844, 0.047821045, -0.022521973, -0.010719299, 0.04055786, 0.04498291, -0.04385376, 0.016220093, -0.016311646, -0.03286743, -0.010269165, 0.023910522, 0.02305603, 0.004245758, 0.014053345, 0.029159546, -0.010719299, 0.034484863, -0.0038700104, 0.03050232, -0.021759033, -0.016143799, 0.008834839, -0.057739258, 0.0038070679, 0.02166748, 0.01184845, 0.030197144, 0.018478394, -0.022247314, 0.0236969, 0.0019512177, -0.0046806335, 0.026779175, 0.0006966591, 0.016830444, -0.036956787, 0.005771637, 0.00995636, -0.017974854, -0.08984375, -0.08282471, 0.0046806335, 0.036712646, 0.02822876, 0.026123047, 0.05731201, 0.021591187, 0.034698486, 0.03024292, 0.046661377, 0.046539307, 0.0231781, 0.04244995, -0.0156860

Number of requested results 10 is greater than number of elements in index 2, updating n_results = 2


[[0.019317627, -0.015960693, -0.03857422, 0.03515625, -0.0076293945, 0.023971558, 0.016357422, -0.008705139, -0.032592773, 0.008796692, -0.016113281, -0.037872314, 0.03463745, -0.015960693, 0.011199951, -0.013961792, -0.011627197, 0.0011329651, -0.03201294, -0.014923096, -0.0012893677, 0.022033691, 0.015792847, 0.019561768, 0.024642944, -0.012336731, 0.026550293, 0.027114868, -0.009857178, -0.035247803, -0.051849365, 0.0058403015, 0.010429382, 0.013877869, 0.003320694, -0.003917694, 0.04147339, -0.01776123, 0.014350891, 0.012016296, 0.02760315, -0.017501831, -0.018173218, 0.037353516, -0.031021118, 0.011100769, 0.013931274, 0.022354126, 0.028152466, 0.018432617, 0.002922058, -0.020202637, 0.004512787, 0.015510559, 0.038146973, 0.0043525696, -0.014228821, -0.05166626, 0.021728516, -0.021347046, -0.006061554, -0.01776123, -0.058258057, 0.020889282, -0.02255249, -0.0046463013, 0.017318726, 0.03656006, 0.012084961, 0.018997192, 0.022140503, 0.08648682, -0.01259613, 0.010276794, -0.02729797