In [31]:
import requests
url = 'https://text.npr.org/nx-s1-5227172'
response = requests.get(url)

In [32]:
from bs4 import BeautifulSoup

# Supongamos que `response.text` contiene el HTML de la página
html_content = response.text

# Crear un objeto BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extraer todas las etiquetas <p>
paragraphs = soup.find_all('p')

# Obtener solo el texto de las etiquetas <p>
paragraph_texts = [p.get_text(strip=True) for p in paragraphs]

In [26]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [37]:
from langchain_chroma import Chroma


vector_store = Chroma.from_texts(
    texts=paragraph_texts,
    collection_name="microplastics",
    embedding=embeddings,
    persist_directory="/tmp/chroma_microplastics",
)

In [38]:
results = vector_store.similarity_search(
    "What is a microplastic",
    k=3  #number of results
)
for res in results:
    print(res)

page_content='As they push to solve the puzzle of microplastics, here are six questions scientists are trying to answer.'
page_content='Our soil, drinking water and food supply, the air we breathe, all carry microplastics,defined asany plastic particle as small as 1 nanometer and as large as 5 millimeters. Some have built up in the environment over many years, while others arrive daily, as they shed from tires, our clothing, food packaging, personal care products and more.'
page_content='Humans encounter many pollutants over our lifetime. And given that researchers are still sorting out the best models for analyzing microplastics, many are cautious not to get ahead of the data.'


In [45]:
retriever = vector_store.as_retriever(
    search_type="similarity",  search_kwargs={"k": 3}
)

def search_with_retriever(query, top_k):
    try:
        retriever.search_kwargs["k"] = top_k  # Dynamically set the number of results
        results = retriever.get_relevant_documents(query)
        return "\n\n".join(
            [f"**Result {i+1}:**\n{doc.page_content}" for i, doc in enumerate(results)]
        )
    except Exception as e:
        return f"Error: {e}"

In [58]:
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

# Prompt
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Local LLM
ollama_llm = "tinyllama"
model_local = ChatOllama(model=ollama_llm)

# Chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model_local
    | StrOutputParser()
)

In [59]:
chain.invoke("Who is Brander?")

'The question asks for information about a character in the given context, Suzanne Brander.'

In [39]:
import gradio as gr

In [62]:
with gr.Blocks() as demo:
    gr.Markdown("### Chroma Database Search")
    
    with gr.Row():
        query_input = gr.Textbox(label="Enter Your Query", placeholder="Type your question here...")

    search_button = gr.Button("Search")
    output_box = gr.Textbox(label="Search Results", lines=15)

    # Bind the function to the Gradio UI
    search_button.click(fn=chain.invoke, inputs=[query_input], outputs=output_box)

# Launch the App
demo.launch()

* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.


