In [3]:
import os
from dotenv import load_dotenv
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [5]:
load_dotenv()

True

In [6]:
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Miguel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Miguel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [10]:
splits = text_splitter.split_documents(docs)

In [11]:
splits[0].metadata['source']

'https://lilianweng.github.io/posts/2024-02-05-human-data-quality/'

In [12]:
#genera dataframes con el page content de cada split
metadata = [split.page_content for split in splits]
source = [split.metadata['source'] for split in splits]

In [13]:
df_texto = pd.DataFrame(metadata, columns=["page_content"])
df_metadata = pd.DataFrame(source, columns=["metadata"])

In [14]:
df_texto["metadata"] = df_metadata["metadata"]
df_texto.sample(5)

Unnamed: 0,page_content,metadata
33,"Pleiss, et al. (2020) developed a method named...",https://lilianweng.github.io/posts/2024-02-05-...
13,"Later, Rottger et al. (2021) formulated the di...",https://lilianweng.github.io/posts/2024-02-05-...
3,"Almost 100 years later, Callison-Burch (2009) ...",https://lilianweng.github.io/posts/2024-02-05-...
20,Baseline: Directly predict the majority vote $...,https://lilianweng.github.io/posts/2024-02-05-...
40,[13] Koh & Liang. “Understanding Black-box Pre...,https://lilianweng.github.io/posts/2024-02-05-...


In [15]:
def limpiar_y_tokenizar(texto):
    # Convertir a minúsculas
    texto = texto.lower()
    
    # Eliminar URLs
    texto = re.sub(r'http\S+|www\S+|https\S+', '', texto, flags=re.MULTILINE)
    
    # Eliminar saltos de línea
    texto = texto.replace('\n', ' ')
    
    # Eliminar caracteres especiales y números
    texto = re.sub(r'[^a-z\s]', ' ', texto)
    
    # Tokenizar el texto
    tokens = word_tokenize(texto)
    
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    tokens_limpios = [token for token in tokens if token not in stop_words]
    
    #une el texto tokenizado
    tokens_limpios = ' '.join(tokens_limpios)
    
    return tokens_limpios

In [16]:
df_texto['tokens'] = df_texto['page_content'].apply(limpiar_y_tokenizar)

In [17]:
df_texto.sample(2)

Unnamed: 0,page_content,metadata,tokens
13,"Later, Rottger et al. (2021) formulated the di...",https://lilianweng.github.io/posts/2024-02-05-...,later rottger et formulated the difference int...
29,"Fig. 11. Data map for SNLI training set, based...",https://lilianweng.github.io/posts/2024-02-05-...,fig data map for snli training set based on ro...


In [18]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="text-embedding-3-small"
)

In [19]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [20]:
collection = chroma_client.get_or_create_collection('blog', embedding_function=openai_ef)

In [21]:
df_texto = df_texto.assign(id=range(1, len(df_texto) + 1))
df_texto.sample(2)

Unnamed: 0,page_content,metadata,tokens,id
35,Create a subset of threshold samples $\mathcal...,https://lilianweng.github.io/posts/2024-02-05-...,create subset of threshold samples mathcaldtex...,36
16,Agreement rate differs a lot across different ...,https://lilianweng.github.io/posts/2024-02-05-...,agreement rate differs lot across different to...,17


In [22]:
collection.upsert(
    documents=df_texto['tokens'].to_list(),
    metadatas=df_texto.to_dict(orient='records'),
    ids=df_texto['id'].astype(str).to_list()
    )