In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
load_dotenv()

True

In [3]:
os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["TAVILY_API_KEY"] = os.getenv("TAVILY_API_KEY")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Miguel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Miguel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Load, chunk and index the contents of the blog.
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [7]:
splits = text_splitter.split_documents(docs)

In [8]:
splits[0].metadata['source']

'https://lilianweng.github.io/posts/2024-02-05-human-data-quality/'

In [9]:
#genera dataframes con el page content de cada split
metadata = [split.page_content for split in splits]
source = [split.metadata['source'] for split in splits]

In [10]:
df_texto = pd.DataFrame(metadata, columns=["page_content"])
df_metadata = pd.DataFrame(source, columns=["metadata"])

In [11]:
df_texto["metadata"] = df_metadata["metadata"]
df_texto.sample(5)

Unnamed: 0,page_content,metadata
5,Fig. 3. (Left) The agreement rate is measured ...,https://lilianweng.github.io/posts/2024-02-05-...
22,Fig. 7. Illustration of how jury learning work...,https://lilianweng.github.io/posts/2024-02-05-...
35,Create a subset of threshold samples $\mathcal...,https://lilianweng.github.io/posts/2024-02-05-...
40,[13] Koh & Liang. “Understanding Black-box Pre...,https://lilianweng.github.io/posts/2024-02-05-...
10,Either EM (Expectation–maximization) or VB (Va...,https://lilianweng.github.io/posts/2024-02-05-...


In [12]:
def limpiar_y_tokenizar(texto):
    # Convertir a minúsculas
    texto = texto.lower()
    
    # Eliminar URLs
    texto = re.sub(r'http\S+|www\S+|https\S+', '', texto, flags=re.MULTILINE)
    
    # Eliminar saltos de línea
    texto = texto.replace('\n', ' ')
    
    # Eliminar caracteres especiales y números
    texto = re.sub(r'[^a-z\s]', ' ', texto)
    
    # Tokenizar el texto
    tokens = word_tokenize(texto)
    
    # Eliminar stopwords
    stop_words = set(stopwords.words('spanish'))
    tokens_limpios = [token for token in tokens if token not in stop_words]
    
    #une el texto tokenizado
    tokens_limpios = ' '.join(tokens_limpios)
    
    return tokens_limpios

In [13]:
df_texto['tokens'] = df_texto['page_content'].apply(limpiar_y_tokenizar)

In [14]:
df_texto.sample(2)

Unnamed: 0,page_content,metadata,tokens
14,Pros\n- Can help to identify which entries are...,https://lilianweng.github.io/posts/2024-02-05-...,pros can help to identify which entries are mo...
1,Fig. 1. Two directions to approach high data q...,https://lilianweng.github.io/posts/2024-02-05-...,fig two directions to approach high data quali...


In [15]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="text-embedding-3-small"
)

In [16]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [17]:
collection = chroma_client.get_or_create_collection('blog', embedding_function=openai_ef)

In [18]:
df_texto = df_texto.assign(id=range(1, len(df_texto) + 1))
df_texto.sample(2)

Unnamed: 0,page_content,metadata,tokens,id
3,"Almost 100 years later, Callison-Burch (2009) ...",https://lilianweng.github.io/posts/2024-02-05-...,almost years later callison burch did an early...,4
8,MACE (Multi-Annotator Competence Estimation; H...,https://lilianweng.github.io/posts/2024-02-05-...,mace multi annotator competence estimation hov...,9


In [19]:
collection.upsert(
    documents=df_texto['tokens'].to_list(),
    metadatas=df_texto.to_dict(orient='records'),
    ids=df_texto['id'].astype(str).to_list()
    )