In [2]:
pip install pandas langchain langchain-community sentence-transformers chromadb transformers torch accelerate

Collecting pandas
  Using cached pandas-2.3.0-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting langchain
  Using cached langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting langchain-community
  Using cached langchain_community-0.3.26-py3-none-any.whl.metadata (2.9 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting chromadb
  Using cached chromadb-1.0.13-cp39-abi3-win_amd64.whl.metadata (7.1 kB)
Collecting transformers
  Using cached transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting torch
  Downloading torch-2.7.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting accelerate
  Using cached accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.2.6-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pa

In [6]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from dotenv import load_dotenv
import os

In [9]:
# 1. Cargar CSV
df = pd.read_csv("data/processed/merged_df.csv")

load_dotenv()

hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")

# 2. Convertir cada fila en un documento
docs = [Document(page_content="\n".join([f"{col}: {val}" for col, val in row.items()])) for _, row in df.iterrows()]

# 3. Dividir documentos en chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_docs = splitter.split_documents(docs)

# 4. Embeddings con modelo local
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 5. Indexar con Chroma
vectordb = Chroma.from_documents(split_docs, embedding=embedding_model, persist_directory="./chroma_index")
vectordb.persist()

# 6. Cargar modelo LLM local (ej. Mistral)
model_name = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=hf_token, device_map="auto", torch_dtype="auto")

# 7. Crear pipeline de generación
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
)

llm = HuggingFacePipeline(pipeline=pipe)

# 8. Crear cadena RAG
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True
)

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


ValueError: Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

In [None]:

# 9. Prueba de consulta
query = "¿Qué día el jugador alcanzó su mayor velocidad pico?"
respuesta = qa_chain.invoke(query)

print("Respuesta:\n", respuesta["result"])