In [None]:
!pip install -q pinecone openai PyMuPDF
!pip install -q sentence-transformers
!pip install -q pdfminer.six
!pip install -q transformers accelerate

In [None]:
import fitz  # PyMuPDF
import pinecone
from openai import OpenAI
from google.colab import files
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

from pathlib import Path
from typing import List
from pdfminer.high_level import extract_text
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
PINECONE_API_KEY = "pcsk_5RX5Kd_7trCEuiZJ9WjTxqQejAySnm8yT4iL5LZWsDQwHSxQANJKFgJVntKvgfZFZBRmnw"
PINECONE_ENV = "gcp-starter"

In [None]:
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # dim=384
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")  # dim=384

# def get_embedding(text):
#     return embedding_model.encode(text).tolist()

def get_embedding(t):
  return embedding_model.encode(t).astype("float32").tolist()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)

# Si el índice existe, lo borramos para recrearlo
if "cv-index" in pc.list_indexes().names():
    pc.delete_index("cv-index")

pc.create_index(
    name='cv-index',
    dimension=384,
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

index = pc.Index("cv-index")

In [None]:
def extract_text_from_pdf(file_path: str) -> str:
    return extract_text(file_path)  # usa pdfminer.six

def create_document(text: str, metadata=None) -> Document:
    metadata = metadata or {}
    return Document(page_content=text, metadata=metadata)

def chunk_text(document: Document, chunk_size=500, chunk_overlap=100) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_documents([document])

# --- MULTI-PDF ---

def chunk_pdfs(
    pdf_paths: List[str],
    chunk_size: int = 1000,
    chunk_overlap: int = 200
) -> List[Document]:
    """Procesa múltiples PDFs y devuelve una lista plana de chunks (Documents)."""
    all_chunks: List[Document] = []
    for p in pdf_paths:
        text = extract_text_from_pdf(p) or ""
        if not text.strip():
            print(f"⚠️ Vacío o ilegible: {p}")
            continue
        doc = create_document(text, metadata={"source": str(p)})
        chunks = chunk_text(doc, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        # Agrego metadata útil a cada chunk
        for i, ch in enumerate(chunks):
            ch.metadata.update({"chunk_id": i})
        all_chunks.extend(chunks)
    return all_chunks

def chunk_pdfs_in_dir(
    folder: str,
    pattern: str = "*.pdf",
    **kwargs
) -> List[Document]:
    """Busca PDFs en una carpeta (recursivo opcional usando '**/*.pdf') y chunea todo."""
    files = [str(p) for p in Path(folder).glob(pattern)]
    return chunk_pdfs(files, **kwargs)

# --- USO ---

# Opción A: lista de archivos
# pdfs = ["cv1.pdf", "cv2.pdf", "cv3.pdf"]
# chunks = chunk_pdfs(pdfs, chunk_size=1000, chunk_overlap=200)

# Opción B: carpeta completa (no recursivo)
chunks = chunk_pdfs_in_dir("./cvs", pattern="*.pdf", chunk_size=200, chunk_overlap=50)

# Opción C: carpeta recursiva
# chunks = chunk_pdfs_in_dir("./cvs", pattern="**/*.pdf", chunk_size=1000, chunk_overlap=200)

print(f"Total de chunks: {len(chunks)}")
print(chunks[0].metadata, chunks[0].page_content[:120], "...")


Total de chunks: 9
{'source': 'cvs/cv.pdf', 'chunk_id': 0} Personal Information

ra.feichu@gmail.com

rfeichubuinm@itba.edu.ar

Capital Federal, Argentina

Ramiro Feichubuinm

Exp ...


In [None]:
chunks

[Document(metadata={'source': 'cvs/cv.pdf', 'chunk_id': 0}, page_content='Personal Information\n\nra.feichu@gmail.com\n\nrfeichubuinm@itba.edu.ar\n\nCapital Federal, Argentina\n\nRamiro Feichubuinm\n\nExperience'),
 Document(metadata={'source': 'cvs/cv.pdf', 'chunk_id': 1}, page_content='Ramiro Feichubuinm\n\nExperience\n\n Transactional Engineer & API Developer\nRed Link S.A.| August 2021 - Present\nContact: redlink.com.ar'),
 Document(metadata={'source': 'cvs/cv.pdf', 'chunk_id': 2}, page_content='API Design & Development in C++\nHPE NonStop Systems\nProduction Support\nPython Scripting\n\nMachine Learning Engineer & Python Developer\nZoomAgri | 2020 - August 2021\nContact: zoomagri.com'),
 Document(metadata={'source': 'cvs/cv.pdf', 'chunk_id': 3}, page_content='Machine/Deep Learning Models\nImage Processing\nExploratory Data Analysis\nPython Scripting\n\nJavascript Developer\nFreelance Development | 2018 - 2020\n\nVanilla Javascript'),
 Document(metadata={'source': 'cvs/cv.pdf', 'ch

In [None]:
vectors = [
    {
        "id": f"{Path(chunk.metadata['source']).stem}-{chunk.metadata['chunk_id']}",
        "values": get_embedding(chunk.page_content),
        "metadata": {
            **chunk.metadata,          # source, chunk_id, etc.
            "text": chunk.page_content # <- clave para el chatbot
        }
    }
    for chunk in chunks
]


index.upsert(vectors=vectors)
print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [None]:
# hago esto para asegurarme de que ya está
import time
while True:
    d = pc.describe_index("cv-index")
    if getattr(d, "status", {}).get("ready"):
        break
    time.sleep(1)

In [None]:
pregunta = "What languages can the candidate speak?"
embedding_pregunta = get_embedding(pregunta)

res = index.query(vector=embedding_pregunta, top_k=5, include_metadata=True, namespace="")

matches = res.get('matches', [])
if not matches:
    print("⚠️ No se encontraron resultados")
else:
    print("CV más cercano:", matches[0]['id'])
    print("Texto:", matches[0]['metadata']['source'])

CV más cercano: cv-7
Texto: cvs/cv.pdf


In [None]:
matches

[{'id': 'cv-7',
  'metadata': {'chunk_id': 7.0,
               'source': 'cvs/cv.pdf',
               'text': 'Escuela Superior de Comercio Carlos Pellegrini (ESCCP)\n'
                       'High School Degree with Accounting Specialization | '
                       '2008 - 2012\n'
                       '\n'
                       'Languages\n'
                       'Spanish\n'
                       'Native.\n'
                       '\n'
                       'English: \n'
                       'Proficient.\n'
                       '\n'
                       'Skills'},
  'score': 0.271029711,
  'values': []},
 {'id': 'cv-0',
  'metadata': {'chunk_id': 0.0,
               'source': 'cvs/cv.pdf',
               'text': 'Personal Information\n'
                       '\n'
                       'ra.feichu@gmail.com\n'
                       '\n'
                       'rfeichubuinm@itba.edu.ar\n'
                       '\n'
                       'Capital Federal, Argentina\n'


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")


In [None]:
context_chunks = []
for m in (matches['matches'] if isinstance(matches, dict) and 'matches' in matches else matches):
    meta = m.get("metadata", {}) or {}
    txt = (meta.get("text") or "").strip()
    if txt:
        context_chunks.append(txt)

if not context_chunks:
    raise ValueError("No text found in metadata['text'] from matches.")

context = "\n\n".join(context_chunks)[:3500]

prompt = f"""
You are an assistant that must answer ONLY using the CONTEXT provided.
Never invent information. If the answer is not explicitly in the context, reply exactly:
"Not enough information".

Now use ONLY the following CONTEXT and QUESTION.

CONTEXT:
{context}

QUESTION: {pregunta}

ANSWER:
""".strip()

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
outputs = model.generate(
    **inputs,
    max_new_tokens=200,
    do_sample=False,
    num_beams=4,
    early_stopping=True
)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

print("\n=== Chatbot answer ===\n", answer)


=== Chatbot answer ===
 Spanish Native. English
