<a href="https://colab.research.google.com/github/sofiaanzolao/Parcial-3/blob/main/codigo_Parcial3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Parcial 3**

In [None]:
# ============================================
# IMPORTS BÁSICOS
# ============================================
!pip install -q langchain langchain-community sentence-transformers faiss-cpu pypdf transformers accelerate bitsandbytes

from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from pypdf import PdfReader
import os, textwrap, shutil, numpy as np, torch

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

try:
    from google.colab import files
    COLAB = True
except ImportError:
    COLAB = False

# ============================================
# SUBIDA DE PDFs
# ============================================
if COLAB:
    print("Sube tus PDFs...")
    uploaded = files.upload()

BASE_FOLDER = "/content/BASES_DE_DATOS"
os.makedirs(BASE_FOLDER, exist_ok=True)

for f in os.listdir("/content"):
    full = os.path.join("/content", f)
    if os.path.isfile(full) and f.lower().endswith(".pdf"):
        shutil.move(full, os.path.join(BASE_FOLDER, f))

print("PDFs cargados:", os.listdir(BASE_FOLDER))
# ============================================
# LECTURA Y CHUNKING
# ============================================

def load_pdfs_and_chunk(folder_path, chunk_size=500, chunk_overlap=100):
    corpus = []
    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(".pdf"):
            continue

        pdf_path = os.path.join(folder_path, filename)
        reader = PdfReader(pdf_path)

        for page_num, page in enumerate(reader.pages):
            try:
                page_text = page.extract_text() or ""
            except:
                page_text = ""
            page_text = page_text.strip()
            if not page_text:
                continue

            # chunking
            start = 0
            while start < len(page_text):
                end = start + chunk_size
                chunk = page_text[start:end]
                corpus.append({
                    "text": chunk,
                    "source": f"{filename} - página {page_num+1}"
                })
                start = end - chunk_overlap

    return corpus


corpus = load_pdfs_and_chunk(BASE_FOLDER)
print("Chunks generados:", len(corpus))
print(corpus[0])
# ============================================
# VECTOREAR Y CREAR VECTORSTORE
# ============================================

EMB_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
embeddings_lc = HuggingFaceEmbeddings(model_name=EMB_MODEL)

docs_texts = [c["text"] for c in corpus]
docs_meta  = [{"title": c["source"]} for c in corpus]

vectorstore = FAISS.from_texts(
    texts=docs_texts,
    embedding=embeddings_lc,
    metadatas=docs_meta
)

vectorstore.save_local("/content/faiss_index_creditos_agro")
print("Vectorstore creado correctamente.")
# ============================================
# RERANK
# ============================================
USE_RERANK = True

if USE_RERANK:
    RERANK_MODEL = "mixedbread-ai/mxbai-rerank-base-v1"
    rr_tok = AutoTokenizer.from_pretrained(RERANK_MODEL)
    rr_model = AutoModelForSequenceClassification.from_pretrained(RERANK_MODEL)
    rr_model = rr_model.to("cuda" if torch.cuda.is_available() else "cpu").eval()

    def rerank(query, candidates, top_n=5):
        pairs = [(query, c.page_content) for c in candidates]
        enc = rr_tok(
            [p[0] for p in pairs],
            [p[1] for p in pairs],
            truncation=True,
            padding=True,
            return_tensors="pt"
        )
        enc = {k: v.to(rr_model.device) for k, v in enc.items()}
        with torch.no_grad():
            scores = rr_model(**enc).logits.squeeze(-1).float().cpu().numpy()
        order = np.argsort(-scores)
        return [candidates[i] for i in order[:top_n]]
# ============================================
# LLM GENERATIVO
# ============================================

LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tok = AutoTokenizer.from_pretrained(LLM_MODEL)
llm = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL,
    device_map="auto",
    quantization_config=bnb_cfg
)

print("LLM cargado.")
# ============================================
# FUNCIONES RAG
# ============================================
from textwrap import shorten

def retrieve(query, k=10, search_type="similarity"):
    if search_type == "similarity":
        return vectorstore.similarity_search(query, k=k)
    elif search_type == "mmr":
        # búsqueda con diversidad (Max Marginal Relevance)
        return vectorstore.max_marginal_relevance_search(query, k=k, lambda_mult=0.1)
    else:
        # por si acaso
        return vectorstore.similarity_search(query, k=k)

def build_prompt(query, contexts):
    header = (
        "Responde en español colombiano usando SOLO el contexto.\n"
        "Si no encuentras la respuesta en el contexto, dilo explícitamente.\n"
        "Sé claro y explica como si hablaras con un caficultor.\n\n"
    )
    ctx = ""
    for i, d in enumerate(contexts):
        t = d.metadata.get("title","")
        ctx += f"[{i+1}] {d.page_content}\n(Cita: {t})\n\n"
    user = f"Pregunta: {query}\n\nCita las fuentes como [n] cuando corresponda."
    return header + "Contexto:\n" + ctx + user

def generate_answer(prompt, max_new_tokens=400):
    input_ids = tok(prompt, return_tensors="pt").to(llm.device)
    out = llm.generate(
        **input_ids,
        max_new_tokens=max_new_tokens,
        temperature=0.2,
        do_sample=False
    )
    return tok.decode(out[0], skip_special_tokens=True)

def ask(query, k=10, rerank_top=5):
    hits = retrieve(query, k=k)
    if not hits:
        return "No encontré ninguna información relevante en los documentos."

    if USE_RERANK:
        hits = rerank(query, hits, top_n=rerank_top)

    prompt = build_prompt(query, hits)
    respuesta = generate_answer(prompt)

    fuentes = "\n".join([f"[{i+1}] {shorten(h.metadata['title'],90)}" for i,h in enumerate(hits)])
    respuesta += "\n\nFuentes:\n" + fuentes
    return respuesta
query = (
    "Soy pequeño caficultor en Pitalito y quiero comprar un dron de aspersión. "
    "¿Qué líneas de crédito de inversión agropecuaria existen y qué plazos y tasas manejan?"
)

print(ask(query, k=10, rerank_top=5))