In [1]:
import pdfplumber
import pandas as pd
import re
import json
from rapidfuzz import process, fuzz

In [2]:
# cargar diccionario maestro
df_raw = pd.read_csv('../data/diccionario_maestro.csv', sep=',', encoding='utf-8')
df_maestro = df_raw.copy()

In [None]:
def buscar_en_diccionario(texto_pdf):
    texto_pdf = str(texto_pdf).strip()
    if not texto_pdf or texto_pdf.lower() in ['none', 'nan', '']:
        return None, "SIN_DESCRIPCION", 0
    match = process.extractOne(
        texto_pdf, 
        df_maestro['nombre'], 
        scorer=fuzz.token_sort_ratio
    )
    
    if match and match[1] > 70:
        idx = match[2]
        return int(df_maestro.iloc[idx]['id']), df_maestro.iloc[idx]['nombre'], int(match[1])
    
    return None, "NUEVA_PIEZA", 0

In [None]:
def procesar_presupuesto(path_pdf, modo="inspeccion", taller_nombre="TREOS", observaciones="", demoras=None):
    config = CONFIG_TALLERES.get(taller_nombre)
    if not config:
        raise ValueError(f"No hay config para {taller_nombre}")
    
    with pdfplumber.open(path_pdf) as pdf:
        table = pdf.pages[0].extract_table()
        if not table: return []
        df_raw = pd.DataFrame(table[1:], columns=table[0])
    
    df_raw.columns = [str(c).replace('\n', '') if c is not None else f"col_{i}" for i, c in enumerate(df_raw.columns)]
    df_raw = df_raw.rename(columns=config["col_map"])
    
    def clean_num(x):
        if x is None or str(x).strip() == "" or str(x).lower() == 'none': return 0.0
        s = str(x).replace(config['thousands'], '').replace(config['decimal'], '.')
        s = "".join(c for c in s if c.isdigit() or c == '.')
        try: return float(s)
        except: return 0.0

    items_procesados = []
    
    for _, row in df_raw.iterrows():
        desc_raw = str(row.get('desc_original', '')).replace('\n', ' ').strip()
        
        if desc_raw == "" or any(x in desc_raw.upper() for x in ["TOTAL", "IVA", "SUBTOTAL"]):
            continue
            
        id_maestro, nombre_maestro, score = buscar_en_diccionario(desc_raw)
        precio_val = clean_num(row.get('precio', 0))
        cant_val = clean_num(row.get('cantidad', 0))
        
        if modo == "inspeccion":
            # Si el PDF es de Treos, el campo 'tipo' ayuda mucho
            tipo_pdf = str(row.get('tipo', '')).upper()
            es_repuesto = "REPUESTO" in tipo_pdf
            
            item = {
                "id_plataforma": id_maestro,
                "descripcion_maestra": nombre_maestro,
                "descripcion_taller": desc_raw,
                "tipo_item": "REPU" if es_repuesto else "MO",
                "sustitucion": es_repuesto,
                "horas_chapa": cant_val if "CHAPA" in desc_raw.upper() else 0,
                "panos_pintura": cant_val if "PINTURA" in desc_raw.upper() else 0,
                "costo": 0.0 if es_repuesto else precio_val,
                "confianza_match": f"{score}%"
            }
        else: # MODO COTIZACION
            item = {
                "id_plataforma": id_maestro,
                "descripcion_maestra": nombre_maestro,
                "oem": row.get('oem', 'N/A'),
                "precio_original": precio_val,
                "demora_original": demoras.get('original', 0) if demoras else 0,
                "costo": precio_val,
                "observaciones": observaciones
            }
        items_procesados.append(item)
    
    return items_procesados

In [None]:
# Simular inputs del usuario
MODO_ACTUAL = "inspeccion"
TALLER_ACTUAL = "TREOS"
MIS_DEMORAS = {"original": 5, "homologado": 10}
CONFIG_TALLERES = {
    "TREOS": {
        "col_map": {"TIPO": "tipo", "DESCRIPCION": "desc_original", "CANT.": "cantidad", "PRECIO": "precio"},
        "decimal": ",", "thousands": "."
    },
    "WAGEN": {
        "col_map": {"C贸digo": "oem", "Descripci贸n": "desc_original", "Cant.": "cantidad", "Precio ($)": "precio"},
        "decimal": ".", "thousands": ","
    }
}

resultado = procesar_presupuesto(
    "../data/presupuesto_A.pdf", 
    modo=MODO_ACTUAL, 
    taller_nombre=TALLER_ACTUAL,
    demoras=MIS_DEMORAS
)

print(json.dumps(resultado, indent=2, ensure_ascii=False))

pd.DataFrame(resultado)

[
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "descripcion_taller": "CHAPA CARGO C.SEG",
    "tipo_item": "MO",
    "sustitucion": false,
    "horas_chapa": 400.0,
    "panos_pintura": 0,
    "costo": 15454545.0,
    "confianza_match": "0%"
  },
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "descripcion_taller": "PINTURA CARGO C.SEG",
    "tipo_item": "MO",
    "sustitucion": false,
    "horas_chapa": 0,
    "panos_pintura": 800.0,
    "costo": 15867769.0,
    "confianza_match": "0%"
  },
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "descripcion_taller": "GUARDABARRO DEL.DER.",
    "tipo_item": "REPU",
    "sustitucion": true,
    "horas_chapa": 0,
    "panos_pintura": 0,
    "costo": 0.0,
    "confianza_match": "0%"
  },
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "descripcion_taller": "UNIT HEADLAMP W GAS",
    "tipo_item": "REPU",
    "sustitucion": true,


Unnamed: 0,id_plataforma,descripcion_maestra,descripcion_taller,tipo_item,sustitucion,horas_chapa,panos_pintura,costo,confianza_match
0,,NUEVA_PIEZA,CHAPA CARGO C.SEG,MO,False,400.0,0.0,15454545.0,0%
1,,NUEVA_PIEZA,PINTURA CARGO C.SEG,MO,False,0.0,800.0,15867769.0,0%
2,,NUEVA_PIEZA,GUARDABARRO DEL.DER.,REPU,True,0.0,0.0,0.0,0%
3,,NUEVA_PIEZA,UNIT HEADLAMP W GAS,REPU,True,0.0,0.0,0.0,0%
4,,NUEVA_PIEZA,PUERTA DELANTERA,REPU,True,0.0,0.0,0.0,0%
5,,NUEVA_PIEZA,ESPEJO DERECHO,REPU,True,0.0,0.0,0.0,0%
6,,NUEVA_PIEZA,MOLDURA MARCO VENTANA,REPU,True,0.0,0.0,0.0,0%
7,,SIN_DESCRIPCION,,MO,False,0.0,0.0,0.0,0%
8,,SIN_DESCRIPCION,,MO,False,0.0,0.0,0.0,0%
9,,SIN_DESCRIPCION,,MO,False,0.0,0.0,0.0,0%


In [None]:
# Simular inputs del usuario
MODO_ACTUAL = "cotizacion"
TALLER_ACTUAL = "TREOS"
MIS_DEMORAS = {"original": 5, "homologado": 10}
CONFIG_TALLERES = {
    "TREOS": {
        "col_map": {"TIPO": "tipo", "DESCRIPCION": "desc_original", "CANT.": "cantidad", "PRECIO": "precio"},
        "decimal": ",", "thousands": "."
    },
    "WAGEN": {
        "col_map": {"C贸digo": "oem", "Descripci贸n": "desc_original", "Cant.": "cantidad", "Precio ($)": "precio"},
        "decimal": ".", "thousands": ","
    }
}

resultado = procesar_presupuesto(
    "../data/presupuesto_A.pdf", 
    modo=MODO_ACTUAL, 
    taller_nombre=TALLER_ACTUAL,
    demoras=MIS_DEMORAS
)

print(json.dumps(resultado, indent=2, ensure_ascii=False))

pd.DataFrame(resultado)

[
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "oem": "N/A",
    "precio_original": 15454545.0,
    "demora_original": 5,
    "costo": 15454545.0,
    "observaciones": ""
  },
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "oem": "N/A",
    "precio_original": 15867769.0,
    "demora_original": 5,
    "costo": 15867769.0,
    "observaciones": ""
  },
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "oem": "N/A",
    "precio_original": 36348701.0,
    "demora_original": 5,
    "costo": 36348701.0,
    "observaciones": ""
  },
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "oem": "N/A",
    "precio_original": 233355822.0,
    "demora_original": 5,
    "costo": 233355822.0,
    "observaciones": ""
  },
  {
    "id_plataforma": null,
    "descripcion_maestra": "NUEVA_PIEZA",
    "oem": "N/A",
    "precio_original": 127736651.0,
    "demora_original": 5,
    "costo": 127736

Unnamed: 0,id_plataforma,descripcion_maestra,oem,precio_original,demora_original,costo,observaciones
0,,NUEVA_PIEZA,,15454545.0,5,15454545.0,
1,,NUEVA_PIEZA,,15867769.0,5,15867769.0,
2,,NUEVA_PIEZA,,36348701.0,5,36348701.0,
3,,NUEVA_PIEZA,,233355822.0,5,233355822.0,
4,,NUEVA_PIEZA,,127736651.0,5,127736651.0,
5,,NUEVA_PIEZA,,81953768.0,5,81953768.0,
6,,NUEVA_PIEZA,,5736545.0,5,5736545.0,
7,,SIN_DESCRIPCION,,0.0,5,0.0,
8,,SIN_DESCRIPCION,,0.0,5,0.0,
9,,SIN_DESCRIPCION,,0.0,5,0.0,
