<a href="https://colab.research.google.com/github/pugapatricia/gestion-documentaria-para-pymes/blob/main/Etiquetado.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq PyPDF2 python-docx openpyxl python-pptx xlrd transformers office365-rest-python-client msal requests

In [None]:
import os
import io
import re
import json
from pathlib import Path
from PyPDF2 import PdfReader
import docx
import openpyxl
from pptx import Presentation
import xlrd
from transformers import pipeline
from office365.sharepoint.client_context import ClientContext
from office365.runtime.auth.user_credential import UserCredential
import os
import requests
import msal


In [None]:
# =========================
# Configuración
# =========================
CLIENT_ID = "e3f2393e-7348-47d1-9c64-8d8efe6a5e95"   # tu nuevo Client ID
AUTHORITY = "https://login.microsoftonline.com/consumers"
SCOPE = ["User.Read", "Files.ReadWrite"]

# =========================
# Autenticación con Device Code Flow
# =========================
app = msal.PublicClientApplication(CLIENT_ID, authority=AUTHORITY)

flow = app.initiate_device_flow(scopes=SCOPE)
if "user_code" not in flow:
    raise Exception("No se pudo iniciar el device flow. Revisa tu configuración en Azure (cliente público).")

print(flow["message"])
# Abre el link y pega el código

result = app.acquire_token_by_device_flow(flow)

if "access_token" not in result:
    raise Exception(f"Error autenticación: {result.get('error_description')}")

access_token = result["access_token"]

# =========================
# Llamada a OneDrive
# =========================
headers = {"Authorization": f"Bearer {access_token}"}
url = "https://graph.microsoft.com/v1.0/me/drive/root/children"
response = requests.get(url, headers=headers)
data = response.json()

print("Archivos en tu OneDrive:")
for item in data.get("value", []):
    print(f"- {item['name']} → {item['webUrl']}")

In [None]:
# Lista de etiquetas personalizadas
etiquetas = [
    "Finanzas", "Contabilidad", "FacturasEmitidas", "FacturasRecibidas",
    "Nóminas", "Bancos", "RecursosHumanos", "Contratos", "CVsCandidatos",
    "Formación", "PolíticasInternas", "Legal", "Clientes", "Proveedores",
    "LicenciasPermisos", "Operaciones", "Proyectos", "Procesos", "Calidad"
]

ext_permitidas = {"pdf", "docx", "xlsx", "xls", "pptx", "txt", "csv"}

# ============================
# 3. Autenticación (Device Code Flow)
# ============================
app = msal.PublicClientApplication(CLIENT_ID, authority=AUTHORITY)
flow = app.initiate_device_flow(scopes=SCOPE)
if "user_code" not in flow:
    raise Exception("No se pudo iniciar el device flow. Revisa configuración.")

print(flow["message"])  # 👈 aquí te da un link y un código que tienes que abrir y pegar

result = app.acquire_token_by_device_flow(flow)
if "access_token" not in result:
    raise Exception(f"Error autenticación: {result.get('error_description')}")

access_token = result["access_token"]
headers = {"Authorization": f"Bearer {access_token}"}

# ============================
# 4. Clasificador Hugging Face
# ============================
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def limpiar_texto(texto):
    texto = texto.lower()
    texto = re.sub(r"\s+", " ", texto)
    return texto.strip()

# ============================
# 5. Lectores de documentos
# ============================
def leer_archivo(nombre, contenido, limite=2000):
    ext = nombre.split(".")[-1].lower()
    texto = ""
    try:
        if ext == "pdf":
            reader = PdfReader(io.BytesIO(contenido))
            for page in reader.pages:
                if page.extract_text():
                    texto += page.extract_text() + "\n"
                    if len(texto) >= limite: break
        elif ext == "docx":
            doc = docx.Document(io.BytesIO(contenido))
            for p in doc.paragraphs:
                if p.text.strip():
                    texto += p.text + "\n"
                    if len(texto) >= limite: break
        elif ext == "xlsx":
            wb = openpyxl.load_workbook(io.BytesIO(contenido), data_only=True, read_only=True)
            for sheet in wb.worksheets:
                for row in sheet.iter_rows(values_only=True):
                    texto += " ".join([str(cell) for cell in row if cell]) + "\n"
                    if len(texto) >= limite: break
        elif ext == "xls":
            with open("temp.xls", "wb") as f: f.write(contenido)
            wb = xlrd.open_workbook("temp.xls")
            for sheet in wb.sheets():
                for row_idx in range(sheet.nrows):
                    row = sheet.row_values(row_idx)
                    texto += " ".join([str(cell) for cell in row if cell]) + "\n"
                    if len(texto) >= limite: break
        elif ext == "pptx":
            with open("temp.pptx", "wb") as f: f.write(contenido)
            prs = Presentation("temp.pptx")
            for slide in prs.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        texto += shape.text + "\n"
                        if len(texto) >= limite: break
        elif ext in ["txt", "csv"]:
            texto = contenido.decode("utf-8", errors="ignore")[:limite]
    except Exception as e:
        print(f"⚠️ Error leyendo {nombre}: {e}")
        return ""
    return limpiar_texto(texto[:limite])

# ============================
# 6. Función de etiquetado
# ============================
def etiquetar_texto(texto):
    res = classifier(texto, candidate_labels=etiquetas, multi_label=True)
    return [label for label, score in zip(res["labels"], res["scores"]) if score > 0.3]

# ============================
# 7. Leer archivos de OneDrive
# ============================
url = "https://graph.microsoft.com/v1.0/me/drive/root/children"
response = requests.get(url, headers=headers)
data = response.json()

resultados = {}
for item in data.get("value", []):
    nombre = item["name"]
    if not any(nombre.lower().endswith(ext) for ext in ext_permitidas):
        continue
    print(f"📂 Procesando {nombre}...")

    # Descargar archivo
    download_url = item["@microsoft.graph.downloadUrl"]
    file_bytes = requests.get(download_url).content

    # Extraer texto
    texto = leer_archivo(nombre, file_bytes)
    if texto:
        etiquetas_detectadas = etiquetar_texto(texto)
        resultados[nombre] = etiquetas_detectadas
        print(f"✅ {nombre} → {etiquetas_detectadas}")
    else:
        resultados[nombre] = []
        print(f"⚠️ No se pudo leer {nombre}")

# ============================
# 8. Guardar resultados
# ============================
with open("etiquetas_onedrive.json", "w", encoding="utf-8") as f:
    json.dump(resultados, f, ensure_ascii=False, indent=4)

print("📌 Resultados guardados en etiquetas_onedrive.json")

In [None]:
import json
import requests

# ============================
# Configuración
# ============================
json_path = "etiquetas_onedrive.json"
headers = {"Authorization": f"Bearer {access_token}"}

# Carpeta objetivo en OneDrive
carpeta_objetivo = "Etiquetados"

# ============================
# Cargar JSON de etiquetas
# ============================
with open(json_path, "r", encoding="utf-8") as f:
    etiquetas_data = json.load(f)

# ============================
# Aplicar etiquetas en OneDrive
# ============================
for archivo, etiquetas in etiquetas_data.items():
    if not etiquetas:
        continue

    # Buscar el archivo dentro de la carpeta Notos
    url_search = f"https://graph.microsoft.com/v1.0/me/drive/root:/{carpeta_objetivo}/{archivo}"
    response = requests.get(url_search, headers=headers)

    if response.status_code != 200:
        print(f"⚠️ No se encontró {archivo} en OneDrive")
        continue

    file_id = response.json()["id"]

    # Guardar etiquetas como propiedades personalizadas (SharePoint listItem)
    url_update = f"https://graph.microsoft.com/v1.0/me/drive/items/{file_id}/listItem/fields"
    payload = {"EtiquetasIA": ", ".join(etiquetas)}  # 👈 columna de SharePoint llamada "EtiquetasIA"
    response_update = requests.patch(url_update, headers={**headers, "Content-Type": "application/json"}, json=payload)

    if response_update.status_code in [200, 204]:
        print(f"✅ Etiquetas {etiquetas} aplicadas a {archivo}")
    else:
        print(f"⚠️ Error al etiquetar {archivo}: {response_update.text}")
