[![Abrir en Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pugapatricia/gestion-documentaria-para-pymes/blob/main/etiquetado/Etiquetado.ipynb)

[![Ver en GitHub](https://img.shields.io/badge/GitHub-Repo-black?logo=github)](https://github.com/pugapatricia/gestion-documentaria-para-pymes/tree/main/etiquetado)

In [None]:
!pip install -qq requests pdfplumber python-docx openai scikit-learn sentence-transformers numpy

In [None]:
import requests
import os
from io import BytesIO
from docx import Document
import pdfplumber
import openai
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np

# Configuración

In [None]:
CLIENT_ID = "e3f2393e-7348-47d1-9c64-8d8efe6a5e95"  # tu nuevo Client ID
AUTHORITY = "https://login.microsoftonline.com/consumers"
SCOPE = ["User.Read", "Files.ReadWrite"]

folder_name = "Etiquetados"
local_temp_folder = "temp_download"
os.makedirs(local_temp_folder, exist_ok=True)

# Autenticación con Device Code Flow

In [None]:
app = msal.PublicClientApplication(CLIENT_ID, authority=AUTHORITY)

flow = app.initiate_device_flow(scopes=SCOPE)
if "user_code" not in flow:
    raise Exception("No se pudo iniciar el device flow. Revisa tu configuración en Azure.")

print(flow["message"])
result = app.acquire_token_by_device_flow(flow)

if "access_token" not in result:
    raise Exception(f"Error autenticación: {result.get('error_description')}")

access_token = result["access_token"]

# =========================
# Definir headers para Graph API
# =========================
headers = {"Authorization": f"Bearer {access_token}"}

# Verificar usuario autenticado
me = requests.get(
    "https://graph.microsoft.com/v1.0/me",
    headers=headers
).json()
print("✅ Usuario autenticado:", me.get("userPrincipalName"))


# Llamada a OneDrive

In [None]:

url = "https://graph.microsoft.com/v1.0/me/drive/root:/Etiquetados:/children"
resp = requests.get(url, headers={"Authorization": f"Bearer {access_token}"})
data = resp.json()

print("\nArchivos en la carpeta Etiquetados:")
for item in data.get("value", []):
    if "folder" not in item:  # 👉 esto asegura que sea archivo, no carpeta
        print("-", item.get("name"), "→", item.get("webUrl", "sin URL"))

In [None]:
# Modelo para embeddings (puedes usar OpenAI o sentence-transformers)
model = SentenceTransformer('all-MiniLM-L6-v2')  # rápido y eficiente para similitud de texto

# Funciones de extracción de texto

In [None]:
def extract_text(file_name, file_bytes):
    if file_name.lower().endswith(".pdf"):
        with pdfplumber.open(BytesIO(file_bytes)) as pdf:
            texto = "\n".join(page.extract_text() or "" for page in pdf.pages)
        return texto
    elif file_name.lower().endswith(".docx"):
        doc = Document(BytesIO(file_bytes))
        texto = "\n".join([p.text for p in doc.paragraphs])
        return texto
    else:
        return None  # tipos de archivo no soportados

# Descargar archivos y extraer texto

In [None]:
archivos_texto = {}
url = f"https://graph.microsoft.com/v1.0/me/drive/root:/{folder_name}:/children"
resp = requests.get(url, headers=headers)
data = resp.json()

for item in data.get("value", []):
    if "folder" in item:
        continue
    file_name = item["name"]
    if file_name.startswith("."):
        continue

    download_url = item["@microsoft.graph.downloadUrl"]
    file_bytes = requests.get(download_url).content

    texto = extract_text(file_name, file_bytes)
    if texto:
        archivos_texto[file_name] = texto.lower().replace("\n", " ").strip()

# Calcular embeddings y similitud

In [None]:
nombres = list(archivos_texto.keys())
textos = [archivos_texto[n] for n in nombres]
embeddings = model.encode(textos)

duplicados = set()
for i in range(len(embeddings)):
    for j in range(i+1, len(embeddings)):
        sim = cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
        if sim > 0.95:  # umbral de similitud
            duplicados.add((nombres[i], nombres[j]))

# Resultado final

In [None]:
print(f"⚠️ Total de duplicados detectados: {len(duplicados)}")
for a, b in duplicados:
    print(f"   - {a} ≈ {b}")