<a href="https://colab.research.google.com/github/rgbb/proyecto001/blob/testing_rg/script_mapeo_keywords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# script_mapeo_keywords_colab_local.py

# Autor: Machine Learning & AI Solutions Team
# Descripción: Comparación de modelos multilingües para mapeo semántico de keywords con procesamiento incremental (versión local Colab)

# ================================
# 1. Instalación de dependencias
# ================================
!pip install transformers sentence-transformers tqdm --quiet
!pip install accelerate bitsandbytes --quiet  # Para modelos grandes como BGE o Granite

# ================================
# 2. Importaciones
# ================================
import os
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import cosine_similarity, normalize
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# ================================
# 3. Montar Google Drive
# ================================
from google.colab import drive
drive.mount('/content/drive')

# Definir carpeta persistente en Google Drive (nombre de la carpeta a crear previamente: "keywords")
carpeta_drive = '/content/drive/MyDrive/keywords'

# Asegurar que la carpeta existe
os.makedirs(carpeta_drive, exist_ok=True)

# Rutas de archivos en Google Drive
archivo_entrada = os.path.join(carpeta_drive, 'keywords.csv')
archivo_resultado = os.path.join(carpeta_drive, 'resultados_comparacion_final.csv')
archivo_resumen = os.path.join(carpeta_drive, 'resumen_modelos_final.csv')

# ================================
# 4. Cargar datos fuente
# ================================
df = pd.read_csv(archivo_entrada)
df = df[df['keyword_get'].notna() & df['keyword_let'].notna()].reset_index(drop=True)
df['keyword_get'] = df['keyword_get'].astype(str)
df['keyword_let'] = df['keyword_let'].astype(str)
unique_keyword_let = df['keyword_let'].drop_duplicates().tolist()

# ================================
# 5. Cargar modelos
# ================================
model_infos = {
    'st_miniLM': SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'),
    'bge_m3': ('BAAI/bge-m3', None),
    'gte': ('Alibaba-NLP/gte-multilingual-base', None),
    'e5': ('intfloat/multilingual-e5-base', None),
    'granite': ('ibm-granite/granite-embedding-278m-multilingual', None),
}

def load_transformer_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
    return tokenizer, model

# ================================
# 6. Calcular embeddings de referencia
# ================================
reference_embeddings = {}
reference_embeddings['st_miniLM'] = torch.tensor(
    model_infos['st_miniLM'].encode(unique_keyword_let, convert_to_numpy=True)
)

for key in ['bge_m3', 'gte', 'e5', 'granite']:
    tokenizer, model = load_transformer_model(model_infos[key][0])
    model_infos[key] = (tokenizer, model)
    embs = []
    for text in tqdm(unique_keyword_let, desc=f'Embeddings {key}'):
        encoded = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            output = model(**encoded)
        cls_embedding = output.last_hidden_state[:, 0]
        normalized = normalize(cls_embedding, p=2, dim=1)
        embs.append(normalized)
    reference_embeddings[key] = torch.cat(embs, dim=0)

# ================================
# 7. Reanudar si hay resultados previos
# ================================
if os.path.exists(archivo_resultado):
    df_resultado = pd.read_csv(archivo_resultado)
    procesados = df_resultado['keyword_get'].tolist()
    print(f"🟡 Retomando desde registro {len(procesados)}...")
else:
    df_resultado = pd.DataFrame()
    procesados = []

df_pendientes = df[~df['keyword_get'].isin(procesados)].reset_index(drop=True)

# ================================
# 8. Función de comparación
# ================================
def comparar_modelos_full(text):
    resultados = {}
    emb_st = torch.tensor(model_infos['st_miniLM'].encode([text])[0])
    sims_st = cosine_similarity(emb_st.unsqueeze(0), reference_embeddings['st_miniLM']).squeeze(0)
    idx_st = torch.argmax(sims_st).item()
    resultados['match_st_miniLM'] = unique_keyword_let[idx_st]
    resultados['sim_st_miniLM'] = sims_st[idx_st].item() * 100

    for key in ['bge_m3', 'gte', 'e5', 'granite']:
        tokenizer, model = model_infos[key]
        encoded = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            output = model(**encoded)
        emb = normalize(output.last_hidden_state[:, 0], p=2, dim=1)
        sims = cosine_similarity(emb, reference_embeddings[key]).squeeze(0)
        idx = torch.argmax(sims).item()
        resultados[f'match_{key}'] = unique_keyword_let[idx]
        resultados[f'sim_{key}'] = sims[idx].item() * 100

    return pd.Series(resultados)

# ================================
# 9. Procesamiento por lotes
# ================================
batch_size = 100
total = len(df_pendientes)

for i, row in df_pendientes.iterrows():
    text = row['keyword_get']
    result = comparar_modelos_full(text)
    fila_resultado = pd.Series({
        'keyword_get': row['keyword_get'],
        'keyword_let': row['keyword_let'],
        **result.to_dict()
    })

    df_resultado = pd.concat([df_resultado, fila_resultado.to_frame().T], ignore_index=True)

    # Guardar después de cada batch para evitar pérdida en caso de reinicio
    if (i + 1) % batch_size == 0 or (i + 1) == total:
        df_resultado.to_csv(archivo_resultado, index=False)
        print(f"✅ Guardado: {i+1} procesados de {total} | Pendientes: {total - (i+1)}")

# ================================
# 10. Generar resumen
# ================================
resumen = {}
for col in [c for c in df_resultado.columns if c.startswith('match_')]:
    aciertos = (df_resultado[col] == df_resultado['keyword_let']).sum()
    resumen[col] = {
        'Coincidencias': aciertos,
        'Porcentaje de aciertos (%)': round(aciertos / len(df_resultado) * 100, 2),
    }

pd.DataFrame.from_dict(resumen, orient='index').to_csv(archivo_resumen)
print('🎉 Proceso finalizado y resumen guardado.')

Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.89k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]


Embeddings bge_m3:   0%|          | 0/4195 [00:00<?, ?it/s][A
Embeddings bge_m3:   0%|          | 1/4195 [00:00<1:04:32,  1.08it/s][A
Embeddings bge_m3:   0%|          | 2/4195 [00:01<49:45,  1.40it/s]  [A
Embeddings bge_m3:   0%|          | 3/4195 [00:01<39:39,  1.76it/s][A
Embeddings bge_m3:   0%|          | 4/4195 [00:02<31:30,  2.22it/s][A
Embeddings bge_m3:   0%|          | 5/4195 [00:02<26:46,  2.61it/s][A
Embeddings bge_m3:   0%|          | 6/4195 [00:02<24:22,  2.86it/s][A
Embeddings bge_m3:   0%|          | 7/4195 [00:03<26:30,  2.63it/s][A
Embeddings bge_m3:   0%|          | 8/4195 [00:03<32:57,  2.12it/s][A
Embeddings bge_m3:   0%|          | 9/4195 [00:04<36:28,  1.91it/s][A
Embeddings bge_m3:   0%|          | 10/4195 [00:05<37:06,  1.88it/s][A
Embeddings bge_m3:   0%|          | 11/4195 [00:05<38:47,  1.80it/s][A
Embeddings bge_m3:   0%|          | 12/4195 [00:06<39:31,  1.76it/s][A
Embeddings bge_m3:   0%|          | 13/4195 [00:06<37:13,  1.87it/s][A
Embed

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Embeddings gte: 100%|██████████| 4195/4195 [07:48<00:00,  8.95it/s]


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Embeddings e5: 100%|██████████| 4195/4195 [06:02<00:00, 11.56it/s]


tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/556M [00:00<?, ?B/s]

Embeddings granite: 100%|██████████| 4195/4195 [05:52<00:00, 11.88it/s]


✅ Guardado: 100 procesados de 53033 | Pendientes: 52933
✅ Guardado: 200 procesados de 53033 | Pendientes: 52833
✅ Guardado: 300 procesados de 53033 | Pendientes: 52733
✅ Guardado: 400 procesados de 53033 | Pendientes: 52633
✅ Guardado: 500 procesados de 53033 | Pendientes: 52533
✅ Guardado: 600 procesados de 53033 | Pendientes: 52433
✅ Guardado: 700 procesados de 53033 | Pendientes: 52333
✅ Guardado: 800 procesados de 53033 | Pendientes: 52233
✅ Guardado: 900 procesados de 53033 | Pendientes: 52133
✅ Guardado: 1000 procesados de 53033 | Pendientes: 52033
✅ Guardado: 1100 procesados de 53033 | Pendientes: 51933
✅ Guardado: 1200 procesados de 53033 | Pendientes: 51833
✅ Guardado: 1300 procesados de 53033 | Pendientes: 51733
✅ Guardado: 1400 procesados de 53033 | Pendientes: 51633
✅ Guardado: 1500 procesados de 53033 | Pendientes: 51533
✅ Guardado: 1600 procesados de 53033 | Pendientes: 51433
✅ Guardado: 1700 procesados de 53033 | Pendientes: 51333
✅ Guardado: 1800 procesados de 53033 | P