In [2]:
from sentence_transformers import SentenceTransformer, losses, InputExample, models, util, quantization
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
import torch
from torch import nn

import os

import pandas as pd
import numpy as np

from custom_adapter_module import AdapterModule

import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/home/estudiante/mlt_project/utils/__init__.py'>

In [3]:
# Carga del modelo de embeddings de palabras
word_embedding_model = models.Transformer(
    model_name_or_path="sentence-transformers/all-MiniLM-L12-v2",  # Modelo base de Sentence Transformers
    max_seq_length=128,  # Longitud máxima de la secuencia
    do_lower_case=False  # No convertir a minúsculas
)

# Definición de los parámetros del modelo de pooling
pooling_model = models.Pooling(
    word_embedding_dimension=384,  # Dimensión de los embeddings de palabras
    pooling_mode_cls_token=False,  # No usar el token CLS para el pooling
    pooling_mode_mean_tokens=True,  # Usar el promedio de los tokens para el pooling
    pooling_mode_max_tokens=False,  # No usar el máximo de los tokens para el pooling
    pooling_mode_mean_sqrt_len_tokens=False,  # No usar el promedio de la raíz cuadrada de la longitud para el pooling
    pooling_mode_weightedmean_tokens=False,  # No usar el promedio ponderado de los tokens para el pooling
    pooling_mode_lasttoken=False,  # No usar el último token para el pooling
    include_prompt=True  # Incluir el prompt en el pooling
)

# Definición del modelo de normalización
normalize = models.Normalize()

# Congelar los pesos del modelo de embeddings de palabras para que no se entrenen
for param in word_embedding_model.parameters():
    param.requires_grad = False

# Configuración del dispositivo para usar GPU si está disponible, de lo contrario usar CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Definir el módulo adaptador con las dimensiones de entrada y salida
adapter = AdapterModule(384, 384).to(device)

# Definir el modelo base de Sentence Transformer con las capas de embedding, pooling y normalización
base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model, normalize], device=device)

# Definir el modelo personalizado de Sentence Transformer que incluye el adaptador
custom_domain_model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model, adapter, normalize], device=device
)

custom_domain_model  # Mostrar la arquitectura del modelo personalizado



SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): AdapterModule(
    (dense1): Linear(in_features=384, out_features=1024, bias=True)
    (dense2): Linear(in_features=1024, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=384, bias=True)
    (activation): ReLU()
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (3): Normalize()
)

In [4]:
qa = pd.read_pickle('data/qa_training.pkl')
qa_eval = pd.read_pickle('data/qa_evaluation.pkl')

In [5]:
train_examples = [
    InputExample(texts=[qa[0], qa[1]])
    for qa in qa
]

In [5]:
print("Training lenght: ", len(qa))
print("Validation lenght: ", len(qa_eval['queries']))

Training lenght:  12105
Validation lenght:  1216


In [6]:
# Crear el dataset de entrenamiento
# Se crea una lista de InputExample, donde cada ejemplo es un par de textos (pregunta y respuesta)
train_dataset = [
    InputExample(texts=[qa[0], qa[1]])
    for qa in qa
]

# Crear el DataLoader para el dataset de entrenamiento
# shuffle=True permite mezclar los datos en cada época
loader = DataLoader(train_dataset, shuffle=True, batch_size=256)

# Definir la función de pérdida
# MultipleNegativesSymmetricRankingLoss es adecuada para tareas de recuperación de información con pares positivos
train_loss = losses.MultipleNegativesSymmetricRankingLoss(custom_domain_model)

# Definir el evaluador
# InformationRetrievalEvaluator evalúa el modelo en un conjunto de consultas y corpus
evaluator = InformationRetrievalEvaluator(
    qa_eval['queries'], 
    qa_eval['corpus'], 
    qa_eval['relevant_docs'], 
    name='qa_eval', 
    main_score_function='dot_score'
)

# Definir el número de épocas y los pasos de calentamiento
epochs = 500
warmup_steps = int(len(loader) * epochs * 0.1)

In [None]:
# Entrenar el modelo
# fit() entrena el modelo con los objetivos de entrenamiento y evaluador
custom_domain_model.fit(
    train_objectives=[(loader, train_loss)],  # Objetivos de entrenamiento: DataLoader y función de pérdida
    epochs=epochs,  # Número de épocas
    warmup_steps=warmup_steps,  # Número de pasos de calentamiento
    output_path='results/domain_adaptation_model',  # Ruta de salida para guardar el modelo entrenado
    show_progress_bar=True,  # Mostrar barra de progreso durante el entrenamiento
    save_best_model=True,  # Guardar el mejor modelo según la evaluación
    use_amp=True,  # Habilitar Mixed Precision 
    evaluator=evaluator,  # Evaluador para evaluar el modelo durante el entrenamiento
    evaluation_steps=50,  # Evaluar el modelo cada 50 pasos
)


Epoch:   0%|                                                                                                                            | 0/500 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                         | 0/48 [00:00<?, ?it/s][A
Iteration:   2%|██▎                                                                                                              | 1/48 [00:00<00:31,  1.48it/s][A
Iteration:   6%|███████                                                                                                          | 3/48 [00:00<00:11,  3.89it/s][A
Iteration:  10%|███████████▊                                                                                                     | 5/48 [00:00<00:07,  5.75it/s][A
Iteration:  15%|████████████████▍                                                                                                | 7/48 [00:01<00:05,  7.15it/s][A
Iteration:  19%|███

## Evaluating the base model & the custom model

In [8]:
custom_domain_model = SentenceTransformer('./results/domain_adaptation_model')

In [9]:
hit_rate = utils.hit_rate_at_k(qa_eval['queries'], qa_eval['corpus'], qa_eval['relevant_docs'], k=10, model=base_model)
print(f"Hit Rate @ 10 (base model): {round(hit_rate, 2)}")


hit_rate = utils.hit_rate_at_k(qa_eval['queries'], qa_eval['corpus'], qa_eval['relevant_docs'], k=10, model=custom_domain_model)
print(f"Hit Rate @ 10 (custom model): {round(hit_rate, 2)}")

Hit Rate @ 10 (base model): 0.1
Hit Rate @ 10 (custom model): 0.94


In [12]:
eva_base_model = evaluator(base_model, output_path='results/base_model/')
print("MAP @ 100 (base model): ", round(eva_base_model, 3))

eva_custom_model = evaluator(custom_domain_model, output_path='results/custom_model/')
print("MAP @ 100 (custom model): ", round(eva_custom_model, 3))

MAP @ 100 (base model):  0.002
MAP @ 100 (custom model):  0.037


In [13]:
base_model_eval = pd.read_csv('results/base_model/Information-Retrieval_evaluation_qa_eval_results.csv')
base_model_eval['tipo'] = 'base_model'
custom_model_eval = pd.read_csv('results/custom_model/Information-Retrieval_evaluation_qa_eval_results.csv')
custom_model_eval['tipo'] = 'custom_model'

pd.concat([base_model_eval, custom_model_eval]).to_csv('results/eval_comparation.csv', index=False)

pd.concat([base_model_eval, custom_model_eval])


Unnamed: 0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100,tipo
0,-1,-1,0.035362,0.055921,0.069901,0.100329,0.035362,0.001052,0.01864,0.001566,...,0.01864,0.001566,0.01398,0.001918,0.010033,0.002831,0.0517,0.013919,0.00189,base_model
0,-1,-1,0.514803,0.754934,0.860197,0.943257,0.514803,0.028315,0.251645,0.042509,...,0.251645,0.042509,0.172039,0.04757,0.094326,0.05231,0.65786,0.167424,0.036689,custom_model


### Comparing authors representation

In [28]:
# Asumiendo que los embeddings están normalizados
question1 = "Which authors are working in Artificial Intelligence?"
question2 =  "Which authors are working in Biology?"
ai_author = "Kudenko D."
bio_author = "Susan H. Fisher"

emb_q1 = custom_domain_model.encode(question1)  # el embedding está normalizado
emb_q2 = custom_domain_model.encode(question2)  # el embedding está normalizado
autai_emb = custom_domain_model.encode(ai_author)
autbio_emb = custom_domain_model.encode(bio_author)


print("Which authors are working in Artificial Intelligence?", autbio_emb @ emb_q1,"(Susan) --", autai_emb @ emb_q1, "(Kudenko)")
print("Which authors are working in Biology?", autbio_emb @ emb_q2, "(Susan) --", autai_emb @ emb_q2, "(Kudenko)")

Which authors are working in Artificial Intelligence? 0.031228611 (Susan) -- 0.17037265 (Kudenko)
Which authors are working in Biology? 0.20633332 (Susan) -- 0.09790592 (Kudenko)


### The custom model mantain original capabilities

In [17]:
paper = "Composable Lightweight Processors"

concept1 = "shark"
concept2 = "ocean"
concept3 = "strawberry"

In [18]:
custom_paper = custom_domain_model.encode(paper)

custom_concept1 = custom_domain_model.encode(concept1)
custom_concept2 = custom_domain_model.encode(concept2)
custom_concept3 = custom_domain_model.encode(concept3)

# Imprimir los resultados y explicaciones
print(f"Producto punto entre dos conceptos (shark y ocean): {np.dot(custom_concept1, custom_concept2)}")
print(f"Producto punto entre dos conceptos (shark y strawberry): {np.dot(custom_concept1, custom_concept3)}")
print(f"Producto punto entre el documento y un concepto (ocean): {np.dot(custom_paper, custom_concept2)}")

Producto punto entre dos conceptos (shark y ocean): 0.5841832160949707
Producto punto entre dos conceptos (shark y strawberry): 0.39272424578666687
Producto punto entre el documento y un concepto (ocean): 0.007279553450644016


In [19]:
base_paper = base_model.encode(paper)

base_concept1 = base_model.encode(concept1)
base_concept2 = base_model.encode(concept2)
base_concept3 = base_model.encode(concept3)  

# Imprimir los resultados y explicaciones
print(f"Producto punto entre dos conceptos (shark y ocean): {np.dot(base_concept1, base_concept2)}")
print(f"Producto punto entre dos conceptos (shark y strawberry): {np.dot(base_concept1, base_concept3)}")
print(f"Producto punto entre el documento y un concepto (ocean): {np.dot(base_paper, base_concept2)}")

Producto punto entre dos conceptos (shark y ocean): 0.5232284069061279
Producto punto entre dos conceptos (shark y strawberry): 0.23324596881866455
Producto punto entre el documento y un concepto (ocean): -0.08226228505373001
