In [1]:
from sentence_transformers import SentenceTransformer, losses, InputExample, models, util
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
import torch
from torch import nn

import os

import pandas as pd
import numpy as np

import utils
import importlib
importlib.reload(utils)

  from .autonotebook import tqdm as notebook_tqdm


<module 'utils' from '/home/estudiante/mlt_project/utils/__init__.py'>

In [2]:
class AdapterModule(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.3, add_residual=True):
        super(AdapterModule, self).__init__()
        self.dense1 = nn.Linear(in_features=input_dim, out_features=1024, bias=True)
        self.dense2 = nn.Linear(in_features=1024, out_features=512, bias=True)
        self.output = nn.Linear(in_features=512, out_features=output_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.add_residual = add_residual
        if add_residual:
            self.residual_weight = nn.Parameter(nn.init.uniform_(torch.empty(1), 0, 0.1))  # Small initialization

    def forward(self, input_data):
        x = input_data.get('sentence_embedding')
        original_x = x if self.add_residual else None
        x = self.dropout(self.activation(self.dense1(x)))
        x = self.dropout(self.activation(self.dense2(x)))
        x = self.output(x)
        if self.add_residual:
            x += self.residual_weight * original_x
            
        input_data['sentence_embedding'] = x
        
        return input_data
    
    def save(self, output_path):
        torch.save(self.state_dict(), os.path.join(output_path, 'adapter_module.pt'))
    
word_embedding_model = models.Transformer(
                            model_name_or_path="sentence-transformers/all-MiniLM-L12-v2", 
                            max_seq_length=128, 
                            do_lower_case=False
                            )

# Parametros default del modelo base a utilizar
pooling_model = models.Pooling(**{'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 
                                  'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False,
                                  'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 
                                  'pooling_mode_lasttoken': False, 'include_prompt': True})
normalize = models.Normalize()

# Unica sección que tiene pesos entrenables
for param in word_embedding_model.parameters():
    param.requires_grad = False
    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Determinar la dimensión de entrada de acuerdo con la última capa del modelo base
adapter = AdapterModule(384, 384).to(device)


base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model, normalize], device=device)

custom_domain_model = SentenceTransformer(modules=[word_embedding_model, pooling_model, 
                                                   adapter, normalize
                                                   ],device=device)

custom_domain_model



SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): AdapterModule(
    (dense1): Linear(in_features=384, out_features=1024, bias=True)
    (dense2): Linear(in_features=1024, out_features=512, bias=True)
    (output): Linear(in_features=512, out_features=384, bias=True)
    (activation): ReLU()
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (3): Normalize()
)

In [3]:
qa = pd.read_pickle('data/qa_training.pkl')
qa_eval = pd.read_pickle('data/qa_evaluation.pkl')

Training lenght:  4200
Validation lenght:  3


In [4]:
train_examples = [
    InputExample(texts=[qa[0], qa[1]])
    for qa in qa
]

In [15]:
print("Training lenght: ", len(qa))
print("Validation lenght: ", len(qa_eval['queries']))

Training lenght:  4200
Validation lenght:  876


In [5]:
loader = DataLoader(train_examples, shuffle=True, batch_size=128)
train_loss = losses.MultipleNegativesSymmetricRankingLoss(custom_domain_model,
                                                          )
evaluator = InformationRetrievalEvaluator(qa_eval['queries'], qa_eval['corpus'], qa_eval['relevant_docs'], 
                                          name='qa_eval', main_score_function='dot_score')

epochs = 300
warmup_steps = int(len(loader) * epochs * 0.1)

custom_domain_model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='domain_adaptation_model',
    show_progress_bar=True,
    save_best_model=True,
    #use_amp=True,
    evaluator=evaluator, 
    evaluation_steps=50,
)

Epoch:   0%|                                                                                                                                         | 0/300 [00:00<?, ?it/s]
Iteration:   0%|                                                                                                                                      | 0/33 [00:00<?, ?it/s][A
Iteration:   3%|███▊                                                                                                                          | 1/33 [00:00<00:20,  1.57it/s][A
Iteration:   9%|███████████▍                                                                                                                  | 3/33 [00:00<00:07,  4.03it/s][A
Iteration:  15%|███████████████████                                                                                                           | 5/33 [00:00<00:04,  5.92it/s][A
Iteration:  21%|██████████████████████████▋                                                                           

## Evaluating the base model & the custom model

In [6]:
hit_rate = utils.hit_rate_at_k(qa_eval['queries'], qa_eval['corpus'], qa_eval['relevant_docs'], k=10, model=base_model)
print(f"Hit Rate @ 10 (base model): {round(hit_rate, 2)}")


hit_rate = utils.hit_rate_at_k(qa_eval['queries'], qa_eval['corpus'], qa_eval['relevant_docs'], k=10, model=custom_domain_model)
print(f"Hit Rate @ 10 (custom model): {round(hit_rate, 2)}")

Hit Rate @ 10 (base model): 0.32
Hit Rate @ 10 (custom model): 0.92


In [16]:
eva_base_model = evaluator(base_model, output_path='results/base_model/')
print("MAP @ 100 (base model): ", round(eva_base_model, 3))

eva_custom_model = evaluator(custom_domain_model, output_path='results/custom_model/')
print("MAP @ 100 (custom model): ", round(eva_custom_model, 3))

MAP @ 100 (base model):  0.006
MAP @ 100 (custom model):  0.044


In [17]:
base_model_eval = pd.read_csv('results/base_model/Information-Retrieval_evaluation_qa_eval_results.csv')
base_model_eval['tipo'] = 'base_model'
custom_model_eval = pd.read_csv('results/custom_model/Information-Retrieval_evaluation_qa_eval_results.csv')
custom_model_eval['tipo'] = 'custom_model'

pd.concat([base_model_eval, custom_model_eval])


Unnamed: 0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100,tipo
0,-1,-1,0.167808,0.242009,0.281963,0.324201,0.167808,0.004195,0.08067,0.00605,...,0.08067,0.00605,0.056393,0.007269,0.03242,0.009019,0.214125,0.053013,0.005827,base_model
0,-1,-1,0.534247,0.784247,0.853881,0.91895,0.534247,0.032326,0.261416,0.053084,...,0.261416,0.053084,0.170776,0.059294,0.091895,0.065997,0.672219,0.171889,0.044243,custom_model


### El modelo sigue preservando las propiedades del conocimiento base

In [9]:
paper = "Composable Lightweight Processors"
author = "Changkyu Kim"
author2 = "Venturelli Guilherme Cavalheiro"

concept1 = "shark"
concept2 = "ocean"
concept3 = "strawberry"

In [12]:
custom_paper = custom_domain_model.encode(paper)
custom_author = custom_domain_model.encode(author)
custom_author2 = custom_domain_model.encode(author2)

custom_concept1 = custom_domain_model.encode(concept1)
custom_concept2 = custom_domain_model.encode(concept2)
custom_concept3 = custom_domain_model.encode(concept3)

# Imprimir los resultados y explicaciones
print(f"Producto punto entre dos conceptos (shark y ocean): {np.dot(custom_concept1, custom_concept2)}")
print(f"Producto punto entre dos conceptos (shark y strawberry): {np.dot(custom_concept1, custom_concept3)}")
print(f"Producto punto entre el documento y un concepto (ocean): {np.dot(custom_paper, custom_concept2)}")

Producto punto entre dos conceptos (shark y ocean): 0.5274841785430908
Producto punto entre dos conceptos (shark y strawberry): 0.3383411169052124
Producto punto entre el documento y un concepto (ocean): -0.05817791819572449


In [13]:
base_paper = base_model.encode(paper)
base_author = base_model.encode(author)
base_author2 = base_model.encode(author2)

base_concept1 = base_model.encode(concept1)
base_concept2 = base_model.encode(concept2)
base_concept3 = base_model.encode(concept3)  

# Imprimir los resultados y explicaciones
print(f"Producto punto entre dos conceptos (shark y ocean): {np.dot(base_concept1, base_concept2)}")
print(f"Producto punto entre dos conceptos (shark y strawberry): {np.dot(base_concept1, base_concept3)}")
print(f"Producto punto entre el documento y un concepto (ocean): {np.dot(base_paper, base_concept2)}")

Producto punto entre dos conceptos (shark y ocean): 0.5232284069061279
Producto punto entre dos conceptos (shark y strawberry): 0.23324596881866455
Producto punto entre el documento y un concepto (ocean): -0.08226228505373001


In [None]:
custom_domain_model.save('results/custom_model')