In [1]:
from sentence_transformers import SentenceTransformer, losses, InputExample
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
import torch
from torch import nn

import re
import pandas as pd

In [2]:
class AdapterModule(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.2, add_residual=True):
        super(AdapterModule, self).__init__()
        self.dense1 = nn.Linear(in_features=input_dim, out_features=1024, bias=True)
        self.dense2 = nn.Linear(in_features=1024, out_features=512, bias=True)
        self.output = nn.Linear(in_features=512, out_features=output_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.add_residual = add_residual
        if add_residual:
            self.residual_weight = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        original_x = x if self.add_residual else None
        x = self.dropout(self.activation(self.dense1(x)))
        x = self.dropout(self.activation(self.dense2(x)))
        x = self.output(x)
        if self.add_residual:
            x += self.residual_weight * original_x
        return x

# Clase que combina el modelo base y el adaptador
class CustomDomainModel(SentenceTransformer):
    def __init__(self, sentence_transformer, adapter):
        super(CustomDomainModel, self).__init__()
        self.modules = sentence_transformer
        self.adapter = adapter

    def forward(self, input_query):
        embedding = self.modules.encode(input_query, normalize_embeddings=True, convert_to_tensor=True)
        output = self.adapter(embedding)
        return output
    
    
    
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2", 
                                     device=device)

for param in sentence_model.parameters():
    param.requires_grad = False

    
# Determinar la dimensión de entrada de acuerdo con la última capa del modelo base
dim = sentence_model.get_sentence_embedding_dimension()

# Instancia del módulo adaptador
adapter = AdapterModule(dim, dim).to(device)

# Crear el modelo completo
custom_domain_model = CustomDomainModel(sentence_model, adapter).to(device)



In [3]:
for param in sentence_model.parameters():
    print(param)

Parameter containing:
tensor([[-0.0200, -0.0034, -0.0147,  ...,  0.0381, -0.0054,  0.0311],
        [-0.0164, -0.0306,  0.0487,  ..., -0.0158,  0.0132, -0.0076],
        [-0.0179, -0.0110,  0.0321,  ..., -0.0232,  0.0219, -0.0079],
        ...,
        [-0.0268, -0.0060,  0.0316,  ..., -0.0249,  0.0128, -0.0103],
        [-0.0040, -0.0477,  0.0333,  ..., -0.0235,  0.0242, -0.0132],
        [-0.0340,  0.0029,  0.0035,  ..., -0.0198,  0.0258,  0.0087]],
       device='cuda:0')
Parameter containing:
tensor([[-0.0818, -0.0318, -0.0171,  ...,  0.0914,  0.0950,  0.0268],
        [-0.0311,  0.0021,  0.0114,  ...,  0.0275,  0.0498, -0.0103],
        [-0.0122, -0.0142,  0.0262,  ...,  0.0043,  0.0450, -0.0162],
        ...,
        [ 0.0050,  0.0131, -0.0085,  ..., -0.0356,  0.0383, -0.0005],
        [ 0.0155,  0.0149, -0.0155,  ...,  0.0017,  0.0265, -0.0240],
        [-0.0231, -0.0088, -0.0210,  ..., -0.0408,  0.0536, -0.0063]],
       device='cuda:0')
Parameter containing:
tensor([[ 1.4139e-

In [43]:
sentence_model.modules()

<generator object Module.modules at 0x7978980d1310>

In [36]:
train_examples = [
    InputExample(texts=['Anchor 1', 'Positive 1']),
    InputExample(texts=['Anchor 2', 'Positive 2']),
]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
train_loss = losses.MultipleNegativesRankingLoss(custom_domain_model)


custom_domain_model.fit(
    [(train_dataloader, train_loss)],
    epochs=10,
)

AttributeError: 'CustomDomainModel' object has no attribute 'fit'

In [25]:
sentence_model._first_module()

Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 

In [None]:
# define evaluator
# define over validation dataset
...
evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

# run training
...
custom_domain_model.fit(
    train_objectives=[(loader, loss)],
    epochs=10,
    warmup_steps=warmup_steps,
    output_path='exp_finetune',
    show_progress_bar=True,
    evaluator=evaluator, 
    evaluation_steps=50,
)

In [9]:
print(custom_model)

CustomDomainModel(
  (0): None
  (sentence_transformer): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Normalize()
  )
  (adapter): AdapterModule(
    (dense): Linear(in_features=384, out_features=1024, bias=True)
    (output): Linear(in_features=1024, out_features=384, bias=True)
    (activation): ReLU()
  )
)


In [5]:
custom_model.fit()

TypeError: SentenceTransformer.fit() missing 1 required positional argument: 'train_objectives'

tensor([-8.5604e-03,  4.7063e-03,  2.9338e-02, -1.6920e-03,  7.8644e-03,
         6.0411e-03, -5.9473e-03,  2.1570e-02, -3.0717e-03, -2.5239e-03,
        -2.3898e-02,  2.4159e-02, -4.0308e-02, -1.9330e-02,  1.4258e-02,
         3.6247e-02, -1.8038e-02, -1.8899e-02,  4.3695e-02,  2.6684e-03,
        -2.7440e-02, -2.0725e-02,  1.1475e-03, -4.6859e-02,  6.2541e-02,
        -3.2953e-02, -1.2535e-03,  9.1481e-03,  7.9562e-03, -1.7621e-02,
         6.7545e-03, -4.0406e-03,  2.9831e-02,  5.0202e-03, -1.5387e-02,
        -4.6414e-03, -1.5906e-03,  2.1663e-02, -3.0840e-02, -5.1280e-04,
         1.7715e-02, -1.7980e-02,  1.6104e-02, -9.2438e-03, -9.8847e-04,
        -7.4001e-05, -4.0515e-03,  7.3917e-03, -1.3626e-02,  1.7298e-03,
         4.5187e-03,  3.5513e-02,  1.3701e-02,  2.2093e-02, -5.2322e-03,
        -1.6128e-02, -1.7908e-02, -2.3397e-02, -5.6054e-02, -9.1612e-03,
        -8.2587e-03,  3.5715e-02, -3.5842e-02,  5.0965e-03, -7.5257e-03,
        -5.9634e-03, -1.9318e-02,  1.3596e-02, -4.7

In [7]:
sentence_model.encode("hola", normalize_embeddings=True, convert_to_tensor=True)

tensor([-7.1916e-02,  1.0171e-01,  1.9020e-02, -2.7333e-02, -9.5139e-02,
        -5.5759e-02,  2.5668e-02,  4.0099e-02, -3.4066e-02, -3.0349e-02,
        -5.8926e-02, -9.3682e-02, -5.0599e-02,  9.1100e-03, -8.5150e-02,
        -1.7012e-03, -3.2957e-02,  1.4803e-02, -8.0482e-02, -4.6386e-02,
         1.4728e-02,  5.8762e-02, -7.7058e-02,  1.2527e-01,  8.2589e-03,
        -1.1486e-01, -3.2913e-02, -5.2499e-03, -4.0697e-02, -1.2445e-02,
        -4.8113e-02,  7.6518e-03,  2.1850e-02, -2.3466e-02, -8.3040e-02,
         4.6369e-03,  1.0829e-02,  4.2880e-02,  5.0805e-02,  7.3297e-02,
        -1.9183e-02, -1.1900e-01, -7.9420e-03, -2.1730e-02,  3.1332e-02,
        -1.0437e-01, -4.3945e-02,  6.5122e-03,  2.7526e-02,  5.6662e-02,
         3.6123e-02,  7.4572e-03, -5.6750e-02,  6.7070e-02,  4.4845e-03,
        -1.6215e-02,  7.7607e-04,  1.8151e-02, -5.5292e-02,  2.3256e-02,
        -2.6742e-02,  1.7868e-02, -3.2128e-03, -7.7392e-03,  3.1251e-02,
        -7.8285e-02,  3.9264e-02, -2.0109e-02,  4.9

CustomDomainModel(
  (0): None
  (sentence_transformer): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Normalize()
  )
  (adapter): AdapterModule(
    (dense): Linear(in_features=384, out_features=1024, bias=True)
    (output): Linear(in_features=1024, out_features=384, bias=True)
    (activation): ReLU()
  )
)


torch.Size([384])

In [3]:
# define evaluator
#from sentence_transformers.evaluation import InformationRetrievalEvaluator
# define over validation dataset

#evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)


In [49]:
data = pd.read_parquet("data/mlt_data_publications.parquet")

In [50]:
authors = []
for row in data.itertuples():
    for author in row.authors:
        names = [author.get('name')]
        
        try:
            aliases = author.get('aliases').tolist()
        except AttributeError:
            aliases = []
        
        names += aliases
        
        names = [' '.join(set(re.sub(r'\s+', ' ', name).split(' ')) )for name in names]

        for name in list(set(names)):
            authors.append({'author': name, 'publication': row.title, 
                            'paperId': row.paperId, 'authorId': author.get('authorId')})

In [51]:
authors = pd.DataFrame(authors)

In [52]:
authors

Unnamed: 0,author,publication,paperId,authorId
0,Paramvir Victor Bahl,RADAR: an in-building RF-based user location a...,bb01353f818ca226b53433163893efc56c3df32d,2292948
1,Paramvir Bahl,RADAR: an in-building RF-based user location a...,bb01353f818ca226b53433163893efc56c3df32d,2292948
2,P. Bahl,RADAR: an in-building RF-based user location a...,bb01353f818ca226b53433163893efc56c3df32d,2292948
3,Padmanabhan Venkat,RADAR: an in-building RF-based user location a...,bb01353f818ca226b53433163893efc56c3df32d,1799406
4,N. Padmanabhan Venkat,RADAR: an in-building RF-based user location a...,bb01353f818ca226b53433163893efc56c3df32d,1799406
...,...,...,...,...
76502,Dangendorf F.,Microbial Load of Drinking Water Reservoir Tri...,f4f9319a7056d134d900e872f0ad76b5cf250afe,4140745
76503,Fischeder R.,Microbial Load of Drinking Water Reservoir Tri...,f4f9319a7056d134d900e872f0ad76b5cf250afe,13172189
76504,Gebel J.,Microbial Load of Drinking Water Reservoir Tri...,f4f9319a7056d134d900e872f0ad76b5cf250afe,2730109
76505,Vacata V.,Microbial Load of Drinking Water Reservoir Tri...,f4f9319a7056d134d900e872f0ad76b5cf250afe,5882296


In [54]:
data.authors.iloc[0]#[0].get('aliases')

array([{'affiliations': array([], dtype=object), 'aliases': array(['P. Bahl', 'Paramvir Victor Bahl', 'Paramvir Bahl'], dtype=object), 'authorId': '2292948', 'citationCount': 30051.0, 'externalIds': {'DBLP': array(['Paramvir Bahl', 'Paramvir Victor Bahl', 'Victor Bahl'],
             dtype=object), 'ORCID': None}, 'name': 'P. Bahl', 'paperCount': 147.0, 'url': 'https://www.semanticscholar.org/author/2292948'}                                                                                                                                   ,
       {'affiliations': array([], dtype=object), 'aliases': array(['Venkata N. Padmanabhan', 'V.n. Padmanabhan', 'V N Padmanabhan',
              'V. N. Padmanabhan', 'Venkat Padmanabhan', 'Venkata N Padmanabhan',
              'Venkat N. Padmanabhan'], dtype=object), 'authorId': '1799406', 'citationCount': 29854.0, 'externalIds': {'DBLP': array(['Venkat N. Padmanabhan', 'Venkata N. Padmanabhan'], dtype=object), 'ORCID': None}, 'name': 'V. Padmanabhan

In [12]:
titles_map = {row.paperId:row.title for row in data.itertuples()}

In [13]:
related_pubs = [(titles_map[row.paperId], 'relatedWith', titles_map[row.source]) 
                for row in data.itertuples() if row.con_type!='base']

In [18]:
question_answer_pairs = [
    (f'Which paper is cited or referenced in the paper titled "{triple[0]}"?', triple[2])
    for triple in related_pubs
]


In [19]:
len(question_answer_pairs)

10184