# Train

In [1]:
max_seq_length = 350
model_name = "distilbert-base-uncased" 
dataset = "scifact"

In [2]:

from sentence_transformers import losses, models, SentenceTransformer
from beir import util, LoggingHandler
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.search.lexical import BM25Search as BM25
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.train import TrainRetriever
import pathlib, os, tqdm
import logging

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

  from tqdm.autonotebook import tqdm, trange


In [3]:
data_path = f"../datasets/{dataset}"
corpus, queries, qrels = GenericDataLoader(data_path).load(split="train")

2024-06-03 13:48:01 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 24874.62it/s]


2024-06-03 13:48:01 - Loaded 5183 TRAIN Documents.
2024-06-03 13:48:01 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 ver

In [4]:
#### Provide any sentence-transformers or HF model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])



2024-06-03 13:48:31 - Use pytorch device_name: cuda


In [5]:
retriever = TrainRetriever(model=model, batch_size=16)

#### Prepare training samples
train_samples = retriever.load_train(corpus, queries, qrels)
train_dataloader = retriever.prepare_train(train_samples, shuffle=True)

#### Training SBERT with cosine-product
train_loss = losses.MultipleNegativesRankingLoss(model=retriever.model)

ir_evaluator = retriever.load_dummy_evaluator()

#### Provide model save path
model_save_path = os.path.join("../", "output", "{}-v1-{}".format(model_name, dataset))
os.makedirs(model_save_path, exist_ok=True)

#### Configure Train params
num_epochs = 10
evaluation_steps = 5000
warmup_steps = int(len(train_samples) * num_epochs / retriever.batch_size * 0.1)


Adding Input Examples: 100%|██████████| 51/51 [00:00<00:00, 645.16it/s]


2024-05-29 16:02:52 - Loaded 919 training pairs.


In [9]:
retriever.fit(train_objectives=[(train_dataloader, train_loss)], 
                evaluator=ir_evaluator, 
                epochs=num_epochs,
                output_path=model_save_path,
                warmup_steps=warmup_steps,
                evaluation_steps=evaluation_steps,
                use_amp=True)

2024-05-29 01:50:49 - Starting to Train...




Step,Training Loss,Validation Loss,Sequential Score
58,No log,No log,1716972665.225394
116,No log,No log,1716972681.964723
174,No log,No log,1716972700.378657
232,No log,No log,1716972719.137342
290,No log,No log,1716972737.55563
348,No log,No log,1716972756.236504
406,No log,No log,1716972774.700456
464,No log,No log,1716972792.930798
522,0.053900,No log,1716972811.220915
580,0.053900,No log,1716972829.536385


2024-05-29 01:51:05 - Save model to ../output/distilbert-base-uncased-v1-scifact
2024-05-29 01:51:21 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:51:40 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:51:59 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:52:17 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:52:36 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:52:54 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:53:12 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:53:31 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

2024-05-29 01:53:49 - Save model to ../output/distilbert-base-uncased-v1-scifact


                                                                             

# Evaluate


In [5]:
# Loading test set
corpus, queries, qrels = GenericDataLoader(data_path).load(split="test")

2024-06-03 13:51:31 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 15994.33it/s]

2024-06-03 13:51:33 - Loaded 5183 TEST Documents.
2024-06-03 13:51:33 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers




In [6]:
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval import models
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES

## Load retriever from saved model
model_save_path = os.path.join("../", "output", "{}-v1-{}".format(model_name, dataset))
model = DRES(models.SentenceBERT(model_save_path), batch_size=128)
retriever = EvaluateRetrieval(model, score_function="cos_sim")

#### Retrieve dense results (format of results is identical to qrels)
results = retriever.retrieve(corpus, queries)

2024-06-03 13:51:33 - Loading faiss with AVX2 support.
2024-06-03 13:51:34 - Successfully loaded faiss with AVX2 support.
2024-06-03 13:51:34 - Use pytorch device_name: cuda
2024-06-03 13:51:34 - Load pretrained SentenceTransformer: ../output/distilbert-base-uncased-v1-scifact
2024-06-03 13:51:41 - Encoding Queries...


Batches: 100%|██████████| 3/3 [00:10<00:00,  3.52s/it]


2024-06-03 13:51:51 - Sorting Corpus by document length (Longest first)...
2024-06-03 13:51:51 - Scoring Function: Cosine Similarity (cos_sim)
2024-06-03 13:51:51 - Encoding Batch 1/1...


Batches: 100%|██████████| 41/41 [00:33<00:00,  1.22it/s]


In [12]:
len(results), len(results['1']), len(results['3'])

(300, 1000, 1000)

In [14]:
#### Evaluate your retrieval using NDCG@k, MAP@K ...
logging.info("Retriever evaluation for k in: {}".format(retriever.k_values))
ndcg, _map, recall, precision = retriever.evaluate(qrels, results, retriever.k_values)
ndcg, _map, recall, precision

2024-05-29 01:56:23 - Retriever evaluation for k in: [1, 3, 5, 10, 100, 1000]
2024-05-29 01:56:23 - For evaluation, we ignore identical query and document ids (default), please explicitly set ``ignore_identical_ids=False`` to ignore this.
2024-05-29 01:56:23 - 

2024-05-29 01:56:23 - NDCG@1: 0.5000
2024-05-29 01:56:23 - NDCG@3: 0.5845
2024-05-29 01:56:23 - NDCG@5: 0.6121
2024-05-29 01:56:23 - NDCG@10: 0.6354
2024-05-29 01:56:23 - NDCG@100: 0.6641
2024-05-29 01:56:23 - NDCG@1000: 0.6725
2024-05-29 01:56:23 - 

2024-05-29 01:56:23 - MAP@1: 0.4772
2024-05-29 01:56:23 - MAP@3: 0.5551
2024-05-29 01:56:23 - MAP@5: 0.5741
2024-05-29 01:56:23 - MAP@10: 0.5847
2024-05-29 01:56:23 - MAP@100: 0.5912
2024-05-29 01:56:23 - MAP@1000: 0.5915
2024-05-29 01:56:23 - 

2024-05-29 01:56:23 - Recall@1: 0.4772
2024-05-29 01:56:23 - Recall@3: 0.6464
2024-05-29 01:56:23 - Recall@5: 0.7123
2024-05-29 01:56:23 - Recall@10: 0.7797
2024-05-29 01:56:23 - Recall@100: 0.9073
2024-05-29 01:56:23 - Recall@1000: 0.9767

({'NDCG@1': 0.5,
  'NDCG@3': 0.58448,
  'NDCG@5': 0.61214,
  'NDCG@10': 0.63538,
  'NDCG@100': 0.66406,
  'NDCG@1000': 0.67247},
 {'MAP@1': 0.47722,
  'MAP@3': 0.55514,
  'MAP@5': 0.57411,
  'MAP@10': 0.58474,
  'MAP@100': 0.59124,
  'MAP@1000': 0.59148},
 {'Recall@1': 0.47722,
  'Recall@3': 0.64639,
  'Recall@5': 0.71233,
  'Recall@10': 0.77967,
  'Recall@100': 0.90733,
  'Recall@1000': 0.97667},
 {'P@1': 0.5,
  'P@3': 0.23222,
  'P@5': 0.15733,
  'P@10': 0.08733,
  'P@100': 0.01023,
  'P@1000': 0.00111})