In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import TripletEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout


#Check if dataset exsist. If not, download and extract  it
data_path = './dureader/train/dual.train.tsv'


#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = 'hfl/rbtl3'

# Read the dataset
train_batch_size = 64
num_epochs = 6
model_save_path = 'output/bi-encoder-sup_hlf-rbtl3'+'-'+datetime.now().strftime("%Y-%m-%d")

In [2]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at hfl/rbtl3 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2022-06-09 00:15:39 - Use pytorch device: cuda


In [3]:
logging.info("Read train dataset")
train_samples = []
with open(data_path, 'r', encoding='utf-8') as f:
    for line in f.readlines():
        data =line.rstrip().split('\t')
        inp_example = InputExample(texts=[data[0][:256], data[2][:256], data[4][:256]])
        train_samples.append(inp_example)

2022-06-09 00:15:39 - Read train dataset


In [4]:
# cross_train_samples = []
# with open('./dureader/train/cross.train.demo.tsv', 'r', encoding='utf-8') as f:
#     for line in f.readlines():
#         data =line.rstrip().split('\t')
#         inp_example = InputExample(texts=[data[0][:256], data[2][:256]]), label=float(data[3])
#         cross_train_samples.append(data)

In [5]:
from sklearn.model_selection import train_test_split

train_samples, test_samples = train_test_split(train_samples, test_size=0.01, random_state=13)
train_samples, dev_samples = train_test_split(train_samples, test_size=0.01, random_state=13)

print("Train sentences:", len(train_samples))
print("Dev sentences:", len(dev_samples))
print("Test sentences:", len(test_samples))

Train sentences: 871877
Dev sentences: 8807
Test sentences: 8896


In [None]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

evaluator = TripletEvaluator.from_input_examples(dev_samples, name='dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)], 
          optimizer_params={'lr': 5e-05}, 
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=15000,
          warmup_steps=warmup_steps,
          output_path=model_save_path, 
          use_amp=True)

model.save(model_save_path)

2022-06-09 00:15:51 - Warmup-steps: 8175




Epoch:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/13624 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm)


2022-06-09 01:23:52 - TripletEvaluator: Evaluating the model on dev dataset after epoch 0:


In [None]:
model = SentenceTransformer(model_save_path)

In [None]:
test_evaluator = TripletEvaluator.from_input_examples(test_samples, name='test')
test_evaluator(model, output_path=model_save_path)

In [None]:
os.system("shutdown")