In [12]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, models, util, InputExample
from sentence_transformers.evaluation import TripletEvaluator
import pandas as pd
from datetime import datetime
import logging
import math
import os
import gzip
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#Check if dataset exsist. If not, download and extract  it
model_name = './output/hfl_chinese-roberta-wwm-ext-2022-05-12'
train_file = './train/train.json'

# Read the dataset
train_batch_size = 8
num_epochs = 1
model_save_path = 'output/bi-encoder-chinese-roberta-wwm-ext'+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [6]:
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at ./output/hfl_chinese-roberta-wwm-ext-2022-05-12 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./output/hfl_chinese-roberta-wwm-ext-2022-05-12 and are newly initialized: ['

2022-05-12 17:07:49 - Use pytorch device: cuda


In [7]:
train_generator = pd.read_json(train_file, lines=True, chunksize=1000, encoding='utf-8')

# Convert the dataset to a DataLoader ready for training
logging.info("Read training dataset")

train_samples = []
dev_samples = []
test_samples = []

for train_data in train_generator:
    for _index, row in train_data.iterrows():
        for paragraph in row.answer_paragraphs:
            inp_example = InputExample(texts=[row['question'], paragraph['paragraph_text']])

        if _index < 5000:
            dev_samples.append(inp_example)
        elif _index < 10000:
            test_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

2022-05-12 17:07:49 - Read training dataset


In [8]:
# Development set: Measure correlation between cosine score and gold labels
evaluator = TripletEvaluator.from_input_examples(dev_samples, name='dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

2022-05-12 17:08:00 - Warmup-steps: 955




Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9550 [00:00<?, ?it/s]

2022-05-12 17:12:20 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 1000 steps:




2022-05-12 17:12:56 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:12:56 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:12:56 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:12:56 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:17:18 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 2000 steps:




2022-05-12 17:17:53 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:17:53 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:17:53 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:17:53 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:22:11 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 3000 steps:




2022-05-12 17:22:46 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:22:46 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:22:46 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:22:46 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:27:09 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 4000 steps:




2022-05-12 17:27:45 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:27:45 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:27:45 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:27:45 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:32:04 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 5000 steps:




2022-05-12 17:32:39 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:32:39 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:32:39 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:32:39 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:37:19 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 6000 steps:




2022-05-12 17:38:27 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:38:27 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:38:27 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:38:27 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:43:15 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 7000 steps:




2022-05-12 17:43:50 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:43:50 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:43:50 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:43:50 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:48:10 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 8000 steps:




2022-05-12 17:48:45 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:48:45 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:48:45 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:48:45 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:53:04 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset in epoch 0 after 9000 steps:




2022-05-12 17:53:40 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:53:40 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:53:40 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:53:40 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan
2022-05-12 17:56:02 - EmbeddingSimilarityEvaluator: Evaluating the model on dev dataset after epoch 0:
2022-05-12 17:56:37 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 17:56:37 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:56:37 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 17:56:37 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan




In [13]:
model.save(model_save_path)

2022-05-12 18:01:24 - Save model to output/bi-encoder-chinese-roberta-wwm-ext-2022-05-12_18-01-19


In [16]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='test')
test_evaluator(model, output_path=model_save_path)

2022-05-12 18:06:03 - Load pretrained SentenceTransformer: output/bi-encoder-chinese-roberta-wwm-ext-2022-05-12_18-01-19
2022-05-12 18:06:04 - Use pytorch device: cuda
2022-05-12 18:06:04 - EmbeddingSimilarityEvaluator: Evaluating the model on test dataset:
2022-05-12 18:07:11 - Cosine-Similarity :	Pearson: nan	Spearman: nan
2022-05-12 18:07:11 - Manhattan-Distance:	Pearson: nan	Spearman: nan
2022-05-12 18:07:11 - Euclidean-Distance:	Pearson: nan	Spearman: nan
2022-05-12 18:07:11 - Dot-Product-Similarity:	Pearson: nan	Spearman: nan




nan