In [1]:
from sentence_transformers import SentenceTransformer, InputExample, models, losses
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Define Input and Output Paths
training_data_path = 'input_data/SemEval_train.txt'
testing_data_path = 'input_data/SemEval_test.txt'
validation_data_path = 'input_data/SemEval_test.txt'

trained_model_path = 'output_model/SemEval_STSModel_BinaryEncoder'

# Use DistillBert Encoder
word_embedding_model = models.Transformer('distilbert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Read Input Data
training_data = []
with open(training_data_path, encoding='utf-8') as in_file_train:
    line_list = in_file_train.read().splitlines()
    for line in line_list:
        data_line_train = line.split('\t')
        training_data.append(InputExample(texts=[data_line_train[1], data_line_train[2]], label=float(data_line_train[0])/5.0))

testing_data = []
with open(testing_data_path, encoding='utf-8') as in_file_test:
    line_list = in_file_test.read().splitlines()
    for line in line_list:
        data_line_test = line.split('\t')
        testing_data.append(InputExample(texts=[data_line_test[1], data_line_test[2]], label=float(data_line_test[0])/5.0))

validation_data = []
with open(validation_data_path, encoding='utf-8') as in_file_valid:
    line_list = in_file_valid.read().splitlines()
    for line in line_list:    
        data_line_valid = line.split('\t')
        validation_data.append(InputExample(texts=[data_line_valid[1], data_line_valid[2]], label=float(data_line_valid[0])/5.0))

# Pytorch Dataloader
training_batch_size = 16
print(training_data[0].texts, training_data[0].label)
training_dataloader = DataLoader(training_data, shuffle=True, batch_size=training_batch_size)

# Training Parameters
num_epochs = 4
training_loss = losses.CosineSimilarityLoss(model)
warmup_steps = round(len(training_dataloader) * num_epochs * 0.1) 
print('warmup steps: ', warmup_steps)
# Define Performance Evaluator
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation_data, name='valid')

# Train the model
model.fit(train_objectives=[(training_dataloader, training_loss)],
          epochs=5,
          warmup_steps=100,
          evaluator=evaluator,
          evaluation_steps=500,
          output_path=trained_model_path)

# Evaluate Performance Metric
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(testing_data, name='test')
print ("Model Evaluation Metric : ", evaluator(model))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['Drug lord captured by marines in Mexico', 'Suspected drug lord known as ‘El Taliban’ held in Mexico'] 0.52
warmup steps:  132


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/330 [00:00<?, ?it/s]

Iteration:   0%|          | 0/330 [00:00<?, ?it/s]

Iteration:   0%|          | 0/330 [00:00<?, ?it/s]

Iteration:   0%|          | 0/330 [00:00<?, ?it/s]

Iteration:   0%|          | 0/330 [00:00<?, ?it/s]

0.8634703763924765