In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd /content/drive/MyDrive/AIBootCamp/NLP/

/content/drive/MyDrive/AIBootCamp/NLP


In [None]:
! pip install sentence_transformers datasets

# SET

In [None]:
import math
import logging
from datetime import datetime

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [None]:
#logger
logging.basicConfig(format="%(asctime)s - %(message)s",
                    datefmt="%Y/%m/%d %H:%M",
                    level=logging.INFO,
                    handlers=[LoggingHandler()]
                    )

In [None]:
pretrained_model = 'klue/roberta-base'
sts_num_epochs = 3
train_batch_size = 32

sts_model_save_path = 'output/train_sts_'+pretrained_model.replace('/','_') + datetime.now().strftime("%m/%d %H:%M:%S")

# DataLoad

In [None]:
# load KLUE-STS Dataset
klue_sts_train = load_dataset("klue", "sts", split='train[:90%]')
klue_sts_valid = load_dataset("klue", "sts", split='train[-10%:]') # train의 10%를 validation set으로 사용
klue_sts_test = load_dataset("klue", "sts", split='validation')

print('Length of Train : ',len(klue_sts_train))
print('Length of Valid : ',len(klue_sts_valid))
print('Length of Test : ',len(klue_sts_test))

Downloading builder script:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

Downloading and preparing dataset klue/sts (download: 1.29 MiB, generated: 2.82 MiB, post-processed: Unknown size, total: 4.11 MiB) to /root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e...


Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

Dataset klue downloaded and prepared to /root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e. Subsequent calls will reuse this data.
2022-05-25 05:47:36,898 - Reusing dataset klue (/root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)
2022-05-25 05:47:37,351 - Reusing dataset klue (/root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)
Length of Train :  10501
Length of Valid :  1167
Length of Test :  519


label >= 3 : 1 </br>
lable < 3 : 0

In [None]:
def make_input_ex_sts(dataset):
  '''
  transform to the Input example
  '''
  input_examples = []
  for i, data in enumerate(dataset):
    sentence1 = data['sentence1']
    sentence2 = data['sentence2']
    score = (data['labels']['label']) / 5.0 # normalize 0 to 1
    input_examples.append(InputExample(texts=[sentence1, sentence2], label=score))
  
  return input_examples

In [None]:
sts_train_examples = make_input_ex_sts(klue_sts_train)
sts_valid_examples = make_input_ex_sts(klue_sts_valid)
sts_test_examples = make_input_ex_sts(klue_sts_test)

In [None]:
train_dataloader = DataLoader(sts_train_examples,
                             shuffle=True,
                             batch_size=train_batch_size)

#evaluator by valid
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid_examples,
    name='sts-dev'
)

# Load Embedding Model

In [None]:
# Load baseline model

base_model = models.Transformer(model_name_or_path=pretrained_model,
                                max_seq_length=256,
                                do_lower_case=True)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    base_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[base_model,pooling_model])

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

2022-05-25 06:45:30,191 - Use pytorch device: cpu


In [None]:
# Use MultipleNegativesRankingLoss
train_loss = losses.MultipleNegativesRankingLoss(model)

# warmup steps
warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path,
    use_amp=False       #Set to True, if your GPU supports FP16 operations
)

2022-05-25 06:45:40,239 - Warmup-steps: 99




Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2022-05-25 07:01:09,768 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 32 steps:
2022-05-25 07:04:11,537 - Cosine-Similarity :	Pearson: 0.8996	Spearman: 0.8997
2022-05-25 07:04:11,543 - Manhattan-Distance:	Pearson: 0.8938	Spearman: 0.8929
2022-05-25 07:04:11,545 - Euclidean-Distance:	Pearson: 0.8919	Spearman: 0.8917
2022-05-25 07:04:11,546 - Dot-Product-Similarity:	Pearson: 0.8357	Spearman: 0.8323
2022-05-25 07:04:11,577 - Save model to output/train_sts_klue_roberta-base05/25 06:41:29
2022-05-25 07:19:17,268 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 64 steps:
2022-05-25 07:22:13,050 - Cosine-Similarity :	Pearson: 0.8806	Spearman: 0.8830
2022-05-25 07:22:13,052 - Manhattan-Distance:	Pearson: 0.8606	Spearman: 0.8705
2022-05-25 07:22:13,052 - Euclidean-Distance:	Pearson: 0.8599	Spearman: 0.8700
2022-05-25 07:22:13,053 - Dot-Product-Similarity:	Pearson: 0.7605	Spearman: 0.7673
2022-05-25 07:37:00,243 -

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2022-05-25 10:02:08,601 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 32 steps:
2022-05-25 10:05:06,743 - Cosine-Similarity :	Pearson: 0.7545	Spearman: 0.7505
2022-05-25 10:05:06,745 - Manhattan-Distance:	Pearson: 0.8449	Spearman: 0.8455
2022-05-25 10:05:06,745 - Euclidean-Distance:	Pearson: 0.8452	Spearman: 0.8459
2022-05-25 10:05:06,746 - Dot-Product-Similarity:	Pearson: 0.3480	Spearman: 0.3356
2022-05-25 10:20:06,746 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 64 steps:
2022-05-25 10:23:06,130 - Cosine-Similarity :	Pearson: 0.7787	Spearman: 0.7691
2022-05-25 10:23:06,133 - Manhattan-Distance:	Pearson: 0.8434	Spearman: 0.8431
2022-05-25 10:23:06,134 - Euclidean-Distance:	Pearson: 0.8435	Spearman: 0.8431
2022-05-25 10:23:06,134 - Dot-Product-Similarity:	Pearson: 0.3903	Spearman: 0.3614
2022-05-25 10:37:26,429 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 1 after 9

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2022-05-25 13:04:46,812 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 32 steps:
2022-05-25 13:07:46,952 - Cosine-Similarity :	Pearson: 0.7633	Spearman: 0.7536
2022-05-25 13:07:46,958 - Manhattan-Distance:	Pearson: 0.8483	Spearman: 0.8492
2022-05-25 13:07:46,959 - Euclidean-Distance:	Pearson: 0.8481	Spearman: 0.8486
2022-05-25 13:07:46,960 - Dot-Product-Similarity:	Pearson: 0.3551	Spearman: 0.3337
2022-05-25 13:22:18,711 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 64 steps:
2022-05-25 13:25:36,223 - Cosine-Similarity :	Pearson: 0.7640	Spearman: 0.7512
2022-05-25 13:25:36,227 - Manhattan-Distance:	Pearson: 0.8524	Spearman: 0.8496
2022-05-25 13:25:36,228 - Euclidean-Distance:	Pearson: 0.8521	Spearman: 0.8490
2022-05-25 13:25:36,228 - Dot-Product-Similarity:	Pearson: 0.3614	Spearman: 0.3372
2022-05-25 13:41:01,009 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 2 after 9

In [None]:
# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_test_examples,
    name="sts-test",
)

In [None]:
# evaluation sts-test
test_evaluator(model, output_path=sts_model_save_path)

2022-05-25 16:03:00,175 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-05-25 16:04:23,081 - Cosine-Similarity :	Pearson: 0.3397	Spearman: 0.3409
2022-05-25 16:04:23,083 - Manhattan-Distance:	Pearson: 0.4193	Spearman: 0.3919
2022-05-25 16:04:23,084 - Euclidean-Distance:	Pearson: 0.4179	Spearman: 0.3896
2022-05-25 16:04:23,084 - Dot-Product-Similarity:	Pearson: 0.1782	Spearman: 0.1695


0.39185234414549436