In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 본인의 실습 디렉토리로 변경
%cd /content/drive/MyDrive/NLP_project/NLP_project/data

/content/drive/MyDrive/NLP_project/NLP_project/data


In [1]:
! pip install sentence_transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.1.tar.gz (84 kB)
[K     |████████████████████████████████| 84 kB 2.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 11.3 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 44.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 36.7 MB/s 
[?25hCollecting huggingface-hub>=0.8.1
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86

In [2]:
import math
import logging
from datetime import datetime

import re
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models, LoggingHandler, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample

In [3]:
#reset gpu cache
import gc
gc.collect()
torch.cuda.empty_cache()

In [4]:
# seed
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# device type
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)


cpu


In [5]:
# logger
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [6]:
pretrained_model_name = 'klue/roberta-base'
sts_num_epochs = 4
train_batch_size = 32

sts_model_save_path = 'output/training_sts-'+pretrained_model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# 1. Load Dataset & Preprocessing

## 1.1. KLUE-STS

In [7]:
# load KLUE-STS Dataset
klue_sts_train = load_dataset("klue", "sts", split='train[:90%]')
klue_sts_valid = load_dataset("klue", "sts", split='train[-10%:]') # train의 10%를 validation set으로 사용
klue_sts_test = load_dataset("klue", "sts", split='validation')

print('Length of Train : ',len(klue_sts_train))
print('Length of Valid : ',len(klue_sts_valid))
print('Length of Test : ',len(klue_sts_test))

Downloading builder script:   0%|          | 0.00/5.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

Downloading and preparing dataset klue/sts (download: 1.29 MiB, generated: 2.82 MiB, post-processed: Unknown size, total: 4.11 MiB) to /root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e...


Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

Dataset klue downloaded and prepared to /root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e. Subsequent calls will reuse this data.
2022-06-24 06:24:00 - Reusing dataset klue (/root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)
2022-06-24 06:24:00 - Reusing dataset klue (/root/.cache/huggingface/datasets/klue/sts/1.0.0/e0fc3bc3de3eb03be2c92d72fd04a60ecc71903f821619cb28ca0e1e29e4233e)
Length of Train :  10501
Length of Valid :  1167
Length of Test :  519


In [8]:
klue_sts_train[0]

{'guid': 'klue-sts-v1_train_00000',
 'labels': {'binary-label': 1, 'label': 3.7, 'real-label': 3.714285714285714},
 'sentence1': '숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.',
 'sentence2': '숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.',
 'source': 'airbnb-rtt'}

In [9]:
import re

In [10]:
def check_st(text):
  text = re.sub(r"[^ㄱ-힣0-9\s]","",text)
  return text

In [11]:
def make_sts_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = check_st(data['sentence1'])
        sentence2 = check_st(data['sentence2'])
        score = (data['labels']['label']) / 5.0  # normalize 0 to 5
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=score))

    return input_examples

In [12]:
sts_train_examples = make_sts_input_example(klue_sts_train)
sts_valid_examples = make_sts_input_example(klue_sts_valid)
sts_test_examples = make_sts_input_example(klue_sts_test)

In [13]:
# Train Dataloader
train_dataloader = DataLoader(
    sts_train_examples,
    shuffle=True,
    batch_size=train_batch_size,
)


# Evaluator by sts-validation
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_valid_examples,
    name="sts-dev",
)

# Evaluator by sts-test
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_test_examples,
    name="sts-test",
)

# print(f"Train dataloader # steps: {len(train_dataloader)}")
# print(f"Valid dataloader # steps: {len(dev_evaluator)}")
# print(f"Test dataloader # steps: {len(test_dataloader)}")


# 2. Load Embedding Model

In [14]:
# Load Embedding Model
embedding_model = models.Transformer(
    model_name_or_path=pretrained_model_name,
    max_seq_length=256,
    do_lower_case=True
)

# Only use Mean Pooling -> Pooling all token embedding vectors of sentence.
pooling_model = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

model = SentenceTransformer(modules=[embedding_model, pooling_model])

Downloading:   0%|          | 0.00/546 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

2022-06-24 06:24:25 - Use pytorch device: cpu


# 3. STS training

In [15]:
# Use CosineSimilarityLoss
train_loss = losses.CosineSimilarityLoss(model=model)

# warmup steps
warmup_steps = math.ceil(len(sts_train_examples) * sts_num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Training
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=sts_num_epochs,
    evaluation_steps=int(len(train_dataloader)*0.1),
    warmup_steps=warmup_steps,
    output_path=sts_model_save_path
)

2022-06-24 06:24:25 - Warmup-steps: 132


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/329 [00:00<?, ?it/s]

2022-06-24 06:38:06 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 32 steps:
2022-06-24 06:40:48 - Cosine-Similarity :	Pearson: 0.9058	Spearman: 0.8987
2022-06-24 06:40:48 - Manhattan-Distance:	Pearson: 0.8920	Spearman: 0.8903
2022-06-24 06:40:48 - Euclidean-Distance:	Pearson: 0.8908	Spearman: 0.8891
2022-06-24 06:40:48 - Dot-Product-Similarity:	Pearson: 0.8595	Spearman: 0.8552
2022-06-24 06:40:48 - Save model to output/training_sts-klue-roberta-base-2022-06-24_06-23-54
2022-06-24 06:53:56 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 64 steps:
2022-06-24 06:56:37 - Cosine-Similarity :	Pearson: 0.9170	Spearman: 0.8875
2022-06-24 06:56:37 - Manhattan-Distance:	Pearson: 0.9091	Spearman: 0.8873
2022-06-24 06:56:37 - Euclidean-Distance:	Pearson: 0.9090	Spearman: 0.8874
2022-06-24 06:56:37 - Dot-Product-Similarity:	Pearson: 0.9016	Spearman: 0.8699
2022-06-24 07:09:54 - EmbeddingSimilarityEvaluator: Evaluati

KeyboardInterrupt: ignored

# 4. Evaluation

In [18]:
# evaluation sts-test
test_evaluator(model, output_path=sts_model_save_path)

2022-06-24 07:35:14 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2022-06-24 07:36:30 - Cosine-Similarity :	Pearson: 0.8163	Spearman: 0.8108
2022-06-24 07:36:30 - Manhattan-Distance:	Pearson: 0.8190	Spearman: 0.8122
2022-06-24 07:36:30 - Euclidean-Distance:	Pearson: 0.8194	Spearman: 0.8133
2022-06-24 07:36:30 - Dot-Product-Similarity:	Pearson: 0.7933	Spearman: 0.7807


0.8132812971089792

In [17]:
model = SentenceTransformer(modules=[embedding_model, pooling_model])

2022-06-24 07:35:11 - Use pytorch device: cpu


In [16]:
emb1 = model.encode(check_st(input()))
emb2 = model.encode(check_st(input()))


cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:",float(cos_sim),"=>", '다름' if cos_sim < 0.6 else '비슷하거나 같음')


숙소 위치는 찾기 쉽고 일반적인 한국의 반지하 숙소입니다.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

숙박시설의 위치는 쉽게 찾을 수 있고 한국의 대표적인 반지하 숙박시설입니다.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine-Similarity: 0.8871272802352905 => 비슷하거나 같음
