In [62]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Fri Feb 10 14:06:35 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro P5000        Off  | 00000000:00:05.0 Off |                  Off |
| 28%   40C    P8     7W / 180W |  10313MiB / 16384MiB |      0%      Default |
|                               |            

# Fine-tuning Sentence Transformer

https://huggingface.co/blog/how-to-train-sentence-transformers

In [69]:
class CFG:
    debug = True
    upload_data = True

In [41]:
from datasets import load_dataset

In [42]:
!pip -qqq install sentence-transformers
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from datasets import Dataset
from torch.utils.data import DataLoader

In [43]:
DATA_PATH = "kaggle_lecr/data/learning-equality-curriculum-recommendations" + "/"
topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv(DATA_PATH + "correlations.csv")

In [44]:
topics["title"] = topics["title"].fillna("No Title")
content["title"] = content["title"].fillna("No Title")

In [45]:
correlations.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4


In [46]:
train = pd.read_csv("kaggle_lecr/data/uns_data/train_lecr_uns_top_n_50.csv")

In [47]:
train_topic_ids_content_ids = train.groupby("topics_ids")["content_ids"].apply(set).sort_index()

In [48]:
correlations["content_ids"] = correlations["content_ids"].apply(lambda x: set(x.split()))
corr_topic_ids_content_ids = correlations.set_index("topic_id")
corr_topic_ids_content_ids = corr_topic_ids_content_ids.sort_index()

In [49]:
train_topic_ids_content_ids.head()

topics_ids
t_00004da3a1b2    {c_0b4a3ea959ba, c_48c2aa56909a, c_a6029b48a44...
t_00068291e9a4    {c_b228ae247379, c_88f804268f39, c_035baf9425e...
t_00069b63a70a    {c_c25696bc9352, c_f99c320132eb, c_362366142e3...
t_0006d41a73a8    {c_0263cb12601d, c_e2d33adb1c78, c_9cb25047417...
t_0008768bdee6    {c_a3c74443d8d3, c_91c6c2b32396, c_bf882e1890d...
Name: content_ids, dtype: object

In [50]:
corr_topic_ids_content_ids.head()

Unnamed: 0_level_0,content_ids
topic_id,Unnamed: 1_level_1
t_00004da3a1b2,"{c_1108dd0c7a5d, c_376c5a8eb028, c_5bc0e1e2cba..."
t_00068291e9a4,"{c_ebb7fdf10a7e, c_89ce9367be10, c_ac1672cdcd2..."
t_00069b63a70a,{c_11a1dc0bfb99}
t_0006d41a73a8,"{c_5e375cf14c47, c_b972646631cb, c_d7a0d7eaf79..."
t_0008768bdee6,"{c_7d1a964d66d5, c_aab93ee667f4, c_34e1424229b4}"


In [51]:
output_id = {}
for topic_id, train_content_id, corr_content_id in tqdm(
    zip(train_topic_ids_content_ids.index, train_topic_ids_content_ids, corr_topic_ids_content_ids["content_ids"]),
    total=len(train_topic_ids_content_ids)
):
    pos = corr_content_id
    neg = train_content_id - corr_content_id
    output_id[topic_id] = {"pos": pos, "neg": neg}

  0%|          | 0/61517 [00:00<?, ?it/s]

In [52]:
output_list = []
for topic_id in tqdm(output_id.keys()):
    topic_title = topics.loc[(topics["id"] == topic_id), "title"].values[0]
    pos_content_ids = output_id[topic_id]["pos"]
    neg_content_ids = output_id[topic_id]["neg"]
    
    pos_content_titles = content.loc[(content["id"].isin(list(pos_content_ids))),"title"].tolist()
    neg_content_titles = content.loc[(content["id"].isin(list(neg_content_ids))),"title"].tolist()
    output_list.append([{"query": topic_title,"pos": pos_content_titles, "neg": neg_content_titles}])
    if CFG.debug and len(output_list) == 100:
        break

  0%|          | 0/61517 [00:00<?, ?it/s]

In [53]:
df_dataset = pd.DataFrame(output_list, columns=["set"])

In [54]:
import itertools
from sentence_transformers import InputExample

train_examples = []
train_data = df_dataset['set']
n_examples = df_dataset.shape[0]

cnt = 0
for i in tqdm(range(n_examples), total=n_examples):
    example = train_data[i]
    for query, pos, neg in itertools.product([example['query']], example['pos'], example['neg'][:10]):
        train_examples.append(InputExample(texts=[query, pos, neg]))
        cnt += 1
    if CFG.debug and cnt >= 100:
        break
    elif cnt >= 100_000: # positive sampleのみだと6万ぐらいであり、上回るように
        break

  0%|          | 0/100 [00:00<?, ?it/s]

In [55]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)

In [56]:
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_id)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [57]:
from sentence_transformers import losses

train_loss = losses.TripletLoss(model=model)

In [58]:
num_epochs = 10
if CFG.debug:
    num_epochs = 1
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
    
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3 [00:00<?, ?it/s]

# Save

In [106]:
import ipynb_path
import re

NOTEBOOK_NAME = ipynb_path.get().split("/")[-1].split(".")[0]
NOTEBOOK_NAME

'finetuning-mpnet-tripletloss-100000'

In [115]:
OUTPUT_DIR = f"kaggle_lecr/output/{NOTEBOOK_NAME}/"
OUTPUT_DIR

'kaggle_lecr/output/finetuning-mpnet-tripletloss-100/'

In [116]:
model.save(OUTPUT_DIR)

# Upload

In [119]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi
import json

os.system('pip install kaggle')

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)





Create Dataset name:finetuning-mpnet-tripletloss-10, output_dir:kaggle_lecr/output/finetuning-mpnet-tripletloss-100/
Starting upload for file pytorch_model.bin


100%|██████████| 1.04G/1.04G [00:20<00:00, 53.7MB/s]


Upload successful: pytorch_model.bin (1GB)
Starting upload for file sentencepiece.bpe.model


100%|██████████| 4.83M/4.83M [00:00<00:00, 7.01MB/s]


Upload successful: sentencepiece.bpe.model (5MB)
Starting upload for file README.md


100%|██████████| 3.70k/3.70k [00:00<00:00, 7.63kB/s]


Upload successful: README.md (4KB)
Starting upload for file tokenizer.json


100%|██████████| 16.3M/16.3M [00:00<00:00, 18.3MB/s]


Upload successful: tokenizer.json (16MB)
Starting upload for file 1_Pooling.tar


100%|██████████| 10.0k/10.0k [00:00<00:00, 21.3kB/s]


Upload successful: 1_Pooling.tar (10KB)
Starting upload for file config.json


100%|██████████| 821/821 [00:00<00:00, 1.18kB/s]


Upload successful: config.json (821B)
Starting upload for file modules.json


100%|██████████| 229/229 [00:00<00:00, 346B/s]  


Upload successful: modules.json (229B)
Starting upload for file tokenizer_config.json


100%|██████████| 536/536 [00:00<00:00, 1.17kB/s]


Upload successful: tokenizer_config.json (536B)
Starting upload for file config_sentence_transformers.json


100%|██████████| 122/122 [00:00<00:00, 249B/s]


Upload successful: config_sentence_transformers.json (122B)
Starting upload for file sentence_bert_config.json


100%|██████████| 53.0/53.0 [00:00<00:00, 103B/s]


Upload successful: sentence_bert_config.json (53B)
Starting upload for file special_tokens_map.json


100%|██████████| 280/280 [00:00<00:00, 332B/s]  


Upload successful: special_tokens_map.json (280B)
