In [1]:
NOTEBOOK_NAME = "finetuning-mpnet-tripletloss-10man"

In [2]:
!nvidia-smi

Sat Feb 11 01:09:42 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   61C    P0    51W / 300W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Fine-tuning Sentence Transformer

https://huggingface.co/blog/how-to-train-sentence-transformers

In [3]:
class CFG:
    debug = False
    upload_data = True
    sample_num = 400000
    batch_size = 384# 64

In [4]:
from datasets import load_dataset

In [5]:
!pip -qqq install sentence-transformers
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from datasets import Dataset
from torch.utils.data import DataLoader

In [6]:
DATA_PATH = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations" + "/"
topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv(DATA_PATH + "correlations.csv")

In [7]:
topics["title"] = topics["title"].fillna("No Title")
content["title"] = content["title"].fillna("No Title")

In [8]:
correlations.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4


In [9]:
train = pd.read_csv("/notebooks/kaggle_lecr/data/uns_data/train_lecr_uns_top_n_50.csv")

In [10]:
train_topic_ids_content_ids = train.groupby("topics_ids")["content_ids"].apply(set).sort_index()

In [11]:
correlations["content_ids"] = correlations["content_ids"].apply(lambda x: set(x.split()))
corr_topic_ids_content_ids = correlations.set_index("topic_id")
corr_topic_ids_content_ids = corr_topic_ids_content_ids.sort_index()

In [12]:
train_topic_ids_content_ids.head()

topics_ids
t_00004da3a1b2    {c_cbbf192e3fb1, c_9b02d916cbf1, c_72abfa1c21a...
t_00068291e9a4    {c_cebc5d74ee8c, c_88f804268f39, c_ed8f0a74752...
t_00069b63a70a    {c_e38971955cb3, c_e9b71cbb320b, c_27c76064bae...
t_0006d41a73a8    {c_f49b9d887c06, c_79ccad92ad9e, c_4d35bf46f02...
t_0008768bdee6    {c_a6b536e1dce3, c_a6d421c4b978, c_aa59a5da51f...
Name: content_ids, dtype: object

In [13]:
corr_topic_ids_content_ids.head()

Unnamed: 0_level_0,content_ids
topic_id,Unnamed: 1_level_1
t_00004da3a1b2,"{c_5bc0e1e2cba0, c_1108dd0c7a5d, c_376c5a8eb02..."
t_00068291e9a4,"{c_89ce9367be10, c_ac1672cdcd2c, c_ebb7fdf10a7..."
t_00069b63a70a,{c_11a1dc0bfb99}
t_0006d41a73a8,"{c_b972646631cb, c_d7a0d7eaf799, c_0c6473c3480..."
t_0008768bdee6,"{c_34e1424229b4, c_7d1a964d66d5, c_aab93ee667f4}"


In [14]:
output_id = {}
for topic_id, train_content_id, corr_content_id in tqdm(
    zip(train_topic_ids_content_ids.index, train_topic_ids_content_ids, corr_topic_ids_content_ids["content_ids"]),
    total=len(train_topic_ids_content_ids)
):
    pos = corr_content_id
    neg = train_content_id - corr_content_id
    output_id[topic_id] = {"pos": pos, "neg": neg}

  0%|          | 0/61517 [00:00<?, ?it/s]

In [15]:
def get_output_list(output_id: dict, topics: pd.DataFrame, content: pd.DataFrame, cfg) -> list:
    
    topics_id_title = dict(zip(topics["id"], topics["title"]))
    content_id_title = dict(zip(content["id"], content["title"]))
    
    output_list = []
    for topic_id in tqdm(output_id.keys()):
        topic_title = topics_id_title[topic_id]
        
        pos_content_ids = output_id[topic_id]["pos"]
        neg_content_ids = output_id[topic_id]["neg"]

        pos_content_titles = [content_id_title[content_id] for content_id in pos_content_ids]
        neg_content_titles = [content_id_title[content_id] for content_id in neg_content_ids]
        
        output_list.append([{"query": topic_title,"pos": pos_content_titles, "neg": neg_content_titles}])
        if CFG.debug and len(output_list) == 100:
            break
    return output_list

output_list = get_output_list(output_id, topics, content, cfg=CFG)

  0%|          | 0/61517 [00:00<?, ?it/s]

In [16]:
df_dataset = pd.DataFrame(output_list, columns=["set"])

In [17]:
import itertools
from sentence_transformers import InputExample

train_examples = []
train_data = df_dataset['set']
n_examples = df_dataset.shape[0]

cnt = 0
for i in tqdm(range(n_examples), total=n_examples):
    example = train_data[i]
    for query, pos, neg in itertools.product([example['query']], example['pos'], example['neg'][:10]):
        train_examples.append(InputExample(texts=[query, pos, neg]))
        cnt += 1
    if CFG.debug and cnt >= 100:
        break
    elif cnt >= CFG.sample_num: # positive sampleのみだと6万ぐらいであり、上回るように
        print(f"Data Num: {cnt}")
        break

  0%|          | 0/61517 [00:00<?, ?it/s]

Data Num: 400028


In [18]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=CFG.batch_size)

In [19]:
from sentence_transformers import SentenceTransformer

model_id = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
model = SentenceTransformer(model_id)

In [20]:
from sentence_transformers import losses

train_loss = losses.TripletLoss(model=model)

In [21]:
num_epochs = 10
if CFG.debug:
    num_epochs = 1
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
    
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1042 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Save

In [27]:
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}/"
OUTPUT_DIR

'/notebooks/kaggle_lecr/output/finetuning-mpnet-tripletloss-10man/'

In [28]:
model.save(OUTPUT_DIR)

# Upload

In [24]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")





0

In [25]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)

Create Dataset name:finetuning-mpnet-tripletloss-10man, output_dir:kaggle_lecr/output/finetuning-mpnet-tripletloss-10man/
Starting upload for file pytorch_model.bin


100%|██████████| 1.04G/1.04G [00:21<00:00, 51.1MB/s]


Upload successful: pytorch_model.bin (1GB)
Starting upload for file sentencepiece.bpe.model


100%|██████████| 4.83M/4.83M [00:00<00:00, 7.95MB/s]


Upload successful: sentencepiece.bpe.model (5MB)
Starting upload for file README.md


100%|██████████| 3.71k/3.71k [00:00<00:00, 6.44kB/s]


Upload successful: README.md (4KB)
Starting upload for file tokenizer.json


100%|██████████| 16.3M/16.3M [00:00<00:00, 22.6MB/s]


Upload successful: tokenizer.json (16MB)
Starting upload for file 1_Pooling.tar


100%|██████████| 10.0k/10.0k [00:00<00:00, 17.4kB/s]


Upload successful: 1_Pooling.tar (10KB)
Starting upload for file config.json


100%|██████████| 821/821 [00:00<00:00, 1.61kB/s]


Upload successful: config.json (821B)
Starting upload for file modules.json


100%|██████████| 229/229 [00:00<00:00, 413B/s]


Upload successful: modules.json (229B)
Starting upload for file tokenizer_config.json


100%|██████████| 536/536 [00:00<00:00, 972B/s]


Upload successful: tokenizer_config.json (536B)
Starting upload for file config_sentence_transformers.json


100%|██████████| 122/122 [00:00<00:00, 188B/s]


Upload successful: config_sentence_transformers.json (122B)
Starting upload for file sentence_bert_config.json


100%|██████████| 53.0/53.0 [00:00<00:00, 97.8B/s]


Upload successful: sentence_bert_config.json (53B)
Starting upload for file special_tokens_map.json


100%|██████████| 280/280 [00:00<00:00, 577B/s]


Upload successful: special_tokens_map.json (280B)
