In [1]:
NOTEBOOK_NAME = "ex1-finetuning-mpnet-mnrloss-ep10-fold"

In [2]:
import os
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
!nvidia-smi

In [4]:
class CFG:
    debug = False
    upload_data = True
    batch_size = 384
    n_folds = 5
    seed = 42
    num_epochs = 10
    data_url = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations/"
    uns_model = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

In [5]:
!pip -qqq install sentence-transformers
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import StratifiedGroupKFold, KFold, StratifiedKFold, GroupKFold


In [6]:
DATA_PATH = "/kaggle/input/learning-equality-curriculum-recommendations/"
topics = pd.read_csv(CFG.data_url + "topics.csv")
content = pd.read_csv(CFG.data_url + "content.csv")
correlations = pd.read_csv(CFG.data_url + "correlations.csv")
correlations.shape

(61517, 2)

In [7]:
topics = topics.fillna('')

In [8]:
def add_parent_text(topics: pd.DataFrame):
    id_full_text = {}
    id_to_text = {}
    for i, row in topics.iterrows():
        id_to_text[row.id] = [row.title, row.parent]
    print('done')
    def get_full_text(id):
        if id in id_full_text:
            return id_full_text[id]
        data = id_to_text[id]
        # full_text = f'{data[0]} < {get_full_text(data[1])}' if data[1] != '' else data[0]
        full_text = f'{data[0]} > {get_full_text(data[1])}' if data[1] != '' else data[0]
        id_full_text[id] = full_text
        return full_text
    tqdm.pandas()
    topics['title'] = topics.id.progress_apply(get_full_text)
    del id_full_text
    del id_to_text
    return topics

topics = add_parent_text(topics)

  0%|          | 0/76972 [00:00<?, ?it/s]

In [9]:
correlations_with_channel_category = pd.merge(
    correlations,
    topics[["id", "channel", "category"]],
    left_on="topic_id",
    right_on="id", 
    how="left"
)

In [10]:
correlations_with_channel_category.head()

Unnamed: 0,topic_id,content_ids,id,channel,category
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,t_00004da3a1b2,000cf7,source
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...,t_00068291e9a4,8e286a,source
2,t_00069b63a70a,c_11a1dc0bfb99,t_00069b63a70a,6e3ba4,source
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...,t_0006d41a73a8,000cf7,source
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4,t_0008768bdee6,5223e0,supplemental


In [11]:
def cv_split(data, cfg):
    kfold = StratifiedGroupKFold(n_splits = cfg.n_folds, shuffle = True, random_state = cfg.seed)
    for num, (train_index, val_index) in enumerate(kfold.split(data, data['category'], data['channel'])):
        data.loc[val_index, 'fold'] = int(num)
    data['fold'] = data['fold'].astype(int)
    return data

correlations_with_channel_category = cv_split(correlations_with_channel_category, cfg=CFG)

In [12]:
import pickle
topic_id_fold = dict(
    zip(
        correlations_with_channel_category["topic_id"],
        correlations_with_channel_category["fold"]
    )
)

with open(f"{OUTPUT_DIR}/fine_tunining_topic_id_fold.pkl", "wb") as f:
    pickle.dump(topic_id_fold, f)

In [13]:
correlations = correlations_with_channel_category[["topic_id", "content_ids", "fold"]]

In [14]:
correlations = correlations[correlations["fold"] != 0]

### Note: I only use 4/5 folds for finetuning, the 0's fold will use for validation at the next step. So you need to save the kfolds variable to a '.csv' file for the next steps.

In [15]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

In [16]:
correlations["content_id"] = correlations["content_ids"].str.split(" ")
corr = correlations.explode("content_id").drop(columns=["content_ids"])
corr

Unnamed: 0,topic_id,fold,content_id
0,t_00004da3a1b2,1,c_1108dd0c7a5d
0,t_00004da3a1b2,1,c_376c5a8eb028
0,t_00004da3a1b2,1,c_5bc0e1e2cba0
0,t_00004da3a1b2,1,c_76231f9d0b5e
1,t_00068291e9a4,4,c_639ea2ef9c95
...,...,...,...
61513,t_fff9e5407d13,1,c_9c20cf705bc1
61513,t_fff9e5407d13,1,c_d1635b5d7097
61513,t_fff9e5407d13,1,c_d64037a72376
61515,t_fffe14f1be1e,3,c_cece166bad6a


In [17]:
corr = corr.merge(topics, how="left", on="topic_id")
corr = corr.merge(content, how="left", on="content_id")
corr.head()

Unnamed: 0,topic_id,fold,content_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,topic_has_content,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license
0,t_00004da3a1b2,1,c_1108dd0c7a5d,Откриването на резисторите > Открития и проект...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Молив като резистор,"Моливът причинява промяна в отклонението, подо...",video,,bg,,
1,t_00004da3a1b2,1,c_376c5a8eb028,Откриването на резисторите > Открития и проект...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Да чуем променливото съпротивление,Тук чертаем линия на лист хартия и я използвам...,video,,bg,,
2,t_00004da3a1b2,1,c_5bc0e1e2cba0,Откриването на резисторите > Открития и проект...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Променлив резистор (реостат) с графит от молив,Използваме сърцевината на молива (неговия граф...,video,,bg,,
3,t_00004da3a1b2,1,c_76231f9d0b5e,Откриването на резисторите > Открития и проект...,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Последователно свързване на галваничен елемент...,"Защо отклонението се променя, когато се свърже...",video,,bg,,
4,t_00068291e9a4,4,c_639ea2ef9c95,Entradas e saídas de uma função > Álgebra: fun...,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Dados e resultados de funções: gráficos,Encontre todas as entradas que correspondem a ...,exercise,,pt,,


In [18]:
corr["set"] = corr[["topic_title", "content_title"]].values.tolist()
train_df = pd.DataFrame(corr["set"])

In [19]:
train_df

Unnamed: 0,set
0,[Откриването на резисторите > Открития и проек...
1,[Откриването на резисторите > Открития и проек...
2,[Откриването на резисторите > Открития и проек...
3,[Откриването на резисторите > Открития и проек...
4,[Entradas e saídas de uma função > Álgebra: fu...
...,...
210984,[NA_U06 - El periódico > Lengua española > PF ...
210985,[NA_U06 - El periódico > Lengua española > PF ...
210986,[NA_U06 - El periódico > Lengua española > PF ...
210987,[Lección 7 > Unidad 4 > Español Actividades 1r...


In [20]:
if CFG.debug:
    train_df = train_df[:1000]

In [21]:
dataset = Dataset.from_pandas(train_df)

In [22]:
dataset

Dataset({
    features: ['set', '__index_level_0__'],
    num_rows: 210989
})

In [23]:
train_examples = []
train_data = dataset["set"]
n_examples = dataset.num_rows

for i in range(n_examples):
    example = train_data[i]
    if example[0] == None: #remove None
        print(example)
        continue        
    train_examples.append(InputExample(texts=[str(example[0]), str(example[1])]))

In [24]:
model = SentenceTransformer(CFG.uns_model)

In [25]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
if CFG.debug:
    CFG.num_epochs = 1
warmup_steps = int(len(train_dataloader) * CFG.num_epochs * 0.1) #10% of train data

In [26]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=CFG.num_epochs,
          save_best_model = True,
          output_path=OUTPUT_DIR,
          warmup_steps=warmup_steps)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3297 [00:00<?, ?it/s]

# Upload

In [27]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [28]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)