In [1]:
NOTEBOOK_NAME = "ex7-uns-add-desc-len100"

In [2]:
import os
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
os.system("pip3 install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cu116")
os.system("pip install tokenizers==0.12.1")
os.system("pip install transformers==4.20.1")

0

In [4]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
import os
os.system('pip install python-dotenv')

from dotenv import load_dotenv
load_dotenv()

True

In [6]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    num_workers = 4
    model = "/notebooks/kaggle_lecr/output/ex6-finetune-add-desc-text-ep9-10/"
    tokenizer = AutoTokenizer.from_pretrained(model)
    batch_size = 64# 32
    top_n = 10
    seed = 42
    data_url = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations/"
    debug = False
    upload_data = True
    wandb = True
    max_len = 128 # https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2

In [7]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    os.system('pip install wandb')
    import wandb

    try:
        # for kaggle
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        
        # for paperspace
        secret_value_0 = os.getenv('WANDB_API_KEY')
        wandb.login(key=secret_value_0)
        
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='LECR',
                     entity="sinchir0",
                     name=NOTEBOOK_NAME,
                     config=class2dict(CFG),
                     group="uns",
                     job_type="train",
                     anonymous=anony)

In [8]:
import torch
wandb.log({"GPU":torch.cuda.get_device_name()})

In [9]:
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    topics = pd.read_csv(cfg.data_url + 'topics.csv')
    content = pd.read_csv(cfg.data_url + 'content.csv')
    correlations = pd.read_csv(cfg.data_url + 'correlations.csv')
    # Fillna titles
    topics['title'].fillna("", inplace = True)
    content['title'].fillna("", inplace = True)
    # Fillna descriptions
    topics['description'].fillna("", inplace = True)
    content['description'].fillna("", inplace = True)
    # Fillna text
    content['text'].fillna("", inplace = True)
    # Sort by title length to make inference faster
    # topics['length'] = topics['title'].apply(lambda x: len(x))
    # content['length'] = content['title'].apply(lambda x: len(x))
    topics['length'] = (topics['title'] + " " + topics['description']).apply(lambda x: len(x))
    content['length'] = (content['title'] + " " + content['description']).apply(lambda x: len(x))

    topics.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    # Drop cols
    topics = topics.drop(['channel', 'category', 'level', 'has_content', 'length'], axis = 1)
    content = content.drop(['kind', 'text', 'copyright_holder', 'license', 'length'], axis = 1)
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return topics, content, correlations

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        # self.texts = df['title'].values
        self.texts = df['use_text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds
    
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = uns_dataset(topics, cfg)
    # Create content dataset
    content_dataset = uns_dataset(content, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, topics_loader, content_loader, topics_preds, content_preds
    gc.collect()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = cfg.top_n, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    # 近い順にcfg.top_n個のindexが返ってくる
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    predictions = []
    for k in tqdm(range(len(indices))):
        pred = indices[k]
        pred_content_id = ' '.join([content.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(pred_content_id)
    # 類似度が高いcontent_id10件をtopicに入れる
    topics['predict_ids'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    return topics

In [10]:
def add_parent_text(topics: pd.DataFrame):
    topics = topics.fillna('')
    id_full_text = {}
    id_to_text = {}
    for i, row in topics.iterrows():
        id_to_text[row.id] = [row.title, row.parent]
    print('done')
    def get_full_text(id):
        if id in id_full_text:
            return id_full_text[id]
        data = id_to_text[id]
        # full_text = f'{data[0]} < {get_full_text(data[1])}' if data[1] != '' else data[0]
        full_text = f'{data[0]} > {get_full_text(data[1])}' if data[1] != '' else data[0]
        id_full_text[id] = full_text
        return full_text
    tqdm.pandas()
    topics['title'] = topics.id.progress_apply(get_full_text)
    # Sort by title length to make inference faster
    topics['length'] = topics['title'].apply(lambda x: len(x))
    topics.sort_values('length', inplace = True)
    del id_full_text
    del id_to_text
    return topics

In [11]:
# Read data
topics, content, correlations = read_data(CFG)

In [12]:
# add parent text
topics = add_parent_text(topics)

  0%|          | 0/76972 [00:00<?, ?it/s]

In [13]:
topics

Unnamed: 0,id,title,description,language,parent,length
230,t_9c500effd8a7,Zim,,en,,3
468,t_292b1d2b7cf2,Math,,fr,,4
462,t_51ba6ffbcc43,K-12,,en,,4
439,t_0325dce6b388,CBSE,,en,,4
331,t_fe5ced2643b3,CREE,,es,,4
...,...,...,...,...,...,...
71910,t_6bd894f141c5,10.1.2 Test for conduction of electricity by: ...,Materials in this folder have been reviewed by...,en,t_78a57f8f898c,498
42420,t_f3bc364aa1fd,El equilibrio y la agilidad como base de la gi...,,es,t_9ae84a50e636,507
49835,t_185abf7d7fb8,Conocimiento adecuado de las capacidades del s...,,es,t_e57421bc3164,512
72931,t_dcb8ebf6be97,9.2.4 Neuro-endocrine system and homeostasis: ...,Materials in this folder have been reviewed by...,en,t_939d8c56ad92,515


In [14]:
content#["length"].max()

Unnamed: 0,id,title,description,language
0,c_3c070b63a944,,,es
1,c_87e171afe50b,,,es
2,c_77105b4b84cc,,,es
3,c_db7818729577,,,es
4,c_a04562126266,,,es
...,...,...,...,...
154042,c_a21c7882270d,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for I...,es
154043,c_6d5cda60831d,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for E...,es
154044,c_9204c4b1e8f4,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for T...,es
154045,c_521df3a0f541,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for F...,es


In [15]:
content

Unnamed: 0,id,title,description,language
0,c_3c070b63a944,,,es
1,c_87e171afe50b,,,es
2,c_77105b4b84cc,,,es
3,c_db7818729577,,,es
4,c_a04562126266,,,es
...,...,...,...,...
154042,c_a21c7882270d,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for I...,es
154043,c_6d5cda60831d,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for E...,es
154044,c_9204c4b1e8f4,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for T...,es
154045,c_521df3a0f541,Documento 1,¡Cuéntalo otra vez! Read-Aloud Anthology for F...,es


In [16]:
topics["use_text"] = topics["title"] + " " + topics["description"].apply(lambda x : x[:100])
content["use_text"] = content["title"] + " " + content["description"].apply(lambda x : x[:100])

In [17]:
order_lang = topics["language"].value_counts().index.tolist()

In [18]:
topics["language"].value_counts()

en     36161
es     13910
pt      4177
fr      3701
ar      3701
bg      2867
sw      2860
gu      2320
bn      2176
hi      1786
it       866
zh       862
mr       300
fil      247
as       167
my       135
km       121
kn       119
te        93
or        70
ur        66
ta        60
pnb       51
pl        43
tr        40
swa       35
ru        34
mul        4
Name: language, dtype: int64

In [19]:
if CFG.debug:
    order_lang = order_lang[:-3:-1]

In [20]:
# Run nearest neighbors
# "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"を利用して得たvectorに対し、cos類似度で近傍10個を取得する
for lang in order_lang:
    print(f"Now:{lang}")
    topics_lang = topics[topics["language"] == lang]
    content_lang = content[content["language"] == lang].reset_index(drop=True)
    if content_lang.empty:
        print(f"{lang}'s content is empty")
        continue
    topics_lang = get_neighbors(topics_lang, content_lang, CFG)
    topics.loc[topics_lang.index, "predict_ids"] = topics_lang["predict_ids"]

  0%|          | 0/566 [00:00<?, ?it/s]

  0%|          | 0/1031 [00:00<?, ?it/s]

  0%|          | 0/36161 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

  0%|          | 0/482 [00:00<?, ?it/s]

  0%|          | 0/13910 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/4177 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/3701 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/116 [00:00<?, ?it/s]

  0%|          | 0/3701 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/95 [00:00<?, ?it/s]

  0%|          | 0/2867 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/2860 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

  0%|          | 0/2320 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/2176 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/1786 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/866 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/862 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/247 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/121 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/93 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/43 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

In [21]:
# TODO: ここから下を再実行

In [22]:
# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true: pd.Series, y_pred: pd.Series):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

In [23]:
if CFG.debug:
    topics["predict_ids"] = topics["predict_ids"].fillna("c_77105b4b84cc")

In [24]:
# Merge with target and compute max positive score
# 実際にcorrelationを持っているcontentのうち、候補10個に入っている割合
topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
pos_score = get_pos_score(topics['content_ids'], topics['predict_ids'])
print(f'Our max positive score is {pos_score}')

In [25]:
# We can delete correlations
del correlations
gc.collect()

0

In [26]:
# Set id as index for content
content.set_index('id', inplace = True)

In [27]:
# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    # topics_titles = []
    # content_titles = []
    topics_texts = []
    content_texts = []
    targets = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        # topics_title = row['title']
        topics_text = row["use_text"]
        predict_ids = row['predict_ids'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        for pred in predict_ids:
            # content_title = content.loc[pred, 'title']
            content_text = content.loc[pred, 'use_text']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            # topics_titles.append(topics_title)
            topics_texts.append(topics_text)
            # content_titles.append(content_title)
            content_texts.append(content_text)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'topics_texts': topics_texts,
         'content_texts': content_texts,
         'target': targets}
    )
    # Release memory
    del topics_ids, content_ids, topics_texts, content_texts, targets
    gc.collect()
    return train    


In [28]:
# Build training set
train = build_training_set(topics, content, CFG)
print(f'Our training set has {len(train)} rows')

  0%|          | 0/61517 [00:00<?, ?it/s]

In [29]:
train.head()

Unnamed: 0,topics_ids,content_ids,topics_texts,content_texts,target
0,t_30dd476279c8,c_a7926808742b,Medicine,Medicine,0
1,t_30dd476279c8,c_1a30551d75b7,Medicine,Misuse of Medicines,0
2,t_30dd476279c8,c_7f13d437e8a9,Medicine,Medicine Practice,0
3,t_30dd476279c8,c_3fbfb6458578,Medicine,First Aid Treatment and its Uses,0
4,t_30dd476279c8,c_35755e8a3883,Medicine,Health and its significance,0


In [30]:
train.shape

(615170, 5)

In [31]:
# Save train set to disk to train on another notebook
train.to_csv(OUTPUT_DIR + 'train.csv', index = False)

# Upload

In [32]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [33]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)

In [34]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
GPU,NVIDIA RTX A5000
