In [33]:
NOTEBOOK_NAME = "ex12-uns-top50"

In [34]:
import os
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [35]:
os.system("pip3 install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cu116")
os.system("pip install tokenizers==0.12.1")
os.system("pip install transformers==4.20.1")

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116












0

In [36]:
os.system("pip3 install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cu116")

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116




0

In [37]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=false


In [38]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    num_workers = 4
    model = "/notebooks/kaggle_lecr/data/lecr-finetune-para-mpnet-parent-text-data/paraphrase-multilingual-mpnet-base-v2-exp/"
    tokenizer = AutoTokenizer.from_pretrained(model)
    batch_size = 32
    top_n = 50
    seed = 42
    data_url = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations/"
    debug = False
    upload_data = True

In [39]:
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    topics = pd.read_csv(cfg.data_url + 'topics.csv')
    content = pd.read_csv(cfg.data_url + 'content.csv')
    correlations = pd.read_csv(cfg.data_url + 'correlations.csv')
    # Fillna titles
    topics['title'].fillna("", inplace = True)
    content['title'].fillna("", inplace = True)
    # Fillna descriptions
    topics['description'].fillna("", inplace = True)
    content['description'].fillna("", inplace = True)
    # Sort by title length to make inference faster
    topics['length'] = topics['title'].apply(lambda x: len(x))
    content['length'] = content['title'].apply(lambda x: len(x))
    topics.sort_values('length', inplace = True)
    content.sort_values('length', inplace = True)
    # Drop cols
    # topics.drop(['description', 'channel', 'category', 'level', 'language', 'parent', 'has_content', 'length'], axis = 1, inplace = True)
    # topics = topics.drop(['description', 'channel', 'category', 'level', 'parent', 'has_content', 'length'], axis = 1)
    topics = topics.drop(['description', 'channel', 'category', 'level', 'has_content', 'length'], axis = 1)
    # content.drop(['description', 'kind', 'language', 'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    content = content.drop(['description', 'kind', 'text', 'copyright_holder', 'license', 'length'], axis = 1)
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    content.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return topics, content, correlations

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    try:
        preds = np.concatenate(preds)
    except:
        import pdb; pdb.set_trace()
    return preds
    
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = uns_dataset(topics, cfg)
    # Create content dataset
    content_dataset = uns_dataset(content, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    torch.cuda.empty_cache()
    del topics_dataset, content_dataset, topics_loader, content_loader, topics_preds, content_preds
    gc.collect()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = min(content.shape[0], cfg.top_n), metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    # 近い順にcfg.top_n個のindexが返ってくる
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    predictions = []
    for k in tqdm(range(len(indices))):
        pred = indices[k]
        pred_content_id = ' '.join([content.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(pred_content_id)
    # 類似度が高いcontent_id10件をtopicに入れる
    topics['predict_ids'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    return topics

In [40]:
def add_parent_text(topics: pd.DataFrame):
    topics = topics.fillna('')
    id_full_text = {}
    id_to_text = {}
    for i, row in topics.iterrows():
        id_to_text[row.id] = [row.title, row.parent]
    print('done')
    def get_full_text(id):
        if id in id_full_text:
            return id_full_text[id]
        data = id_to_text[id]
        # full_text = f'{data[0]} < {get_full_text(data[1])}' if data[1] != '' else data[0]
        full_text = f'{data[0]} > {get_full_text(data[1])}' if data[1] != '' else data[0]
        id_full_text[id] = full_text
        return full_text
    tqdm.pandas()
    topics['title'] = topics.id.progress_apply(get_full_text)
    # Sort by title length to make inference faster
    topics['length'] = topics['title'].apply(lambda x: len(x))
    topics.sort_values('length', inplace = True)
    del id_full_text
    del id_to_text
    return topics

In [41]:
# Read data
topics, content, correlations = read_data(CFG)

 
--------------------------------------------------
topics.shape: (76972, 4)
content.shape: (154047, 3)
correlations.shape: (61517, 2)


In [42]:
# add parent text
topics = add_parent_text(topics)

done


  0%|          | 0/76972 [00:00<?, ?it/s]

In [43]:
order_lang = topics["language"].value_counts().sort_values().index.tolist()

In [44]:
topics["language"].value_counts().sort_values()

mul        4
ru        34
swa       35
tr        40
pl        43
pnb       51
ta        60
ur        66
or        70
te        93
kn       119
km       121
my       135
as       167
fil      247
mr       300
zh       862
it       866
hi      1786
bn      2176
gu      2320
sw      2860
bg      2867
ar      3701
fr      3701
pt      4177
es     13910
en     36161
Name: language, dtype: int64

In [45]:
if CFG.debug:
    order_lang = order_lang[:3]

In [46]:
# Run nearest neighbors
# "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"を利用して得たvectorに対し、cos類似度で近傍10個を取得する
for lang in order_lang:
    topics_lang = topics[topics["language"] == lang]
    content_lang = content[content["language"] == lang].reset_index(drop=True)
    if content_lang.empty:
        print(f"{lang}'s content is empty")
        continue
    # topics = get_neighbors(topics, content, CFG)
    topics_lang = get_neighbors(topics_lang, content_lang, CFG)
    topics.loc[topics_lang.index, "predict_ids"] = topics_lang["predict_ids"]

mul's content is empty


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/envs/rapids/lib/python3.9/multiprocessing/queues.py", line 239, in _feed
    reader_close()
  File "/opt/conda/envs/rapids/lib/python3.9/multiprocessing/connection.py", line 182, in close
    self._close()
  File "/opt/conda/envs/rapids/lib/python3.9/multiprocessing/connection.py", line 366, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/envs/rapids/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/opt/conda/envs/rapids/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/envs/rapids/lib/python3.9/multiprocessing/queues.py", line 271, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times


 
Training KNN model...
> [0;32m/tmp/ipykernel_91/4237422885.py[0m(156)[0;36mget_neighbors[0;34m()[0m
[0;32m    154 [0;31m    [0mprint[0m[0;34m([0m[0;34m'Training KNN model...'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    155 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 156 [0;31m    [0mneighbors_model[0m [0;34m=[0m [0mNearestNeighbors[0m[0;34m([0m[0mn_neighbors[0m [0;34m=[0m [0mcfg[0m[0;34m.[0m[0mtop_n[0m[0;34m,[0m [0mmetric[0m [0;34m=[0m [0;34m'cosine'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    157 [0;31m    [0mneighbors_model[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mcontent_preds_gpu[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    158 [0;31m    [0;31m# 近い順にcfg.top_n個のindexが返ってくる[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  content


                 id                                              title  \
0    c_b3dc2ac51e6f                                               Вода   
1    c_8a869c785873                                               Вода   
2    c_8d0a9e88e75a                                            Лягушка   
3    c_11026b702f6a                                           На кухне   
4    c_95058967f183                                           На кухне   
..              ...                                                ...   
183  c_8d51f793661d  Арт-переработка: Как сделать изголовье кровати...   
184  c_779f51c11bc6  Арт-переработка: Как сделать журнальный столик...   
185  c_682369aa6231  Арт-переработка: Как сделать изголовье кровати...   
186  c_2ede7cdd55d2  Арт-переработка: Создание эффекта старины при ...   
187  c_270887f26948  Арт-переработка: Создание эффекта старины при ...   

    language  
0         ru  
1         ru  
2         ru  
3         ru  
4         ru  
..       ...  
183   

ipdb>  content.shape[0]


188


ipdb>  q


BdbQuit: 

In [None]:
# =========================================================================================
# Get the amount of positive classes based on the total
# =========================================================================================
def get_pos_score(y_true: pd.Series, y_pred: pd.Series):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

In [None]:
if CFG.debug:
    topics["predict_ids"] = topics["predict_ids"].fillna("c_77105b4b84cc")

In [None]:
# Merge with target and compute max positive score
# 実際にcorrelationを持っているcontentのうち、候補10個に入っている割合
topics = topics.merge(correlations, how = 'inner', left_on = ['id'], right_on = ['topic_id'])
pos_score = get_pos_score(topics['content_ids'], topics['predict_ids'])
print(f'Our max positive score is {pos_score}')

In [None]:
# We can delete correlations
del correlations
gc.collect()

In [None]:
# Set id as index for content
content.set_index('id', inplace = True)

In [None]:
# =========================================================================================
# Build our training set
# =========================================================================================
def build_training_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    topics_titles = []
    content_titles = []
    targets = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_title = row['title']
        predict_ids = row['predict_ids'].split(' ')
        ground_truth = row['content_ids'].split(' ')
        for pred in predict_ids:
            content_title = content.loc[pred, 'title']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            topics_titles.append(topics_title)
            content_titles.append(content_title)
            # If pred is in ground truth, 1 else 0
            if pred in ground_truth:
                targets.append(1)
            else:
                targets.append(0)
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'topics_titles': topics_titles, 
         'content_titles': content_titles, 
         'target': targets}
    )
    # Release memory
    del topics_ids, content_ids, topics_titles, content_titles, targets
    gc.collect()
    return train    


In [None]:
# Build training set
train = build_training_set(topics, content, CFG)
print(f'Our training set has {len(train)} rows')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
# Save train set to disk to train on another notebook
train.to_csv(OUTPUT_DIR + f'train_{CFG.top_n}.csv', index = False)

# Upload

In [None]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)