In [1]:
NOTEBOOK_NAME = "ex12-trn-top50"

In [2]:
import os
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
os.system("pip3 install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cu116")
os.system("pip install tokenizers==0.12.1")
os.system("pip install transformers==4.20.1")

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116












0

In [4]:
!nvidia-smi

Tue Feb 28 14:47:24 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   36C    P8    20W / 300W |  26439MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import os
os.system('pip install python-dotenv')

from dotenv import load_dotenv
load_dotenv()





True

In [6]:
os.system("pip install scikit-learn==1.2.1")





0

In [7]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import pickle
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [8]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    debug = False
    upload_data = True
    wandb = True
    print_freq = 500
    num_workers = 4
    model = "xlm-roberta-base"
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 128#368# 32#128#64#32
    weight_decay = 0.01
    max_grad_norm = 0.012
    max_len = 512
    n_folds = 5
    seed = 42
    epochs = 5
    data_url = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations"
    train_set_url = "/notebooks/kaggle_lecr/output/ex12-uns-top50"

In [9]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    os.system('pip install wandb==0.13.3')
    import wandb

    try:
        # for kaggle
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        
        # for paperspace
        secret_value_0 = os.getenv('WANDB_API_KEY')
        wandb.login(key=secret_value_0)
        
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='LECR',
                     entity="sinchir0",
                     name=NOTEBOOK_NAME,
                     config=class2dict(CFG),
                     group="trn",
                     job_type="train",
                     anonymous=anony)



[34m[1mwandb[0m: Currently logged in as: [33msinchir0[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [10]:
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True

In [23]:
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    # train = pd.read_csv(f"{cfg.train_set_url}/train.csv")
    # train = pd.read_pickle(f"{cfg.train_set_url}/train.pkl")
    train = pd.read_csv(f"{cfg.train_set_url}/train_50.csv")
    train["content_titles"] = train["content_titles"].fillna("")
    
    topics = pd.read_csv(cfg.data_url + "/" + "topics.csv")
    content = pd.read_csv(cfg.data_url + "/" + "content.csv")
    correlations = pd.read_csv(cfg.data_url + "/" + "correlations.csv")

    topics["title"] = topics["title"].fillna("")
    content["title"] = content["title"].fillna("")
    
    topics["description"] = topics["description"].fillna("")
    content["description"] = content["description"].fillna("")
    
    content['text'] = content['text'].fillna("")
    
    print(' ')
    print('-' * 50)
    print(f"train.shape: {train.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return train, topics, content, correlations

In [12]:
def preprocess(train: pd.DataFrame):
    # Create feature column
    # train['text'] = train['topics_titles'] + '[SEP]' + train['content_titles']
    train['text'] = train['topics_texts'] + '[SEP]' + train['content_texts']
    return train

In [13]:
def merge_cat_info(train: pd.DataFrame, topics: pd.DataFrame, content:pd.DataFrame):
    merge_train = pd.merge(train, topics[["id", "level", "category"]], left_on="topics_ids", right_on="id", how="left")
    merge_train = merge_train.drop("id", axis=1)
    merge_train = pd.merge(merge_train, content[["id", "kind"]], left_on="content_ids", right_on="id", how="left")
    merge_train = merge_train.drop("id", axis=1)
    
    merge_train["level_tag"] = merge_train["level"].apply(lambda x: f"[LEVEL{x}]")
    merge_train["category_tag"] = merge_train["category"].apply(lambda x: f"[CATEGORY_{x.upper()}]")
    merge_train["kind_tag"] = merge_train["kind"].apply(lambda x: f"[KIND_{x.upper()}]")
    
    level_tag_list = sorted(merge_train["level_tag"].unique()) 
    category_list = sorted(merge_train["category_tag"].unique()) 
    kind_list = sorted(merge_train["kind_tag"].unique()) 
    
    # train['topics_titles'] = merge_train['level_tag'] + merge_train["category_tag"] + merge_train['topics_titles'] 
    train['topics_texts'] = merge_train['level_tag'] + merge_train["category_tag"] + merge_train['topics_texts'] 
    # train['content_titles'] = merge_train['kind_tag'] + merge_train['content_titles']
    train['content_texts'] = merge_train['kind_tag'] + merge_train['content_texts']
    
    return train, level_tag_list, category_list, kind_list

In [14]:
# =========================================================================================
# CV split
# =========================================================================================
def cv_split(train, cfg):
    kfold = StratifiedGroupKFold(n_splits = cfg.n_folds, shuffle = True, random_state = cfg.seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train, train['target'], train['topics_ids'])):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    return train

In [15]:
# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

In [16]:
# =========================================================================================
# Get max length
# =========================================================================================
def get_max_length(train, cfg):
    lengths = []
    for text in tqdm(train['text'].fillna("").values, total = len(train)):
        length = len(cfg.tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    cfg.max_len = min(max(lengths) + 5, cfg.max_len) # cls + sep + level + category + kind
    print(f"max_len: {cfg.max_len}")

In [17]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Custom dataset
# =========================================================================================
class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
# =========================================================================================
# Collate function for training
# =========================================================================================
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
# =========================================================================================
# Model
# =========================================================================================
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        # AutoTokenizer.from_pretrainedでadditional_special_tokensをした際は、増えたtoken分、新しい語彙として登録が必要らしい
        # https://cocoinit23.com/pytorch-runtimeerror-cuda-error-device-side-assert-triggered/
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
# =========================================================================================
# Helper functions
# =========================================================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# =========================================================================================
# Train function loop
# =========================================================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
    return losses.avg

# =========================================================================================
# Valid function loop
# =========================================================================================
def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

# =========================================================================================
# Get best threshold
# =========================================================================================
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold
    
# =========================================================================================
# Train & Evaluate
# =========================================================================================
def train_and_evaluate_one_fold(train, correlations, fold, cfg, add_topic_content: list):
    # 高速化、計算の再現性は担保されない、https://qiita.com/sugulu_Ogawa_ISID/items/62f5f7adee083d96a587
    torch.backends.cudnn.benchmark = True
    
    print(' ')
    print(f"========== fold: {fold} training ==========")
    # Split train & validation
    x_train = train[train['fold'] != fold]
    x_val = train[train['fold'] == fold]
    
    # categoryがsourceのtopicは評価に使わない
    x_val = x_val[~x_val["topics_texts"].str.contains("CATEGORY_SOURCE")]
    
    # 追加したpositiveのtopic, contentは評価には使わない
    x_val = x_val[~(x_val["topics_ids"] + x_val["content_ids"]).isin(add_topic_content)]
    
    valid_labels = x_val['target'].values
    train_dataset = custom_dataset(x_train, cfg)
    valid_dataset = custom_dataset(x_val, cfg)
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    model.to(device)
    # Optimizer
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    optimizer_parameters = get_optimizer_params(
        model, 
        encoder_lr = cfg.encoder_lr, 
        decoder_lr = cfg.decoder_lr,
        weight_decay = cfg.weight_decay
    )
    optimizer = AdamW(
        optimizer_parameters, 
        lr = cfg.encoder_lr, 
        eps = cfg.eps, 
        betas = cfg.betas
    )
    num_train_steps = int(len(x_train) / cfg.batch_size * cfg.epochs)
    num_warmup_steps = num_train_steps * cfg.warmup_ratio
    # Scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = num_warmup_steps, 
        num_training_steps = num_train_steps, 
        num_cycles = cfg.num_cycles
        )
    # Training & Validation loop
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")
    best_score = 0
    
    for epoch in range(cfg.epochs):
        start_time = time.time()
        # Train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
        # Validation
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
        # Compute f2_score
        score, threshold = get_best_threshold(x_val, predictions, correlations)
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
        torch.save(
            {'model': model.state_dict(), 'predictions': predictions}, 
            f"{OUTPUT_DIR}/{cfg.model.replace('/', '-')}_fold{fold}_{cfg.seed}_epoch{epoch}.pth"
        )
        if score > best_score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save(
                {'model': model.state_dict(), 'predictions': predictions}, 
                f"{OUTPUT_DIR}/{cfg.model.replace('/', '-')}_fold{fold}_{cfg.seed}_best_model.pth"
                )
            val_predictions = predictions
    torch.cuda.empty_cache()
    gc.collect()
    # Get best threshold
    best_score, best_threshold = get_best_threshold(x_val, val_predictions, correlations)
    # Save Score, Threshold
    score = {"best_score": best_score, "best_threshold": best_threshold}
    with open(f"{OUTPUT_DIR}/score.pkl", "wb") as f:
        pickle.dump(score, f)
    print(f'Our CV score is {best_score} using a threshold of {best_threshold}')

In [18]:
# Seed everything
seed_everything(CFG)

In [35]:
# Read data
train, topics, content, correlations = read_data(CFG)

 
--------------------------------------------------
train.shape: (3075850, 5)
correlations.shape: (61517, 2)


In [38]:
train = train.rename(columns={"topics_titles":"topics_texts","content_titles":"content_texts"})

In [39]:
train.isnull().sum()

topics_ids       0
content_ids      0
topics_texts     0
content_texts    0
target           0
dtype: int64

In [40]:
# positive sampleを足す
def add_positive_sample(train: pd.DataFrame, correlations: pd.DataFrame, topics: pd.DataFrame, content: pd.DataFrame):
    
    # topic_text_dict = dict(zip(topics["id"], topics['title'] + " " + topics['description']))
    # content_text_dict = dict(zip(content["id"], content['title'] + " " + content['description']))
    topic_text_dict = dict(zip(topics["id"], topics['title']))
    content_text_dict = dict(zip(content["id"], content['title']))
    
    correlations["content_ids_list"] = correlations["content_ids"].apply(lambda x : x.split())
    
    all_positive_sample = correlations.explode("content_ids_list")[["topic_id","content_ids_list"]]
    all_positive_sample = all_positive_sample.rename(columns={"topic_id":"topics_ids","content_ids_list":"content_ids"})
    
    all_positive_sample["topics_texts"] = all_positive_sample["topics_ids"].map(topic_text_dict)
    all_positive_sample["content_texts"] = all_positive_sample["content_ids"].map(content_text_dict)
    all_positive_sample["target"] = 1
    
    all_positive_sample = all_positive_sample.reset_index(drop=True)
    
    # 追加するtopic, contentのみを持つlistを生成
    all_positive_topic_content = (all_positive_sample["topics_ids"] + all_positive_sample["content_ids"]).tolist()
    train_positive = train[train["target"] == 1]
    train_positive_topic_content = (train_positive["topics_ids"] + train_positive["content_ids"]).tolist()
    add_topic_content = list(set(all_positive_topic_content) - set(train_positive_topic_content))
    
    # trainにpositive sampleを追加
    train = pd.concat([train, all_positive_sample]).drop_duplicates(subset=["topics_ids","content_ids"], keep='first')
    train = train.sort_values("topics_ids")
    train = train.reset_index(drop=True)

    return train, add_topic_content

train, add_topic_content = add_positive_sample(train, correlations, topics, content)

In [None]:
# train["topics_texts"] = train["topics_texts"].apply(lambda x: x[:300])
# train["content_texts"] = train["content_texts"].apply(lambda x: x[:300])

In [42]:
train.head()

Unnamed: 0,topics_ids,content_ids,topics_texts,content_texts,target
0,t_00004da3a1b2,c_6cd1bd6f1e49,Откриването на резисторите > Открития и проект...,Диелектрици в кондензатори,0
1,t_00004da3a1b2,c_c8184b4bba5d,Откриването на резисторите > Открития и проект...,Електричен ток,0
2,t_00004da3a1b2,c_678145c4cfe4,Откриването на резисторите > Открития и проект...,Кондензатори и капацитет,0
3,t_00004da3a1b2,c_ded49059e260,Откриването на резисторите > Открития и проект...,Задача за събиране на съпротивления,0
4,t_00004da3a1b2,c_0c885859d4fa,Откриването на резисторите > Открития и проект...,Последователно свързани кондензатори,0


In [41]:
len(add_topic_content)

51859

In [43]:
if CFG.debug:
    train = train[:1000]
    CFG.epoch = 1

In [44]:
train, level_tag_list, category_list, kind_list = merge_cat_info(train, topics, content)

In [45]:
train = preprocess(train)

In [46]:
train.head()

Unnamed: 0,topics_ids,content_ids,topics_texts,content_texts,target,text
0,t_00004da3a1b2,c_6cd1bd6f1e49,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...,[KIND_VIDEO]Диелектрици в кондензатори,0,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...
1,t_00004da3a1b2,c_c8184b4bba5d,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...,[KIND_VIDEO]Електричен ток,0,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...
2,t_00004da3a1b2,c_678145c4cfe4,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...,[KIND_VIDEO]Кондензатори и капацитет,0,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...
3,t_00004da3a1b2,c_ded49059e260,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...,[KIND_VIDEO]Задача за събиране на съпротивления,0,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...
4,t_00004da3a1b2,c_0c885859d4fa,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...,[KIND_VIDEO]Последователно свързани кондензатори,0,[LEVEL4][CATEGORY_SOURCE]Откриването на резист...


In [47]:
CFG.model

'xlm-roberta-base'

In [48]:
CFG.tokenizer = AutoTokenizer.from_pretrained(
    CFG.model,
    additional_special_tokens = level_tag_list + category_list + kind_list
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [49]:
CFG.tokenizer.additional_special_tokens

['[LEVEL0]',
 '[LEVEL10]',
 '[LEVEL1]',
 '[LEVEL2]',
 '[LEVEL3]',
 '[LEVEL4]',
 '[LEVEL5]',
 '[LEVEL6]',
 '[LEVEL7]',
 '[LEVEL8]',
 '[LEVEL9]',
 '[CATEGORY_ALIGNED]',
 '[CATEGORY_SOURCE]',
 '[CATEGORY_SUPPLEMENTAL]',
 '[KIND_AUDIO]',
 '[KIND_DOCUMENT]',
 '[KIND_EXERCISE]',
 '[KIND_HTML5]',
 '[KIND_VIDEO]']

In [None]:
# categoryがsourceのnegativeデータを減らし、計算時間を短くする
# train["is_category_source"] = train["topics_texts"].str.contains("CATEGORY_SOURCE")
# train = train[~(train["is_category_source"] & (train["target"] == 0))].reset_index(drop=True)

In [None]:
# 計算を終わらせるため、行を減らす
# if not CFG.debug:
#     train = train.sample(200000).reset_index(drop=True)

In [50]:
# CV split
train = cv_split(train, CFG)

In [51]:
tst = train[["topics_ids","fold"]]

In [52]:
tst.drop_duplicates().to_csv("topics_ids_fold.csv", index=False)

In [53]:
fold0_list = train[train["fold"] == 0]["topics_ids"].unique().tolist()
with open(f"{OUTPUT_DIR}/fold_0_topics_ids.pkl", "wb") as f:
    pickle.dump(fold0_list, f)

In [54]:
print(train["fold"].value_counts())

4    625884
0    625596
1    625534
3    625402
2    625293
Name: fold, dtype: int64


In [55]:
train = train.reset_index(drop=True)

In [56]:
train["target"].value_counts()

0    2847790
1     279919
Name: target, dtype: int64

In [58]:
# Get max length
get_max_length(train, CFG)

  0%|          | 0/3127709 [00:00<?, ?it/s]

max_len: 233


In [59]:
if CFG.debug:
    CFG.epochs = 1

In [60]:
# Train and evaluate one fold
train_and_evaluate_one_fold(train, correlations, 0, CFG, add_topic_content)

 


Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/19547] Elapsed 0m 1s (remain 632m 22s) Loss: 0.8759(0.8759) Grad: inf  LR: 0.00000000  


KeyboardInterrupt: 

# Upload

In [None]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)