In [1]:
NOTEBOOK_NAME = "ex2-trn-valid-without-source"

In [2]:
import os
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
os.system("pip3 install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cu116")
os.system("pip install tokenizers==0.12.1")
os.system("pip install transformers==4.20.1")

0

In [4]:
!nvidia-smi

In [5]:
os.system("pip install scikit-learn==1.2.1")

0

In [6]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import pickle
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    debug = False
    upload_data = True
    print_freq = 500
    num_workers = 4
    model = "xlm-roberta-base"
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 128 # 32
    weight_decay = 0.01
    max_grad_norm = 0.012
    max_len = 512
    n_folds = 5
    seed = 42
    epochs = 5
    data_url = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations"
    train_set_url = "/notebooks/kaggle_lecr/output/ex1-uns-top-n-10-para-mpnet-fold"
    finetune_url = "/notebooks/kaggle_lecr/data/ex1-finetuning-mpnet-mnrloss-ep10-fold"

In [8]:
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True

In [9]:
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    train = pd.read_csv(f"{cfg.train_set_url}/train.csv")
    train['topics_titles'].fillna("Title does not exist", inplace = True)
    train['content_titles'].fillna("Title does not exist", inplace = True)
    
    topics = pd.read_csv(cfg.data_url + "/" + "topics.csv")
    content = pd.read_csv(cfg.data_url + "/" + "content.csv")
    correlations = pd.read_csv(cfg.data_url + "/" + "correlations.csv")

    print(' ')
    print('-' * 50)
    print(f"train.shape: {train.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return train, topics, content, correlations

In [10]:
def preprocess(train: pd.DataFrame):
    # Create feature column
    train['text'] = train['topics_titles'] + '[SEP]' + train['content_titles']
    return train

In [11]:
def merge_cat_info(train: pd.DataFrame, topics: pd.DataFrame, content:pd.DataFrame):
    merge_train = pd.merge(train, topics[["id", "level", "category"]], left_on="topics_ids", right_on="id", how="left")
    merge_train = merge_train.drop("id", axis=1)
    merge_train = pd.merge(merge_train, content[["id", "kind"]], left_on="content_ids", right_on="id", how="left")
    merge_train = merge_train.drop("id", axis=1)
    
    merge_train["level_tag"] = merge_train["level"].apply(lambda x: f"[LEVEL{x}]")
    merge_train["category_tag"] = merge_train["category"].apply(lambda x: f"[CATEGORY_{x.upper()}]")
    merge_train["kind_tag"] = merge_train["kind"].apply(lambda x: f"[KIND_{x.upper()}]")
    
    level_tag_list = sorted(merge_train["level_tag"].unique()) 
    category_list = sorted(merge_train["category_tag"].unique()) 
    kind_list = sorted(merge_train["kind_tag"].unique()) 
    
    train['topics_titles'] = merge_train['level_tag'] + merge_train["category_tag"] + merge_train['topics_titles'] 
    train['content_titles'] = merge_train['kind_tag'] + merge_train['content_titles']
    
    return train, level_tag_list, category_list, kind_list

In [12]:
# =========================================================================================
# CV split
# =========================================================================================
# def cv_split(train, cfg):
#     import pdb; pdb.set_trace()
#     kfold = StratifiedGroupKFold(n_splits = cfg.n_folds, shuffle = True, random_state = cfg.seed)
#     for num, (train_index, val_index) in enumerate(kfold.split(train, train['target'], train['topics_ids'])):
#         train.loc[val_index, 'fold'] = int(num)
#     train['fold'] = train['fold'].astype(int)
#     return train

In [13]:
# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

In [14]:
# =========================================================================================
# Get max length
# =========================================================================================
def get_max_length(train, cfg):
    lengths = []
    for text in tqdm(train['text'].fillna("").values, total = len(train)):
        length = len(cfg.tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    cfg.max_len = max(lengths) + 5 # cls + sep + level + category + kind
    print(f"max_len: {cfg.max_len}")

In [15]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Custom dataset
# =========================================================================================
class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
# =========================================================================================
# Collate function for training
# =========================================================================================
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
# =========================================================================================
# Model
# =========================================================================================
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        # AutoTokenizer.from_pretrainedでadditional_special_tokensをした際は、増えたtoken分、新しい語彙として登録が必要らしい
        # https://cocoinit23.com/pytorch-runtimeerror-cuda-error-device-side-assert-triggered/
        self.model.resize_token_embeddings(len(cfg.tokenizer))
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
# =========================================================================================
# Helper functions
# =========================================================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# =========================================================================================
# Train function loop
# =========================================================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
    return losses.avg

# =========================================================================================
# Valid function loop
# =========================================================================================
def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

# =========================================================================================
# Get best threshold
# =========================================================================================
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold
    
# =========================================================================================
# Train & Evaluate
# =========================================================================================
def train_and_evaluate_one_fold(train, correlations, fold, cfg):
    print(' ')
    print(f"========== fold: {fold} training ==========")
    # Split train & validation
    x_train = train[train['fold'] != fold]
    x_val = train[train['fold'] == fold]
    
    # categoryがsourceのtopicは評価に使わない
    x_val = x_val[~x_val["topics_titles"].str.contains("CATEGORY_SOURCE")]

    valid_labels = x_val['target'].values
    train_dataset = custom_dataset(x_train, cfg)
    valid_dataset = custom_dataset(x_val, cfg)
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    model.to(device)
    # Optimizer
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    optimizer_parameters = get_optimizer_params(
        model, 
        encoder_lr = cfg.encoder_lr, 
        decoder_lr = cfg.decoder_lr,
        weight_decay = cfg.weight_decay
    )
    optimizer = AdamW(
        optimizer_parameters, 
        lr = cfg.encoder_lr, 
        eps = cfg.eps, 
        betas = cfg.betas
    )
    num_train_steps = int(len(x_train) / cfg.batch_size * cfg.epochs)
    num_warmup_steps = num_train_steps * cfg.warmup_ratio
    # Scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = num_warmup_steps, 
        num_training_steps = num_train_steps, 
        num_cycles = cfg.num_cycles
        )
    # Training & Validation loop
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")
    best_score = 0
    for epoch in range(cfg.epochs):
        start_time = time.time()
        # Train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
        # Validation
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
        # Compute f2_score
        score, threshold = get_best_threshold(x_val, predictions, correlations)
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
        if score > best_score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save(
                {'model': model.state_dict(), 'predictions': predictions}, 
                f"{OUTPUT_DIR}/{cfg.model.replace('/', '-')}_fold{fold}_{cfg.seed}.pth"
                )
            val_predictions = predictions
    torch.cuda.empty_cache()
    gc.collect()
    # Get best threshold
    best_score, best_threshold = get_best_threshold(x_val, val_predictions, correlations)
    # Save Score, Threshold
    score = {"best_score": best_score, "best_threshold": best_threshold}
    with open(f"{OUTPUT_DIR}/score.pkl", "wb") as f:
        pickle.dump(score, f)
    print(f'Our CV score is {best_score} using a threshold of {best_threshold}')

In [16]:
# Seed everything
seed_everything(CFG)

In [17]:
# Read data
train, topics, content, correlations = read_data(CFG)

In [18]:
if CFG.debug:
    train = train[:1000]
    CFG.epoch = 1

In [19]:
train, level_tag_list, category_list, kind_list = merge_cat_info(train, topics, content)

In [20]:
level_tag_list

['[LEVEL0]',
 '[LEVEL10]',
 '[LEVEL1]',
 '[LEVEL2]',
 '[LEVEL3]',
 '[LEVEL4]',
 '[LEVEL5]',
 '[LEVEL6]',
 '[LEVEL7]',
 '[LEVEL8]',
 '[LEVEL9]']

In [21]:
category_list

['[CATEGORY_ALIGNED]', '[CATEGORY_SOURCE]', '[CATEGORY_SUPPLEMENTAL]']

In [22]:
kind_list

['[KIND_AUDIO]',
 '[KIND_DOCUMENT]',
 '[KIND_EXERCISE]',
 '[KIND_HTML5]',
 '[KIND_VIDEO]']

In [23]:
train = preprocess(train)

In [24]:
train.head()

Unnamed: 0,topics_ids,content_ids,topics_titles,content_titles,target,text
0,t_30dd476279c8,c_a7926808742b,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_HTML5]Medicine,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
1,t_30dd476279c8,c_7f13d437e8a9,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_EXERCISE]Medicine Practice,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
2,t_30dd476279c8,c_9535009492ec,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_VIDEO]The Science of Healthy Living,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
3,t_30dd476279c8,c_1a30551d75b7,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_VIDEO]Misuse of Medicines,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...
4,t_30dd476279c8,c_6dfde5fa8570,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine,[KIND_DOCUMENT]Garden of Medicines,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][K...


In [25]:
train["text"][0]

'[LEVEL0][CATEGORY_SUPPLEMENTAL]Medicine[SEP][KIND_HTML5]Medicine'

In [26]:
CFG.tokenizer = AutoTokenizer.from_pretrained(
    CFG.model,
    additional_special_tokens = level_tag_list + category_list + kind_list
)

In [27]:
CFG.tokenizer.additional_special_tokens

['[LEVEL0]',
 '[LEVEL10]',
 '[LEVEL1]',
 '[LEVEL2]',
 '[LEVEL3]',
 '[LEVEL4]',
 '[LEVEL5]',
 '[LEVEL6]',
 '[LEVEL7]',
 '[LEVEL8]',
 '[LEVEL9]',
 '[CATEGORY_ALIGNED]',
 '[CATEGORY_SOURCE]',
 '[CATEGORY_SUPPLEMENTAL]',
 '[KIND_AUDIO]',
 '[KIND_DOCUMENT]',
 '[KIND_EXERCISE]',
 '[KIND_HTML5]',
 '[KIND_VIDEO]']

In [28]:
# CV split
# train = cv_split(train, CFG)

# Use Same Fold By FineTuning
with open(CFG.finetune_url + "/fine_tunining_topic_id_fold.pkl", "rb") as f:
    fine_tunining_topic_id_fold = pickle.load(f)
    
train["fold"] = train["topics_ids"].map(fine_tunining_topic_id_fold)

In [29]:
print(train["fold"].value_counts())

In [30]:
train[(train["fold"] == 0)]

Unnamed: 0,topics_ids,content_ids,topics_titles,content_titles,target,text,fold
10,t_dcb302962e2f,c_50645593cc08,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria,[KIND_HTML5]Aula Prática,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria[SEP...,0
11,t_dcb302962e2f,c_1a0c00d75837,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria,[KIND_HTML5]Processos,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria[SEP...,0
12,t_dcb302962e2f,c_91e1f12935b9,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria,[KIND_HTML5]Herramientas de testing,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria[SEP...,0
13,t_dcb302962e2f,c_758f12ca153b,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria,[KIND_HTML5]ACTIVIDADES DE EXPERIMENTACIÓN,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria[SEP...,0
14,t_dcb302962e2f,c_56b8eaaeba9d,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria,[KIND_HTML5]Experimentos Científicos,0,[LEVEL0][CATEGORY_SUPPLEMENTAL]Laboratoria[SEP...,0
...,...,...,...,...,...,...,...
613225,t_7f847077b21d,c_f61f8c4c1a6b,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,[KIND_EXERCISE]Distributive property,1,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,0
613226,t_7f847077b21d,c_04098335d55f,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,[KIND_VIDEO]Identifying values in scale copies,0,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,0
613227,t_7f847077b21d,c_d7af3ab1f656,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,[KIND_VIDEO]Multiplying by tens word problem,0,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,0
613228,t_7f847077b21d,c_a84b0c82371d,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,[KIND_EXERCISE]Convert to smaller units (mL an...,0,[LEVEL5][CATEGORY_SOURCE]Topic F: Distributive...,0


In [31]:
# Get max length
get_max_length(train, CFG)

  0%|          | 0/615170 [00:00<?, ?it/s]

In [32]:
# Train and evaluate one fold
train_and_evaluate_one_fold(train, correlations, 0, CFG)

# Upload

In [33]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [34]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)