In [1]:
NOTEBOOK_NAME = "ex15-trn-lightgbm-without-valid-pos-sample"

In [2]:
import os
OUTPUT_DIR = f"/notebooks/kaggle_lecr/output/{NOTEBOOK_NAME}"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [3]:
os.system("pip3 install torch==1.12.0 --extra-index-url https://download.pytorch.org/whl/cu116")
os.system("pip install tokenizers==0.12.1")
os.system("pip install transformers==4.20.1")

0

In [4]:
!nvidia-smi

In [5]:
import os
os.system('pip install python-dotenv')

from dotenv import load_dotenv
load_dotenv()

True

In [6]:
os.system("pip install scikit-learn==1.2.1")

0

In [7]:
!pip install lightgbm

In [8]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import pickle
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import lightgbm as lgb
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    debug = False
    upload_data = True
    wandb = True
    print_freq = 500
    num_workers = 4
    # model = "xlm-roberta-base"
    model = "/notebooks/kaggle_lecr/data/lecr-finetune-para-mpnet-parent-text-data/paraphrase-multilingual-mpnet-base-v2-exp"
    tokenizer = AutoTokenizer.from_pretrained(model)
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 128#368# 32#128#64#32
    weight_decay = 0.01
    max_grad_norm = 0.012
    max_len = 512
    n_folds = 5
    seed = 42
    epochs = 5
    data_url = "/notebooks/kaggle_lecr/data/learning-equality-curriculum-recommendations"
    train_set_url = "/notebooks/kaggle_lecr/output/ex12-uns-top50"
    vector_name = [f"feat_{i}" for i in range(768)]
    svd_n_components = 100
    svd_vector_name = [f"svd_feat_{i}" for i in range(svd_n_components)]

In [10]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    os.system('pip install wandb==0.13.3')
    import wandb

    try:
        # for kaggle
        # from kaggle_secrets import UserSecretsClient
        # user_secrets = UserSecretsClient()
        # secret_value_0 = user_secrets.get_secret("wandb_api")
        
        # for paperspace
        secret_value_0 = os.getenv('WANDB_API_KEY')
        wandb.login(key=secret_value_0)
        
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='LECR',
                     entity="sinchir0",
                     name=NOTEBOOK_NAME,
                     config=class2dict(CFG),
                     group="trn",
                     job_type="train",
                     anonymous=anony)

In [11]:
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True

In [12]:
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    # train = pd.read_csv(f"{cfg.train_set_url}/train.csv")
    # train = pd.read_pickle(f"{cfg.train_set_url}/train.pkl")
    train = pd.read_csv(f"{cfg.train_set_url}/train_50.csv")
    train["content_titles"] = train["content_titles"].fillna("")
    
    topics = pd.read_csv(cfg.data_url + "/" + "topics.csv")
    content = pd.read_csv(cfg.data_url + "/" + "content.csv")
    correlations = pd.read_csv(cfg.data_url + "/" + "correlations.csv")

    topics["title"] = topics["title"].fillna("")
    content["title"] = content["title"].fillna("")
    
    topics["description"] = topics["description"].fillna("")
    content["description"] = content["description"].fillna("")
    
    content['text'] = content['text'].fillna("")
    
    print(' ')
    print('-' * 50)
    print(f"train.shape: {train.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return train, topics, content, correlations

In [13]:
# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

In [14]:
# =========================================================================================
# Get best threshold
# =========================================================================================
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.1, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold


In [15]:
# Seed everything
seed_everything(CFG)

In [16]:
# Read data
train, topics, content, correlations = read_data(CFG)

In [17]:
train = train.rename(columns={"topics_titles":"topics_texts","content_titles":"content_texts"})

In [18]:
train.isnull().sum()

topics_ids       0
content_ids      0
topics_texts     0
content_texts    0
target           0
dtype: int64

In [19]:
# positive sampleを足す
def add_positive_sample(train: pd.DataFrame, correlations: pd.DataFrame, topics: pd.DataFrame, content: pd.DataFrame):
    
    # topic_text_dict = dict(zip(topics["id"], topics['title'] + " " + topics['description']))
    # content_text_dict = dict(zip(content["id"], content['title'] + " " + content['description']))
    topic_text_dict = dict(zip(topics["id"], topics['title']))
    content_text_dict = dict(zip(content["id"], content['title']))
    
    correlations["content_ids_list"] = correlations["content_ids"].apply(lambda x : x.split())
    
    all_positive_sample = correlations.explode("content_ids_list")[["topic_id","content_ids_list"]]
    all_positive_sample = all_positive_sample.rename(columns={"topic_id":"topics_ids","content_ids_list":"content_ids"})
    
    all_positive_sample["topics_texts"] = all_positive_sample["topics_ids"].map(topic_text_dict)
    all_positive_sample["content_texts"] = all_positive_sample["content_ids"].map(content_text_dict)
    all_positive_sample["target"] = 1
    
    all_positive_sample = all_positive_sample.reset_index(drop=True)
    
    # 追加するtopic, contentのみを持つlistを生成
    all_positive_topic_content = (all_positive_sample["topics_ids"] + all_positive_sample["content_ids"]).tolist()
    train_positive = train[train["target"] == 1]
    train_positive_topic_content = (train_positive["topics_ids"] + train_positive["content_ids"]).tolist()
    add_topic_content = list(set(all_positive_topic_content) - set(train_positive_topic_content))
    
    # trainにpositive sampleを追加
    train = pd.concat([train, all_positive_sample]).drop_duplicates(subset=["topics_ids","content_ids"], keep='first')
    train = train.sort_values("topics_ids")
    train = train.reset_index(drop=True)

    return train, add_topic_content

train, add_topic_content = add_positive_sample(train, correlations, topics, content)

In [20]:
# train["topics_texts"] = train["topics_texts"].apply(lambda x: x[:300])
# train["content_texts"] = train["content_texts"].apply(lambda x: x[:300])

In [21]:
train.head()

Unnamed: 0,topics_ids,content_ids,topics_texts,content_texts,target
0,t_00004da3a1b2,c_6cd1bd6f1e49,Откриването на резисторите > Открития и проект...,Диелектрици в кондензатори,0
1,t_00004da3a1b2,c_c8184b4bba5d,Откриването на резисторите > Открития и проект...,Електричен ток,0
2,t_00004da3a1b2,c_678145c4cfe4,Откриването на резисторите > Открития и проект...,Кондензатори и капацитет,0
3,t_00004da3a1b2,c_ded49059e260,Откриването на резисторите > Открития и проект...,Задача за събиране на съпротивления,0
4,t_00004da3a1b2,c_0c885859d4fa,Откриването на резисторите > Открития и проект...,Последователно свързани кондензатори,0


In [22]:
# textとしてくっつける
def preprocess(train: pd.DataFrame):
    # Create feature column
    # train['text'] = train['topics_titles'] + '[SEP]' + train['content_titles']
    train['text'] = train['topics_texts'] + '[SEP]' + train['content_texts']
    return train

train = preprocess(train)

In [23]:
# cvを切る
def cv_split(train, cfg):
    kfold = StratifiedGroupKFold(n_splits = cfg.n_folds, shuffle = True, random_state = cfg.seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train, train['target'], train['topics_ids'])):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    return train

train = cv_split(train, CFG)

In [24]:
if CFG.debug:
    train = train[:1000]

In [25]:
train = pd.merge(
    train,
    topics[["id", "category", "level"]],
    left_on="topics_ids",
    right_on="id",
    how="left").drop("id", axis=1)

In [26]:
train = pd.merge(
    train,
    content[["id", "kind"]],
    left_on="content_ids",
    right_on="id",
    how="left").drop("id", axis=1)

In [27]:
train = train[~((train["category"] == "source") & (train["target"] == 0))].reset_index(drop=True)

In [28]:
# Fine-tuningしたmpnetでvectorを得る
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds
    
# =========================================================================================
# Get Vecotr
# =========================================================================================
def get_vector(train, cfg):
    # Create topics dataset
    train_dataset = uns_dataset(train, cfg)
    # Create topics and content dataloaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    train_preds = get_embeddings(train_loader, model, device)
    
    # Release memory
    torch.cuda.empty_cache()
    del train_dataset, train_loader
    gc.collect()
    
    return train_preds

#train_preds = get_vector(train, CFG)

In [29]:
# np.save(f"{OUTPUT_DIR}/train_preds", train_preds)

In [30]:
# train_preds = np.load(f"{OUTPUT_DIR}/train_preds.npy")

In [31]:
# # 次元削減を行う
# from sklearn.decomposition import TruncatedSVD
# from scipy.sparse import csr_matrix

# X = csr_matrix(train_preds)
# svd = TruncatedSVD(n_components=CFG.svd_n_components, n_iter=7, random_state=42)
# svd_vector = svd.fit_transform(X)

In [32]:
# np.save(f"{OUTPUT_DIR}/svd_vector", svd_vector)
svd_vector = np.load("/notebooks/kaggle_lecr/output/ex14-trn-lightgbm-for-save/svd_vector.npy")

In [33]:
# del train_preds
# gc.collect()

In [34]:
train[CFG.svd_vector_name] = svd_vector

In [35]:
del svd_vector
gc.collect()

106

In [36]:
# train.to_csv(f"{OUTPUT_DIR}/train_with_vector.csv", index=False)
# train = pd.read_csv(f"{OUTPUT_DIR}/train_with_vector.csv")

In [37]:
train.head()

Unnamed: 0,topics_ids,content_ids,topics_texts,content_texts,target,text,fold,category,level,kind,...,svd_feat_90,svd_feat_91,svd_feat_92,svd_feat_93,svd_feat_94,svd_feat_95,svd_feat_96,svd_feat_97,svd_feat_98,svd_feat_99
0,t_00004da3a1b2,c_76231f9d0b5e,Откриването на резисторите > Открития и проект...,Последователно свързване на галваничен елемент...,1,Откриването на резисторите > Открития и проект...,1,source,4,video,...,-0.206004,0.210298,0.045369,-0.138827,-0.053636,-0.112049,0.02323,-0.11224,-0.008163,0.225495
1,t_00004da3a1b2,c_1108dd0c7a5d,Откриването на резисторите > Открития и проект...,Молив като резистор,1,Откриването на резисторите > Открития и проект...,1,source,4,video,...,-0.169751,0.162981,0.065976,-0.116974,-0.023236,-0.098758,0.038024,-0.093514,-0.046875,0.260005
2,t_00004da3a1b2,c_5bc0e1e2cba0,Откриването на резисторите > Открития и проект...,Променлив резистор (реостат) с графит от молив,1,Откриването на резисторите > Открития и проект...,1,source,4,video,...,-0.167677,0.143924,0.046197,-0.156766,-0.00885,-0.102681,0.063325,-0.061339,-0.049079,0.214675
3,t_00004da3a1b2,c_376c5a8eb028,Откриването на резисторите > Открития и проект...,Да чуем променливото съпротивление,1,Откриването на резисторите > Открития и проект...,1,source,4,video,...,-0.161866,0.149925,0.052445,-0.114741,-0.043544,-0.079053,0.063789,-0.042917,-0.035958,0.225816
4,t_00068291e9a4,c_ac1672cdcd2c,Entradas e saídas de uma função > Álgebra: fun...,Resolução de exemplo: como relacionar uma entr...,1,Entradas e saídas de uma função > Álgebra: fun...,3,source,4,video,...,0.258849,0.256854,0.077855,0.313349,-0.060605,-0.443884,-0.00249,-0.204502,0.112875,-0.086771


In [38]:
data = train.copy()

In [39]:
del train
gc.collect()

0

In [40]:
use_col = ["category", "level", "kind"] + CFG.svd_vector_name

In [41]:
data["category"] = data["category"].astype("category")
data["kind"] = data["kind"].astype("category")

In [42]:
params = {
    'objective': 'binary',
    'verbosity': 1,
    'boosting_type': 'gbdt',
    'num_leaves': 63,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_freq': 1,
    'bagging_fraction': 0.8,
    'random_state': 0
}

valid_scores = []
models = []

for fold in range(CFG.n_folds):
    train = data[data["fold"] != fold]
    valid = data[data["fold"] == fold]
    
    # 追加したpositiveのtopic, contentは評価には使わない
    valid = valid[~(valid["topics_ids"] + valid["content_ids"]).isin(add_topic_content)]
    
    X_train, X_valid = train[use_col], valid[use_col]
    y_train, y_valid = train["target"], valid["target"]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_valid, y_valid)

    model = lgb.train(
        params,
        lgb_train,
        valid_sets=lgb_eval,
        num_boost_round=10000,
        verbose_eval=-1,
        callbacks=[lgb.early_stopping(stopping_rounds=100)]
    )

    y_valid_pred = model.predict(X_valid)
    
    score, threshold = get_best_threshold(valid, y_valid_pred, correlations)
    print(f'fold{fold} f1 score: {score}')
    valid_scores.append(score)

    models.append(model)
    
    # Save valid pred, model, Score, Threshold
    valid["pred"] = y_valid_pred
    valid[["topics_ids","content_ids","pred"]].to_csv(f"{OUTPUT_DIR}/pred_fold{fold}.csv", index=False)
    
    with open(f"{OUTPUT_DIR}/model_fold{fold}.pkl", 'wb') as f:
        pickle.dump(model, f)
    
    score = {"score": score, "best_threshold": threshold}
    
    with open(f"{OUTPUT_DIR}/score_fold{fold}.pkl", "wb") as f:
        pickle.dump(score, f)
    
    break

# Upload

In [43]:
import os
os.system("pip install kaggle")
os.system("mkdir -p ~/.kaggle/")
os.system("cp /notebooks/kaggle_lecr/kaggle.json ~/.kaggle/")
os.system("chmod 600 ~/.kaggle/kaggle.json")

0

In [44]:
from kaggle.api.kaggle_api_extended import KaggleApi
import json

def dataset_create_new(dataset_name: str, upload_dir: str):
    if "_" in dataset_name:
        raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata['id'] = f'sinchir0/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

if CFG.upload_data:
    print(f"Create Dataset name:{NOTEBOOK_NAME}, output_dir:{OUTPUT_DIR}")
    dataset_create_new(dataset_name=NOTEBOOK_NAME, upload_dir=OUTPUT_DIR)