In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

os.environ["WANDB_DISABLED"] = "true"

In [2]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/deberta-v3-large/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 5
    epochs = 5
    batch_size = 4

In [3]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
train_df = train_df.merge(titles, left_on='context', right_on='code')

# https://www.kaggle.com/code/abhishek/phrase-matching-folds
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # the next step is to randomize the rows of the data
    data = data.sample(frac=1).reset_index(drop=True)

    # calculate number of bins by Sturge's rule
    # I take the floor of the value, you can also
    # just round it
    # num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(
        data["score"], bins=5, labels=False
    )
    
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [4]:
train_df['input'] = train_df['title']+' '+train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer( inputs, targets ),
        'label':label.astype(np.float32)
    }

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }

In [8]:
oof_df = pd.DataFrame()
for fold in range(CFG.num_fold):
    
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)
    
    args = TrainingArguments(
        output_dir=f"/tmp/uspppm",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=CFG.learning_rate,
        per_device_train_batch_size=CFG.batch_size,
        per_device_eval_batch_size=CFG.batch_size,
        num_train_epochs=CFG.epochs,
        weight_decay=CFG.weight_decay,
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path, num_labels=1)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    shutil.rmtree(f"/tmp/uspppm")
    trainer.save_model(f"uspppm_{fold}")
    
    outputs = trainer.predict(va_dataset)
    predictions = outputs.predictions.reshape(-1)
    va_data['preds'] = predictions
    oof_df = pd.concat([oof_df, va_data])

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of the model checkpoint at ../input/deberta-v3-large/ were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.classifer.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifer.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT 

{'loss': 0.0585, 'learning_rate': 1.9725839616175463e-05, 'epoch': 0.07}


  3%|▎         | 1000/36475 [05:13<3:00:12,  3.28it/s]

{'loss': 0.0412, 'learning_rate': 1.9451679232350927e-05, 'epoch': 0.14}


  4%|▍         | 1500/36475 [07:44<2:56:29,  3.30it/s]

{'loss': 0.0387, 'learning_rate': 1.9177518848526388e-05, 'epoch': 0.21}


  5%|▌         | 2000/36475 [10:15<2:55:37,  3.27it/s]

{'loss': 0.0393, 'learning_rate': 1.8903358464701853e-05, 'epoch': 0.27}


  7%|▋         | 2500/36475 [12:46<2:55:09,  3.23it/s]

{'loss': 0.0324, 'learning_rate': 1.8629198080877317e-05, 'epoch': 0.34}


  8%|▊         | 3000/36475 [15:17<2:54:14,  3.20it/s]

{'loss': 0.0336, 'learning_rate': 1.8355037697052778e-05, 'epoch': 0.41}


 10%|▉         | 3500/36475 [17:49<2:51:28,  3.21it/s]

{'loss': 0.0318, 'learning_rate': 1.808087731322824e-05, 'epoch': 0.48}


 11%|█         | 4000/36475 [20:21<2:40:47,  3.37it/s]

{'loss': 0.0307, 'learning_rate': 1.7806716929403704e-05, 'epoch': 0.55}


 12%|█▏        | 4500/36475 [22:53<2:36:00,  3.42it/s]

{'loss': 0.0307, 'learning_rate': 1.7532556545579165e-05, 'epoch': 0.62}


 14%|█▎        | 5000/36475 [25:26<2:34:18,  3.40it/s]

{'loss': 0.0303, 'learning_rate': 1.7258396161754626e-05, 'epoch': 0.69}


 15%|█▌        | 5500/36475 [27:58<2:27:35,  3.50it/s]

{'loss': 0.0319, 'learning_rate': 1.698423577793009e-05, 'epoch': 0.75}


 16%|█▋        | 6000/36475 [30:30<2:37:09,  3.23it/s]

{'loss': 0.0296, 'learning_rate': 1.6710075394105554e-05, 'epoch': 0.82}


 17%|█▋        | 6231/36475 [31:40<2:26:56,  3.43it/s]

KeyboardInterrupt: 

In [None]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

In [None]:
oof_df.to_csv('oof_df.csv')