# 0. Imports and predefines

In [1]:
BRANCH="feat/custom_datasets"

In [2]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("WANDB_KEY") 
wandb.login(key=wandb_api)

# GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")
# GITHUB_USER = user_secrets.get_secret("GITHUB_USER")
GITHUB_HOST = user_secrets.get_secret("GITHUB_HOST")
CLONE_URL = f"https://github.com/{GITHUB_HOST}/paraphrase-detection"
get_ipython().system(f"git clone {CLONE_URL}")

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Cloning into 'nlp_project_2023'...
remote: Enumerating objects: 180, done.[K
remote: Counting objects: 100% (180/180), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 180 (delta 96), reused 140 (delta 60), pack-reused 0[K
Receiving objects: 100% (180/180), 63.24 KiB | 9.03 MiB/s, done.
Resolving deltas: 100% (96/96), done.


In [3]:
cd paraphrase-detection/

/kaggle/working/nlp_project_2023


In [4]:
!git checkout $BRANCH

Branch 'feat/custom_datasets' set up to track remote branch 'feat/custom_datasets' from 'origin'.
Switched to a new branch 'feat/custom_datasets'


In [5]:
!pip install -r requirements.txt

[0m

In [6]:
cd ..

/kaggle/working


In [7]:
import sys
sys.path.append("nlp_project_2023/src")

In [8]:
import os
import random
from dataclasses import asdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import transformers



In [9]:
# Hardware
num_workers = 2
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# seed all
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

g = torch.Generator()
g.manual_seed(SEED)

def seed_dataloader_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [10]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 1. Prepare data

In [11]:
from data import ParaphraseDataset, PawsParaphraseDataset, PawsQQPParaphraseDataset, DatasetManager

In [12]:
ds_paws = PawsParaphraseDataset("/kaggle/input/the-paws-dataset-for-paraphrase-identification")
ds_paws.add_set("labeled_final_train.csv", "train")
ds_paws.add_set("labeled_swap_train.csv", "train")
# ds_paws.add_set("unlabeled_final_train.csv", "train")

ds_paws.add_set("labeled_final_validation.csv", "val")
# ds_paws.add_set("unlabeled_final_validation.csv", "val")

ds_paws.add_set("labeled_final_test.csv", "test")

ds_paws.compile_dataset()

print(ds_paws.train_df.shape, ds_paws.val_df.shape, ds_paws.test_df.shape)

(79798, 4) (8000, 4) (8000, 4)


In [13]:
dsqqp = PawsQQPParaphraseDataset("/kaggle/input/paws-qqp")

dsqqp.add_set("train.tsv", "train")
dsqqp.add_set("dev_and_test.tsv", "val")

# ds.add_set("labeled_final_validation.csv", "val")
# ds.add_set("labeled_final_test.csv", "test")

dsqqp.compile_dataset()
print(dsqqp.train_df.shape)

(11988, 4)


In [14]:
ds = DatasetManager([ds_paws, dsqqp])
print(ds.train_df.shape, ds.val_df.shape, ds.test_df.shape)

(91786, 4) (8677, 4) (8000, 4)


In [15]:
from data import build_tokenizer, PairedSentenceDataset


tokenizer = build_tokenizer("microsoft/deberta-v3-large")

dataset = PairedSentenceDataset(ds.train_df, tokenizer, 128)

assert dataset[0]["labels"].shape == (1, )
assert dataset[0]["input_ids"].shape == (1, 128)
assert dataset[0]

dataset = PairedSentenceDataset(ds.train_df, tokenizer, 20)

assert dataset[0]["input_ids"].shape == (1, 20)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# 2. Experiment

In [16]:
from train import TrainConfig, Trainer, DummyLogger, WandbLogger


config = TrainConfig(
    model="microsoft/deberta-v3-large",
    checkpoints_folder="./init_exp",
    batch_size=8,
    epochs=3,
    max_length=100,
    lr=6e-6,
    device=str(device),
    test_sets=ds.test_sets,
    val_sets=ds.val_sets,
    train_sets=ds.train_sets,
    others=dict()
)

In [17]:
tokenizer = build_tokenizer(config.model)

train_loader = torch.utils.data.DataLoader(PairedSentenceDataset(ds.train_df, tokenizer, config.max_length),
                                           batch_size=config.batch_size, shuffle=True,
                                           num_workers=num_workers,
                                           worker_init_fn=seed_dataloader_worker,
                                           generator=g)

val_loader = torch.utils.data.DataLoader(PairedSentenceDataset(ds.val_df, tokenizer, config.max_length),
                                         batch_size=config.batch_size, shuffle=False,
                                         num_workers=num_workers,
                                         worker_init_fn=seed_dataloader_worker,
                                         generator=g)

test_loader = torch.utils.data.DataLoader(PairedSentenceDataset(ds.test_df, tokenizer, config.max_length),
                                          batch_size=config.batch_size, shuffle=False,
                                          num_workers=num_workers,
                                          worker_init_fn=seed_dataloader_worker,
                                          generator=g)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
from models import DebertaV2WithCustomClassifier, ConcatenatePooler

model = DebertaV2WithCustomClassifier(
    transformers.DebertaV2Model.from_pretrained(config.model),
    ConcatenatePooler([4 * 1024, 512, 64, 2], 4)
)
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

model

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.dense.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DebertaV2WithCustomClassifier(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_

In [19]:
logger = WandbLogger(project="paraphrase-detection", experiment_config=asdict(config))

trainer = Trainer(model, optimizer, logger)

trainer.train(train_loader, val_loader, config, test_loader)

logger.finish()

[34m[1mwandb[0m: Currently logged in as: [33mcrendelyok[0m ([33mcrendelyok_team[0m). Use [1m`wandb login --relogin`[0m to force relogin




  0%|          | 0/11474 [00:00<?, ?it/s]

  0%|          | 0/1085 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/11474 [00:00<?, ?it/s]

  0%|          | 0/1085 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/11474 [00:00<?, ?it/s]

  0%|          | 0/1085 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]



VBox(children=(Label(value='0.001 MB of 0.035 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.030579…

0,1
accuracy,▁█▅
auc_score,▇█▁
current_best_epoch,▁█
f1,▁█▄
precision,▁▅█
recall,▇█▁
test_accuracy,▁█▅
test_auc_score,▁█▂
test_f1,▁█▄
test_precision,▁▅█

0,1
accuracy,0.95644
auc_score,0.98781
current_best_epoch,1.0
f1,0.94975
precision,0.94198
recall,0.95764
test_accuracy,0.95225
test_auc_score,0.9852
test_f1,0.94641
test_precision,0.93903
