In [None]:
!pip show datasets

In [None]:
import os
import random
import warnings
import pandas as pd
import numpy as np
import torch
from transformers import (AutoModelForMaskedLM,AutoTokenizer,DataCollatorForLanguageModeling,
                          Trainer, TrainingArguments)

warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

def seed_torch(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_torch(42)

In [None]:
class TrainingArgs:
    weight_decay = 0.01
    learning_rate = 2e-5  
    warmup_ratio = 0.1
    gradient_accumulation_steps = 4
    fp16 = True
    lr_scheduler_type = "cosine"
    # Number of checkpoints to save for each model
    save_total_limit = 1
    
class Config:    
    DATA_PATH = "/kaggle/input/us-patent-phrase-to-phrase-matching/"
    # location where trained model weights are saved
    OUT_DIR = "/kaggle/working/model/"
    RUNTIME = "KAGGLE"
    RANDOM_STATE = 42
    BATCH_SIZE = 32
    NUM_LABELS = 1
    NUM_FOLDS = 5
    RUN_ALL_FOLDS = True
    NUM_EPOCHS = 4
    NUM_WORKERS = 8
    TRANSFORMER_CHECKPOINT = "microsoft/deberta-v3-small"
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    SUBSET_ROWS_FRAC = 0.1
    TRAIN_ON_SUBSET = False

In [None]:
df_abstract = pd.read_csv('../input/pppm-abstract/pppm_abstract.csv')
df_abstract = df_abstract.dropna().reset_index(drop=True)
df_abstract

In [None]:
def tokenize_text(data_row):
    result = tokenizer(data_row["abstract"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained(Config.TRANSFORMER_CHECKPOINT)
ds_abstract_raw = Dataset.from_pandas(df_abstract)
raw_ds_col_names = ds_abstract_raw.column_names    
ds_abstract = ds_abstract_raw.map(tokenize_text, batched=True, batch_size=1000, remove_columns=raw_ds_col_names)

In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
chunk_size = 128
ds_abstract_mlm = ds_abstract.map(group_texts, batched=True)
ds_abstract_mlm

In [None]:
ds_abstract_mlm = ds_abstract_mlm.train_test_split(test_size=0.2, train_size=0.8, seed=Config.RANDOM_STATE)

In [None]:
tokenizer.decode(ds_abstract_mlm["train"][1]["labels"])

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
model = AutoModelForMaskedLM.from_pretrained(Config.TRANSFORMER_CHECKPOINT)

training_args = TrainingArguments(
        output_dir=Config.OUT_DIR,
        evaluation_strategy="epoch",
        save_strategy='epoch',        
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        warmup_ratio=TrainingArgs.warmup_ratio,
        weight_decay=TrainingArgs.weight_decay,
        learning_rate=TrainingArgs.learning_rate,    
        gradient_accumulation_steps=TrainingArgs.gradient_accumulation_steps,
        fp16=TrainingArgs.fp16,
        lr_scheduler_type=TrainingArgs.lr_scheduler_type,
        save_total_limit=TrainingArgs.save_total_limit
    )

trainer = Trainer(
    model=model,                                  # the instantiated Transformers model to be trained
    args=training_args,                           # training arguments, defined above
    train_dataset=ds_abstract_mlm["train"],       # training dataset
    eval_dataset=ds_abstract_mlm["test"],         # evaluation dataset    
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [None]:
# Before fine tuning the model using MLM

import math
import os

eval_results = trainer.evaluate()
print(f">>> Perplexity before fine tuning: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity after fine tuning = {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
# !pip install GPUtil

# import torch
# from GPUtil import showUtilization as gpu_usage
# from numba import cuda

# def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

#     torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

# free_gpu_cache()    