In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

try:
  import datasets
except:
    !pip install -q datasets
    try:
        import datasets
    except:
        print("Can't import datasets.")

In [None]:
class CFG():
    PATH = "../input/us-patent-phrase-to-phrase-matching"
    MODEL = 'microsoft/deberta-v3-small'
    NUM_FOLDS = 4
    LR = 8e-5
    BATCH_SIZE = 128
    WEIGHT_DECAY = 0.01
    EPOCHS = 4
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def lowercase_df(df): # lowercase for better score
    df['context'] = df['context'].str.lower()
    df['anchor'] = df['anchor'].str.lower()
    df['target'] = df['target'].str.lower()
    return df
train_df = pd.read_csv(f"{CFG.PATH}/train.csv")
train_df = lowercase_df(train_df)

In [None]:
# credits https://www.kaggle.com/code/hannes82/pppm-deberta-v3-large-closing-the-cv-lb-gap/notebook

!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def k_fold(train_df):
    dfx = pd.get_dummies(train_df, columns=["score"]).groupby(["anchor"], as_index=False).sum()
    cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
    dfx = dfx[cols]

    mskf = MultilabelStratifiedKFold(n_splits=CFG.NUM_FOLDS, shuffle=True, random_state=42)
    labels = [c for c in dfx.columns if c != "anchor"]
    dfx_labels = dfx[labels]
    dfx["fold"] = -1

    for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
        dfx.loc[val_, "fold"] = fold

    train_df = train_df.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
    return train_df

train_df = k_fold(train_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL)
def sep(df):
    df['input'] = df['context'] + tokenizer.sep_token + df['anchor'] + tokenizer.sep_token + df['target']
    return df

train_df = sep(train_df)

def convert(df, isTest=False):
    if isTest:
        ds = datasets.Dataset.from_pandas(df)
    else:
        ds = datasets.Dataset.from_pandas(df).rename_column('score', 'label')
    return ds

In [None]:
def tokenize(example):
    return tokenizer(example["input"])

def create_tokenized_ds_from_df(df, isTest=False):
    ds = convert(df, isTest=True if isTest == True else False)
    inps = "anchor","target","context"
    if isTest:
        tokenized_ds = ds.map(tokenize, batched=True, remove_columns=inps+('input','id'))
    else:
        tokenized_ds = ds.map(tokenize, batched=True, remove_columns=inps+('input','id', '__index_level_0__', 'fold'))
        
    tokenized_ds = ds.map(tokenize, batched=True, remove_columns=inps+('input','id'))
    return tokenized_ds

# Column to remove ['__index_level_0__', 'fold'] not in the dataset. Current columns in the dataset: ['id', 'anchor', 'target', 'context', 'input']

In [None]:
def compute_metrics(eval_pred):
    return {'pearson': np.corrcoef(*eval_pred)[0][1]}

def get_trainer(train_dataset, eval_dataset):
    args = TrainingArguments('outputs', learning_rate=CFG.LR, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True if torch.cuda.is_available() else False,
                             evaluation_strategy="epoch", per_device_train_batch_size=CFG.BATCH_SIZE, per_device_eval_batch_size=CFG.BATCH_SIZE*2, optim="adamw_torch", 
                             num_train_epochs=CFG.EPOCHS, weight_decay=CFG.WEIGHT_DECAY, report_to='none')
    model = AutoModelForSequenceClassification.from_pretrained(CFG.MODEL, num_labels=1)
    return Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset,
                   tokenizer=tokenizer, compute_metrics=compute_metrics)

In [None]:
for fold in range(CFG.NUM_FOLDS):
    train_folds = train_df[train_df.fold!=fold]
    eval_folds = train_df[train_df.fold==fold]
    
    train_ds, eval_ds = create_tokenized_ds_from_df(train_folds), create_tokenized_ds_from_df(eval_folds)
    
    trainer = get_trainer(train_ds, eval_ds)
    trainer.train()

In [None]:
test_df = pd.read_csv(f"{CFG.PATH}/test.csv")

In [None]:
def create_test_ds():
    test_df = pd.read_csv(f"{CFG.PATH}/test.csv")
    test_df = lowercase_df(test_df)
    test_df = sep(test_df)
    test_ds = create_tokenized_ds_from_df(test_df, isTest=True)
    return test_ds
test_ds = create_test_ds()

In [None]:
pred = trainer.predict(test_ds)
pred_arr = np.array(pred[0])

submissions =  pd.DataFrame(pred_arr, columns=['score'])
submissions['id'] = train_df['id']
submissions = submissions.reindex(columns=['id', 'score'])

In [None]:
submissions.to_csv('submission.csv')