In [None]:
!pip install -q /kaggle/input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install -q simpletransformers==0.63.4 --no-index --find-links=file:///kaggle/input/simpletransformers-pip-download/simpletransformers-0.63.4/

In [None]:
from simpletransformers.classification import (
    ClassificationModel, ClassificationArgs
)

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold

import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
train_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/train.csv')
print(train_df.shape)
display(train_df.head())

In [None]:
test_df = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/test.csv')
print(test_df.shape)
display(test_df.head())

In [None]:
train_df = train_df[['anchor', 'target', 'score']].copy()
train_df.columns = ["text_a", "text_b", "labels"]

test_df = test_df[['anchor', 'target']].copy()
test_df.columns = ["text_a", "text_b"]

label_map = {
    0.00: 0,
    0.25: 1,
    0.50: 2,
    0.75: 3,
    1.00: 4
}

train_df['labels'] = train_df['labels'].map(label_map)

In [None]:
model_args = ClassificationArgs()

model_args.max_seq_length = 32
model_args.num_train_epochs = 3
model_args.train_batch_size = 128
model_args.eval_batch_size = 256
model_args.no_save = True
model_args.save_model_every_epoch = False
model_args.save_steps = -1
model_args.overwrite_output_dir = True

In [None]:
test_probs = np.zeros((len(test_df), 5))

folds = StratifiedKFold(n_splits=5, shuffle=True)
for fold, (trn_ind, val_ind) in enumerate(folds.split(train_df, train_df['labels'])):
    trn_df = train_df.loc[trn_ind].reset_index(drop=True)
    val_df = train_df.loc[val_ind].reset_index(drop=True)
    
    model = ClassificationModel("deberta", "../input/deberta/base",  
                                num_labels=len(label_map), args=model_args)
    model.train_model(trn_df)
    result, _, _ = model.eval_model(val_df)
    print(result)
    
    _, probs = model.predict(test_df[['text_a', 'text_b']].values.tolist())
    test_probs += probs / folds.n_splits

In [None]:
sub = pd.read_csv('../input/us-patent-phrase-to-phrase-matching/sample_submission.csv')
sub['score'] = np.argmax(test_probs, axis=1)
sub['score'] = sub['score'] / 4
sub

In [None]:
sub.to_csv('submission.csv', index=False)