In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import shutil
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel

os.environ["WANDB_DISABLED"] = "true"

# Config

In [None]:
class CFG_DEB_SIMPLE:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/deberta-v3-large/deberta-v3-large'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 4
    epochs = 5
    batch_size = 64
    max_input_length = 130
    batch_size = 64
    num_workers = 2


# Preproc

In [None]:
test_df = pd.read_csv(f"{CFG_DEB_SIMPLE.input_path}test.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
test_df = test_df.merge(titles, left_on='context', right_on='code')
# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load("../input/folds-dump-the-two-paths-fix/cpc_texts.pth")
test_df['context_text'] = test_df['context'].map(cpc_texts)
test_df['text'] = test_df['anchor'] + '[SEP]' + test_df['target'] + '[SEP]'  + test_df['context_text']
test_df['text'] = test_df['text'].apply(str.lower)
display(test_df.head())

# Tokenizer

In [None]:
tokenizer_deberta_v3 = AutoTokenizer.from_pretrained(CFG_DEB_SIMPLE.model_path)

# Dataset

In [None]:
class TestDataset(Dataset):
    def __init__(self, df, tokenizer, max_input_length):
        self.text = df['text'].values.astype(str)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        inputs = self.text[item]
        
        inputs = self.tokenizer(inputs,
                    max_length=self.max_input_length,
                    padding='max_length',
                    truncation=True )
        return torch.as_tensor(inputs['input_ids'], dtype=torch.long),\
               torch.as_tensor(inputs['token_type_ids'], dtype=torch.long),\
               torch.as_tensor(inputs['attention_mask'], dtype=torch.long)

## Model

In [None]:
class Custom_Bert_Simple(nn.Module):
    def __init__(self, model_path):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_path)
        config.num_labels = 1
        self.base = AutoModelForSequenceClassification.from_config(config=config)
        dim = config.hidden_size
        self.dropout = nn.Dropout(p=0)
        self.cls = nn.Linear(dim,1)
        
    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        base_output = self.base(input_ids=input_ids,
                                attention_mask=attention_mask,
                               token_type_ids=token_type_ids )

        output = base_output[0]
        if labels is None:
            return output
        
        else:
            return (nn.MSELoss()(torch.squeeze(output,1),labels), output)

# Inference

In [None]:
def valid_fn(valid_loader, model, device):
    model.eval()
    preds = []
    labels = []
    for step, batch in enumerate(valid_loader):
        input_ids, token_type_ids, attention_mask = [i.to(device) for i in batch]
        with torch.no_grad():
            y_preds = model(input_ids, attention_mask, token_type_ids)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

## deberta simple

In [None]:
predictions = []
MMscaler = MinMaxScaler()
te_dataset = TestDataset(test_df, tokenizer_deberta_v3, CFG_DEB_SIMPLE.max_input_length)
te_dataloader = DataLoader(te_dataset,
                              batch_size=CFG_DEB_SIMPLE.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG_DEB_SIMPLE.num_workers, pin_memory=True, drop_last=False)
for fold in tqdm(range(CFG_DEB_SIMPLE.num_fold)):
    
    model = Custom_Bert_Simple(CFG_DEB_SIMPLE.model_path)
    model.load_state_dict(torch.load('../input/us-patent-deberta-simple/microsoft_deberta-v3-large_best{}.pth'.format(fold))['model'])
    model.to('cuda')
    
    outputs = valid_fn(te_dataloader, model, 'cuda')
    prediction = outputs.reshape(-1)
    predictions.append(MMscaler.fit_transform(prediction.reshape(-1,1)).reshape(-1))
    

In [None]:
len(predictions)

## post process

In [None]:
predictions = np.mean(predictions, axis=0)
predictions = np.where(predictions<=0, 0, predictions)
predictions = np.where(predictions>=1, 1, predictions)
submission = pd.DataFrame({
    'id': test_df['id'],
    'score': predictions,
})

submission

In [None]:
submission.to_csv('submission.csv', index=False)