# About this notebook
- Deberta-v3-large starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/pppm-pip-wheels)
- Training notebook is [here](https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

INPUT_DIR = '../input/us-patent-phrase-to-phrase-matching/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    num_workers=4
    path="../input/pppm-cocolm-large-exp-2/"
    config_path=path+'config.pth'
    model="microsoft/cocolm-large"  # ['microsoft/deberta-v3-large', 'anferico/bert-for-patents']
    batch_size=32
    fc_dropout=0.1
    target_size=1
    max_len=190
    seed=42
    trn_fold=[0, 1, 2, 4, 6, 10, 12, 13, 18, 19]

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")


import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels-dataset tokenizers')
import tokenizers
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

#os.environ["WANDB_DISABLED"] = "true"

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
sys.path.insert(1, '../input/cocolm/huggingface/')
 
from cocolm.modeling_cocolm import COCOLMModel, COCOLMPreTrainedModel
from cocolm.configuration_cocolm import COCOLMConfig
from cocolm.tokenization_cocolm import COCOLMTokenizer

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# OOF

In [None]:
# oof_df = pd.read_pickle(CFG.path+'oof_df.pkl')
# labels = oof_df['score'].values
# preds = oof_df['pred'].values
# score = get_score(labels, preds)
# LOGGER.info(f'CV Score: {score:<.4f}')

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
test = pd.read_csv(INPUT_DIR+'test.csv')
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
print(f"test.shape: {test.shape}")
print(f"submission.shape: {submission.shape}")
display(test.head())
display(submission.head())

In [None]:
# ====================================================
# CPC Data
# ====================================================
cpc_texts = torch.load(CFG.path + "cpc_texts.pth")
test['context_text'] = test['context'].map(cpc_texts)
display(test.head())

In [None]:
# test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
# test['text'] = '[CLS]' + test['anchor'] + '[cpc]' + test['context_text'] + '[SEP]'  + test['target'] + '[SEP]'
test['text'] = '[CLS]' + test['anchor'] + '[SEP]' + test['target'] + '[SEP]'  + test['context_text']
display(test.head())

# tokenizer

In [None]:
# ====================================================
# tokenizer
# ====================================================
# CFG.tokenizer = AutoTokenizer.from_pretrained('../input/pppm-deberta-v3-large-baseline-w-w-b-train/tokenizer/')
CFG.tokenizer = COCOLMTokenizer.from_pretrained('../input/coco-lm-large/tokenizer/')

# Dataset

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input_CoCoLM(cfg, anchor, target, context_text):
    # inputs = cfg.tokenizer.encode_plus(text,
    #                        add_special_tokens=True,
    #                        max_length=cfg.max_len,
    #                        padding="max_length",
    #                        return_offsets_mapping=False)

    tokenizer = cfg.tokenizer
    _anchor = tokenizer.encode(anchor, add_special_tokens=False)
    _target = tokenizer.encode(target, add_special_tokens=False)
    _context_text = tokenizer.encode(context_text, add_special_tokens=False)

    token_ids = [tokenizer.cls_token_id] + _anchor + [tokenizer.sep_token_id] + _target + [tokenizer.sep_token_id] + _context_text + [tokenizer.sep_token_id]
    inputs = {'input_ids': token_ids}

    for k, v in inputs.items():
        if len(v) < cfg.max_len:
            num_of_paddings = cfg.max_len - len(v)
            paddings = [cfg.tokenizer.pad_token_id for n in range(num_of_paddings)]
            v = v + paddings
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


# '[CLS]' + train['anchor'] + '[SEP]' + train['target'] + '[SEP]'  + train['context_text'] + '[SEP]'
class CoCoLMTestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        # self.texts = df['text'].values
        self.anchor = df['anchor'].values
        self.target = df['target'].values
        self.context_text = df['context_text'].values

    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, item):
        inputs = prepare_input_CoCoLM(self.cfg, self.anchor[item], self.target[item], self.context_text[item])
        return inputs

# Model

In [None]:
# ====================================================
# Model
# ====================================================

# set model path
COCOLMModel.supported_convert_pretrained_model_archive_map['cocolm']['microsoft/cocolm-large'] = '../input/cocolmlargeweight/pytorch_model.bin'
COCOLMPreTrainedModel.supported_convert_pretrained_model_archive_map['cocolm']['microsoft/cocolm-large'] = '../input/cocolmlargeweight/pytorch_model.bin'


class CustomCoCoLMModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        
        self.config = COCOLMConfig.from_pretrained("../input/cocolmlargeweight/")
        self.model = COCOLMModel.from_pretrained("microsoft/cocolm-large", config=self.config, local_files_only=True)
        
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)
        self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

# inference

In [None]:
# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
#         preds.append(y_preds.sigmoid().to('cpu').numpy())
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [None]:
test_dataset = CoCoLMTestDataset(CFG, test)
test_loader = DataLoader(test_dataset,
                         batch_size=CFG.batch_size,
                         shuffle=False,
                         num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
predictions1 = []
for fold in CFG.trn_fold:
    model = CustomCoCoLMModel(CFG, config_path=CFG.config_path, pretrained=False)
    state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    predictions1.append(prediction)
    del model, state, prediction; gc.collect()
    torch.cuda.empty_cache()
predictions1 = np.mean(predictions1, axis=0) 

In [None]:
def winsorizing(preds):
    predictions = []
    for p in preds:
        if p[0] < 0:
            predictions.append(0)
        elif p[0] > 1:
            predictions.append(0)
        else:
            predictions.append(p[0])
    return predictions

# predictions1 = winsorizing(predictions1)

# Submission

In [None]:
# print(predictions1)

In [None]:
# predictions = []
# for p1, p2 in zip(predictions1, predictions2):
#     predictions.append((p1[0] + p2) / 2)

In [None]:
submission = pd.read_csv(INPUT_DIR+'sample_submission.csv')
# submission['score'] = predictions
submission['score'] = predictions1
display(submission.head())
submission[['id', 'score']].to_csv('submission.csv', index=False)

In [None]:
print(submission)