## Iterate like a grandmaster

In [None]:
!pip install -q datasets

from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from transformers import DataCollatorWithPadding  
import os
import logging.config
from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
import datasets
from datasets import load_dataset, Dataset, DatasetDict
from fastai.imports import *

warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True 
    
def get_logger(filename='train'):    
    logging.config.dictConfig({
        'version': 1,
        'disable_existing_loggers': True
    })
    
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    
    log_format = "%(asctime)s | %(levelname)s | %(message)s"
    
    handler1.setFormatter(Formatter(log_format))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter(log_format))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
seed_everything()
if not 'LOGGER' in globals():
    LOGGER = get_logger()

lr,bs = 2e-5,32
wd,epochs = 0.01,5

model_nm = '../input/deberta-v3-large/deberta-v3-large'

inps = "anchor","target","context"

test_data_path = '../input/us-patent-phrase-to-phrase-matching/test.csv'
train_data_path = '../input/us-patent-phrase-to-phrase-matching/train.csv'

## Dataset Prep Methods

In [None]:
context_mapping = {
    "A": "Human Necessities",
    "B": "Operations and Transport",
    "C": "Chemistry and Metallurgy",
    "D": "Textiles",
    "E": "Fixed Constructions",
    "F": "Mechanical Engineering",
    "G": "Physics",
    "H": "Electricity",
    "Y": "Emerging Cross-Sectional Technologies",
}

def process_context(df):
    cpc_codes_df = pd.read_csv('../input/cpc-codes/titles.csv')
    
    df = df.copy()
    df['context_category'] = df.context.apply(lambda x: context_mapping.get(x[0], ''))
    df = df.merge(cpc_codes_df, left_on='context', right_on='code', how='left')        
    df.title = df.title.apply(lambda x: x.lower())
    df.context_category = df.context_category.apply(lambda x: x.lower())
    df = df.drop('code', axis=1)
    df['class'] = df['class'].astype('int')    
    return df

def get_data_df(df_path):
    df = pd.read_csv(df_path)
    df = process_context(df)
    df['section'] = df.context.str[0]
    sep = " [s] "
    df['sectok'] = '[' + df.section + ']'
    df['inputs'] = df.sectok + sep + df.context_category + sep + df.title + sep + df.anchor + sep + df.target
    #df['inputs'] = df.inputs.str.lower()
    return df

def get_train_test_split(df):
    anchors = df.anchor.unique()
    
    np.random.shuffle(anchors)

    val_prop = 0.25
    val_sz = int(len(anchors)*val_prop)
    val_anchors = anchors[:val_sz]

    is_val = np.isin(df.anchor, val_anchors)
    idxs = np.arange(len(df))
    val_idxs = idxs[ is_val]
    trn_idxs = idxs[~is_val]
    return trn_idxs, val_idxs

def tok_func(x): 
    return tokz(x["inputs"], padding=True, truncation=True)

def get_dds(df, trn_idxs=None, val_idxs=None):    
    
    ds = Dataset.from_pandas(df)
    if 'score' in df.columns:    
        ds = ds.rename_column('score', 'label')
    tok_ds = ds.map(tok_func, batched=True, remove_columns=inps+('__index_level_0__', 'inputs','id','sectok', 'context_category', 'title', 'section', 'class', 'subclass', 'group', 'main_group'))
    dataset_dict = {"train":tok_ds.select(trn_idxs), "val": tok_ds.select(val_idxs)} if trn_idxs is not None else {'test': tok_ds}
    return DatasetDict(dataset_dict)

## Fit and Inference Methods

In [None]:
def corr(data): 
    x, y = data
    return {'pearson': np.corrcoef(np.squeeze(x), np.squeeze(y))[0][1]}

def get_model():
    return AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

def get_tokz():
    return AutoTokenizer.from_pretrained(model_nm)

def get_trainer(dds, model, tokz):
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        evaluation_strategy = "epoch", save_strategy = "epoch", logging_strategy= "epoch",
        per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, optim="adamw_torch", report_to='none', load_best_model_at_end=True,
        metric_for_best_model="pearson",
        greater_is_better=True,
        save_total_limit=1)    
    
    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['val'],
                   tokenizer=tokz, compute_metrics=corr)

def model_predict(dds, tokenizer):
    test_data_loader = DataLoader(dds['test'], batch_size=32, collate_fn=DataCollatorWithPadding(tokenizer)) 
    data_iter = iter(test_data_loader)

    all_preds = []
    for inputs in data_iter:        
        inputs = {k:v.to('cuda') for k,v in inputs.items()} # TODO: get device from config
        with torch.no_grad():            
            batch_preds = model(**inputs).logits.detach().cpu().tolist()  
            all_preds += batch_preds

    all_preds = np.array(all_preds).reshape(-1)
    return all_preds

## Run it

In [None]:
train_df = get_data_df(train_data_path)
test_df = get_data_df(test_data_path)

tokz = get_tokz()
sectoks = list(train_df.sectok.unique())
tokz.add_special_tokens({'additional_special_tokens': sectoks})

trn_idxs, val_idxs = get_train_test_split(train_df)

dds = get_dds(train_df, trn_idxs, val_idxs)
test_dds = get_dds(test_df)

model = get_model()
model.resize_token_embeddings(len(tokz))
    
trainer = get_trainer(dds, model=model,  tokz=tokz)
trainer.train()

submission_preds = model_predict(test_dds, tokz)

submission = datasets.Dataset.from_dict({
    'id': pd.read_csv(test_data_path).id.values.tolist(),
    'score': submission_preds,
})

submission.to_csv('submission.csv', index=False)

In [None]:
test_df

#tokz = AutoTokenizer.from_pretrained('../input/condensed/outputs/checkpoint-4275')
#test_df = get_data_df(test_data_path)
#test_dds = get_dds(test_df)
#model = AutoModelForSequenceClassification.from_pretrained('../input/condensed/outputs/checkpoint-4275', num_labels=1).to('cuda')
#submission_preds = model_predict(test_dds, tokz)

In [None]:
percent_missing = test_df.isnull().sum() * 100 / len(test_df)
missing_value_df = pd.DataFrame({'column_name': test_df.columns,
                                 'percent_missing': percent_missing})

missing_value_df.sort_values('percent_missing', inplace=True)
missing_value_df