In [None]:
import pkgutil
check_module = True if pkgutil.find_loader("hydra") else False
if not check_module:
    import subprocess
    subprocess.run('pip uninstall -y transformers'.split())
    subprocess.run('python -m pip install --no-index --find-links=../input/uspppm-pip-wheels transformers'.split())
    subprocess.run('python -m pip install --no-index --find-links=../input/uspppm-pip-wheels datasets'.split())
    subprocess.run('python -m pip install --no-index --find-links=../input/uspppm-pip-wheels sentencepiece'.split())
    subprocess.run('python -m pip install --no-index --find-links=../input/uspppm-pip-wheels hydra-core'.split())
    subprocess.run('python -m pip install --no-index --find-links=../input/uspppm-pip-wheels slackclient'.split())
else:
    print("Environment is already setup")

del check_module

# Configuration

In [None]:
import os

DEBUG = False
BATCH_SIZE = 64
N_FOLDS = 5

##############
# ROOT PATHS
##############
DATA_DIR = "/kaggle/input/us-patent-phrase-to-phrase-matching/"
INPUT_DIR = "/kaggle/input/uspppm-data"
CODE_DIR = "/kaggle/input/uspppm-source-code"

# Imports

In [None]:
import re
import gc
gc.enable()
import sys
import importlib
import logging
import warnings
from pprint import pprint
from typing import Dict
from omegaconf import OmegaConf

sys.path.append("../input/uspppm-source-code")
sys.path.append("../input/uspppm-source-code/src")

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pprint import pprint
import transformers
import datasets
import tokenizers
from datasets import Dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

# from source-code
from data.cpc_texts import get_cpc_texts
from data.dataset import tokenize_func, prepare_data, tokenize_func_transformer_head, create_folds
from extensions.scoring import pearsonr, post_process_predictions, scale_predictions
from utils import seed_everything

from src.cocolm.configuration_cocolm import COCOLMConfig
from src.cocolm.tokenization_cocolm import COCOLMTokenizer
from src.modeling.models.cocolm.cocolm import COCOLMForSequenceClassification

# setup
seed_everything(42)

# log some stuff
print(f"-> transformers version: {transformers.__version__}")
print(f"-> datasets version: {datasets.__version__}")
print(f"-> tokenizers version: {tokenizers.__version__}")

# Read OOF files

In [None]:
import glob
checkpoint_root_dirs = [f"../input/{root_dir}" for root_dir in os.listdir("../input") if "checkpoints" in root_dir]

# refactor, inefficient and hacky code!
CKPTS = []
for ckpt_dir in checkpoint_root_dirs:
    for dire in os.listdir(ckpt_dir):
        if "." not in dire:
             CKPTS.append(f"{ckpt_dir}/{dire}")
CKPTS = np.sort(CKPTS)
# CKPTS = [ck for ck in CKPTS if "coco" not in ck]
print(f"We have {len(CKPTS)} experiments for ensemble\n")
pprint([ckpt.split('/')[-1] for ckpt in CKPTS])

# reading oof files
OOF = {f"{f.split('/')[-1]}": f"{f}/{f.split('/')[-1]}_oof.csv" for f in CKPTS}
OOF_CSV = [pd.read_csv(v) for _, v in OOF.items()]

In [None]:
x = np.zeros((len(OOF_CSV[0]),len(OOF)))
for k in range(len(OOF)):
    x[:,k] = scale_predictions(OOF_CSV[k]['preds'].to_numpy())
    
TRUE = OOF_CSV[0]['score'].to_numpy()

In [None]:
all_scores = []
for k in range(x.shape[1]):
    pscore = pearsonr(OOF_CSV[0]['score'].to_numpy(),x[:,k])
    all_scores.append(pscore)
    print('Model %i has OOF Pearsonr = %.4f'%(k,pscore))
    
m = [np.argmax(all_scores)]; w = []

# Hill Climbing Ensemble on OOF

In [None]:
old = np.max(all_scores); 

RES = 200
PATIENCE = 10
TOL = 0.0003
DUPLICATES = False

print('Ensemble Pearsonr = %.5f by beginning with model %i'%(old,m[0]))
print()

for kk in range(len(OOF)):
    
    # BUILD CURRENT ENSEMBLE
    md = x[:,m[0]]
    for i,k in enumerate(m[1:]):
        md = w[i]*x[:,k] + (1-w[i])*md
        
    # FIND MODEL TO ADD
    mx = 0; mx_k = 0; mx_w = 0
    print('Searching for best model to add... ')
    
    # TRY ADDING EACH MODEL
    for k in range(x.shape[1]):
        print(k,', ',end='')
        if not DUPLICATES and (k in m): continue
            
        # EVALUATE ADDING MODEL K WITH WEIGHTS W
        bst_j = 0; bst = 0; ct = 0
        for j in range(RES):
            tmp = j/RES*x[:,k] + (1-j/RES)*md
            pear_score = pearsonr(TRUE,tmp)
            if pear_score>bst:
                bst = pear_score
                bst_j = j/RES
            else: ct += 1
            if ct>PATIENCE: break
        if bst>mx:
            mx = bst
            mx_k = k
            mx_w = bst_j
            
    # STOP IF INCREASE IS LESS THAN TOL
    inc = mx-old
    if inc<=TOL: 
        print(); print('No increase. Stopping.')
        break
        
    # DISPLAY RESULTS
    print();
    print('Ensemble Pearsonr = %.4f after adding model %i with weight %.3f. Increase of %.4f'%(mx,mx_k,mx_w,inc))
    print()
    
    old = mx; m.append(mx_k); w.append(mx_w)

In [None]:
print(f'We are using models (total {len(m)})',m)
print('with weights',w)
print(f"with tolerance of {TOL}")
print('and achieve ensemble pearsonr = %.5f'%old)

In [None]:
m_weights = dict()
for i, s in enumerate(m):
    if i == 0:
        m_weights.update({s: None})
    else:
        m_weights.update({s:w[i-1]})

print("dictionary storing the index and weight of each oof/experiment")
m_weights # has all model's indices along with their weights

In [None]:
md = x[:,m[0]]  # highest cv experiment preds
for mod, weight in m_weights.items():
    if weight is not None:
        md = weight * x[:, mod] + (1-weight) * md
print(f"Ensemble CV Score: {np.round(pearsonr(md, TRUE), 5)}\n\n")
plt.hist(md,bins=100)
plt.title('Ensemble OOF predictions')
plt.show()

In [None]:
INFER_EXPS = {}
for i, (k,v) in enumerate(OOF.items()):
    if i in m:
        INFER_EXPS.update({k:i})

INFER_EXPS_NAMES = [k for k, v in INFER_EXPS.items()]

print(f"Inferencing selected experiments to generate submission file for ensemble (total {len(INFER_EXPS_NAMES)}):\n")
pprint(INFER_EXPS)

# Read data

In [None]:
# read data
submission = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))
test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

if DEBUG:
    print("Running in DEBUG mode")
    test_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))[:10000]

display(test_df.head())

# Helper funcs

In [None]:
# function to return key for any value
def get_key(dicti, val):
    for key, value in dicti.items():
         if val == value:
            return key
 
    return "key doesn't exist"

def infer_one_experiment(
    checkpoint_dir: str,
    test_df: pd.DataFrame,
    batch_size: int = 64,
    n_folds: int = 5,
):
    predictions = []
    for fold in range(n_folds):
        ckpt_dir = os.path.join(checkpoint_dir, f"fold_{fold}")
        cfg = OmegaConf.load(os.path.join(ckpt_dir, "experiment_config.yaml"))


        # tokenizer
        if "cocolm" in cfg.model.model_name:
            tokenizer = COCOLMTokenizer.from_pretrained(ckpt_dir)
        else:
            tokenizer = AutoTokenizer.from_pretrained(ckpt_dir)
        
        # determine whether to train transformer head type model or not
        # if True then tokenization is max_length and dynamic padding is disabled
        if "TransformerHead" in cfg.model.class_name:
            transformer_head_trainer = True
        else:
            transformer_head_trainer = False
        
        test_df_prep = prepare_data(df=test_df,
                              tokenizer=tokenizer,
                              cpc_scheme_xml_dir=os.path.join(INPUT_DIR, "CPCSchemeXML202105"),
                              cpc_title_list_dir=os.path.join(INPUT_DIR, "CPCTitleList202202"),
                              use_custom_seperator=cfg.data.use_custom_seperator)
        test_ds = Dataset.from_pandas(test_df_prep)

        if transformer_head_trainer:
            print(">>> Transformer head trainer")
            # max_length padding
            tokenized_ds = test_ds.map(
                lambda x: tokenize_func_transformer_head(
                    x, tokenizer=tokenizer, max_length=133
                ),
                batched=True,
            )
        else:
            tokenized_ds = test_ds.map(
                lambda x: tokenize_func(
                    x,
                    tokenizer=tokenizer,
                ),
                batched=True,
            )
        
        if fold == 0:
            print("\nSample tokenized text:")
            print(tokenizer.decode(tokenized_ds[0]["input_ids"]) + "\n")

        print(f"-> Loading checkpoint from {ckpt_dir}")

        # init model
        if "cocolm" in cfg.model.model_name:
            config = COCOLMConfig.from_pretrained(ckpt_dir)
        else:
            config = AutoConfig.from_pretrained(ckpt_dir, num_labels=1)        
        # gets the appropiate class of the model defined in `cfg.model.class_name` in hydra configuration
        # from `src.modeling.models` package
        if "AutoModel" in cfg.model.class_name:
            print(f"-> Loading AutoModelForSequenceClassification class")
            model = AutoModelForSequenceClassification.from_pretrained(
                ckpt_dir, config=config
            )

        # """
        # Model class name should contain `*General`
        # This class supports 
        # - different losses (mse, bce, pearson) with appropiate post-process
        # - multi_sample dropout
        # - attention_pool
        # """

        elif "General" in cfg.model.class_name:
            if "cocolm" in ckpt_dir:
                ckpt_dir = f"{ckpt_dir}/pytorch_model.bin"
            print("-> Loading general class")
            MODEL_CLASS = getattr(importlib.import_module("modeling"), cfg.model.class_name)
            model = MODEL_CLASS.from_pretrained(
                ckpt_dir,
                config=config,
                loss_type=cfg.model.loss_type,
                multi_sample_dropout=cfg.model.multi_sample_dropout,
                attention_pool=cfg.model.attention_pool,
            )
        else:
            if "cocolm" in ckpt_dir:
                ckpt_dir = f"{ckpt_dir}/pytorch_model.bin"
            print("-> Loading class with loss_type key")
            MODEL_CLASS = getattr(importlib.import_module("modeling"), cfg.model.class_name)
            model = MODEL_CLASS.from_pretrained(
                ckpt_dir,
                config=config,
                loss_type=cfg.model.loss_type
            )

        trainer_args = TrainingArguments(
            output_dir="/kaggle/working/trainer_out",
            per_device_eval_batch_size=batch_size,
        )
        # init Trainer
        trainer = Trainer(
            model=model,
            args=trainer_args,
            # dynamic padding within batch
            data_collator=None if transformer_head_trainer else DataCollatorWithPadding(tokenizer),
            tokenizer=tokenizer,
        )

        # infer on test dataset
        logits, _, _ = trainer.predict(tokenized_ds)
        predictions.append(scale_predictions(post_process_predictions(logits.reshape(-1), cfg.model.loss_type)))
        
        # clean-up
        del trainer, model, trainer_args, config, logits, tokenized_ds, cfg, tokenizer, test_ds
        gc.collect()
        torch.cuda.empty_cache()
    
    
    exp_sub_df = pd.DataFrame()
    exp_sub_df['id'] = submission['id'].to_numpy()
    exp_sub_df["score"] = np.mean(predictions, axis=0)
    save_name = checkpoint_dir.split("/")[-1]
    exp_sub_df.to_csv(f"{save_name}_sub.csv", index=False)
    
    del exp_sub_df, save_name
    gc.collect()

# Inference

In [None]:
# DISABLE PROGRESS BARS AND WARNINGS
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)
datasets.disable_progress_bar()

print("Inferencing following experiments:\n")
pprint(INFER_EXPS_NAMES)
print("\n\n")

# INFER ALL EXPERIMENTS SELECTED TO MAXIMIZE CV SCORE
for exp, oof_path in OOF.items():
    if exp in INFER_EXPS_NAMES:
        root_dir = oof_path.split('/')[2]
        
        print("*" * 10 + f" Experiment {exp} " + "*" * 10)
        infer_one_experiment(
            checkpoint_dir=os.path.join("../input", root_dir, exp),
            test_df=test_df,
            batch_size=BATCH_SIZE,
            n_folds=N_FOLDS,
        )

        print("=" * 100)
        print()
        gc.collect()

# Submission

In [None]:
INFER_EXPS_REV = {v:k for k,v in INFER_EXPS.items()}
INFER_EXPS_REV

In [None]:
high_cv_exp_name = INFER_EXPS_REV[get_key(m_weights, None)]
high_cv_preds = scale_predictions(pd.read_csv(OOF[high_cv_exp_name])['preds'].to_numpy())
high_inf_preds = scale_predictions(pd.read_csv(f"/kaggle/working/{high_cv_exp_name}_sub.csv")['score'].to_numpy())
for mod_idx, weight in m_weights.items():
    if weight is not None:
        exp_name = INFER_EXPS_REV[mod_idx]
        oof_preds = scale_predictions(pd.read_csv(OOF[exp_name])['preds'].to_numpy())
        inf_preds = scale_predictions(pd.read_csv(f"/kaggle/working/{exp_name}_sub.csv")['score'].to_numpy())
        high_cv_preds = weight * oof_preds + (1-weight) * high_cv_preds
        high_inf_preds = weight * inf_preds + (1-weight) * high_inf_preds

In [None]:
high_cv_preds.shape, high_inf_preds.shape

In [None]:
# SANITY CHECK CV (if all weights were being assigned properly)
np.round(pearsonr(high_cv_preds, TRUE), 5)

In [None]:
plt.hist(high_cv_preds,bins=100)
plt.title('Ensemble OOF predictions')
plt.show()

In [None]:
plt.hist(high_inf_preds)
plt.title('Ensemble predictions')
plt.show()

In [None]:
if not DEBUG:
    # create submission
    submission["score"] = high_inf_preds
    submission.to_csv("submission.csv", index=False)
    print(submission.dtypes)
    display(submission.head())

In [None]:
import shutil
shutil.rmtree("/kaggle/working/trainer_out", ignore_errors=True)