In [None]:
!pip install -U -q --no-index --find-links=../input/download-packages \
python-box \
pytorch-lightning \
datasets \
torchmetrics \
pytorch-pfn-extras \
kaleido \
mlflow \
iterative-stratification \
adabelief-pytorch \
omegaconf \
hydra-core

In [None]:
!ls ../input/uspppm-public -lh

In [None]:
!ls ../input/uspppm-results -lh

# inference of each run

In [None]:
%%writefile process.py

import argparse

import os
import sys
import gc
import glob
import time
import warnings
import joblib
import shutil
import psutil
import importlib
import yaml
from pathlib import Path

from tqdm.auto import tqdm

warnings.simplefilter("ignore")

from scipy.special import expit as sigmoid
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

import torch

import pytorch_pfn_extras as ppe
from pytorch_pfn_extras.config import Config

def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset-name", default="uspppm-results", type=str)
    parser.add_argument('--idx', default=0, type=int)
    parser.add_argument('--run-id', default=None)
    
    return parser.parse_args()

def memory_used_to_str():
    pid = os.getpid()
    processs = psutil.Process(pid)
    memory_use = processs.memory_info()[0] / 2.0**30
    return "ram memory gb :" + str(np.round(memory_use, 2))


def get_feature(CFG, model_path, df):
    CFG["/dataset/test"].lazy_init(df=df)
    dataloader = CFG[f"/dataloader/test"]

    model = CFG["/model"]
    device = CFG["/training/device"]
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    pred_list = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="extracting..."):
            for k in batch.keys():
                batch[k] = batch[k].to(device)

            last_hidden_state = model.encoder(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )["last_hidden_state"]
            pred_list.append(last_hidden_state[:, 0, :].detach().cpu().numpy())

    del model, device
    torch.cuda.empty_cache()
    gc.collect()
    
    output = np.concatenate(pred_list)
    
    return output


def get_prediction(CFG, model_path, df, use_sigmoid=False):
    CFG["/dataset/test"].lazy_init(df=df)
    dataloader = CFG[f"/dataloader/test"]

    model = CFG["/model"]
    device = CFG["/training/device"]
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    pred_list = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="predicting..."):
            for k in batch.keys():
                batch[k] = batch[k].to(device)

            yhat = model(batch)
            pred_list.append(yhat.detach().cpu().numpy())

    del model, device
    torch.cuda.empty_cache()
    gc.collect()
    
    output = np.concatenate(pred_list)

    if use_sigmoid:
        return sigmoid(output)
    
    return output


def run_inference(dataset_name="uspppm-results", idx=0, RUN_ID=None):
    print()
    print("#" * 30, f"idx={idx}", f"RUN_ID={RUN_ID}", "#" * 30)
    
    if RUN_ID is None:
        input_dir = f"../input/{dataset_name}"
    else:
        input_dir = f"../input/{dataset_name}/{RUN_ID}"
    
    result = joblib.load(os.path.join(input_dir, "results.pkl"))
    result.model_paths = [os.path.join(input_dir, path) for path in result.model_paths]
    os.makedirs(f"RUN{idx}", exist_ok=True)
    shutil.copy(os.path.join(input_dir, "work.py"), f"RUN{idx}")
    
    exec(f"from RUN{idx} import work as work{idx}")
    
    # run inference code
    config_path = Path(input_dir, "config.yaml")
    if os.path.exists(config_path):
        with open(Path(input_dir, "config.yaml")) as f:
            PRE_EVAL_CFG = yaml.safe_load(f.read())
    else:
        PRE_EVAL_CFG = yaml.safe_load(eval(f"work{idx}.CONFIG_STRING"))
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        train_df = pd.read_csv(Path(PRE_EVAL_CFG["globals"]["input_dir"], "train.csv"))
    else:
        train_df = pd.read_csv(Path(PRE_EVAL_CFG["globals"]["input_dir"], "train.csv"), nrows=500)
    test_df = pd.read_csv(Path(PRE_EVAL_CFG["globals"]["input_dir"], "test.csv"))
    test_df["score"] = 0.0
    
    CFG = Config(PRE_EVAL_CFG, types=eval(f"work{idx}.CONFIG_TYPES"))
    
    pred_sigmoid = isinstance(CFG["/metric/metric"], eval(f"work{idx}.PearsonCorrCoefWithLogitsMetric"))
    
    for fold, model_path in enumerate(result.model_paths):
        print("#" * 30, f"fold: {fold}", "#" * 30)
        print("#" * 30, f"model path: {model_path}", "#" * 30)

        
        # train_predictions
#         train_prediction = get_prediction(CFG, model_path, train_df, use_sigmoid=pred_sigmoid)
#         np.save(f"train_prediction_{idx}_fold{fold}", train_prediction)

        
        # test_predictions
        test_prediction = get_prediction(CFG, model_path, test_df, use_sigmoid=pred_sigmoid)
        np.save(f"test_prediction_{idx}_fold{fold}", test_prediction)
        test_df["score"] += test_prediction / len(result.model_paths)



        # train_features
#         train_feature = get_feature(CFG, model_path, train_df)
#         np.save(f"train_feature_{idx}_fold{fold}", train_feature)


        # test_features
#         test_feature = get_feature(CFG, model_path, test_df)
#         np.save(f"test_feature_{idx}_fold{fold}", test_feature)

        gc.collect()
        print(memory_used_to_str())
    
    del PRE_EVAL_CFG, CFG
    gc.collect()
    
    shutil.copy(os.path.join(input_dir, "oof.csv"), f"oof_{idx}.csv")
    test_df[["id", "score"]].to_csv(f"submission_{idx}.csv", index=False)

args = parse_arguments()
run_inference(dataset_name=args.dataset_name, idx=args.idx, RUN_ID=args.run_id)

In [None]:
# !python -u process.py

!python -u process.py --dataset-name="uspppm-public" --run-id=018420c68d63460f9ad625d51bb3903f --idx=0
# !python -u process.py --dataset-name="uspppm-public" --run-id=187e25eeeed943f08f6b450e47455542 --idx=1
# !python -u process.py --dataset-name="uspppm-public" --run-id=56a4472423e54bd39ff7b02c0daa08ac --idx=2
# !python -u process.py --dataset-name="uspppm-public" --run-id=7923146adde64b6a9643eb163d8d223f --idx=3
# !python -u process.py --dataset-name="uspppm-public" --run-id=b02bfabced5345b689e6ac36e25a478c --idx=4
# !python -u process.py --dataset-name="uspppm-public" --run-id=b977b33e7f8547bcb1e62e42e021e627 --idx=5
# !python -u process.py --dataset-name="uspppm-public" --run-id=bc8145deac9d4376b708e8b1499da970 --idx=6
# !python -u process.py --dataset-name="uspppm-public" --run-id=e38625b57d3e4f00992c4f191a786c5d --idx=7

# !python -u process.py --dataset-name="uspppm-results" --run-id=9aa398aea22c4048b4e904c36bb3605b --idx=8
# !python -u process.py --dataset-name="uspppm-results" --run-id=c05c9ec899474ff58fb3552fe4a084ed --idx=9
# !python -u process.py --dataset-name="uspppm-results" --run-id=c54349cdef094923b1003ba22b394ce9 --idx=10
# !python -u process.py --dataset-name="uspppm-results" --run-id=dc95c61f15bf4ac0b9e1de3ac1299f45 --idx=11
# !python -u process.py --dataset-name="uspppm-results" --run-id=f87754b75cb54060b19527b551b3e6fe --idx=12

# make submission

## mean blending

In [None]:
import glob

from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr

import numpy as np
import pandas as pd


train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

oof_paths = sorted(glob.glob("oof_*.csv"))
sub_paths = sorted(glob.glob("submission_*.csv"))

print(list(zip(oof_paths, sub_paths)))

N = len(sub_paths)

for idx, (oof_path, sub_path) in enumerate(zip(oof_paths, sub_paths)):
    print(f"oof_path: {oof_path}, sub_path: {sub_path}")
    oof = pd.read_csv(oof_path)
    sub = pd.read_csv(sub_path)

    
    scaler = MinMaxScaler((0, 1))
    scaler.fit(np.concatenate([oof[["score"]].to_numpy(), sub[["score"]].to_numpy()]))

    train[f"score_{idx}"] = scaler.transform(train[["id"]].merge(oof, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)
    value = pearsonr(train['score'], train[f'score_{idx}'])[0]
    print(f"pearsonr: {value}")
    
    test[f"score_{idx}"] = scaler.transform(test[["id"]].merge(sub, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)

print("#" * 100)
cols = [f"score_{idx}" for idx in range(N)]
train["oof_score"] = train[cols].mean(axis=1)
test["score"] = test[cols].mean(axis=1)
value = pearsonr(train['score'], train["oof_score"])[0]
print(f"pearsonr: {value}")
print("#" * 100)

test[["id", "score"]].to_csv("submission.csv", index=False)
pd.read_csv("submission.csv", nrows=30)

## Nelder-Mead

In [None]:
# import glob
# from sklearn.preprocessing import MinMaxScaler
# from scipy.optimize import minimize
# from scipy.stats import pearsonr

# import numpy as np
# import pandas as pd


# train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
# test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

# oof_paths = sorted(glob.glob("oof_*.csv"))
# sub_paths = sorted(glob.glob("submission_*.csv"))

# N = len(sub_paths)
# cols = [f"score_{idx}" for idx in range(N)]

# for idx, (oof_path, sub_path) in enumerate(zip(oof_paths, sub_paths)):
#     print(f"oof_path: {oof_path}, sub_path: {sub_path}")
#     oof = pd.read_csv(oof_path)
#     sub = pd.read_csv(sub_path)

    
#     scaler = MinMaxScaler((0, 1))
#     scaler.fit(np.concatenate([oof[["score"]].to_numpy(), sub[["score"]].to_numpy()]))

#     train[f"score_{idx}"] = scaler.transform(train[["id"]].merge(oof, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)
#     value = pearsonr(train['score'], train[f'score_{idx}'])[0]
#     print(f"pearsonr: {value}")
    
#     test[f"score_{idx}"] = scaler.transform(test[["id"]].merge(sub, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)



# def func(x):
#     pred = np.average(train[cols], weights=x, axis=1)
#     return -pearsonr(train["score"], pred)[0]

# x0 = [1] * N
# result = minimize(func, x0, method='Nelder-Mead')

# result_weights = result.x
# result_value = result.fun

# print("#" * 100)
# print(f"result_weights: {result_weights}")
# print(f"result_value: {-result_value}")
# print("#" * 100)


# test["score"] = np.average(test[cols], weights=result_weights, axis=1)

# test[["id", "score"]].to_csv("submission.csv", index=False)
# pd.read_csv("submission.csv", nrows=30)

## Nelder-Mead(gmean)

In [None]:
# import glob
# from sklearn.preprocessing import MinMaxScaler
# from scipy.optimize import minimize
# from scipy.stats import pearsonr, gmean

# import numpy as np
# import pandas as pd


# train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
# test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")

# oof_paths = sorted(glob.glob("oof_*.csv"))
# sub_paths = sorted(glob.glob("submission_*.csv"))

# N = len(sub_paths)
# cols = [f"score_{idx}" for idx in range(N)]

# for idx, (oof_path, sub_path) in enumerate(zip(oof_paths, sub_paths)):
#     print(f"oof_path: {oof_path}, sub_path: {sub_path}")
#     oof = pd.read_csv(oof_path)
#     sub = pd.read_csv(sub_path)

    
#     scaler = MinMaxScaler((0.0001, 1))
#     scaler.fit(np.concatenate([oof[["score"]].to_numpy(), sub[["score"]].to_numpy()]))

#     train[f"score_{idx}"] = scaler.transform(train[["id"]].merge(oof, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)
#     value = pearsonr(train['score'], train[f'score_{idx}'])[0]
#     print(f"pearsonr: {value}")
    
#     test[f"score_{idx}"] = scaler.transform(test[["id"]].merge(sub, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)



# def func(x):
#     pred = gmean(train[cols], weights=x, axis=1)
#     return -pearsonr(train["score"], pred)[0]

# x0 = [1] * N
# result = minimize(func, x0, method='Nelder-Mead')

# result_weights = result.x
# result_value = result.fun

# print("#" * 100)
# print(f"result_weights: {result_weights}")
# print(f"result_value: {-result_value}")
# print("#" * 100)

# predictions = gmean(test[cols], weights=result_weights, axis=1)

# test["score"] = predictions

# test[["id", "score"]].to_csv("submission.csv", index=False)
# pd.read_csv("submission.csv", nrows=30)

## Stacking(skearn)

In [None]:
# import glob
# import copy
# from sklearn.preprocessing import MinMaxScaler
# from scipy.optimize import minimize
# from scipy.stats import pearsonr

# import numpy as np
# import pandas as pd

# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

# from sklearn.linear_model import LinearRegression, BayesianRidge, Lasso
# from sklearn.svm import SVR
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.decomposition import PCA

# def prepare_fold(df: pd.DataFrame, n_fold: int, seed: int):
#     dfx = (
#         pd.get_dummies(df, columns=["score"]).groupby(["anchor"], as_index=False).sum()
#     )
#     cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
#     dfx = dfx[cols]

#     mskf = MultilabelStratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
#     labels = [c for c in dfx.columns if c != "anchor"]
#     dfx_labels = dfx[labels]
#     dfx["fold"] = -1

#     for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
#         dfx.loc[val_, "fold"] = fold

#     fold_array = df.merge(dfx[["anchor", "fold"]], on="anchor", how="left")[
#         "fold"
#     ].to_numpy()
#     df["fold"] = fold_array

#     print("#" * 30, "folds", "#" * 30)
#     print(df["fold"].value_counts())


# train = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/train.csv")
# test = pd.read_csv("../input/us-patent-phrase-to-phrase-matching/test.csv")
# test["score"] = 0.0

# oof_paths = sorted(glob.glob("oof_*.csv"))
# sub_paths = sorted(glob.glob("submission_*.csv"))

# N = len(sub_paths)
# cols = [f"score_{idx}" for idx in range(N)]

# for idx, (oof_path, sub_path) in enumerate(zip(oof_paths, sub_paths)):
#     print(f"oof_path: {oof_path}, sub_path: {sub_path}")
#     oof = pd.read_csv(oof_path)
#     sub = pd.read_csv(sub_path)

    
#     scaler = MinMaxScaler((0, 1))
#     scaler.fit(np.concatenate([oof[["score"]].to_numpy(), sub[["score"]].to_numpy()]))

#     train[f"score_{idx}"] = scaler.transform(train[["id"]].merge(oof, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)
#     value = pearsonr(train['score'], train[f'score_{idx}'])[0]
#     print(f"pearsonr: {value}")
    
#     test[f"score_{idx}"] = scaler.transform(test[["id"]].merge(sub, on="id")["score"].to_numpy().reshape(-1, 1)).reshape(-1)

    
# #     train[f"score_{idx}"] = train[["id"]].merge(oof, on="id")["score"].to_numpy()
# #     value = pearsonr(train['score'], train[f'score_{idx}'])[0]
# #     print(f"pearsonr: {value}")
    
# #     test[f"score_{idx}"] = test[["id"]].merge(sub, on="id")["score"].to_numpy()


# def stacking(model, df, n_fold, seed):
#     np.random.seed(seed)
#     prepare_fold(df, n_fold=n_fold, seed=seed)
#     oof = np.zeros(len(train))
#     models = []
#     for fold in range(n_fold):
#         tr, val = df.loc[df["fold"] != fold], df.loc[df["fold"] == fold]
#         model.fit(tr[cols], tr["score"])
        
#         oof[df["fold"] == fold] = model.predict(val[cols])
#         models.append(copy.deepcopy(model))

#     score = pearsonr(train["score"], oof)[0]
#     print("#" * 30)
#     print("OOF:", score)
#     print("#" * 30)
    
#     return models, oof

# oof = np.zeros(len(train))

# seeds = [42, 1221, 128, 256, 512, 1024]
# models = []

# for seed in seeds:
#     print(f"seed: {seed}")
#     model = BayesianRidge()
#     _models, _oof = stacking(model, train.copy(), n_fold=10, seed=seed)
#     oof += _oof
#     models.extend(_models)


# score = pearsonr(train["score"], oof)[0]
# print(f"ALL OOF: {score}")

# for idx, model in enumerate(models):
#     print(f"idx: {idx}, model: {model}")
#     predictions = model.predict(test[cols])
    
#     test["score"] += predictions

# test[["id", "score"]].to_csv("submission.csv", index=False)
# pd.read_csv("submission.csv", nrows=30)