In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm.notebook import tqdm

In [None]:
ll ../input

In [None]:
DEBUG = True

In [None]:
ROOT = Path.cwd().parent
INPUT_ROOT = ROOT / "input"

KF_ROOT = INPUT_ROOT / "2nd-place-solution-inference-kf-new-sequences" / "output"
ONODERA_ROOT = INPUT_ROOT / "covid-233-onodera-outputs-v2"
XGB_ROOT = INPUT_ROOT / "stanford-covid-vaccine-onodera-models" / "xgb/xgb"

SUBMISSION_FILE = ROOT / "working" / "2nd-place-233-seq.csv.gz"

COLS_TARGET = ["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"]


In [None]:
# ==============================================================================
# load kf
# ==============================================================================


file_kf = {}
files = sorted(KF_ROOT.glob('*.csv.gz'))
for i, file in enumerate(files):

    file_kf[str(file).split("/")[-1][:-7]] =  pd.read_csv(file)
    print(f"Loaded: {file}")

print('kf', len(file_kf))

In [None]:
# ==============================================================================
# load onodera
# ==============================================================================

file_onodera = {}
files = sorted(ONODERA_ROOT.glob('*.csv'))
for i, file in enumerate(files):

    file_onodera[str(file).split("/")[-1][:-4]] =  pd.read_csv(file)
    print(f"Loaded: {file}")

print('onodera', len(file_onodera))


In [None]:
# ==============================================================================
# xgb
# ==============================================================================

files_all = {}
files_all.update(file_kf)
files_all.update(file_onodera)

cols = ["id_seqpos"] + COLS_TARGET


X_test = pd.concat(
    [
        files_all[k][cols].set_index("id_seqpos").add_prefix(k + "_")
        for k in files_all
    ],
    axis=1,
)

test_id = X_test.index
X_test.reset_index(drop=True, inplace=True)

X_train = pd.read_csv(XGB_ROOT / "X_train_head.csv")

a, b = X_train.align(X_test, join='inner', axis=1)
if not a.shape == X_train.shape:
    raise Exception(a.shape, X_train.shape)
else:
    X_train = a
    X_test = b

dtest = xgb.DMatrix(X_test)
sub = pd.DataFrame(index=test_id)

for target in COLS_TARGET:
    sub[target] = 0
    cnt = 0
    for file in tqdm(sorted(XGB_ROOT.glob(f'xgb_{target}*.json'))):
        model = xgb.Booster({'nthread': 4})
        model.load_model(file)

        sub[target] += model.predict(dtest)
        cnt += 1
        
        if DEBUG:
            break
    
    sub[target] /= cnt
    print(f"[{target}] Done")

sub.to_csv(SUBMISSION_FILE)

In [None]:
sub

In [None]:
pd.read_csv(SUBMISSION_FILE)

### output is [here](https://www.kaggle.com/onodera/covid-result-of-233-sequences)