In [1]:
import numpy as np
import pandas as pd

In [2]:
SC_MEAN    = 16
SC_STD     = 35
TARGET_COL = 'scalar_coupling_constant'

PROC_DATA_PATH = '../proc_data/'
SUB_PATH       = '../submissions/'
OOF_PATH       = '../oofs/'

In [3]:
version = 1
fold_id = 5
epochs  = 40

In [6]:
train_df = pd.read_csv(PROC_DATA_PATH + 'train_proc_df.csv', index_col=0)
val_idxs = pd.read_csv(PROC_DATA_PATH + 'val_idxs_8_fold_cv.csv', usecols=[0, fold_id], 
                       index_col=0).dropna().astype(int).iloc[:,0]

  mask |= (ar1 == a)


In [7]:
sub_file = f'mol_transformer_snapshots_v{version}_fold{fold_id}-submission.csv'
sub_base_file = f'mol_transformer_v{version}_fold{fold_id}-submission.csv'
oof_file = f'mol_transformer_snapshots_v{version}_fold{fold_id}-oof.csv'
oof_base_file = f'mol_transformer_v{version}_fold{fold_id}-oof.csv'
base_sub = pd.read_csv(SUB_PATH + sub_base_file, index_col=0)
subs = pd.concat((base_sub.reset_index(drop=True), 
                  pd.read_csv(SUB_PATH + sub_file)), axis=1)
oofs = pd.concat((pd.read_csv(OOF_PATH + oof_base_file, index_col=0),
                  pd.read_csv(OOF_PATH + oof_file, index_col=0)), axis=1)
subs.columns = [str(i) for i in range(epochs+1)]
oofs.columns = [str(i) for i in range(epochs+1)]

  mask |= (ar1 == a)


In [8]:
val_df = train_df[train_df['molecule_id'].isin(val_idxs)]
y_true, types = val_df[TARGET_COL] * SC_STD + SC_MEAN, val_df['type']

In [9]:
def group_mean_log_mae(y_true, y_pred, types, return_groups=False):
    maes = (y_true - y_pred).abs().groupby(types).mean()
    return np.log(maes).mean()

In [10]:
used_models, avail_models = [], list(range(epochs+1))
improved, best_score, model_to_add = True, 0, None
while improved:
    improved = False
    for i in avail_models:
        y_pred = oofs[[str(m) for m in used_models]+[str(i)]].values.mean(axis=1)
        gmlmae = group_mean_log_mae(y_true, y_pred, types)
        if gmlmae < best_score:
            improved = True
            model_to_add = str(i)
            best_score = gmlmae
            print(best_score, used_models+[model_to_add])
    if improved:
        used_models.append(model_to_add)           

-2.8489835587157417 ['0']
-2.8504853284868727 ['8']
-2.852540849279369 ['15']
-2.853238272695925 ['16']
-2.8547816737673086 ['23']
-2.855946341923628 ['24']
-2.857300856513066 ['32']
-2.859050538286121 ['40']
-2.859066922437052 ['40', '23']
-2.859152005282061 ['40', '24']
-2.85920610889473 ['40', '31']
-2.859619202107832 ['40', '32']
-2.859754098415871 ['40', '32', '40']
-2.85981527424827 ['40', '32', '40', '23']
-2.8599723178828165 ['40', '32', '40', '30']
-2.8599913763488036 ['40', '32', '40', '30', '24']
-2.8600258941444103 ['40', '32', '40', '30', '31']
-2.8602183143148627 ['40', '32', '40', '30', '38']
-2.860219109346976 ['40', '32', '40', '30', '38', '24']
-2.8602995003877028 ['40', '32', '40', '30', '38', '39']
-2.8603162465221272 ['40', '32', '40', '30', '38', '39', '40']
-2.8603363412280576 ['40', '32', '40', '30', '38', '39', '40', '23']


In [11]:
sub_ens = base_sub.copy()
sub_ens[TARGET_COL] = subs[used_models].mean(axis=1).values
sub_ens.head()

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
4658147,16.962313
4658148,186.399607
4658149,1.787979
4658150,184.165779
4658151,17.519956


In [12]:
sub_ens.to_csv(f'{SUB_PATH}mol_transformer_se_v{version}_fold{fold_id}-submission.csv')