In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

def load_data(base_path = "data"):
    proteins = pd.read_csv(f"{base_path}/train_proteins.csv")
    peptides = pd.read_csv(f"{base_path}/train_peptides.csv")
    clinical = pd.read_csv(f"{base_path}/train_clinical_data.csv")
    supplement = pd.read_csv(f"{base_path}/supplemental_clinical_data.csv")
    return proteins, peptides, clinical, supplement

proteins, peptides, clinical, supplement = load_data("../input/amp-parkinsons-disease-progression-prediction")
supplement.loc[supplement["visit_month"] == 5, "visit_month"] = 6


In [None]:
def smape1p_ind(A, F):
    val = 200 * np.abs(F - A) / (np.abs(A+1) + np.abs(F+1))
    return val

def smape1p(A, F):
    return smape1p_ind(A, F).mean()

def max_dif(val, lst):
    lst0 = [x for x in lst if x < val]
    if len(lst0) == 0:
        return -1
    return val - max(lst0)

def count_prev_visits(val, lst):
    lst0 = [x for x in lst if x < val]
    return len(lst0)

class DataPrep:
    def __init__(self, target_horizons=[0, 6, 12, 24], test_vmonths = [0, 6, 12, 18, 24, 36, 48, 60, 72, 84]):
        self.target_horizons = target_horizons
        self.test_vmonths = test_vmonths

    def fit(self, proteins_df, peptides_df, clinical_df):
        pass

    def fe(self, sample, proteins_df, peptides_df, clinical_df):
        for v_month in [0, 6, 12, 18, 24, 36, 48, 60, 72, 84]:
            p = list(clinical_df[clinical_df["visit_month"] == v_month]["patient_id"].unique())
            sample[f"visit_{v_month}m"] = sample.apply(lambda x: (x["patient_id"] in p) and (x["visit_month"] >= v_month), axis=1).astype(int)

            p = list(proteins_df[proteins_df["visit_month"] == v_month]["patient_id"].unique())
            sample[f"btest_{v_month}m"] = sample.apply(lambda x: (x["patient_id"] in p) and (x["visit_month"] >= v_month), axis=1).astype(int)

            sample[f"t_month_eq_{v_month}"] = (sample["target_month"] == v_month).astype(int)
            sample[f"v_month_eq_{v_month}"] = (sample["visit_month"] == v_month).astype(int)

        for hor in self.target_horizons:
            sample[f"hor_eq_{hor}"] = (sample["horizon"] == hor).astype(int)

        sample["horizon_scaled"] = sample["horizon"] / 24.0

        blood_samples = proteins_df["visit_id"].unique()
        sample["blood_taken"] = sample.apply(lambda x: x["visit_id"] in blood_samples, axis=1).astype(int)
        
        all_visits = clinical_df.groupby("patient_id")["visit_month"].apply(lambda x: list(set(x))).to_dict()
        all_non12_visits = sample.apply(lambda x: [xx for xx in all_visits.get(x["patient_id"], []) if xx <= x["visit_month"] and xx%12 != 0], axis=1)
        sample["count_non12_visits"] = all_non12_visits.apply(lambda x: len(x)) 

        return sample

    def transform_train(self, proteins_df, peptides_df, clinical_df):
        sample = clinical_df.rename({"visit_month":"target_month", "visit_id":"visit_id_target"}, axis=1).\
            merge(clinical_df[["patient_id", "visit_month", "visit_id"]], how="left", on="patient_id")
        sample["horizon"] = sample["target_month"] - sample["visit_month"]
        sample = sample[sample["horizon"].isin(self.target_horizons)]
        sample = sample[sample["visit_month"].isin(self.test_vmonths)]

        # Features
        sample = self.fe(sample,
            proteins_df[proteins_df["visit_month"].isin(self.test_vmonths)],
            peptides_df[peptides_df["visit_month"].isin(self.test_vmonths)],
            clinical_df[clinical_df["visit_month"].isin(self.test_vmonths)])

        # Targets reshape
        res = []
        for tgt_i in np.arange(1, 5):
            delta_df = sample.copy()
            if f"updrs_{tgt_i}" in delta_df.columns:
                delta_df["target"] = delta_df[f"updrs_{tgt_i}"]
                delta_df["target_norm"] = delta_df["target"] / 100
            delta_df["target_i"] = tgt_i
            res.append(delta_df)

        sample = pd.concat(res, axis=0).reset_index(drop=True)
        if f"updrs_1" in sample.columns:
            sample = sample.drop(["updrs_1", "updrs_2", "updrs_3", "updrs_4"], axis=1)
        
        for tgt_i in np.arange(1, 5):
            sample[f"target_n_{tgt_i}"] = (sample["target_i"] == tgt_i).astype(int)

        return sample
    
    def transform_test(self, proteins_df, peptides_df, test_df, sub_df):
        sub = sub_df.copy()
        sub["patient_id"] = sub["prediction_id"].apply(lambda x: int(x.split("_")[0]))
        sub["visit_month"] = sub["prediction_id"].apply(lambda x: int(x.split("_")[1]))
        sub["visit_id"] = sub.apply(lambda x: str(x["patient_id"]) + "_" + str(x["visit_month"]), axis=1)

        sample = sub[["patient_id", "visit_month", "visit_id", "prediction_id"]]

        sample["horizon"] = sample["prediction_id"].apply(lambda x: int(x.split("_")[5]))
        sample["target_i"] = sample["prediction_id"].apply(lambda x: int(x.split("_")[3]))
        sample["visit_month"] = sample["visit_month"]
        sample["target_month"] = sample["visit_month"] + sample["horizon"]
        del sample["prediction_id"]

        # Features
        sample = self.fe(sample, proteins_df, peptides_df, test_df)

        for tgt_i in np.arange(1, 5):
            sample[f"target_n_{tgt_i}"] = (sample["target_i"] == tgt_i).astype(int)

        return sample

dp3 = DataPrep()
dp3.fit(proteins, peptides, clinical)

sample3 = dp3.transform_train(proteins, peptides, clinical)
sample3 = sample3[~sample3["target"].isnull()]
sample3["is_suppl"] = 0

sup_sample3 = dp3.transform_train(proteins, peptides, supplement)
sup_sample3 = sup_sample3[~sup_sample3["target"].isnull()]
sup_sample3["is_suppl"] = 1

print(sample3.shape)
print(sup_sample3.shape)

In [None]:
def split_df(df, folds_mapping, fold_id:int = 0):
    folds = df["patient_id"].map(folds_mapping)

    df_train = df[folds != fold_id]
    df_train = df_train[~df_train["target"].isnull()].reset_index(drop=True)

    df_valid = df[folds == fold_id]
    df_valid = df_valid[~df_valid["target"].isnull()].reset_index(drop=True)
    
    return df_train, df_valid

def create_folds_mapping(df, n_folds=5, random_state=42):
    folds_df = pd.DataFrame({"patient_id":df["patient_id"].unique()})
    folds_df["fold"] = -1

    for i, (_, test_index) in enumerate(KFold(n_splits=n_folds, 
            shuffle=True, random_state=random_state).split(folds_df)):
        folds_df.loc[test_index, "fold"] = i
    folds_mapping = folds_df.set_index(["patient_id"])["fold"]
    return folds_mapping

from joblib import Parallel, delayed

def smape1p_ind(A, F):
    val = 200 * np.abs(F - A) / (np.abs(A+1) + np.abs(F+1))
    return val

def smape1p(A, F):
    return smape1p_ind(A, F).mean()

def smape1p_opt(x):
    #return np.median(x)
    tgts = np.arange(0, 61)
    #tgts = [smape(x, val) for val in np.arange(0, 61)]
    scores = [smape1p(x, val) for val in tgts]
    return tgts[np.argmin(scores)]

def split_df(df, folds_mapping, fold_id:int = 0):
    folds = df["patient_id"].map(folds_mapping)

    df_train = df[folds != fold_id]
    df_train = df_train[~df_train["target"].isnull()].reset_index(drop=True)

    df_valid = df[folds == fold_id]
    df_valid = df_valid[~df_valid["target"].isnull()].reset_index(drop=True)
    
    return df_train, df_valid

def create_folds_mapping(df, n_folds=5, random_state=42):
    folds_df = pd.DataFrame({"patient_id":df["patient_id"].unique()})
    folds_df["fold"] = -1

    for i, (_, test_index) in enumerate(KFold(n_splits=n_folds, 
            shuffle=True, random_state=random_state).split(folds_df)):
        folds_df.loc[test_index, "fold"] = i
    folds_mapping = folds_df.set_index(["patient_id"])["fold"]
    return folds_mapping

def run_single_fit(model, df_train, df_valid, fold_id, seed, probs):
    if probs:
        p = model.fit_predict_proba(df_train, df_valid)
        p = pd.DataFrame(p, columns=[f"prob_{i}" for i in range(p.shape[1])]).reset_index(drop=True)
        res = pd.DataFrame({"seed":seed, "fold": fold_id, \
            "patient_id":df_valid["patient_id"], "visit_month":df_valid["visit_month"], \
            "target_month":df_valid["target_month"], "target_i":df_valid["target_i"], \
            "target":df_valid["target"]}).reset_index(drop=True)
        return pd.concat([res, p], axis=1)
    else:
        p = model.fit_predict(df_train, df_valid)
        return pd.DataFrame({"seed":seed, "fold": fold_id, \
            "patient_id":df_valid["patient_id"], "visit_month":df_valid["visit_month"], \
            "target_month":df_valid["target_month"], "target_i":df_valid["target_i"], \
            "target":df_valid["target"], "preds":p})

class BaseModel:
    def fit(self, df_train):
        raise "NotImplemented"

    def predict(self, df_valid):
        raise "NotImplemented"

    def predict_proba(self, df_valid):
        raise "NotImplemented"

    def fit_predict(self, df_train, df_valid):
        self.fit(df_train)
        return self.predict(df_valid)

    def fit_predict_proba(self, df_train, df_valid):
        self.fit(df_train)
        return self.predict_proba(df_valid)

    def cv(self, sample, sup_sample=None, n_folds=5, random_state=42):
        folds_mapping = create_folds_mapping(sample, n_folds, random_state)

        res = None
        for fold_id in sorted(folds_mapping.unique()):
            df_train, df_valid = split_df(sample, folds_mapping, fold_id)
            if sup_sample is not None:
                df_train = pd.concat([df_train, sup_sample], axis=0)
            p = self.fit_predict(df_train, df_valid)
            delta = pd.DataFrame({"fold": fold_id,  \
                    "patient_id":df_valid["patient_id"], "visit_month":df_valid["visit_month"], \
                    "target_month":df_valid["target_month"], "target_i":df_valid["target_i"], \
                    "target":df_valid["target"], "preds":p})
            res = pd.concat([res, delta], axis=0)

        return res

    def cvx(self, sample, sup_sample=None, n_runs=1, n_folds=5, random_state=42, probs=False):
        np.random.seed(random_state)
        seeds = np.random.randint(0, 1e6, n_runs)

        run_args = []
        for seed in seeds:
            folds_mapping = create_folds_mapping(sample, n_folds, seed)
            for fold_id in sorted(folds_mapping.unique()):
                df_train, df_valid = split_df(sample, folds_mapping, fold_id)
                if sup_sample is not None:
                    df_train = pd.concat([df_train, sup_sample], axis=0)
                run_args.append(dict(
                    df_train = df_train,
                    df_valid = df_valid,
                    fold_id = fold_id,
                    seed = seed,
                    probs = probs
                ))

        res = Parallel(-1)(delayed(run_single_fit)(self, **args) for args in run_args)
        #res = [run_single_fit(self, **args) for args in run_args]
        return pd.concat(res, axis=0)

    def loo(self, sample, sup_sample=None, probs=False, sample2=None):
        if sample2 is None:
            sample2 = sample
        run_args = []
        for patient_id in sample["patient_id"].unique():
            df_train = sample[sample["patient_id"] != patient_id]
            df_valid = sample2[sample2["patient_id"] == patient_id]
            if sup_sample is not None:
                df_train = pd.concat([df_train, sup_sample], axis=0)
            run_args.append(dict(
                df_train = df_train,
                df_valid = df_valid,
                fold_id = None,
                seed = None,
                probs=probs
            ))

        res = Parallel(-1)(delayed(run_single_fit)(self, **args) for args in run_args)
        return pd.concat(res, axis=0)

def print_cvx_summary(res_df):
    scores = res_df.groupby(["seed", "fold"]).apply(lambda x: smape1p(x["target"], x["preds"])).values
    print("# ", len(scores), " runs")
    #print("# 05   :      ", np.quantile(scores, 0.05))
    #print("# 25   :   ", np.quantile(scores, 0.25))
    print("# mean :", scores.mean())
    #print("# 75   :   ", np.quantile(scores, 0.75))
    #print("# 95   :      ", np.quantile(scores, 0.95))
    print("# ovrl :", smape1p(res_df["target"], res_df["preds"]))

def print_loo_summary(res_df):
    scores = res_df.groupby(["patient_id"]).apply(lambda x: smape1p(x["target"], x["preds"])).values
    print("# ", len(scores), " runs")
    #print("# 05   :      ", np.quantile(scores, 0.05))
    #print("# 25   :   ", np.quantile(scores, 0.25))
    print("# mean :", scores.mean())
    #print("# 75   :   ", np.quantile(scores, 0.75))
    #print("# 95   :      ", np.quantile(scores, 0.95))
    print("# ovrl :", smape1p(res_df["target"], res_df["preds"]))



In [None]:
import numpy as np
import pandas as pd
import sys
import random
from tqdm import tqdm
import gc
import torch
from torch import optim
from torch.cuda.amp import GradScaler, autocast
from collections import defaultdict
from copy import copy
import os
from transformers import get_cosine_schedule_with_warmup
from torch.utils.data import SequentialSampler, DataLoader
from sklearn.metrics import roc_auc_score, f1_score, cohen_kappa_score
from torch.utils.data import Dataset
from scipy.special import softmax

torch.set_num_threads(1)

def single_smape1p(preds, tgt):
    x = np.tile(np.arange(preds.shape[1]), (preds.shape[0], 1))
    x = np.abs(x - tgt) / (2 + x + tgt)
    return (x * preds).sum(axis=1)

def opt_smape1p(preds):
    x = np.hstack([single_smape1p(preds, i).reshape(-1,1) for i in range(preds.shape[1])])
    return x.argmin(axis=1)


from types import SimpleNamespace
from torch import nn
import torch

class CustomDataset(Dataset):
    def __init__(self, df, cfg, aug, mode="train"):
        self.cfg = cfg
        self.mode = mode
        self.df = df.copy()
        self.features = df[cfg.features].values
        if self.mode != "test":
            self.targets = df[self.cfg.target_column].values.astype(np.float32)
        else:
            self.targets = np.zeros(len(df))

    def __getitem__(self, idx):
        features = self.features[idx]
        targets = self.targets[idx]
        
        feature_dict = {
            "input": torch.tensor(features),
            "target_norm": torch.tensor(targets),
        }
        return feature_dict

    def __len__(self):
        return len(self.df)


class Net(nn.Module):
    def __init__(self, cfg):
        super(Net, self).__init__()
        self.cfg = cfg
        self.n_classes = cfg.n_classes
        self.cnn = nn.Sequential(*([
            nn.Linear(len(self.cfg.features), cfg.n_hidden),
            nn.LeakyReLU(),
            ] +
            [
            nn.Linear(cfg.n_hidden, cfg.n_hidden),
            nn.LeakyReLU(),
            ] * self.cfg.n_layers)
        )

        self.head = nn.Sequential(
            nn.Linear(cfg.n_hidden, self.n_classes),
            nn.LeakyReLU(),
        )

    def forward(self, batch):
        input = batch["input"].float()
        y = batch["target_norm"]
        x = input
        x = self.cnn(x)
        preds = self.head(x).squeeze(-1)
        loss = (torch.abs(y - preds) / (torch.abs(0.01 + y) + torch.abs(0.01 + preds))).mean()
        return {"loss": loss, "preds": preds, "target_norm": y}


def worker_init_fn(worker_id):
    np.random.seed(np.random.get_state()[1][0] + worker_id)

def get_train_dataloader(train_ds, cfg, verbose):
    train_dataloader = DataLoader(
        train_ds,
        sampler=None,
        shuffle=True,
        batch_size=cfg.batch_size,
        num_workers=cfg.num_workers,
        pin_memory=False,
        collate_fn=cfg.tr_collate_fn,
        drop_last=cfg.drop_last,
        worker_init_fn=worker_init_fn,
    )
    if verbose:
        print(f"train: dataset {len(train_ds)}, dataloader {len(train_dataloader)}")
    return train_dataloader


def get_val_dataloader(val_ds, cfg, verbose):
    sampler = SequentialSampler(val_ds)
    if cfg.batch_size_val is not None:
        batch_size = cfg.batch_size_val
    else:
        batch_size = cfg.batch_size
    val_dataloader = DataLoader(
        val_ds,
        sampler=sampler,
        batch_size=batch_size,
        num_workers=cfg.num_workers,
        pin_memory=False,
        collate_fn=cfg.val_collate_fn,
        worker_init_fn=worker_init_fn,
    )
    if verbose:
        print(f"valid: dataset {len(val_ds)}, dataloader {len(val_dataloader)}")
    return val_dataloader


def get_scheduler(cfg, optimizer, total_steps):
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=cfg.warmup * (total_steps // cfg.batch_size),
        num_training_steps=cfg.epochs * (total_steps // cfg.batch_size),
    )
    return scheduler


def set_seed(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = False
    #torch.backends.cudnn.benchmark = True

def batch_to_device(batch, device):
    batch_dict = {key: batch[key].to(device) for key in batch}
    return batch_dict

def run_eval(model, val_dataloader, cfg, pre="val", verbose=True):
    model.eval()
    torch.set_grad_enabled(False)
    val_data = defaultdict(list)
    if verbose:
        progress_bar = tqdm(val_dataloader)
    else:
        progress_bar = val_dataloader
    for data in progress_bar:
        batch = batch_to_device(data, cfg.device)
        if cfg.mixed_precision:
            with autocast():
                output = model(batch)
        else:
            output = model(batch)
        for key, val in output.items():
            val_data[key] += [output[key]]
    for key, val in output.items():
        value = val_data[key]
        if len(value[0].shape) == 0:
            val_data[key] = torch.stack(value)
        else:
            val_data[key] = torch.cat(value, dim=0)

    preds = val_data["preds"].cpu().numpy()
    if (pre == "val") and verbose:
        metric = smape1p(100*val_data["target_norm"].cpu().numpy(), 100*preds)
        print(f"{pre}_metric 1 ", metric)
        metric = smape1p(100*val_data["target_norm"].cpu().numpy(), np.round(100*preds))
        print(f"{pre}_metric 2 ", metric)
    
    return 100*preds


def run_train(cfg, train_df, val_df, test_df=None, verbose=True):
    os.makedirs(str(cfg.output_dir + "/"), exist_ok=True)

    if cfg.seed < 0:
        cfg.seed = np.random.randint(1_000_000)
    if verbose:
        print("seed", cfg.seed)
    set_seed(cfg.seed)

    train_dataset = CustomDataset(train_df, cfg, aug=None, mode="train")
    train_dataloader = get_train_dataloader(train_dataset, cfg, verbose)
    
    if val_df is not None:
        val_dataset = CustomDataset(val_df, cfg, aug=None, mode="val")
        val_dataloader = get_val_dataloader(val_dataset, cfg, verbose)

    if test_df is not None:
        test_dataset = CustomDataset(test_df, cfg, aug=None, mode="test")
        test_dataloader = get_val_dataloader(test_dataset, cfg, verbose)

    model = Net(cfg)
    model.to(cfg.device)

    total_steps = len(train_dataset)
    params = model.parameters()
    optimizer = optim.Adam(params, lr=cfg.lr, weight_decay=0)
    scheduler = get_scheduler(cfg, optimizer, total_steps)

    if cfg.mixed_precision:
        scaler = GradScaler()
    else:
        scaler = None

    cfg.curr_step = 0
    i = 0
    optimizer.zero_grad()
    for epoch in range(cfg.epochs):
        set_seed(cfg.seed + epoch)
        if verbose:
            print("EPOCH:", epoch)
            progress_bar = tqdm(range(len(train_dataloader)))
        else:
            progress_bar = range(len(train_dataloader))
        tr_it = iter(train_dataloader)
        losses = []
        gc.collect()

        for itr in progress_bar:
            i += 1
            data = next(tr_it)
            model.train()
            torch.set_grad_enabled(True)
            batch = batch_to_device(data, cfg.device)
            if cfg.mixed_precision:
                with autocast():
                    output_dict = model(batch)
            else:
                output_dict = model(batch)
            loss = output_dict["loss"]
            losses.append(loss.item())
            if cfg.mixed_precision:
                scaler.scale(loss).backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.gradient_clip)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.gradient_clip)
                optimizer.step()
                optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
        if val_df is not None:
            if (epoch + 1) % cfg.eval_epochs == 0 or (epoch + 1) == cfg.epochs:
                run_eval(model, val_dataloader, cfg, pre="val", verbose=verbose)

    if test_df is not None:
        return run_eval(model, test_dataloader, cfg, pre="test", verbose=verbose)
    else:
        return model

def run_test(model, cfg, test_df):
    test_dataset = CustomDataset(test_df, cfg, aug=None, mode="test")
    test_dataloader = get_val_dataloader(test_dataset, cfg, verbose=False)
    return run_eval(model, test_dataloader, cfg, pre="test", verbose=False)


In [None]:
import lightgbm as lgb

def single_smape1p(preds, tgt):
    x = np.tile(np.arange(preds.shape[1]), (preds.shape[0], 1))
    x = np.abs(x - tgt) / (2 + x + tgt)
    return (x * preds).sum(axis=1)

def opt_smape1p(preds):
    x = np.hstack([single_smape1p(preds, i).reshape(-1,1) for i in range(preds.shape[1])])
    return x.argmin(axis=1)

class LGBClassModel1(BaseModel):
    def __init__(self, params, features) -> None:
        self.params = params
        self.features = features
    
    def fit(self, df_train):
        if self.features is None:
            self.features = [col for col in df_train.columns if col.startswith("v_")]
        lgb_train = lgb.Dataset(df_train[self.features], df_train["target"])
        params0 = {k:v for k,v in self.params.items() if k not in ["n_estimators"]}
        self.m_gbm = lgb.train(params0, lgb_train, num_boost_round=self.params["n_estimators"])
        return self

    def predict_proba(self, df_valid):
        return self.m_gbm.predict(df_valid[self.features])

    def predict(self, df_valid):
        return opt_smape1p(self.predict_proba(df_valid))



params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 87,
        "n_estimators": 300,

        'learning_rate': 0.019673004699536346,
        'num_leaves': 208,
        'max_depth': 14,
        'min_data_in_leaf': 850,
        'feature_fraction': 0.5190632906197453,
        'lambda_l1': 7.405660751699475e-08,
        'lambda_l2': 0.14583961675675494,
        'max_bin': 240,
    
        'verbose': -1,
        'force_col_wise': True,
        'n_jobs': -1,
    }

features = ["target_i", "target_month", "horizon", "visit_month", "visit_6m", "blood_taken"]
features += ["visit_18m", "is_suppl"]
features += ["count_non12_visits"]
features += ["visit_48m"]

model_lgb = LGBClassModel1(params, features)
model_lgb = model_lgb.fit(pd.concat([sample3, sup_sample3], axis=0))
    

In [None]:
class NNRegModel1(BaseModel):
    def __init__(self, cfg, features=None) -> None:
        self.cfg = cfg
        #self.features = features
    
    def fit(self, df_train):
        self.models = [run_train(self.cfg, df_train, None, None, verbose=False) for _ in range(self.cfg.bag_size)]
        return self

    def predict(self, df_valid):
        preds = np.vstack([run_test(model, self.cfg, df_valid) for model in self.models])
        if self.cfg.bag_agg_function == "max":
            return np.max(preds, axis=0)
        elif self.cfg.bag_agg_function == "median":
            return np.median(preds, axis=0)
        else:
            return np.mean(preds, axis=0)


cfg = SimpleNamespace(**{})

cfg.tr_collate_fn = None
cfg.val_collate_fn = None
#cfg.CustomDataset = CustomDataset
#cfg.net = Net

cfg.target_column = "target_norm"
cfg.output_dir = "results/nn_temp"
cfg.seed = -1
cfg.eval_epochs = 1
cfg.mixed_precision = False
#cfg.device = "cuda:2"
cfg.device = "cpu"

cfg.n_classes = 1
cfg.batch_size = 128
cfg.batch_size_val = 256
cfg.n_hidden = 64
cfg.n_layers = 2 #3
cfg.num_workers = 0
cfg.drop_last = False
cfg.gradient_clip = 1.0

cfg.bag_size = 1
cfg.bag_agg_function = "mean"
cfg.lr = 2e-3
cfg.warmup = 0
cfg.epochs = 10

cfg.features = ["visit_6m"]
#cfg.features += ["blood_taken"]
cfg.features += [c for c in sample3.columns if c.startswith("t_month_eq_")]
cfg.features += [c for c in sample3.columns if c.startswith("v_month_eq_")]
cfg.features += [c for c in sample3.columns if c.startswith("hor_eq_")]
cfg.features += [c for c in sample3.columns if c.startswith("target_n_")]
cfg.features += ["visit_18m"]
cfg.features += ["visit_48m"]
cfg.features += ["is_suppl"]
cfg.features += ["horizon_scaled"]

model_nn = NNRegModel1(cfg)
model_nn = model_nn.fit(pd.concat([sample3, sup_sample3], axis=0))


In [None]:
import sys
sys.path.append('/kaggle/input/amp-pd')

import amp_pd_peptide
env = amp_pd_peptide.make_env()
iter_test = env.iter_test()

def repl(x1, x2, cond):
    res = x1.copy()
    res[cond] = x2[cond]
    return res

all_test_peptides = None
all_test_proteins = None
all_test_df = None

for (test_df, test_peptides, test_proteins, sample_submission) in iter_test:
    all_test_df = pd.concat([all_test_df, test_df], axis=0)
    all_test_proteins = pd.concat([all_test_proteins, test_proteins], axis=0)
    all_test_peptides = pd.concat([all_test_peptides, test_peptides], axis=0)
    sample_test = dp3.transform_test(all_test_proteins, all_test_peptides, all_test_df, sample_submission)
    sample_test["is_suppl"] = 0
    
    sample_test["preds_lgb"] = model_lgb.predict(sample_test)
    sample_test["preds_nn"] = np.round(np.clip(model_nn.predict(sample_test), 0, None))
    
    sample_submission["rating"] = np.round( (sample_test["preds_lgb"] + sample_test["preds_nn"]) / 2)
    
    env.predict(sample_submission)


In [None]:
sub = pd.read_csv('/kaggle/working/submission.csv')
sub