# Dependencies

In [None]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
import pandas as pd, numpy as np, os
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import math
import os

import warnings
import random
from matplotlib import pyplot as plt
import seaborn as sns
from typing import *
import albumentations
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import cv2

import numpy as np
import pandas as pd
import timm
import torch
import torch.nn.functional as F
from albumentations.pytorch.transforms import ToTensorV2
from sklearn.preprocessing import LabelEncoder
from torch import nn
from torch.autograd import Variable
from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.optimizer import Optimizer

from torchvision import models
from tqdm.notebook import tqdm
import pandas as pd
warnings.filterwarnings("ignore")
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from IPython.display import clear_output 
clear_output()

# Config

In [None]:
import albumentations as A

CONFIG = {
    "COMPETITION_NAME": "SETI-ALIENS",
    "MODEL": {"MODEL_FACTORY": "timm", "MODEL_NAME": "resnet18d"},
    "WORKSPACE": "Kaggle",
    "DATA": {
        "TARGET_COL_NAME": "target",
        "IMAGE_COL_NAME": "id",
        "NUM_CLASSES": 1,
        "CLASS_LIST": [0, 1],
        "IMAGE_SIZE": 512,
        "CHANNEL_MODE": "spatial_6ch",
        "USE_MIXUP": True
    },
    "CROSS_VALIDATION": {"SCHEMA" : 'StratifiedKFold', "NUM_FOLDS": 4},
    "TRAIN": {
        "DATALOADER": {
            "batch_size": 32,
            "shuffle": True, #using random sampler
            "num_workers": 4,
            "drop_last": False,
        },
        "SETTINGS": {
            "IMAGE_SIZE": 512,
            "NUM_EPOCHS": 8,
            "USE_AMP": True,
            "USE_GRAD_ACCUM": False,
            "ACCUMULATION_STEP": 1,
            "DEBUG": False,
            "VERBOSE": True,
            "VERBOSE_STEP": 10,
        },
    },
    "VALIDATION": {
        "DATALOADER": {
            "batch_size": 32,
            "shuffle": False,
            "num_workers": 4,
            "drop_last": False,
        }
    },
    "TEST": {
        "DATALOADER": {
            "batch_size": 32,
            "shuffle": False,
            "num_workers": 4,
            "drop_last": False,
        }
    },
    "OPTIMIZER": {
        "NAME": "AdamW",
        "OPTIMIZER_PARAMS": {"lr": 1e-4, "eps": 1.0e-8, "weight_decay": 1.0e-3},
    },
    "SCHEDULER": {
        "NAME": "CosineAnnealingWarmRestarts",
        "SCHEDULER_PARAMS": {
            "T_0": 4,
            "T_mult": 1,
            "eta_min": 1.0e-7,
            "last_epoch": -1,
            "verbose": True,
        # "NAME": "CosineAnnealingLR",
        # "SCHEDULER_PARAMS": {
        #     "T_max": 16,
        #     "eta_min": 1.0e-7,
        #     "last_epoch": -1,
        #     "verbose": True,
        },
        "CUSTOM": "GradualWarmupSchedulerV2",
        "CUSTOM_PARAMS": {"multiplier": 10, "total_epoch": 1},
        "VAL_STEP": False,
    },
    "CRITERION_TRAIN": {
        "NAME": "BCEWithLogitsLoss",
        "LOSS_PARAMS": {
            "weight": None,
            "size_average": None,
            "reduce": None,
            "reduction": "mean",
            "pos_weight": None
        },
    },
    "CRITERION_VALIDATION": {
        "NAME": "BCEWithLogitsLoss",
        "LOSS_PARAMS": {
            "weight": None,
            "size_average": None,
            "reduce": None,
            "reduction": "mean",
            "pos_weight": None
        },
    },
    "TRAIN_TRANSFORMS": {
        # "RandomResizedCrop": {"height": 384, "width": 384, "scale": [0.9, 1.0], "p": 1},
        
        "VerticalFlip": {"p": 0.5},
        "HorizontalFlip": {"p": 0.5},
        "Resize": {"height": 512, "width": 512, "p": 1},
        #"Normalize": {"mean": (0.485, 0.456, 0.406), "std": (0.229, 0.224, 0.225)},
       
    },
    "VALID_TRANSFORMS": {
        "Resize": {"height": 512, "width": 512, "p": 1},
        #"Normalize": {"mean": (0.485, 0.456, 0.406), "std": (0.229, 0.224, 0.225)},
    },
    "TEST_TRANSFORMS": {
        "Resize": {"height": 384, "width": 384, "p": 1},
        #"Normalize": {"mean": (0.485, 0.456, 0.406), "std": (0.229, 0.224, 0.225)},
    },
    "PATH": {
        "DATA_DIR": "/content/",
        "TRAIN_CSV": "../input/seti-breakthrough-listen/train_labels.csv",
#         "TRAIN_PATH": "/content/jpeg-melanoma-384x384/train",
        
#         "TEST_CSV": "/content/jpeg-melanoma-384x384/test.csv",
#         "TEST_PATH": "/content/jpeg-melanoma-384x384/test",
        "SAVE_WEIGHT_PATH": "./",
        "OOF_PATH": "./",
        "LOG_PATH": "./log.txt",
        # "SAVE_WEIGHT_PATH": "/content/drive/MyDrive/Kaggle Projects/[Kaggle] SETI Breakthrough Listen - E.T. Signal Search/JuneV1/",
        # "OOF_PATH": "/content/drive/MyDrive/Kaggle Projects/[Kaggle] SETI Breakthrough Listen - E.T. Signal Search/JuneV1/",
        # "LOG_PATH": "/content/drive/MyDrive/Kaggle Projects/[Kaggle] SETI Breakthrough Listen - E.T. Signal Search/JuneV1/log.txt",
    },
    "SEED": 19921930,
    "DEVICE": "cuda",
    "GPU": "V100",
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config = CONFIG

# Seeding

In [None]:
def seed_all(seed: int = 1930):
    """Seed all random number generators."""
    print("Using Seed Number {}".format(seed))

    os.environ["PYTHONHASHSEED"] = str(
        seed
    )  # set PYTHONHASHSEED env var at fixed value
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.cuda.manual_seed(seed)  # pytorch (both CPU and CUDA)
    np.random.seed(seed)  # for numpy pseudo-random generator
    random.seed(seed)  # set fixed value for python built-in pseudo-random generator
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


def seed_worker(_worker_id):
    """Seed a worker with the given ID."""
    worker_seed = torch.initial_seed() % 2 ** 32
    np.random.seed(worker_seed)
    random.seed(worker_seed)
    
seed_all(config['SEED'])

# Load Files

In [None]:
train = pd.read_csv(CONFIG['PATH']['TRAIN_CSV'])
# test = pd.read_csv('../input/seti-breakthrough-listen/sample_submission.csv')

def get_train_file_path(image_id):
    if config['WORKSPACE'] == 'Kaggle':

        return "../input/seti-breakthrough-listen/train/{}/{}.npy".format(image_id[0], image_id)
    elif config['WORKSPACE'] == 'Colab':
        return "/content/seti-breakthrough-listen/{}/{}.npy".format(image_id[0], image_id)

# def get_test_file_path(image_id):
#     return "../input/seti-breakthrough-listen/test/{}/{}.npy".format(image_id[0], image_id)

train['file_path'] = train['id'].apply(get_train_file_path)
# test['file_path'] = test['id'].apply(get_test_file_path)

display(train.head())

In [None]:
def make_folds(train_csv: pd.DataFrame, config) -> pd.DataFrame:
    """Split the given dataframe into training folds."""
    # TODO: add options for cv_scheme as it is cumbersome here.
    if config['CROSS_VALIDATION']['SCHEMA'] == "StratifiedKFold":
        df_folds = train_csv.copy()
        skf = StratifiedKFold(
            n_splits=config['CROSS_VALIDATION']['NUM_FOLDS'], shuffle=True, random_state=config['SEED']
        )

        for fold, (train_idx, val_idx) in enumerate(
            skf.split(
                X=df_folds[config['DATA']['IMAGE_COL_NAME']], y=df_folds[config['DATA']['TARGET_COL_NAME']]
            )
        ):
            df_folds.loc[val_idx, "fold"] = int(fold + 1)
        df_folds["fold"] = df_folds["fold"].astype(int)
        print(df_folds.groupby(["fold", config['DATA']['TARGET_COL_NAME']]).size())

    elif config.cv_schema == "GroupKfold":
        df_folds = train_csv.copy()
        gkf = GroupKFold(n_splits=config.num_folds)
        groups = df_folds[config.group_kfold_split].values
        for fold, (train_index, val_index) in enumerate(
            gkf.split(X=df_folds, y=df_folds[config.class_col_name], groups=groups)
        ):
            df_folds.loc[val_index, "fold"] = int(fold + 1)
        df_folds["fold"] = df_folds["fold"].astype(int)
        try:
            print(df_folds.groupby(["fold", config.class_col_name]).size())
        except:
            display(df_folds)

    else:  # No CV Schema used in this file, but custom one
        df_folds = train_csv.copy()
        try:
            print(df_folds.groupby(["fold", config.class_col_name]).size())
        except:
            display(df_folds)

    return df_folds

In [None]:
df_folds =  make_folds(train, config)
F1 = df_folds[df_folds['fold'] == 1]
F2 = df_folds[df_folds['fold'] == 2]
F3 = df_folds[df_folds['fold'] == 3]
F4 = df_folds[df_folds['fold'] == 4]
folds_df = pd.concat([F1, F2, F3, F4])


folds_df = folds_df.reset_index(drop=True)
y_true_df = folds_df[['id', 'target']]

In [None]:

# my_folds_sequence = folds_df['id'].values
# oof_991 = pd.read_csv("../input/forwardensemble/oof_991.csv")
# oof_991_dict = dict(zip(oof_991.id, oof_991.target))        
# sorted_arr = []
# for ids in my_folds_sequence:
#     # print(ids)
#     sorted_arr.append(oof_991_dict[ids])
# print(roc_auc_score(folds_df['target'].values, sorted_arr))
# oof_991['target'] = sorted_arr
# oof_991.to_csv("oof_91.csv", index=False)

# Forward Ensembling

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from typing import List
from sklearn.metrics import roc_auc_score


class ForwardEnsemble:
    def __init__(
        self,
        dir: str,
        oof: pd.DataFrame,
        weight_interval: int,
        patience: int,
        min_increase: float,
        target_column_names: List[str],
        pred_column_names: List[str],
    ):
        super().__init__()
        self.dir = dir
        FILES = os.listdir(dir)
        self.oof_list = np.sort([f for f in FILES if "oof" in f])
        self.num_oofs = len(self.oof_list)

        self.oof = oof  # the oof csv with n rows m columns where n is the number of images in the dataset, and m be the number of target columns * number of oof you have
        self.weight_interval = weight_interval
        self.patience = patience
        self.min_increase = min_increase
        self.target_column_names = (
            target_column_names  # target_cols = oof[0].iloc[:, 1:12].columns.tolist()
        )
        self.pred_column_names = (
            pred_column_names  # pred_cols = oof[0].iloc[:, 15:].columns.tolist()
        )

        self.col_len = len(target_column_names)

        self.num_test_images = len(oof[0])

        # get ground truth
        self.y_true = y_true_df['target'].values

        self.all_oof_preds = np.zeros(
            (self.num_test_images, self.num_oofs * self.col_len)
        )

        # append all oof preds to all_oof_preds: for example - k=0 -> all_oof_preds[:,0:11] = self.oof[0][['ETT - Abnormal OOF', etc]].values
        for k in range(self.num_oofs):
            self.all_oof_preds[
                :,
                int(k * self.col_len) : int((k + 1) * self.col_len),
            ] = oof[k][pred_column_names].values
            
        print(self.all_oof_preds)
        print(self.num_oofs)
        
        self.model_i_score, self.model_i_index, self.model_i_weight = 0, 0, 0

    def __len__(self):
        return len(
            self.column_names
        )  # get number of prediction columns, in multi-label, should have more than 1 column, while in binary, there is only 1

    def macro_multilabel_auc(self, label, pred):
        """ Also works for binary AUC like Melanoma"""
        aucs = []
#         for i in range(self.col_len):
#             print(label[:,i])
#             print()
#             print(pred[:, i])
#             print(roc_auc_score(label[:, i], pred[:, i]))
        aucs.append(roc_auc_score(label, pred))
        return np.mean(aucs)

    def compute_best_oof(self):
        _all = []
        for k in range(self.num_oofs):
            print(self.all_oof_preds[:, 0])
            auc = self.macro_multilabel_auc(
                self.y_true,
                self.all_oof_preds[
                    :,k
                ],
            )
            _all.append(auc)
            print("Model %i has OOF AUC = %.4f" % (k, auc))
        best_auc, best_oof_index = np.max(_all), np.argmax(_all)
        return best_auc, best_oof_index

    def forward_ensemble(self):
        DUPLICATES = False
        old_best_auc, best_oof_index = self.compute_best_oof()
        chosen_model = [best_oof_index]
        optimal_weights = []
        for oof_index in range(self.num_oofs):
            curr_model = self.all_oof_preds[
                :,
                int(best_oof_index * self.col_len) : int(
                    (best_oof_index + 1) * self.col_len
                ),
            ]
            for i, k in enumerate(chosen_model[1:]):
                # this step is confusing because it overwrites curr_model in the previous step. basically curr_model is reset to the best oof model initially, and then loop through to get the best oof
                curr_model = (
                    optimal_weights[i]
                    * self.all_oof_preds[
                        :, int(k * self.col_len) : int((k + 1) * self.col_len)
                    ]
                    + (1 - optimal_weights[i]) * curr_model
                )

            print("Searching for best model to add")

            # try add each model
            for i in range(self.num_oofs):
                print(i, ", ", end="")
                if not DUPLICATES and (i in chosen_model):
                    continue
                best_weight_index, best_score, patience_counter = 0, 0, 0
                for j in range(self.weight_interval):
                    temp = (j / self.weight_interval) * self.all_oof_preds[
                        :, int(i * self.col_len) : int((i + 1) * self.col_len)
                    ] + (1 - j / self.weight_interval) * curr_model
                    auc = self.macro_multilabel_auc(self.y_true, temp)

                    if auc > best_score:
                        best_score = auc
                        best_weight_index = j / self.weight_interval
                    else:
                        patience_counter += 1
                        # in this loop, if 10 increment in j does not lead to any increase in AUC, we break out
                    if patience_counter > self.patience:
                        break
                    if best_score > self.model_i_score:
                        self.model_i_score = best_score
                        self.model_i_index = i
                        self.model_i_weights = best_weight_index

            increment = self.model_i_score - old_best_auc
            if increment <= self.min_increase:
                print("No more significant increase")
                break
            # DISPLAY RESULTS
            print()
            print(
                "Ensemble AUC = %.4f after adding model %i with weight %.3f. Increase of %.4f"
                % (
                    self.model_i_score,
                    self.model_i_index,
                    self.model_i_weights,
                    increment,
                )
            )
            print()

            old_best_auc = self.model_i_score
            chosen_model.append(self.model_i_index)
            optimal_weights.append(self.model_i_weights)
            print(chosen_model)
        return chosen_model, optimal_weights


if __name__ == "__main__":
    PATH = "../input/forwardensemble"
    FILES = os.listdir(PATH)
    OOF = np.sort([f for f in FILES if "oof" in f])
    OOF_CSV = [pd.read_csv(os.path.join(PATH,k)) for k in OOF]
    
    print("We have %i oof files..." % len(OOF))
    print()
    print(OOF)
    SUB = np.sort([f for f in FILES if "sub" in f])
    SUB_CSV = [pd.read_csv(os.path.join(PATH,k)) for k in SUB]

    print("We have %i submission files..." % len(SUB))
    print()
    print(SUB)
    target_cols = [
        "target"
    ]

    pred_cols = [
        "target"
    ]
    for i,j in zip(target_cols, pred_cols):
        print(i,j)
        _target_cols = [i]
        _pred_cols = [j]
        forward_ens = ForwardEnsemble(
            dir=PATH,
            oof=OOF_CSV,
            weight_interval=1000, # 200
            patience=20, # 10
            min_increase=0.0003, # 0.00003
            target_column_names=_target_cols,
            pred_column_names=_pred_cols,
        )
        m, w = forward_ens.forward_ensemble()
        

        x = np.zeros(( len(OOF_CSV[0]), len(OOF)*len(_pred_cols)))
        for k in range(len(OOF)):
            x[:, int(k*len(_pred_cols)):int((k+1)*len(_pred_cols))] = OOF_CSV[k][_pred_cols].values    
            
        _target_cols = [i]
        _pred_cols = [j]
        md = x[:, int(m[0]*len(_pred_cols)):int((m[0]+1)*len(_pred_cols))]
        for i, k in enumerate(m[1:]):
            md = w[i]*x[:, int(k*len(_pred_cols)):int((k+1)*len(_pred_cols))] + (1-w[i])*md
            
        plt.hist(md,bins=100)
        plt.title('Ensemble OOF predictions')
        plt.show()
        df = OOF_CSV[0].copy()
        df[_target_cols] = md

#         y = np.zeros((len(SUB_CSV[0]), len(SUB) * len(_pred_cols)))
#         for k in range(len(SUB)):
#             y[:, int(k * len(_pred_cols)) : int((k + 1) * len(_pred_cols))] = SUB_CSV[k][
#                 _target_cols
#             ].values

#         md2 = y[:, int(m[0] * len(_pred_cols)) : int((m[0] + 1) * len(_pred_cols))]
#         for i, k in enumerate(m[1:]):
#             md2 = (
#                 w[i] * y[:, int(k * len(_pred_cols)) : int((k + 1) * len(_pred_cols))]
#                 + (1 - w[i]) * md2
#             )
#         plt.hist(md2, bins=100)
#         plt.show()

#         df = SUB_CSV[0].copy()
#         df[_target_cols] = md2
#         df.to_csv("ensemble_sub.csv", index=False)
#         df.head()

In [None]:
print('We are using models',m)
print('with weights',w)
# print('and achieve ensemble AUC = %.4f'%old)

In [None]:
md

In [None]:
df = OOF_CSV[0].copy()
df.pred = md
df.to_csv('ensemble_oof.csv',index=False)
df.head()

# Load SUB Files

In [None]:
SUB = np.sort( [f for f in FILES if 'sub' in f] )
SUB_CSV = [pd.read_csv(os.path.join(PATH,k)) for k in SUB]

print('We have %i submission files...'%len(SUB))
print(); print(SUB)

In [None]:
# VERFIY THAT SUBMISSION FILES MATCH OOF FILES
a = np.array( [ int( x.split('_')[1].split('.')[0]) for x in SUB ] )
b = np.array( [ int( x.split('_')[1].split('.')[0]) for x in OOF ] )
if len(a)!=len(b):
    print('ERROR submission files dont match oof files')
else:
    for k in range(len(a)):
        if a[k]!=b[k]: print('ERROR submission files dont match oof files')

In [None]:
y = np.zeros(( len(SUB_CSV[0]), len(SUB)*len(pred_cols)))
for k in range(len(SUB)):
    y[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] = SUB_CSV[k][target_cols].values

# Build SUB Ensemble

In [None]:
y = np.zeros(( len(SUB_CSV[0]), len(SUB)*len(pred_cols)))
for k in range(len(SUB)):
    y[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] = SUB_CSV[k][target_cols].values

md2 = y[:, int(m[0]*len(pred_cols)):int((m[0]+1)*len(pred_cols))]
for i, k in enumerate(m[1:]):
    md2 = w[i]*y[:, int(k*len(pred_cols)):int((k+1)*len(pred_cols))] + (1-w[i])*md2
plt.hist(md2,bins=100)
plt.show()

In [None]:
df = SUB_CSV[0].copy()
df[target_cols] = md2
df.to_csv('submission.csv',index=False)
df.head()