In [None]:
import os
import sys
import shutil
import cv2
import numpy as np
import pandas as pd
import itertools
from tqdm.notebook import tqdm

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import torch
from torch import nn, utils, optim
from torchvision import transforms, models
sys.path.append("../input/efficientnet-pytorch/EfficientNet-PyTorch/EfficientNet-PyTorch-master")
from efficientnet_pytorch import model as enet

# Initial settings

In [None]:
# Be deterministic 
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Since it is learned multiple times when tuning parameters,
# make it a function so that it can be initialized with the same seed each time. 
def init_seed():
    np.random.seed(0)
    torch.manual_seed(0)

In [None]:
# Initial settings
USE_DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE = 16
BATCH_SIZE_VALID = 4
NUM_WORKERS = 2
NUM_EPOCHS = 3 # epochs for test to find parameter
LR_TESTS = [5e-3,1e-4,5e-4] # Find the best parameter while changing the learning rate
WEIGHT_TESTS = [0.5,0.7,0.9] # Try weighted classification weights-unbalanced dataset with more than 1 than 0 
USE_TRAIN_SUBSET = True # for test run

# Deta model with DA

In [None]:
df_origin = pd.read_csv("../input/seti-breakthrough-listen/train_labels.csv")
df_test = pd.concat([df_origin[df_origin.target==1][:500], df_origin[df_origin.target==0][:500]])
df_train = pd.concat([df_origin[df_origin.target==1][500:], df_origin[df_origin.target==0][500:]])
df_valid = pd.read_csv("../input/seti-breakthrough-listen/sample_submission.csv")

In [None]:
class MyDataset:
    def __init__(self, test=False, valid=False):
        # Read Training file
        df = df_valid if valid else (df_test if test else df_train)
        if (not valid) and USE_TRAIN_SUBSET:
            _, df = train_test_split(df, test_size=0.1, random_state=0)
        self.df = df
        self.valid = valid # is prediction?

    def __len__(self):
        return len(self.df)

    def __getitem__(self, pos):
        loc = self.df.iloc[pos]
        _id = loc["id"]
        # read signal data
        if self.valid:
            fn = f"../input/seti-breakthrough-listen/test/{_id[0]}/{_id}.npy"
        else:
            fn = f"../input/seti-breakthrough-listen/train/{_id[0]}/{_id}.npy"
        # target value
        lb = int(loc["target"])
        arr = np.load(fn) # read signal
        if not self.valid:
            if np.random.random() < 0.2: # DA
                # Since the horizontal direction is the time axis, rotate the time and shift the position. 
                pos = np.random.randint(arr.shape[2]-50)+50
                X = arr.copy()
                p = np.array([[np.linspace(0,arr[i,j,-1]-arr[i,j,-pos],pos) for j in range(arr.shape[1])] for i in range(arr.shape[0])])
                q = np.array([[np.linspace(0,arr[i,j,pos]-arr[i,j,0],arr.shape[2]-pos) for j in range(arr.shape[1])] for i in range(arr.shape[0])])
                X[:,:,0:pos] = arr[:,:,-pos:] - p
                X[:,:,pos:] = arr[:,:,:-pos] - q
                arr = X
        return torch.tensor(arr, dtype=torch.float32), torch.tensor(lb, dtype=torch.int64)

# Pytorch model

In [None]:
def get_model():
    # modify output classes=2
    model = enet.EfficientNet.from_name('efficientnet-b0')
    model.load_state_dict(torch.load('../input/efficientnet-pytorch/efficientnet-b0-08094119.pth'))
    model._conv_stem = nn.Conv2d(6, 32, kernel_size=3, bias=False)
    model._fc = nn.Linear(1280, 2)
    model = model.to(USE_DEVICE)
    return model

def get_optim(model, lr):
    params = model.parameters()
    optimizer = optim.Adam(params, lr=lr)
    return optimizer

def get_loss(weight):
    # Loss function for classification with weights for unbalanced datasets 
    weight = torch.tensor([1.0-weight,weight], dtype=torch.float)
    weight = weight.to(USE_DEVICE)
    loss = nn.CrossEntropyLoss(weight=weight)
    return loss

def get_score(true_valid, pred_valid):
    # RocAUC Score
    return roc_auc_score(true_valid, pred_valid)

# Make dataset

In [None]:
train_ds = MyDataset(test=False)
test_ds = MyDataset(test=True)
data_loader = utils.data.DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
data_loader_v = utils.data.DataLoader(
    test_ds, batch_size=BATCH_SIZE_VALID, shuffle=False, num_workers=NUM_WORKERS)

# Training with find best parameter

In [None]:
best_scores = [] # score of best epoch in each parameter set

if not os.path.isdir('tmp'):
    os.mkdir('tmp')

for t, (lr, weight) in enumerate(itertools.product(LR_TESTS, WEIGHT_TESTS)):
    init_seed() # initialized with the same seed each time. 
    model = get_model() # get model

    optimizer = get_optim(model, lr)
    loss = get_loss(weight) # get weighted loss function

    print(f'test #{t} lr={lr} weight={weight}') # try train&test
    scores = []

    for epoch in tqdm(range(NUM_EPOCHS)):
        total_loss = []
        model.train() # make model for train
        
        # train
        for X, y in data_loader:
            X = X.to(USE_DEVICE)
            y = y.to(USE_DEVICE)

            losses = loss(model(X), y)

            optimizer.zero_grad()
            losses.backward()
            optimizer.step()

            total_loss.append(losses.detach().cpu().numpy())

        # test
        with torch.no_grad():
            total_loss_v = []
            true_valid = []
            pred_valid = []

            model.eval() # make model for test
            for i, (X, y) in enumerate(data_loader_v):
                X = X.to(USE_DEVICE)
                y = y.to(USE_DEVICE)

                res = model(X)
                losses = loss(res, y)

                y = y.detach().cpu().numpy()
                true_valid.extend(y.tolist())

                res = torch.softmax(res, axis=1)
                res = res.detach().cpu().numpy()
                pred_valid.extend(res[:,1].tolist())

                total_loss_v.append(losses.detach().cpu().numpy())

        # Run tests for every epoch and use the one with the best epoch 
        total_loss = np.mean(total_loss)
        total_loss_v = np.mean(total_loss_v)
        score = get_score(true_valid, pred_valid)
        scores.append(score) # scores in this parameter set
        print(f'epoch #{epoch}: train_loss:{total_loss} valid_loss:{total_loss_v} score:{score}')
        torch.save(model.state_dict(), f'tmp/checkpoint{epoch}.pth') # save model

    # The best epoch for this parameter 
    best_epoch = np.argmax(scores)
    shutil.copyfile(f'tmp/checkpoint{best_epoch}.pth',f'tmp/test{t}_best.pth')
    best_scores.append(scores[best_epoch])

    del model, optimizer, loss, X, y, res, losses
    torch.cuda.empty_cache()

In [None]:
del train_ds, test_ds, data_loader, data_loader_v

# Load best parameter-trained model

In [None]:
best_of_best = np.argmax(best_scores) # best epoch in best parameter
model_name = f'tmp/test{best_of_best}_best.pth'

In [None]:
validmodel = get_model()
validmodel.load_state_dict(torch.load(model_name, map_location=torch.device(USE_DEVICE)))

# Make Submission

In [None]:
# Make test data
valid_ds = MyDataset(test=False, valid=True)
data_loader_v = utils.data.DataLoader(
    valid_ds, batch_size=BATCH_SIZE_VALID, shuffle=False, num_workers=NUM_WORKERS)

In [None]:
# Prediction
with torch.no_grad():
    pred_valid = []

    validmodel.eval()
    for i, (X, y) in tqdm(enumerate(data_loader_v), total=len(data_loader_v)):
        X = X.to(USE_DEVICE)
        y = y.to(USE_DEVICE)

        res = validmodel(X)
        res = torch.softmax(res, axis=1)
        res = res.detach().cpu().numpy()
        pred_valid.extend(res[:,1].tolist())

# Save Submission file

In [None]:
csv = "../input/seti-breakthrough-listen/sample_submission.csv"
df = pd.read_csv(csv)
df["target"] = pred_valid
df.to_csv("submission.csv", index=False)

# Training All Data in find parameters

In [None]:
df_train = df_origin # train all data
USE_TRAIN_SUBSET = False # train all data

In [None]:
train_ds = MyDataset(test=False)
data_loader = utils.data.DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)

In [None]:
lr = 0.0005 # find parameter
weight = 0.5 # find parameter
NUM_EPOCHS = 3 # find parameter

init_seed() # initialized with the same seed each time. 
model = get_model() # get model

optimizer = get_optim(model, lr)
loss = get_loss(weight) # get weighted loss function

print(f'train again lr={lr} weight={weight}') # train in find parameters
scores = []

for epoch in tqdm(range(NUM_EPOCHS)):
    total_loss = []
    model.train() # make model for train

    # train
    for X, y in data_loader:
        X = X.to(USE_DEVICE)
        y = y.to(USE_DEVICE)

        losses = loss(model(X), y)

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

# Make Submission file

In [None]:
validmodel = model

In [None]:
# Prediction
with torch.no_grad():
    pred_valid = []

    validmodel.eval()
    for i, (X, y) in tqdm(enumerate(data_loader_v), total=len(data_loader_v)):
        X = X.to(USE_DEVICE)
        y = y.to(USE_DEVICE)

        res = validmodel(X)
        res = torch.softmax(res, axis=1)
        res = res.detach().cpu().numpy()
        pred_valid.extend(res[:,1].tolist())

In [None]:
csv = "../input/seti-breakthrough-listen/sample_submission.csv"
df = pd.read_csv(csv)
df["target"] = pred_valid
df.to_csv("submission2.csv", index=False)