In [None]:
import cv2
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import codecs
from tqdm import tqdm

In [None]:
train_labels = pd.read_csv("../input/seti-breakthrough-listen/train_labels.csv")
train_labels.head(5)

In [None]:
ddt = train_labels["target"].value_counts().to_frame()
fig, ax = plt.subplots(1,1,figsize=(12,4))
sns.countplot(data = train_labels, x = "target", orient = "v", palette = "pastel", ax=ax)
plt.suptitle("Train target distribution")

> # Train set

In [None]:
train_files = glob.glob("../input/seti-breakthrough-listen/train" + "/*/*.npy")
print("Number of train files:{}".format(len(train_files)))

># Test set

In [None]:
test_files = glob.glob("../input/seti-breakthrough-listen/test" + "/*/*.npy")
print("Number of test files:{}".format(len(test_files)))

In [None]:
def get_train_filename_by_id(_id: str) -> str:
    return f"../input/seti-breakthrough-listen/train/{_id[0]}/{_id}.npy"

def show_cadence(filename: str, label: int) -> None:
    fig, axes = plt.subplots(6, 1, figsize = (16, 10))
    ax = axes.ravel()
    arr = np.load(filename)
    for i in range(6):
        
        ax[i].imshow(arr[i].astype(float), interpolation='nearest', aspect='auto')
        ax[i].text(5, 100, ["ON", "OFF"][i % 2], bbox={'facecolor': 'white'})
        if i != 5:
            ax[i].set_xticks([])
            
    fig.text(0.5, -0.02, 'Frequency Range', ha='center', fontsize=18)
    fig.text(-0.02, 0.5, 'Seconds', va='center', rotation='vertical', fontsize=18)

    plt.suptitle(f"ID: {os.path.basename(filename)} TARGET: {label}", fontsize=18)
    fig.tight_layout()
    plt.show()

In [None]:
positive_target=train_labels.query("target==1").sample().id.item()
negative_target=train_labels.query("target==0").sample().id.item()
show_cadence(get_train_filename_by_id(positive_target), 1)
show_cadence(get_train_filename_by_id(negative_target), 0)

In [None]:
!pip install efficientnet_pytorch

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import sys
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import StratifiedKFold
from efficientnet_pytorch import model as enet
from torch.utils.data import Dataset, DataLoader

In [None]:
def set_seed(seed = 0):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

random_state = set_seed(2020)

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

> # DATASET

In [None]:
sample_train_id = train_labels.query("target==1").sample().id.item()
sample = np.load(f"../input/seti-breakthrough-listen/train/{sample_train_id[0]}/{sample_train_id}.npy")
sample.shape
np.array([np.abs(sample).max() for i in range(sample.shape[0])]).reshape(sample.shape[0],1,1)

In [None]:
import albumentations as A
ttransform = A.Compose([
    A.ShiftScaleRotate(p=0.8, shift_limit_x=(-0.2, 0.2), shift_limit_y=(-0.2, 0.2), scale_limit=(-0.20, 0.20), rotate_limit=(-20, 20), interpolation=1, border_mode=0, value=0, mask_value=0)
#     A.RandomResizedCrop(p=1.0, height=320, width=320, scale=(0.9, 1.0), ratio=(0.75, 1.3333333333333333), interpolation=1)
])

class ETDataSet(Dataset):
    def __init__(self, image_paths, targets):
        self.image_paths = image_paths
        self.targets = targets
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, item):
        image = np.load(self.image_paths[item]).astype(np.float32)
        image = np.vstack(image).transpose((1,0)).astype(np.float32)[np.newaxis,]
#         image = ttransform(image = image)["image"]
        targets = self.targets[item]
#         image = image / np.array([np.abs(image).max() for i in range(6)]).reshape(6,1,1)
        
        return torch.tensor(image,dtype=torch.float), torch.tensor(targets,dtype=torch.float)

In [None]:
df_train = pd.read_csv("../input/seti-breakthrough-listen/train_labels.csv")
df_train.head(5)

In [None]:
df_train["image_path"] = df_train["id"].apply(lambda x:f"../input/seti-breakthrough-listen/train/{x[0]}/{x}.npy")
df_train.head(5)

In [None]:
# df_train_pos = df_train[df_train["target"] == 1]
# df_train_pos_arr = []
# for i in range(df_train_pos.shape[0]):
#     df_train_pos_arr.append(np.load(df_train_pos["image_path"].iloc[i]))
# df_train_pos_arr = np.array(df_train_pos_arr)

In [None]:
# import os
# diff = df_train[df_train["target"] == 0].shape[0] - df_train_pos.shape[0]

# savePath = "./new_pos"
# if not os.path.exists(savePath):
#     os.makedirs(savePath)

# total_index = df_train.shape[0]
# for i in range(diff//5):
#     index = np.random.choice(df_train_pos_arr.shape[0], 5)
#     new_pos = (df_train_pos_arr[index] * 1/5).sum(axis = 0)
#     new_file_path = f"{savePath}/{i}.npy"
#     np.save(new_file_path,new_pos)
#     df_train.loc[total_index + i] = ["1",1,new_file_path]

># focal loss

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma = 2, alpha = 0.2, reduction = "mean"):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction
#         self.criterion = nn.BCEWithLogitsLoss(reduction = "none")
        
    
    def forward(self, preds, targets):
        targets = targets.reshape(-1,1)
        
        pt = torch.sigmoid(preds)
        loss =  - self.alpha * (1 - pt) ** self.gamma * targets * torch.log(pt) - (1 - self.alpha) *  pt ** self.gamma * (1 - targets) * torch.log(1 - pt)
#         loss = self.criterion(preds, targets) * weights
        
        if self.reduction == "mean":
            loss = torch.mean(loss)
        else:
            loss = torch.sum(loss)
        return loss

># Mixup Data

In [None]:
def mixup_data(x, y, alpha=1.0, use_cuda=True):
    "Returns mixed inputs, pairs of targets and lambda"
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1
    
    batch_size = x.size()[0]
    
    if use_cuda:
        index = torch.randperm(batch_size).cuda()
    else:
        index = torch.randperm(batch_size)
        
    mixed_x = lam  * x + (1-lam) * x[index,:]
    y, y_shuffle = y, y[index]
    return mixed_x, y, y_shuffle, lam

def mixup_criterion(criterion, pred, y, y_shuffle, lam):
    y = y.view(-1,1)
    y_shuffle = y_shuffle.view(-1,1)
    loss = lam * criterion(pred, y) + (1 - lam) * criterion(pred, y_shuffle)
    return loss.mean()

># BackBone

In [None]:
class backbone(nn.Module):
    def __init__(self, pretrain, out_dim):
        super(backbone, self).__init__()
        self.enet = enet.EfficientNet.from_pretrained(pretrain)
        self.myfc = nn.Linear(self.enet._fc.in_features, out_dim)
        self.enet._fc = nn.Identity()
        self.conv = nn.Conv2d(1, 3, kernel_size = 3, padding = 3, bias = False)
        
    def forward(self, x):
        x = self.conv(x)
        x = self.enet(x)
        x = self.myfc(x)
        return x

># Criterion and train

In [None]:
criterion = nn.BCEWithLogitsLoss()
# criterion = FocalLoss()

def train(train_loader, val_loader, model, optimizer, device, fold_idx, epoch, scheduler):
    model.train()
    with codecs.open('log.out','a') as f:
        f.write("\n\n")
    best_roc_auc  = 0
    check_point = 1
    loss = -1

    for epoch_idx in range(epoch):
        
        
        process_bar = tqdm(train_loader, position=0, leave=True, desc="Training")
        for datas,labels in process_bar:
            datas = datas.to(device).float()
            labels = labels.to(device).float()
            
            optimizer.zero_grad()

            if np.random.randint(1, 10) >= 5:
                mixed_x, y, y_shuffle, lam = mixup_data(datas, labels,use_cuda = True)
                
                outputs = model(mixed_x)
                loss_now = mixup_criterion(criterion, outputs, y, y_shuffle, lam)
                
            else:
                outputs = model(datas)
                loss_now = criterion(outputs, labels.reshape(-1,1))
            
 
            
            if loss == -1:
                loss = loss_now.data.item()
            else:
                loss = 0.9 * loss + 0.1 * loss_now.data.item()
            
            process_bar.set_postfix(loss = loss)
            process_bar.update()
            
            loss_now.backward()
            optimizer.step()
            scheduler.step()


        preds, valid_targets = evaluate(val_loader, model, device=DEVICE)
        roc_auc = roc_auc_score(valid_targets, preds)
        print(f"valid auc is:{roc_auc}")

        with codecs.open("log.out","a") as f:
            f.write(f"Fold{fold_idx}, Epoch={epoch_idx}, Check_point={check_point}, Valid_ROC_AUC={roc_auc}\n")

        if roc_auc > best_roc_auc:
            torch.save(model.state_dict(), "./efnet-" + str(fold_idx) + ".pth")
            best_roc_auc = roc_auc

            
def evaluate(data_loader, model, device):
    model.eval()
    total_labels = []
    total_outputs = []
    
    with torch.no_grad():
        for datas, labels in tqdm(data_loader, position=0, leave=True, desc="Evaluating"):
            datas = datas.to(device).float()

            outputs = model(datas)
            outputs = outputs.detach().cpu().numpy().tolist()
            labels = labels.detach().cpu().numpy().tolist()

            total_labels.extend(labels)
            total_outputs.extend(outputs)
    
    return total_outputs, total_labels

In [None]:
backbone_name = "efficientnet-b0"

epochs = 4
batch_size = 16

In [None]:
X = df_train.image_path.values
Y = df_train.target.values
print(X)
print(Y)

In [None]:
skf = StratifiedKFold(n_splits = 5)
fold = 0

for train_idx, val_idx in skf.split(X,Y):
    model = backbone(backbone_name, out_dim=1)
    model.to(DEVICE)
    model = nn.DataParallel(model)
    
    train_images, valid_images = X[train_idx], X[val_idx]
    train_targets, valid_targets = Y[train_idx], Y[val_idx]
    
    train_dataset = ETDataSet(image_paths = train_images, targets = train_targets)
    valid_dataset = ETDataSet(image_paths = valid_images, targets = valid_targets)
    
    train_loader = DataLoader(dataset = train_dataset,
                             batch_size = batch_size,
                             shuffle = True,
                             num_workers=3)
    valid_loader = DataLoader(dataset = valid_dataset,
                             batch_size = batch_size,
                             shuffle = False,
                             num_workers=3)
    optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)
    scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0001, max_lr=5e-5, gamma=0.9, cycle_momentum=False,
                                                  step_size_up=1400,step_size_down=1400, mode="triangular2")
    if fold in [0,1,2]:
        pass
    else:
        train(train_loader, valid_loader, model, optimizer, DEVICE, fold, epochs, scheduler)
    
    fold += 1
    print("")

In [None]:
import pandas as pd
submission = pd.read_csv("../input/seti-breakthrough-listen/sample_submission.csv")
submission["img_path"] = submission["id"].apply(lambda x:f"../input/seti-breakthrough-listen/test/{x[0]}/{x}.npy")

test_dataset = ETDataSet(image_paths = submission.img_path.values, targets = submission.target.values)
test_loader = DataLoader(dataset = test_dataset,
                        batch_size = 32,
                        shuffle = False,
                        num_workers = 1)

In [None]:
models = []
for path in glob.glob("../input/et-model-pth/efnet-*"):
    model = backbone(backbone_name, out_dim = 1)
    model.to(DEVICE)
    model = nn.DataParallel(model)
    print(path)
    model.load_state_dict(torch.load(path))
    models.append(model)
    
sigmoid = torch.nn.Sigmoid()
outs = []
for model in models:
    predictions, valid_targets = evaluate(test_loader,model, DEVICE)
    predictions = np.array(predictions)[:,0]
    out = sigmoid(torch.from_numpy(predictions))
    out = out.detach().numpy()
    outs.append(out)

pred = np.mean(np.array(outs), axis=0)

In [None]:
submission.target = pred
submission.drop(["img_path"], axis=1 ,inplace = True)
submission.to_csv('./submission.csv', index = False)